ASLP-lab commited on Oct 10

Commit

d0690fd

1 Parent(s): 512c889

add one-click func

Browse files

Files changed (25) hide show

config.json +10 -0
configuration_songformer.py +24 -0
dataset/custom_types.py +14 -0
dataset/label2id.py +163 -0
model.py +527 -0
model.safetensors +3 -0
model_config.py +65 -0
modeling_songformer.py +328 -0
msd_stats.json +65 -0
muq_config2.json +143 -0
musicfm/.gitignore +10 -0
musicfm/LICENSE +224 -0
musicfm/README.md +173 -0
musicfm/data/.gitkeep +0 -0
musicfm/figs/Fig1.png +3 -0
musicfm/figs/Table1.png +3 -0
musicfm/model/__init__.py +2 -0
musicfm/model/musicfm_25hz.py +252 -0
musicfm/modules/__init__.py +2 -0
musicfm/modules/conv.py +82 -0
musicfm/modules/features.py +45 -0
musicfm/modules/flash_conformer.py +2114 -0
musicfm/modules/random_quantizer.py +83 -0
postprocessing/functional.py +71 -0
postprocessing/helpers.py +101 -0

config.json ADDED Viewed

	@@ -0,0 +1,10 @@

+{
+    "architectures": [
+        "SongFormerModel"
+    ],
+    "model_type": "songformer",
+    "auto_map": {
+        "AutoConfig": "configuration_songformer.SongFormerConfig",
+        "AutoModel": "modeling_songformer.SongFormerModel"
+    }
+}

configuration_songformer.py ADDED Viewed

	@@ -0,0 +1,24 @@

+from transformers import PretrainedConfig
+class SongFormerConfig(PretrainedConfig):
+    """Configuration class to store the configuration of a custom model."""
+    model_type = "custom_model"
+    def __init__(
+        self,
+        win_size=420,
+        hop_size=420,
+        num_classes=128,
+        no_rule_post_processing=False,
+        local_maxima_filter_size=3,
+        frame_rates=8.333,
+        **kwargs
+    ):
+        super().__init__(**kwargs)
+        self.win_size = win_size
+        self.hop_size = hop_size
+        self.num_classes = num_classes
+        self.no_rule_post_processing = no_rule_post_processing
+        self.local_maxima_filter_size = local_maxima_filter_size
+        self.frame_rates = frame_rates

dataset/custom_types.py ADDED Viewed

	@@ -0,0 +1,14 @@

+"""
+MsaInfo
+	A list of (timestamp, label) tuples used to represent music structure
+	analysis (MSA). The first element of the tuple is a float timestamp
+	(in seconds) and the second is a string label
+Example
+-------
+	>>> msa: MsaInfo = [(0.0, "intro"), (12.5, "verse"), (34.0, "chorus")]
+"""
+from typing import List, Tuple
+MsaInfo = List[Tuple[float, str]]

dataset/label2id.py ADDED Viewed

	@@ -0,0 +1,163 @@

+LABEL_TO_ID = {
+    "intro": 0,
+    "verse": 1,
+    "chorus": 2,
+    "bridge": 3,
+    "inst": 4,
+    "outro": 5,
+    "silence": 6,
+    "intchorus": 7,
+    "prechorus": 8,
+    "gtrbreak": 9,
+    "solo": 10,
+    "quietchorus": 11,
+    "bre": 12,
+    "break": 13,
+    "introverse": 14,
+    "mainriff": 15,
+    "chorushalf": 16,
+    "instintro": 17,
+    "gtr": 18,
+    "vocaloutro": 19,
+    "verse_slow": 20,
+    "fadein": 21,
+    "saxobeat": 22,
+    "transition": 23,
+    "verse1a": 24,
+    "build": 25,
+    "pre-chorus": 26,
+    "outroa": 27,
+    "bigoutro": 28,
+    "fast": 29,
+    "instrumentalverse": 30,
+    "section": 31,
+    "choruspart": 32,
+    "instbridge": 33,
+    "guitar": 34,
+    "instrumental": 35,
+    "breakdown": 36,
+    "rhythmlessintro": 37,
+    "intropt": 38,
+    "interlude": 39,
+    "postchorus": 40,
+    "postverse": 41,
+    "opening": 42,
+    "altchorus": 43,
+    "stutter": 44,
+    "oddriff": 45,
+    "synth": 46,
+    "preverse": 47,
+    "quiet": 48,
+    "raps": 49,
+    "verseinst": 50,
+    "instchorus": 51,
+    "chorus_instrumental": 52,
+    "slowverse": 53,
+    "slow": 54,
+    "worstthingever": 55,
+    "transition2a": 56,
+    "miniverse": 57,
+    "refrain": 58,
+    "introchorus": 59,
+    "drumroll": 60,
+    "guitarsolo": 61,
+    "versepart": 62,
+    "chorusinst": 63,
+    "ending": 64,
+    "no-vocal-intro": 65,
+    "no-vocal-interlude": 66,
+    "no-vocal-outro": 67,
+    "NO_LABEL": 68,  # Only referring to cases without labels, this portion of labels will be ignored during the loss calculation process.
+}
+ID_TO_LABEL = {v: k for k, v in LABEL_TO_ID.items()}
+# Reserve 64 embedding positions for dataset identifiers in the model.
+DATASET_LABEL_TO_DATASET_ID = {
+    "SongForm-HX-7Class": 0,  # Categories after rule mapping for HarmonixSet
+    "SongForm-HX-Widen": 1,  # Original HarmonixSet
+    "SongForm-Private-Raw": 2,
+    "SongForm-Private": 3,
+    "SongForm-HX-Gemini-Relabeled": 4,  # Rule-mapped HarmonixSet corrected by Gemini
+    "SongForm-HX-8Class": 5,  # Rule-mapped (pre-chorus retained)
+    "SongForm-Hook": 6,
+    "SongForm-Gem": 7,
+    "SongForm-Gem-Only-Label": 8,  # Use only segments with labels in SongForm-Gem
+}
+DATASET_ID_TO_DATASET_LABEL = {v: k for k, v in DATASET_LABEL_TO_DATASET_ID.items()}
+DATASET_ID_ALLOWED_LABEL_IDS = {
+    0: [0, 1, 2, 3, 4, 5, 6],
+    1: [
+        0,
+        1,
+        2,
+        3,
+        4,
+        5,
+        6,
+        7,
+        8,
+        9,
+        10,
+        11,
+        12,
+        13,
+        14,
+        15,
+        16,
+        17,
+        18,
+        19,
+        20,
+        21,
+        22,
+        23,
+        24,
+        25,
+        27,
+        28,
+        29,
+        30,
+        31,
+        32,
+        33,
+        34,
+        35,
+        36,
+        37,
+        38,
+        40,
+        41,
+        42,
+        43,
+        44,
+        45,
+        46,
+        47,
+        48,
+        49,
+        50,
+        51,
+        52,
+        53,
+        54,
+        55,
+        56,
+        57,
+        58,
+        59,
+        60,
+        61,
+        62,
+        63,
+    ],
+    2: [0, 1, 2, 3, 26, 39, 64, 65, 66, 67],
+    3: [0, 1, 2, 3, 4, 5, 6, 26, 39, 64, 65, 66, 67],
+    4: [0, 1, 2, 3, 4, 5, 6, 26],
+    5: [0, 1, 2, 3, 4, 5, 6, 26],
+    6: [0, 1, 2, 3, 4, 5, 6, 26],
+    7: [0, 1, 2, 3, 4, 5, 6, 26],
+    8: [0, 1, 2, 3, 4, 5, 6, 26],
+}

model.py ADDED Viewed

	@@ -0,0 +1,527 @@

+import pdb
+import scipy
+import numpy as np
+scipy.inf = np.inf
+import torch
+import torch.nn as nn
+import numpy as np
+import torch.nn.functional as F
+from dataset.custom_types import MsaInfo
+from msaf.eval import compute_results
+from postprocessing.functional import postprocess_functional_structure
+from x_transformers import Encoder
+import bisect
+class Head(nn.Module):
+    def __init__(self, input_dim, output_dim, hidden_dims=None, activation="silu"):
+        super().__init__()
+        hidden_dims = hidden_dims or []
+        act_layers = {"relu": nn.ReLU, "silu": nn.SiLU, "gelu": nn.GELU}
+        act_layer = act_layers.get(activation.lower())
+        if not act_layer:
+            raise ValueError(f"Unsupported activation: {activation}")
+        dims = [input_dim] + hidden_dims + [output_dim]
+        layers = []
+        for i in range(len(dims) - 1):
+            layers.append(nn.Linear(dims[i], dims[i + 1]))
+            if i < len(dims) - 2:
+                layers.append(act_layer())
+        self.net = nn.Sequential(*layers)
+    def reset_parameters(self, confidence):
+        bias_value = -torch.log(torch.tensor((1 - confidence) / confidence))
+        self.net[-1].bias.data.fill_(bias_value.item())
+    def forward(self, x):
+        batch, T, C = x.shape
+        x = x.reshape(-1, C)
+        x = self.net(x)
+        return x.reshape(batch, T, -1)
+class WrapedTransformerEncoder(nn.Module):
+    def __init__(
+        self, input_dim, transformer_input_dim, num_layers=1, nhead=8, dropout=0.1
+    ):
+        super().__init__()
+        self.input_dim = input_dim
+        self.transformer_input_dim = transformer_input_dim
+        if input_dim != transformer_input_dim:
+            self.input_proj = nn.Sequential(
+                nn.Linear(input_dim, transformer_input_dim),
+                nn.LayerNorm(transformer_input_dim),
+                nn.GELU(),
+                nn.Dropout(dropout * 0.5),
+                nn.Linear(transformer_input_dim, transformer_input_dim),
+            )
+        else:
+            self.input_proj = nn.Identity()
+        self.transformer = Encoder(
+            dim=transformer_input_dim,
+            depth=num_layers,
+            heads=nhead,
+            layer_dropout=dropout,
+            attn_dropout=dropout,
+            ff_dropout=dropout,
+            attn_flash=True,
+            rotary_pos_emb=True,
+        )
+    def forward(self, x, src_key_padding_mask=None):
+        """
+        The input src_key_padding_mask is a B x T boolean mask, where True indicates masked positions.
+        However, in x-transformers, False indicates masked positions.
+        Therefore, it needs to be converted so that False represents masked positions.
+        """
+        x = self.input_proj(x)
+        mask = (
+            ~torch.tensor(src_key_padding_mask, dtype=torch.bool, device=x.device)
+            if src_key_padding_mask is not None
+            else None
+        )
+        return self.transformer(x, mask=mask)
+def prefix_dict(d, prefix: str):
+    if prefix:
+        return d
+    return {prefix + key: value for key, value in d.items()}
+class TimeDownsample(nn.Module):
+    def __init__(
+        self, dim_in, dim_out=None, kernel_size=5, stride=5, padding=0, dropout=0.1
+    ):
+        super().__init__()
+        self.dim_out = dim_out or dim_in
+        assert self.dim_out % 2 == 0
+        self.depthwise_conv = nn.Conv1d(
+            in_channels=dim_in,
+            out_channels=dim_in,
+            kernel_size=kernel_size,
+            stride=stride,
+            padding=padding,
+            groups=dim_in,
+            bias=False,
+        )
+        self.pointwise_conv = nn.Conv1d(
+            in_channels=dim_in,
+            out_channels=self.dim_out,
+            kernel_size=1,
+            bias=False,
+        )
+        self.pool = nn.AvgPool1d(kernel_size, stride, padding=padding)
+        self.norm1 = nn.LayerNorm(self.dim_out)
+        self.act1 = nn.GELU()
+        self.dropout1 = nn.Dropout(dropout)
+        if dim_in != self.dim_out:
+            self.residual_conv = nn.Conv1d(
+                dim_in, self.dim_out, kernel_size=1, bias=False
+            )
+        else:
+            self.residual_conv = None
+    def forward(self, x):
+        residual = x  # [B, T, D_in]
+        # Convolutional module
+        x_c = x.transpose(1, 2)  # [B, D_in, T]
+        x_c = self.depthwise_conv(x_c)  # [B, D_in, T_down]
+        x_c = self.pointwise_conv(x_c)  # [B, D_out, T_down]
+        # Residual module
+        res = self.pool(residual.transpose(1, 2))  # [B, D_in, T]
+        if self.residual_conv:
+            res = self.residual_conv(res)  # [B, D_out, T_down]
+        x_c = x_c + res  # [B, D_out, T_down]
+        x_c = x_c.transpose(1, 2)  # [B, T_down, D_out]
+        x_c = self.norm1(x_c)
+        x_c = self.act1(x_c)
+        x_c = self.dropout1(x_c)
+        return x_c
+class AddFuse(nn.Module):
+    def __init__(self):
+        super(AddFuse, self).__init__()
+    def forward(self, x, cond):
+        return x + cond
+class TVLoss1D(nn.Module):
+    def __init__(
+        self, beta=1.0, lambda_tv=0.4, boundary_threshold=0.01, reduction_weight=0.1
+    ):
+        """
+        Args:
+            beta: Exponential parameter for TV loss (recommended 0.5~1.0)
+            lambda_tv: Overall weight for TV loss
+            boundary_threshold: Label threshold to determine if a region is a "boundary area" (e.g., 0.01)
+            reduction_weight: Scaling factor for TV penalty within boundary regions (e.g., 0.1, meaning only 10% penalty)
+        """
+        super().__init__()
+        self.beta = beta
+        self.lambda_tv = lambda_tv
+        self.boundary_threshold = boundary_threshold
+        self.reduction_weight = reduction_weight
+    def forward(self, pred, target=None):
+        """
+        Args:
+            pred: (B, T) or (B, T, 1), float boundary scores output by the model
+            target: (B, T) or (B, T, 1), ground truth labels (optional, used for spatial weighting if provided)
+        Returns:
+            scalar: weighted TV loss
+        """
+        if pred.dim() == 3:
+            pred = pred.squeeze(-1)
+        if target is not None and target.dim() == 3:
+            target = target.squeeze(-1)
+        diff = pred[:, 1:] - pred[:, :-1]
+        tv_base = torch.pow(torch.abs(diff) + 1e-8, self.beta)
+        if target is None:
+            return self.lambda_tv * tv_base.mean()
+        left_in_boundary = target[:, :-1] > self.boundary_threshold
+        right_in_boundary = target[:, 1:] > self.boundary_threshold
+        near_boundary = left_in_boundary | right_in_boundary
+        weight_mask = torch.where(
+            near_boundary,
+            self.reduction_weight * torch.ones_like(tv_base),
+            torch.ones_like(tv_base),
+        )
+        tv_weighted = (tv_base * weight_mask).mean()
+        return self.lambda_tv * tv_weighted
+class SoftmaxFocalLoss(nn.Module):
+    """
+    Softmax Focal Loss for single-label multi-class classification.
+    Suitable for mutually exclusive classes.
+    """
+    def __init__(self, alpha: float = 0.25, gamma: float = 2.0):
+        super().__init__()
+        self.alpha = alpha
+        self.gamma = gamma
+    def forward(self, pred: torch.Tensor, targets: torch.Tensor) -> torch.Tensor:
+        """
+        Args:
+            pred: [B, T, C], raw logits
+            targets: [B, T, C] (soft) or [B, T] (hard, dtype=long)
+        Returns:
+            loss: scalar or [B, T] depending on reduction
+        """
+        log_probs = F.log_softmax(pred, dim=-1)
+        probs = torch.exp(log_probs)
+        if targets.dtype == torch.long:
+            targets_onehot = F.one_hot(targets, num_classes=pred.size(-1)).float()
+        else:
+            targets_onehot = targets
+        p_t = (probs * targets_onehot).sum(dim=-1)
+        p_t = p_t.clamp(min=1e-8, max=1.0 - 1e-8)
+        if self.alpha > 0:
+            alpha_t = self.alpha * targets_onehot + (1 - self.alpha) * (
+                1 - targets_onehot
+            )
+            alpha_weight = (alpha_t * targets_onehot).sum(dim=-1)
+        else:
+            alpha_weight = 1.0
+        focal_weight = (1 - p_t) ** self.gamma
+        ce_loss = -log_probs * targets_onehot
+        ce_loss = ce_loss.sum(dim=-1)
+        loss = alpha_weight * focal_weight * ce_loss
+        return loss
+class Model(nn.Module):
+    def __init__(self, config):
+        super().__init__()
+        self.config = config
+        self.input_norm = nn.LayerNorm(config.input_dim)
+        self.mixed_win_downsample = nn.Linear(config.input_dim_raw, config.input_dim)
+        self.dataset_class_prefix = nn.Embedding(
+            num_embeddings=config.num_dataset_classes,
+            embedding_dim=config.transformer_encoder_input_dim,
+        )
+        self.down_sample_conv = TimeDownsample(
+            dim_in=config.input_dim,
+            dim_out=config.transformer_encoder_input_dim,
+            kernel_size=config.down_sample_conv_kernel_size,
+            stride=config.down_sample_conv_stride,
+            dropout=config.down_sample_conv_dropout,
+            padding=config.down_sample_conv_padding,
+        )
+        self.AddFuse = AddFuse()
+        self.transformer = WrapedTransformerEncoder(
+            input_dim=config.transformer_encoder_input_dim,
+            transformer_input_dim=config.transformer_input_dim,
+            num_layers=config.num_transformer_layers,
+            nhead=config.transformer_nhead,
+            dropout=config.transformer_dropout,
+        )
+        self.boundary_TVLoss1D = TVLoss1D(
+            beta=config.boundary_tv_loss_beta,
+            lambda_tv=config.boundary_tv_loss_lambda,
+            boundary_threshold=config.boundary_tv_loss_boundary_threshold,
+            reduction_weight=config.boundary_tv_loss_reduction_weight,
+        )
+        self.label_focal_loss = SoftmaxFocalLoss(
+            alpha=config.label_focal_loss_alpha, gamma=config.label_focal_loss_gamma
+        )
+        self.boundary_head = Head(config.transformer_input_dim, 1)
+        self.function_head = Head(config.transformer_input_dim, config.num_classes)
+    def cal_metrics(self, gt_info: MsaInfo, msa_info: MsaInfo):
+        assert gt_info[-1][1] == "end" and msa_info[-1][1] == "end", (
+            "gt_info and msa_info should end with 'end'"
+        )
+        gt_info_labels = [label for time_, label in gt_info][:-1]
+        gt_info_inters = [time_ for time_, label in gt_info]
+        gt_info_inters = np.column_stack(
+            [np.array(gt_info_inters[:-1]), np.array(gt_info_inters[1:])]
+        )
+        msa_info_labels = [label for time_, label in msa_info][:-1]
+        msa_info_inters = [time_ for time_, label in msa_info]
+        msa_info_inters = np.column_stack(
+            [np.array(msa_info_inters[:-1]), np.array(msa_info_inters[1:])]
+        )
+        result = compute_results(
+            ann_inter=gt_info_inters,
+            est_inter=msa_info_inters,
+            ann_labels=gt_info_labels,
+            est_labels=msa_info_labels,
+            bins=11,
+            est_file="test.txt",
+            weight=0.58,
+        )
+        return result
+    def cal_acc(
+        self, ann_info: MsaInfo | str, est_info: MsaInfo | str, post_digit: int = 3
+    ):
+        ann_info_time = [
+            int(round(time_, post_digit) * (10**post_digit))
+            for time_, label in ann_info
+        ]
+        est_info_time = [
+            int(round(time_, post_digit) * (10**post_digit))
+            for time_, label in est_info
+        ]
+        common_start_time = max(ann_info_time[0], est_info_time[0])
+        common_end_time = min(ann_info_time[-1], est_info_time[-1])
+        time_points = {common_start_time, common_end_time}
+        time_points.update(
+            {
+                time_
+                for time_ in ann_info_time
+                if common_start_time <= time_ <= common_end_time
+            }
+        )
+        time_points.update(
+            {
+                time_
+                for time_ in est_info_time
+                if common_start_time <= time_ <= common_end_time
+            }
+        )
+        time_points = sorted(time_points)
+        total_duration, total_score = 0, 0
+        for idx in range(len(time_points) - 1):
+            duration = time_points[idx + 1] - time_points[idx]
+            ann_label = ann_info[
+                bisect.bisect_right(ann_info_time, time_points[idx]) - 1
+            ][1]
+            est_label = est_info[
+                bisect.bisect_right(est_info_time, time_points[idx]) - 1
+            ][1]
+            total_duration += duration
+            if ann_label == est_label:
+                total_score += duration
+        return total_score / total_duration
+    def infer_with_metrics(self, batch, prefix: str = None):
+        with torch.no_grad():
+            logits = self.forward_func(batch)
+            losses = self.compute_losses(logits, batch, prefix=None)
+            expanded_mask = batch["label_id_masks"].expand(
+                -1, logits["function_logits"].size(1), -1
+            )
+            logits["function_logits"] = logits["function_logits"].masked_fill(
+                expanded_mask, -float("inf")
+            )
+            msa_info = postprocess_functional_structure(
+                logits=logits, config=self.config
+            )
+            gt_info = batch["msa_infos"][0]
+            results = self.cal_metrics(gt_info=gt_info, msa_info=msa_info)
+        ret_results = {
+            "loss": losses["loss"].item(),
+            "HitRate_3P": results["HitRate_3P"],
+            "HitRate_3R": results["HitRate_3R"],
+            "HitRate_3F": results["HitRate_3F"],
+            "HitRate_0.5P": results["HitRate_0.5P"],
+            "HitRate_0.5R": results["HitRate_0.5R"],
+            "HitRate_0.5F": results["HitRate_0.5F"],
+            "PWF": results["PWF"],
+            "PWP": results["PWP"],
+            "PWR": results["PWR"],
+            "Sf": results["Sf"],
+            "So": results["So"],
+            "Su": results["Su"],
+            "acc": self.cal_acc(ann_info=gt_info, est_info=msa_info),
+        }
+        if prefix:
+            ret_results = prefix_dict(ret_results, prefix)
+        return ret_results
+    def infer(
+        self,
+        input_embeddings,
+        dataset_ids,
+        label_id_masks,
+        prefix: str = None,
+        with_logits=False,
+    ):
+        with torch.no_grad():
+            input_embeddings = self.mixed_win_downsample(input_embeddings)
+            input_embeddings = self.input_norm(input_embeddings)
+            logits = self.down_sample_conv(input_embeddings)
+            dataset_prefix = self.dataset_class_prefix(dataset_ids)
+            dataset_prefix_expand = dataset_prefix.unsqueeze(1).expand(
+                logits.size(0), 1, -1
+            )
+            logits = self.AddFuse(x=logits, cond=dataset_prefix_expand)
+            logits = self.transformer(x=logits, src_key_padding_mask=None)
+            function_logits = self.function_head(logits)
+            boundary_logits = self.boundary_head(logits).squeeze(-1)
+            logits = {
+                "function_logits": function_logits,
+                "boundary_logits": boundary_logits,
+            }
+            expanded_mask = label_id_masks.expand(
+                -1, logits["function_logits"].size(1), -1
+            )
+            logits["function_logits"] = logits["function_logits"].masked_fill(
+                expanded_mask, -float("inf")
+            )
+            msa_info = postprocess_functional_structure(
+                logits=logits, config=self.config
+            )
+        return (msa_info, logits) if with_logits else msa_info
+    def compute_losses(self, outputs, batch, prefix: str = None):
+        loss = 0.0
+        losses = {}
+        loss_section = F.binary_cross_entropy_with_logits(
+            outputs["boundary_logits"],
+            batch["widen_true_boundaries"],
+            reduction="none",
+        )
+        loss_section += self.config.boundary_tvloss_weight * self.boundary_TVLoss1D(
+            pred=outputs["boundary_logits"],
+            target=batch["widen_true_boundaries"],
+        )
+        loss_function = F.cross_entropy(
+            outputs["function_logits"].transpose(1, 2),
+            batch["true_functions"].transpose(1, 2),
+            reduction="none",
+        )
+        # input is [B, T, C]
+        ttt = self.config.label_focal_loss_weight * self.label_focal_loss(
+            pred=outputs["function_logits"], targets=batch["true_functions"]
+        )
+        loss_function += ttt
+        float_masks = (~batch["masks"]).float()
+        boundary_mask = batch.get("boundary_mask", None)
+        function_mask = batch.get("function_mask", None)
+        if boundary_mask is not None:
+            boundary_mask = (~boundary_mask).float()
+        else:
+            boundary_mask = 1
+        if function_mask is not None:
+            function_mask = (~function_mask).float()
+        else:
+            function_mask = 1
+        loss_section = torch.mean(boundary_mask * float_masks * loss_section)
+        loss_function = torch.mean(function_mask * float_masks * loss_function)
+        loss_section *= self.config.loss_weight_section
+        loss_function *= self.config.loss_weight_function
+        if self.config.learn_label:
+            loss += loss_function
+        if self.config.learn_segment:
+            loss += loss_section
+        losses.update(
+            loss=loss,
+            loss_section=loss_section,
+            loss_function=loss_function,
+        )
+        if prefix:
+            losses = prefix_dict(losses, prefix)
+        return losses
+    def forward_func(self, batch):
+        input_embeddings = batch["input_embeddings"]
+        input_embeddings = self.mixed_win_downsample(input_embeddings)
+        input_embeddings = self.input_norm(input_embeddings)
+        logits = self.down_sample_conv(input_embeddings)
+        dataset_prefix = self.dataset_class_prefix(batch["dataset_ids"])
+        logits = self.AddFuse(x=logits, cond=dataset_prefix.unsqueeze(1))
+        src_key_padding_mask = batch["masks"]
+        logits = self.transformer(x=logits, src_key_padding_mask=src_key_padding_mask)
+        function_logits = self.function_head(logits)
+        boundary_logits = self.boundary_head(logits).squeeze(-1)
+        logits = {
+            "function_logits": function_logits,
+            "boundary_logits": boundary_logits,
+        }
+        return logits
+    def forward(self, batch):
+        logits = self.forward_func(batch)
+        losses = self.compute_losses(logits, batch, prefix=None)
+        return logits, losses["loss"], losses

model.safetensors ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:8dcabf4ea19973edd51b9e5794004775fa7e8de3ecfa07eb1dbce00f516ce7f7
+size 2755035132

model_config.py ADDED Viewed

	@@ -0,0 +1,65 @@

+# config.py
+from transformers import PretrainedConfig
+class ModelConfig(PretrainedConfig):
+    model_type = "SongFormer"
+    def __init__(
+        self,
+        input_dim=2048,
+        input_dim_raw=4096,
+        transformer_encoder_input_dim=1024,
+        transformer_input_dim=512,
+        num_transformer_layers=4,
+        transformer_nhead=8,
+        transformer_dropout=0.1,
+        num_classes=128,
+        num_dataset_classes=64,
+        down_sample_conv_kernel_size=3,
+        down_sample_conv_stride=3,
+        down_sample_conv_dropout=0.1,
+        down_sample_conv_padding=0,
+        boundary_tv_loss_beta=0.6,
+        boundary_tv_loss_lambda=0.4,
+        boundary_tv_loss_boundary_threshold=0.01,
+        boundary_tv_loss_reduction_weight=0.1,
+        boundary_tvloss_weight=0.05,
+        label_focal_loss_alpha=0.25,
+        label_focal_loss_gamma=2.0,
+        label_focal_loss_weight=0.2,
+        loss_weight_section=0.2,
+        loss_weight_function=0.8,
+        learn_label=True,
+        learn_segment=True,
+        local_maxima_filter_size=3,
+        frame_rates=8.333,
+        **kwargs
+    ):
+        super().__init__(**kwargs)
+        self.input_dim = input_dim
+        self.input_dim_raw = input_dim_raw
+        self.transformer_encoder_input_dim = transformer_encoder_input_dim
+        self.transformer_input_dim = transformer_input_dim
+        self.num_transformer_layers = num_transformer_layers
+        self.transformer_nhead = transformer_nhead
+        self.transformer_dropout = transformer_dropout
+        self.num_classes = num_classes
+        self.num_dataset_classes = num_dataset_classes
+        self.down_sample_conv_kernel_size = down_sample_conv_kernel_size
+        self.down_sample_conv_stride = down_sample_conv_stride
+        self.down_sample_conv_dropout = down_sample_conv_dropout
+        self.down_sample_conv_padding = down_sample_conv_padding
+        self.boundary_tv_loss_beta = boundary_tv_loss_beta
+        self.boundary_tv_loss_lambda = boundary_tv_loss_lambda
+        self.boundary_tv_loss_boundary_threshold = boundary_tv_loss_boundary_threshold
+        self.boundary_tv_loss_reduction_weight = boundary_tv_loss_reduction_weight
+        self.boundary_tvloss_weight = boundary_tvloss_weight
+        self.label_focal_loss_alpha = label_focal_loss_alpha
+        self.label_focal_loss_gamma = label_focal_loss_gamma
+        self.label_focal_loss_weight = label_focal_loss_weight
+        self.loss_weight_section = loss_weight_section
+        self.loss_weight_function = loss_weight_function
+        self.learn_label = learn_label
+        self.learn_segment = learn_segment
+        self.local_maxima_filter_size = local_maxima_filter_size
+        self.frame_rates = frame_rates

modeling_songformer.py ADDED Viewed

	@@ -0,0 +1,328 @@

+import pdb
+from typing import Tuple
+import torch
+import torch.nn as nn
+from transformers import PreTrainedModel
+import argparse
+import importlib
+import json
+import math
+import multiprocessing as mp
+import os
+import time
+from argparse import Namespace
+from pathlib import Path
+# monkey patch to fix issues in msaf
+import scipy
+import numpy as np
+scipy.inf = np.inf
+import librosa
+import torch
+from ema_pytorch import EMA
+from loguru import logger
+from muq import MuQ
+from musicfm.model.musicfm_25hz import MusicFM25Hz
+from omegaconf import OmegaConf
+from tqdm import tqdm
+import torch
+import torch.nn as nn
+from transformers import PreTrainedModel
+from transformers.modeling_outputs import CausalLMOutputWithPast
+from configuration_songformer import SongFormerConfig
+from model_config import ModelConfig
+from model import Model
+from omegaconf import OmegaConf
+# MUSICFM_HOME_PATH = os.path.join("ckpts", "MusicFM")
+MUSICFM_HOME_PATH = "/home/node59_tmpdata3/cbhao/SongFormer_kaiyuan_test/github_test/SongFormer/src/SongFormer/ckpts/MusicFM"
+BEFORE_DOWNSAMPLING_FRAME_RATES = 25
+AFTER_DOWNSAMPLING_FRAME_RATES = 8.333
+DATASET_LABEL = "SongForm-HX-8Class"
+DATASET_IDS = [5]
+TIME_DUR = 420
+INPUT_SAMPLING_RATE = 24000
+from dataset.label2id import DATASET_ID_ALLOWED_LABEL_IDS, DATASET_LABEL_TO_DATASET_ID
+from postprocessing.functional import postprocess_functional_structure
+def rule_post_processing(msa_list):
+    if len(msa_list) <= 2:
+        return msa_list
+    result = msa_list.copy()
+    while len(result) > 2:
+        first_duration = result[1][0] - result[0][0]
+        if first_duration < 1.0 and len(result) > 2:
+            result[0] = (result[0][0], result[1][1])
+            result = [result[0]] + result[2:]
+        else:
+            break
+    while len(result) > 2:
+        last_label_duration = result[-1][0] - result[-2][0]
+        if last_label_duration < 1.0:
+            result = result[:-2] + [result[-1]]
+        else:
+            break
+    while len(result) > 2:
+        if result[0][1] == result[1][1] and result[1][0] <= 10.0:
+            result = [(result[0][0], result[0][1])] + result[2:]
+        else:
+            break
+    while len(result) > 2:
+        last_duration = result[-1][0] - result[-2][0]
+        if result[-2][1] == result[-3][1] and last_duration <= 10.0:
+            result = result[:-2] + [result[-1]]
+        else:
+            break
+    return result
+class SongFormerModel(PreTrainedModel):
+    config_class = SongFormerConfig
+    def __init__(self, config: SongFormerConfig):
+        super().__init__(config)
+        device = "cpu"
+        with open("muq_config2.json", "r") as f:
+            muq_config_file = OmegaConf.load(f)
+        # self.muq = MuQ.from_pretrained("OpenMuQ/MuQ-large-msd-iter", device_map=None)
+        self.muq = MuQ(muq_config_file)
+        self.musicfm = MusicFM25Hz(
+            is_flash=False,
+            stat_path="msd_stats.json",
+            # model_path=os.path.join(MUSICFM_HOME_PATH, "pretrained_msd.pt"),
+        )
+        self.songformer = Model(ModelConfig())
+        num_classes = config.num_classes
+        dataset_id2label_mask = {}
+        for key, allowed_ids in DATASET_ID_ALLOWED_LABEL_IDS.items():
+            dataset_id2label_mask[key] = np.ones(config.num_classes, dtype=bool)
+            dataset_id2label_mask[key][allowed_ids] = False
+        self.num_classes = num_classes
+        self.dataset_id2label_mask = dataset_id2label_mask
+        self.config = config
+    def forward(self, input):
+        with torch.no_grad():
+            INPUT_SAMPLING_RATE = 24000
+            device = next(self.parameters()).device
+            # 如果为tensor或者是numpy
+            if isinstance(input, (torch.Tensor, np.ndarray)):
+                audio = torch.tensor(input).to(device)
+            elif os.path.exists(input):
+                wav, sr = librosa.load(input, sr=INPUT_SAMPLING_RATE)
+                audio = torch.tensor(wav).to(device)
+            else:
+                raise ValueError("input should be a tensor/numpy or a valid file path")
+            win_size = self.config.win_size
+            hop_size = self.config.hop_size
+            num_classes = self.config.num_classes
+            total_len = (
+                (audio.shape[0] // INPUT_SAMPLING_RATE) // TIME_DUR
+            ) * TIME_DUR + TIME_DUR
+            total_frames = math.ceil(total_len * AFTER_DOWNSAMPLING_FRAME_RATES)
+            logits = {
+                "function_logits": np.zeros([total_frames, num_classes]),
+                "boundary_logits": np.zeros([total_frames]),
+            }
+            logits_num = {
+                "function_logits": np.zeros([total_frames, num_classes]),
+                "boundary_logits": np.zeros([total_frames]),
+            }
+            lens = 0
+            i = 0
+            while True:
+                start_idx = i * INPUT_SAMPLING_RATE
+                end_idx = min((i + win_size) * INPUT_SAMPLING_RATE, audio.shape[-1])
+                if start_idx >= audio.shape[-1]:
+                    break
+                if end_idx - start_idx <= 1024:
+                    continue
+                audio_seg = audio[start_idx:end_idx]
+                # MuQ embedding
+                muq_output = self.muq(audio_seg.unsqueeze(0), output_hidden_states=True)
+                muq_embd_420s = muq_output["hidden_states"][10]
+                del muq_output
+                torch.cuda.empty_cache()
+                # MusicFM embedding
+                _, musicfm_hidden_states = self.musicfm.get_predictions(
+                    audio_seg.unsqueeze(0)
+                )
+                musicfm_embd_420s = musicfm_hidden_states[10]
+                del musicfm_hidden_states
+                torch.cuda.empty_cache()
+                wraped_muq_embd_30s = []
+                wraped_musicfm_embd_30s = []
+                for idx_30s in range(i, i + hop_size, 30):
+                    start_idx_30s = idx_30s * INPUT_SAMPLING_RATE
+                    end_idx_30s = min(
+                        (idx_30s + 30) * INPUT_SAMPLING_RATE,
+                        audio.shape[-1],
+                        (i + hop_size) * INPUT_SAMPLING_RATE,
+                    )
+                    if start_idx_30s >= audio.shape[-1]:
+                        break
+                    if end_idx_30s - start_idx_30s <= 1024:
+                        continue
+                    wraped_muq_embd_30s.append(
+                        self.muq(
+                            audio[start_idx_30s:end_idx_30s].unsqueeze(0),
+                            output_hidden_states=True,
+                        )["hidden_states"][10]
+                    )
+                    torch.cuda.empty_cache()
+                    wraped_musicfm_embd_30s.append(
+                        self.musicfm.get_predictions(
+                            audio[start_idx_30s:end_idx_30s].unsqueeze(0)
+                        )[1][10]
+                    )
+                    torch.cuda.empty_cache()
+                wraped_muq_embd_30s = torch.concatenate(wraped_muq_embd_30s, dim=1)
+                wraped_musicfm_embd_30s = torch.concatenate(
+                    wraped_musicfm_embd_30s, dim=1
+                )
+                all_embds = [
+                    wraped_musicfm_embd_30s,
+                    wraped_muq_embd_30s,
+                    musicfm_embd_420s,
+                    muq_embd_420s,
+                ]
+                if len(all_embds) > 1:
+                    embd_lens = [x.shape[1] for x in all_embds]
+                    max_embd_len = max(embd_lens)
+                    min_embd_len = min(embd_lens)
+                    if abs(max_embd_len - min_embd_len) > 4:
+                        raise ValueError(
+                            f"Embedding shapes differ too much: {max_embd_len} vs {min_embd_len}"
+                        )
+                    for idx in range(len(all_embds)):
+                        all_embds[idx] = all_embds[idx][:, :min_embd_len, :]
+                embd = torch.concatenate(all_embds, axis=-1)
+                dataset_label = DATASET_LABEL
+                dataset_ids = torch.Tensor(DATASET_IDS).to(device, dtype=torch.long)
+                msa_info, chunk_logits = self.songformer.infer(
+                    input_embeddings=embd,
+                    dataset_ids=dataset_ids,
+                    label_id_masks=torch.Tensor(
+                        self.dataset_id2label_mask[
+                            DATASET_LABEL_TO_DATASET_ID[dataset_label]
+                        ]
+                    )
+                    .to(device, dtype=bool)
+                    .unsqueeze(0)
+                    .unsqueeze(0),
+                    with_logits=True,
+                )
+                start_frame = int(i * AFTER_DOWNSAMPLING_FRAME_RATES)
+                end_frame = start_frame + min(
+                    math.ceil(hop_size * AFTER_DOWNSAMPLING_FRAME_RATES),
+                    chunk_logits["boundary_logits"][0].shape[0],
+                )
+                logits["function_logits"][start_frame:end_frame, :] += (
+                    chunk_logits["function_logits"][0].detach().cpu().numpy()
+                )
+                logits["boundary_logits"][start_frame:end_frame] = (
+                    chunk_logits["boundary_logits"][0].detach().cpu().numpy()
+                )
+                logits_num["function_logits"][start_frame:end_frame, :] += 1
+                logits_num["boundary_logits"][start_frame:end_frame] += 1
+                lens += end_frame - start_frame
+                i += hop_size
+            logits["function_logits"] /= logits_num["function_logits"]
+            logits["boundary_logits"] /= logits_num["boundary_logits"]
+            logits["function_logits"] = torch.from_numpy(
+                logits["function_logits"][:lens]
+            ).unsqueeze(0)
+            logits["boundary_logits"] = torch.from_numpy(
+                logits["boundary_logits"][:lens]
+            ).unsqueeze(0)
+            msa_infer_output = postprocess_functional_structure(logits, self.config)
+            assert msa_infer_output[-1][-1] == "end"
+            if not self.config.no_rule_post_processing:
+                msa_infer_output = rule_post_processing(msa_infer_output)
+            msa_json = []
+            for idx in range(len(msa_infer_output) - 1):
+                msa_json.append(
+                    {
+                        "label": msa_infer_output[idx][1],
+                        "start": msa_infer_output[idx][0],
+                        "end": msa_infer_output[idx + 1][0],
+                    }
+                )
+            return msa_json
+    @staticmethod
+    def _fix_state_dict_key_on_load(key: str) -> Tuple[str, bool]:
+        """Replace legacy parameter names with their modern equivalents. E.g. beta -> bias, gamma -> weight."""
+        # ---- begin: ignore muq ----
+        if key.startswith("muq."):
+            return key, False
+        # ---- end ---
+        # Rename LayerNorm beta & gamma params for some early models ported from Tensorflow (e.g. Bert)
+        # This rename is logged.
+        if key.endswith("LayerNorm.beta"):
+            return key.replace("LayerNorm.beta", "LayerNorm.bias"), True
+        if key.endswith("LayerNorm.gamma"):
+            return key.replace("LayerNorm.gamma", "LayerNorm.weight"), True
+        # Rename weight norm parametrizations to match changes across torch versions.
+        # Impacts a number of speech/wav2vec models. e.g. Hubert, Wav2Vec2, and others.
+        # This rename is not logged.
+        if hasattr(nn.utils.parametrizations, "weight_norm"):
+            if key.endswith("weight_g"):
+                return key.replace(
+                    "weight_g", "parametrizations.weight.original0"
+                ), True
+            if key.endswith("weight_v"):
+                return key.replace(
+                    "weight_v", "parametrizations.weight.original1"
+                ), True
+        else:
+            if key.endswith("parametrizations.weight.original0"):
+                return key.replace(
+                    "parametrizations.weight.original0", "weight_g"
+                ), True
+            if key.endswith("parametrizations.weight.original1"):
+                return key.replace(
+                    "parametrizations.weight.original1", "weight_v"
+                ), True
+        return key, False

msd_stats.json ADDED Viewed

	@@ -0,0 +1,65 @@

+{
+    "spec_256_cnt": 14394344256,
+    "spec_256_mean": -23.34296658431829,
+    "spec_256_std": 26.189295587132637,
+    "spec_512_cnt": 28677104448,
+    "spec_512_mean": -21.31267396860235,
+    "spec_512_std": 26.52644536245769,
+    "spec_1024_cnt": 57242624832,
+    "spec_1024_mean": -18.852271129208273,
+    "spec_1024_std": 26.443154583585663,
+    "spec_2048_cnt": 114373665600,
+    "spec_2048_mean": -15.638743433896792,
+    "spec_2048_std": 26.115825961611545,
+    "spec_4096_cnt": 228635747136,
+    "spec_4096_mean": -11.715532502794836,
+    "spec_4096_std": 25.763972210234062,
+    "melspec_256_cnt": 14282760192,
+    "melspec_256_mean": -26.962600400166156,
+    "melspec_256_std": 36.13614100912126,
+    "melspec_512_cnt": 14282760192,
+    "melspec_512_mean": -9.108344167718862,
+    "melspec_512_std": 24.71910937988429,
+    "melspec_1024_cnt": 14282760192,
+    "melspec_1024_mean": 0.37302579246531126,
+    "melspec_1024_std": 18.684082325919388,
+    "melspec_2048_cnt": 14282760192,
+    "melspec_2048_mean": 6.768444971712967,
+    "melspec_2048_std": 18.417922652295623,
+    "melspec_4096_cnt": 14282760192,
+    "melspec_4096_mean": 13.617164614990036,
+    "melspec_4096_std": 18.08552130124525,
+    "cqt_cnt": 9373061376,
+    "cqt_mean": 0.46341379757927165,
+    "cqt_std": 0.9543998080910191,
+    "mfcc_256_cnt": 1339008768,
+    "mfcc_256_mean": -11.681755459447485,
+    "mfcc_256_std": 29.183186444668316,
+    "mfcc_512_cnt": 1339008768,
+    "mfcc_512_mean": -2.540581461792183,
+    "mfcc_512_std": 31.93752185832081,
+    "mfcc_1024_cnt": 1339008768,
+    "mfcc_1024_mean": 6.606636263169779,
+    "mfcc_1024_std": 34.151644801729624,
+    "mfcc_2048_cnt": 1339008768,
+    "mfcc_2048_mean": 5.281600844245184,
+    "mfcc_2048_std": 33.12784541220003,
+    "mfcc_4096_cnt": 1339008768,
+    "mfcc_4096_mean": 4.7616569480166095,
+    "mfcc_4096_std": 32.61458906894133,
+    "chromagram_256_cnt": 1339008768,
+    "chromagram_256_mean": 55.15596556703181,
+    "chromagram_256_std": 73.91858278719991,
+    "chromagram_512_cnt": 1339008768,
+    "chromagram_512_mean": 175.73092252759895,
+    "chromagram_512_std": 248.48485148525953,
+    "chromagram_1024_cnt": 1339008768,
+    "chromagram_1024_mean": 589.2947481634608,
+    "chromagram_1024_std": 913.857929063196,
+    "chromagram_2048_cnt": 1339008768,
+    "chromagram_2048_mean": 2062.286388327397,
+    "chromagram_2048_std": 3458.92657915397,
+    "chromagram_4096_cnt": 1339008768,
+    "chromagram_4096_mean": 7673.039107997085,
+    "chromagram_4096_std": 13009.883158267234
+}

muq_config2.json ADDED Viewed

	@@ -0,0 +1,143 @@

+{
+    "label_rate": 25,
+    "num_codebooks": 1,
+    "codebook_dim": 16,
+    "codebook_size": 8192,
+    "features": [
+        "melspec_2048"
+    ],
+    "hop_length": 240,
+    "n_mels": 128,
+    "conv_dim": 512,
+    "encoder_dim": 1024,
+    "encoder_depth": 12,
+    "mask_hop": 0.4,
+    "mask_prob": 0.6,
+    "is_flash": false,
+    "stat": {
+        "melspec_2048_cnt": 14282760192,
+        "melspec_2048_mean": 6.768444971712967,
+        "melspec_2048_std": 18.417922652295623
+    },
+    "w2v2_config": {
+        "activation_dropout": 0.1,
+        "adapter_kernel_size": 3,
+        "adapter_stride": 2,
+        "add_adapter": false,
+        "apply_spec_augment": true,
+        "architectures": [
+            "Wav2Vec2ConformerForCTC"
+        ],
+        "attention_dropout": 0.1,
+        "bos_token_id": 1,
+        "classifier_proj_size": 256,
+        "codevector_dim": 768,
+        "conformer_conv_dropout": 0.1,
+        "contrastive_logits_temperature": 0.1,
+        "conv_bias": true,
+        "conv_depthwise_kernel_size": 31,
+        "conv_dim": [
+            512,
+            512,
+            512,
+            512,
+            512,
+            512,
+            512
+        ],
+        "conv_kernel": [
+            10,
+            3,
+            3,
+            3,
+            3,
+            2,
+            2
+        ],
+        "conv_stride": [
+            5,
+            2,
+            2,
+            2,
+            2,
+            2,
+            2
+        ],
+        "ctc_loss_reduction": "sum",
+        "ctc_zero_infinity": false,
+        "diversity_loss_weight": 0.1,
+        "do_stable_layer_norm": true,
+        "eos_token_id": 2,
+        "feat_extract_activation": "gelu",
+        "feat_extract_dropout": 0.0,
+        "feat_extract_norm": "layer",
+        "feat_proj_dropout": 0.1,
+        "feat_quantizer_dropout": 0.0,
+        "final_dropout": 0.1,
+        "gradient_checkpointing": false,
+        "hidden_act": "swish",
+        "hidden_dropout": 0.1,
+        "hidden_dropout_prob": 0.1,
+        "hidden_size": 1024,
+        "initializer_range": 0.02,
+        "intermediate_size": 4096,
+        "layer_norm_eps": 1e-05,
+        "layerdrop": 0.0,
+        "mask_feature_length": 10,
+        "mask_feature_min_masks": 0,
+        "mask_feature_prob": 0.0,
+        "mask_time_length": 10,
+        "mask_time_min_masks": 2,
+        "mask_time_prob": 0.05,
+        "max_source_positions": 5000,
+        "model_type": "wav2vec2-conformer",
+        "num_adapter_layers": 3,
+        "num_attention_heads": 16,
+        "num_codevector_groups": 2,
+        "num_codevectors_per_group": 320,
+        "num_conv_pos_embedding_groups": 16,
+        "num_conv_pos_embeddings": 128,
+        "num_feat_extract_layers": 7,
+        "num_hidden_layers": 24,
+        "num_negatives": 100,
+        "output_hidden_size": 1024,
+        "pad_token_id": 0,
+        "position_embeddings_type": "rotary",
+        "proj_codevector_dim": 768,
+        "rotary_embedding_base": 10000,
+        "tdnn_dilation": [
+            1,
+            2,
+            3,
+            1,
+            1
+        ],
+        "tdnn_dim": [
+            512,
+            512,
+            512,
+            512,
+            1500
+        ],
+        "tdnn_kernel": [
+            5,
+            3,
+            3,
+            1,
+            1
+        ],
+        "torch_dtype": "float32",
+        "transformers_version": "4.19.0.dev0",
+        "use_weighted_layer_sum": false,
+        "vocab_size": 32,
+        "xvector_output_dim": 512
+    },
+    "use_rvq_target": true,
+    "use_vq_target": false,
+    "use_encodec_target": false,
+    "rvq_ckpt_path": null,
+    "recon_loss_ratio": null,
+    "resume_checkpoint": null,
+    "rvq_n_codebooks": 8,
+    "rvq_multi_layer_num": 1
+}

musicfm/.gitignore ADDED Viewed

	@@ -0,0 +1,10 @@

+# mac
+.DS_Store
+# cache
+*.pyc
+# data
+*.json
+*.pt

musicfm/LICENSE ADDED Viewed

	@@ -0,0 +1,224 @@

+Dual Licensing Information
+-------------------------
+This software is dual-licensed under both the MIT License and the Apache License, Version 2.0.
+- The file `modules/flash_conformer.py` is distributed under the terms of the Apache License, Version 2.0.
+- All other files and modules in this software are distributed under the terms of the MIT License.
+### MIT License
+Copyright 2023 ByteDance Inc.
+Permission is hereby granted, free of charge, to any person obtaining a copy of this software and associated documentation files (the “Software”), to deal in the Software without restriction, including without limitation the rights to use, copy, modify, merge, publish, distribute, sublicense, and/or sell copies of the Software, and to permit persons to whom the Software is furnished to do so, subject to the following conditions:
+The above copyright notice and this permission notice shall be included in all copies or substantial portions of the Software.
+THE SOFTWARE IS PROVIDED “AS IS”, WITHOUT WARRANTY OF ANY KIND, EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
+### Apache License, Version 2.0
+Copyright 2018- The Hugging Face team. All rights reserved.
+                                 Apache License
+                           Version 2.0, January 2004
+                        http://www.apache.org/licenses/
+   TERMS AND CONDITIONS FOR USE, REPRODUCTION, AND DISTRIBUTION
+   1. Definitions.
+      "License" shall mean the terms and conditions for use, reproduction,
+      and distribution as defined by Sections 1 through 9 of this document.
+      "Licensor" shall mean the copyright owner or entity authorized by
+      the copyright owner that is granting the License.
+      "Legal Entity" shall mean the union of the acting entity and all
+      other entities that control, are controlled by, or are under common
+      control with that entity. For the purposes of this definition,
+      "control" means (i) the power, direct or indirect, to cause the
+      direction or management of such entity, whether by contract or
+      otherwise, or (ii) ownership of fifty percent (50%) or more of the
+      outstanding shares, or (iii) beneficial ownership of such entity.
+      "You" (or "Your") shall mean an individual or Legal Entity
+      exercising permissions granted by this License.
+      "Source" form shall mean the preferred form for making modifications,
+      including but not limited to software source code, documentation
+      source, and configuration files.
+      "Object" form shall mean any form resulting from mechanical
+      transformation or translation of a Source form, including but
+      not limited to compiled object code, generated documentation,
+      and conversions to other media types.
+      "Work" shall mean the work of authorship, whether in Source or
+      Object form, made available under the License, as indicated by a
+      copyright notice that is included in or attached to the work
+      (an example is provided in the Appendix below).
+      "Derivative Works" shall mean any work, whether in Source or Object
+      form, that is based on (or derived from) the Work and for which the
+      editorial revisions, annotations, elaborations, or other modifications
+      represent, as a whole, an original work of authorship. For the purposes
+      of this License, Derivative Works shall not include works that remain
+      separable from, or merely link (or bind by name) to the interfaces of,
+      the Work and Derivative Works thereof.
+      "Contribution" shall mean any work of authorship, including
+      the original version of the Work and any modifications or additions
+      to that Work or Derivative Works thereof, that is intentionally
+      submitted to Licensor for inclusion in the Work by the copyright owner
+      or by an individual or Legal Entity authorized to submit on behalf of
+      the copyright owner. For the purposes of this definition, "submitted"
+      means any form of electronic, verbal, or written communication sent
+      to the Licensor or its representatives, including but not limited to
+      communication on electronic mailing lists, source code control systems,
+      and issue tracking systems that are managed by, or on behalf of, the
+      Licensor for the purpose of discussing and improving the Work, but
+      excluding communication that is conspicuously marked or otherwise
+      designated in writing by the copyright owner as "Not a Contribution."
+      "Contributor" shall mean Licensor and any individual or Legal Entity
+      on behalf of whom a Contribution has been received by Licensor and
+      subsequently incorporated within the Work.
+   2. Grant of Copyright License. Subject to the terms and conditions of
+      this License, each Contributor hereby grants to You a perpetual,
+      worldwide, non-exclusive, no-charge, royalty-free, irrevocable
+      copyright license to reproduce, prepare Derivative Works of,
+      publicly display, publicly perform, sublicense, and distribute the
+      Work and such Derivative Works in Source or Object form.
+   3. Grant of Patent License. Subject to the terms and conditions of
+      this License, each Contributor hereby grants to You a perpetual,
+      worldwide, non-exclusive, no-charge, royalty-free, irrevocable
+      (except as stated in this section) patent license to make, have made,
+      use, offer to sell, sell, import, and otherwise transfer the Work,
+      where such license applies only to those patent claims licensable
+      by such Contributor that are necessarily infringed by their
+      Contribution(s) alone or by combination of their Contribution(s)
+      with the Work to which such Contribution(s) was submitted. If You
+      institute patent litigation against any entity (including a
+      cross-claim or counterclaim in a lawsuit) alleging that the Work
+      or a Contribution incorporated within the Work constitutes direct
+      or contributory patent infringement, then any patent licenses
+      granted to You under this License for that Work shall terminate
+      as of the date such litigation is filed.
+   4. Redistribution. You may reproduce and distribute copies of the
+      Work or Derivative Works thereof in any medium, with or without
+      modifications, and in Source or Object form, provided that You
+      meet the following conditions:
+      (a) You must give any other recipients of the Work or
+          Derivative Works a copy of this License; and
+      (b) You must cause any modified files to carry prominent notices
+          stating that You changed the files; and
+      (c) You must retain, in the Source form of any Derivative Works
+          that You distribute, all copyright, patent, trademark, and
+          attribution notices from the Source form of the Work,
+          excluding those notices that do not pertain to any part of
+          the Derivative Works; and
+      (d) If the Work includes a "NOTICE" text file as part of its
+          distribution, then any Derivative Works that You distribute must
+          include a readable copy of the attribution notices contained
+          within such NOTICE file, excluding those notices that do not
+          pertain to any part of the Derivative Works, in at least one
+          of the following places: within a NOTICE text file distributed
+          as part of the Derivative Works; within the Source form or
+          documentation, if provided along with the Derivative Works; or,
+          within a display generated by the Derivative Works, if and
+          wherever such third-party notices normally appear. The contents
+          of the NOTICE file are for informational purposes only and
+          do not modify the License. You may add Your own attribution
+          notices within Derivative Works that You distribute, alongside
+          or as an addendum to the NOTICE text from the Work, provided
+          that such additional attribution notices cannot be construed
+          as modifying the License.
+      You may add Your own copyright statement to Your modifications and
+      may provide additional or different license terms and conditions
+      for use, reproduction, or distribution of Your modifications, or
+      for any such Derivative Works as a whole, provided Your use,
+      reproduction, and distribution of the Work otherwise complies with
+      the conditions stated in this License.
+   5. Submission of Contributions. Unless You explicitly state otherwise,
+      any Contribution intentionally submitted for inclusion in the Work
+      by You to the Licensor shall be under the terms and conditions of
+      this License, without any additional terms or conditions.
+      Notwithstanding the above, nothing herein shall supersede or modify
+      the terms of any separate license agreement you may have executed
+      with Licensor regarding such Contributions.
+   6. Trademarks. This License does not grant permission to use the trade
+      names, trademarks, service marks, or product names of the Licensor,
+      except as required for reasonable and customary use in describing the
+      origin of the Work and reproducing the content of the NOTICE file.
+   7. Disclaimer of Warranty. Unless required by applicable law or
+      agreed to in writing, Licensor provides the Work (and each
+      Contributor provides its Contributions) on an "AS IS" BASIS,
+      WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or
+      implied, including, without limitation, any warranties or conditions
+      of TITLE, NON-INFRINGEMENT, MERCHANTABILITY, or FITNESS FOR A
+      PARTICULAR PURPOSE. You are solely responsible for determining the
+      appropriateness of using or redistributing the Work and assume any
+      risks associated with Your exercise of permissions under this License.
+   8. Limitation of Liability. In no event and under no legal theory,
+      whether in tort (including negligence), contract, or otherwise,
+      unless required by applicable law (such as deliberate and grossly
+      negligent acts) or agreed to in writing, shall any Contributor be
+      liable to You for damages, including any direct, indirect, special,
+      incidental, or consequential damages of any character arising as a
+      result of this License or out of the use or inability to use the
+      Work (including but not limited to damages for loss of goodwill,
+      work stoppage, computer failure or malfunction, or any and all
+      other commercial damages or losses), even if such Contributor
+      has been advised of the possibility of such damages.
+   9. Accepting Warranty or Additional Liability. While redistributing
+      the Work or Derivative Works thereof, You may choose to offer,
+      and charge a fee for, acceptance of support, warranty, indemnity,
+      or other liability obligations and/or rights consistent with this
+      License. However, in accepting such obligations, You may act only
+      on Your own behalf and on Your sole responsibility, not on behalf
+      of any other Contributor, and only if You agree to indemnify,
+      defend, and hold each Contributor harmless for any liability
+      incurred by, or claims asserted against, such Contributor by reason
+      of your accepting any such warranty or additional liability.
+   END OF TERMS AND CONDITIONS
+   APPENDIX: How to apply the Apache License to your work.
+      To apply the Apache License to your work, attach the following
+      boilerplate notice, with the fields enclosed by brackets "[]"
+      replaced with your own identifying information. (Don't include
+      the brackets!)  The text should be enclosed in the appropriate
+      comment syntax for the file format. We also recommend that a
+      file or class name and description of purpose be included on the
+      same "printed page" as the copyright notice for easier
+      identification within third-party archives.
+   Copyright [yyyy] [name of copyright owner]
+   Licensed under the Apache License, Version 2.0 (the "License");
+   you may not use this file except in compliance with the License.
+   You may obtain a copy of the License at
+       http://www.apache.org/licenses/LICENSE-2.0
+   Unless required by applicable law or agreed to in writing, software
+   distributed under the License is distributed on an "AS IS" BASIS,
+   WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+   See the License for the specific language governing permissions and
+   limitations under the License.

musicfm/README.md ADDED Viewed

	@@ -0,0 +1,173 @@

+# MusicFM 🤖
+[![License: MIT](https://img.shields.io/badge/License-MIT-yellow.svg)](https://opensource.org/licenses/MIT)
+[![License](https://img.shields.io/github/license/openshift/source-to-image.svg)](https://www.apache.org/licenses/LICENSE-2.0.html)
+**A Foundation Model for Music Informatics**, ICASSP 2024 [[paper](https://arxiv.org/abs/2311.03318)]
+-- Minz Won, Yun-Ning Hung, and Duc Le
+## Quick start
+### Download models
+**MusicFM-FMA**
+- Pretrained using [FMA-large](https://github.com/mdeff/fma) data
+```
+wget -P YOUR_HOME_PATH/musicfm/data/ https://huggingface.co/minzwon/MusicFM/resolve/main/fma_stats.json
+wget -P YOUR_HOME_PATH/musicfm/data/ https://huggingface.co/minzwon/MusicFM/resolve/main/pretrained_fma.pt
+```
+⚠️ The model checkpoint prior to Feb 13, 2024, was incorrect. Please ensure to re-download these files if you've been using previous versions.
+**MusicFM-MSD**
+- Pretrained with the entire [Million Song Dataset](http://millionsongdataset.com/)
+- This version performs better than the FMA version
+- This version is not introduced in the paper
+```
+wget -P YOUR_HOME_PATH/musicfm/data/ https://huggingface.co/minzwon/MusicFM/resolve/main/msd_stats.json
+wget -P YOUR_HOME_PATH/musicfm/data/ https://huggingface.co/minzwon/MusicFM/resolve/main/pretrained_msd.pt
+```
+### Get embeddings
+```
+HOME_PATH = "/home/dev" # path where you cloned musicfm
+import os
+import sys
+import torch
+sys.path.append(HOME_PATH)
+from musicfm.model.musicfm_25hz import MusicFM25Hz
+# dummy audio (30 seconds, 24kHz)
+wav = (torch.rand(4, 24000 * 30) - 0.5) * 2
+# load MusicFM
+musicfm = MusicFM25Hz(
+    is_flash=False,
+    stat_path=os.path.join(HOME_PATH, "musicfm", "data", "msd_stats.json"),
+    model_path=os.path.join(HOME_PATH, "musicfm", "data", "pretrained_msd.pt"),
+)
+# to GPUs
+wav = wav.cuda()
+musicfm = musicfm.cuda()
+# get embeddings
+musicfm.eval()
+emb = musicfm.get_latent(wav, layer_ix=7)
+```
+### Mixed precision and Flash attention
+Suffering from memory issues? [Mixed precision](https://pytorch.org/tutorials/recipes/recipes/amp_recipe.html) and [Flash attention](https://arxiv.org/abs/2205.14135) will be good friends of yours!
+```
+# dummy audio (30 seconds, 24kHz)
+wav = (torch.rand(4, 24000 * 30) - 0.5) * 2
+# load MusicFM
+musicfm = MusicFM25Hz(is_flash=True)
+# to GPUs
+wav = wav.cuda().half()
+musicfm = musicfm.cuda().half()
+# get embeddings
+musicfm.eval()
+emb = musicfm.get_latent(wav, layer_ix=7)
+```
+However, I highly recommend using `float32` for better performance in specific downstream tasks, such as beat tracking.
+### Usage in downstream tasks
+The pretrained model operates at a 25Hz frame rate, but our downstream tasks demand varying temporal resolutions. To address this, we either summarize the sequence through global average pooling or adjust the temporal resolution using adaptive average pooling.
+```
+from torch import nn
+# Sequence-level representation
+seq_emb = emb.mean(-1) # (batch, time, channel) -> (batch, channel)
+# Frame-level representation
+"""
+	n_frame = desired_temporal_resolution * sequence_length_in_sec
+	300 frames = 10Hz * 30s in this example
+	As a result, the sequence length becomes from 750 (25Hz * 30s) to 300
+"""
+n_frame = 300
+token_emb = nn.AdaptiveAvgPool1d(n_frame)(emb) # (batch, time, channel) -> (batch, time', channel)
+```
+We share the details of our downstream evaluation as follows. The selection of input lengths and temporal resolutions is based on our prior experience with each task.
+|  | Beat | Chord | Structure | Key | Tagging |
+| :--------: | :--------: | :--------: | :--------: | :--------: | :--------: |
+| Input length | 6s | 12s | 24s | 12s | 29.1s |
+| Temporal resolution | 50Hz | 16Hz | 8Hz | 0.5Hz | - |
+| n_frame | 300 | 192 | 192 | 6 | 1 |
+### Fine-tuning
+You can expect better performance in downstream tasks by fine-tuning the foundation model. In this scenario, employ `musicfm.train()` and extract the final embeddings by setting `layer_ix=12`. However, when optimizing the model with the same learning rate, there's a risk of [catastrophic forgetting](https://en.wikipedia.org/wiki/Catastrophic_interference). To mitigate this issue, we utilized a learning rate of 1e-5 for the foundation model and 1e-4 for the probing layers.
+## Results
+<img src="figs/Table1.png" width="800">
+\* FM1 is pretrained [MERT](https://arxiv.org/abs/2306.00107).
+\*\*FM8 mirrors the [BEST-RQ](https://arxiv.org/abs/2202.01855) but with the distinction that it was trained using music data.
+- Random tokenization generalizes well to music data.
+- Frame-level classification offers a more comprehensive understanding of foundation models. While FM4 excels in music tagging, its performance in structural analysis is subpar.
+- Input length used during training is critical for capturing
+long-term contexts. Check 5s models (FM1, FM2, and FM4) and a 30s model (FM5) in downbeat tracking and structure analysis.
+- Temporal resolution has less impact in our experimental setup. See FM5, FM6, and FM7.
+- Model architecture makes a significant difference. Conformer (FM5) consistently outperformed BERT encoder (FM3) for across all downstream tasks.
+- The influence of model size was relatively minimal (FM7 and FM8). However, we observed that FM8's performance continued to improve, which is typically indicative of underfitting. All models were trained for two weeks to ensure a fair comparison.
+- Data is undeniably crucial, as in any data-driven approach. Please compare FM7 and FM9.
+- Fine-tuning the foundation model further enhances downstream performance. However, we did observe a performance
+drop in the tagging task, primarily attributed to overfitting.
+## Masked token modeling
+<img src="figs/Fig1.png" width="300">
+MusicFM follows the training scheme of [BEST-RQ](https://arxiv.org/abs/2202.01855). Input audio is masked with noise, and the model predicts the masked representation. Target tokens are generated by random projection and a random codebook. Both the projection layer and codebook are **randomly initialized** and remain **non-trainable**. Isn't it fascinating?
+Note that input normalization is exceptionally crucial, considering the usage of random projection. You can check the details [here](https://github.com/minzwon/musicfm/blob/d5d0f313add9f3c32c41f95521760b1a136809ed/model/musicfm_25hz.py#L148).
+## Limitations
+- Self-supervised foundation models in music, such as [JukeMIR](https://arxiv.org/abs/2107.05677), [MERT](https://arxiv.org/abs/2306.00107), and [MusicFM](https://arxiv.org/abs/2311.03318), consistently report relatively low performance in key detection. While fine-tuning the model can help bridge the performance gap, the foundation model itself does not appear to learn musical keys inherently. Further investigation is required to develop more advanced music foundation models.
+- We share our model trained with the [FMA Dataset](https://github.com/mdeff/fma), which comprises 8k hours of Creative Common-licensed audio. While using a larger dataset (160k hours) can enhance performance, we've chosen to release the model trained on FMA to avoid potential licensing complications.
+- Fine-tuned models for downstream tasks are not made publicly available as they are primarily used for evaluation purposes. It is expected that carefully designed backends beyond simple probing layers will improve downstream performance. I look forward to the contributions of other researchers with more expertise in each specific task.
+- The downstream evaluation pipeline is not provided in this repository. Nonetheless, I believe creating a comprehensive evaluation pipeline is essential to expedite progress in music informatics research. I'm very open to discussing it together.
+## Acknowledgement
+We acknowledge and extend our sincere gratitude to Ju-Chiang Wang for his valuable contributions to data refinement and providing a crucial codebase for our downstream evaluation.
+## Citation
+```
+@article{won2023musicfm,
+    title={A Foundation Model for Music Informatics},
+    author = {Won, Minz and Hung, Yun-Ning and Le, Duc},
+    journal={arXiv preprint arXiv:2311.03318},
+    year={2023}
+}
+```

musicfm/data/.gitkeep ADDED Viewed

File without changes

musicfm/figs/Fig1.png ADDED Viewed

Git LFS Details

SHA256: bbbb7a435402555125e996c747a619585906bc2cb7911afa5521ac35af1201e3
Pointer size: 131 Bytes
Size of remote file: 396 kB

musicfm/figs/Table1.png ADDED Viewed

Git LFS Details

SHA256: 773da09077da92f3e41fb9a53aff4efc559dcfc07d2e65b142cf23af2de512d7
Pointer size: 131 Bytes
Size of remote file: 807 kB

musicfm/model/__init__.py ADDED Viewed

	@@ -0,0 +1,2 @@


1	+
2	+

musicfm/model/musicfm_25hz.py ADDED Viewed

	@@ -0,0 +1,252 @@

+# MIT License
+#
+# Copyright 2023 ByteDance Inc.
+#
+# Permission is hereby granted, free of charge, to any person obtaining a copy of this software and associated documentation files (the “Software”),
+# to deal in the Software without restriction, including without limitation the rights to use, copy, modify, merge, publish, distribute, sublicense,
+# and/or sell copies of the Software, and to permit persons to whom the Software is furnished to do so, subject to the following conditions:
+#
+# The above copyright notice and this permission notice shall be included in all copies or substantial portions of the Software.
+#
+# THE SOFTWARE IS PROVIDED “AS IS”, WITHOUT WARRANTY OF ANY KIND, EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+# FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+# LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
+# IN THE SOFTWARE.
+import json
+import random
+import torch
+from torch import nn
+from einops import rearrange
+from musicfm.modules.random_quantizer import RandomProjectionQuantizer
+from musicfm.modules.features import MelSTFT
+from musicfm.modules.conv import Conv2dSubsampling
+class MusicFM25Hz(nn.Module):
+    """
+    MusicFM
+    Input: 128-band mel spectrogram
+    Frontend: 2-layer Residual convolution
+    Backend: 12-layer Conformer
+    Quantizer: a codebook for mel spectrogram
+    """
+    def __init__(
+        self,
+        num_codebooks=1,
+        codebook_dim=16,
+        codebook_size=4096,
+        features=["melspec_2048"],
+        hop_length=240,
+        n_mels=128,
+        conv_dim=512,
+        encoder_dim=1024,
+        encoder_depth=12,
+        mask_hop=0.4,
+        mask_prob=0.6,
+        is_flash=False,
+        stat_path="./data/fma_stats.json",
+        # model_path="./data/pretrained_fma.pt",
+    ):
+        super(MusicFM25Hz, self).__init__()
+        # global variables
+        self.hop_length = hop_length
+        self.mask_hop = mask_hop
+        self.mask_prob = mask_prob
+        self.num_codebooks = num_codebooks
+        self.codebook_size = codebook_size
+        self.features = features
+        # load feature mean / std stats
+        with open(stat_path, "r") as f:
+            self.stat = json.load(f)
+        # feature extractor
+        self.preprocessor_melspec_2048 = MelSTFT(
+            n_fft=2048, hop_length=hop_length, is_db=True
+        )
+        # random quantizer
+        seed = 142
+        for feature in self.features:
+            for i in range(num_codebooks):
+                setattr(
+                    self,
+                    f"quantizer_{feature}_{i}",
+                    RandomProjectionQuantizer(
+                        n_mels * 4, codebook_dim, codebook_size, seed=seed + i
+                    ),
+                )
+        # two residual convolution layers + one projection layer
+        self.conv = Conv2dSubsampling(
+            1, conv_dim, encoder_dim, strides=[2, 2], n_bands=n_mels
+        )
+        # Conformer
+        if is_flash:
+            from modules.flash_conformer import (
+                Wav2Vec2ConformerEncoder,
+                Wav2Vec2ConformerConfig,
+            )
+        else:
+            from transformers.models.wav2vec2_conformer.modeling_wav2vec2_conformer import (
+                Wav2Vec2ConformerEncoder,
+                Wav2Vec2ConformerConfig,
+            )
+        config = Wav2Vec2ConformerConfig.from_pretrained(
+            "facebook/wav2vec2-conformer-rope-large-960h-ft"
+        )
+        config.num_hidden_layers = encoder_depth
+        config.hidden_size = encoder_dim
+        self.conformer = Wav2Vec2ConformerEncoder(config)
+        # projection
+        self.linear = nn.Linear(encoder_dim, codebook_size)
+        # loss function
+        self.loss = nn.CrossEntropyLoss()
+        # cls token (used for sequence classification)
+        random.seed(seed)
+        self.cls_token = nn.Parameter(torch.randn(encoder_dim))
+        # load model
+        # if model_path:
+        #     S = torch.load(model_path)["state_dict"]
+        #     SS = {k[6:]: v for k, v in S.items()}
+        #     self.load_state_dict(SS, strict=True)
+    def masking(self, x):
+        """random masking of 400ms with given probability"""
+        mx = x.clone()
+        b, t = mx.shape
+        len_masking_raw = int(24000 * self.mask_hop)
+        len_masking_token = int(24000 / self.hop_length / 2 / 2 * self.mask_hop)
+        # get random mask indices
+        start_indices = torch.rand(b, t // len_masking_raw) < self.mask_prob
+        time_domain_masked_indices = torch.nonzero(
+            start_indices.repeat_interleave(len_masking_raw, dim=1)
+        )
+        token_domain_masked_indices = torch.nonzero(
+            start_indices.repeat_interleave(len_masking_token, dim=1)
+        )
+        # mask with random values
+        masking_noise = (
+            torch.randn(time_domain_masked_indices.shape[0], dtype=x.dtype) * 0.1
+        )  # 0 mean 0.1 std
+        mx[tuple(time_domain_masked_indices.t())] = masking_noise.to(x.device)
+        return mx, token_domain_masked_indices
+    @torch.no_grad()
+    def preprocessing(self, x, features):
+        """extract classic audio features"""
+        # check precision
+        if x.dtype == torch.float16:
+            precision = 16
+        else:
+            precision = 32
+        out = {}
+        for key in features:
+            layer = getattr(self, "preprocessor_%s" % key)
+            out[key] = layer.float()(x.float())[..., :-1]
+            if precision == 16:
+                out[key] = out[key].half()
+        return out
+    def encoder(self, x):
+        """2-layer conv + w2v-conformer"""
+        x = self.conv(x)
+        out = self.conformer(x, output_hidden_states=True)
+        hidden_emb = out["hidden_states"]
+        last_emb = out["last_hidden_state"]
+        logits = self.linear(last_emb)
+        logits = {
+            key: logits[:, :, i * self.codebook_size : (i + 1) * self.codebook_size]
+            for i, key in enumerate(self.features)
+        }
+        return logits, hidden_emb
+    @torch.no_grad()
+    def normalize(self, x):
+        """normalize the input audio to have zero mean unit variance"""
+        for key in x.keys():
+            x[key] = (x[key] - self.stat["%s_mean" % key]) / self.stat["%s_std" % key]
+        return x
+    @torch.no_grad()
+    def rearrange(self, x):
+        """rearrange the batch to flatten every 4 steps"""
+        for key in x.keys():
+            if key == "chromagram":
+                x[key] = rearrange(x[key], "b f t -> b t f")
+            else:
+                x[key] = rearrange(x[key], "b f (t s) -> b t (s f)", s=4)
+        return x
+    @torch.no_grad()
+    def tokenize(self, x):
+        out = {}
+        for key in x.keys():
+            layer = getattr(self, "quantizer_%s" % key)
+            out[key] = layer(x[key])
+        return out
+    def get_targets(self, x):
+        x = self.preprocessing(x, features=self.features)
+        x = self.normalize(x)
+        x = self.rearrange(x)
+        target_tokens = self.tokenize(x)
+        return target_tokens
+    def get_predictions(self, x):
+        # preprocessing
+        x = self.preprocessing(x, features=["melspec_2048"])
+        x = self.normalize(x)
+        # encoding
+        logits, hidden_emb = self.encoder(x["melspec_2048"])
+        return logits, hidden_emb
+    def get_latent(self, x, layer_ix=12):
+        _, hidden_states = self.get_predictions(x)
+        emb = hidden_states[layer_ix]
+        return emb
+    def get_loss(self, logits, target_tokens, masked_indices):
+        losses = {}
+        accuracies = {}
+        for key in logits.keys():
+            masked_logits = logits[key][tuple(masked_indices.t())]
+            masked_tokens = target_tokens[key][tuple(masked_indices.t())]
+            losses[key] = self.loss(masked_logits, masked_tokens)
+            accuracies[key] = (
+                torch.sum(masked_logits.argmax(-1) == masked_tokens)
+                / masked_tokens.numel()
+            )
+        return losses, accuracies
+    def forward(self, x):
+        # get target feature tokens
+        target_tokens = self.get_targets(x)
+        # masking
+        x, masked_indices = self.masking(x)
+        # forward
+        logits, hidden_emb = self.get_predictions(x)
+        # get loss
+        losses, accuracies = self.get_loss(logits, target_tokens, masked_indices)
+        return logits, hidden_emb, losses, accuracies

musicfm/modules/__init__.py ADDED Viewed

	@@ -0,0 +1,2 @@


1	+
2	+

musicfm/modules/conv.py ADDED Viewed

	@@ -0,0 +1,82 @@

+# MIT License
+#
+# Copyright 2023 ByteDance Inc.
+#
+# Permission is hereby granted, free of charge, to any person obtaining a copy of this software and associated documentation files (the “Software”),
+# to deal in the Software without restriction, including without limitation the rights to use, copy, modify, merge, publish, distribute, sublicense,
+# and/or sell copies of the Software, and to permit persons to whom the Software is furnished to do so, subject to the following conditions:
+#
+# The above copyright notice and this permission notice shall be included in all copies or substantial portions of the Software.
+#
+# THE SOFTWARE IS PROVIDED “AS IS”, WITHOUT WARRANTY OF ANY KIND, EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+# FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+# LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
+# IN THE SOFTWARE.
+from torch import nn
+from einops import rearrange
+class Res2dModule(nn.Module):
+    def __init__(self, idim, odim, stride=(2, 2)):
+        super(Res2dModule, self).__init__()
+        self.conv1 = nn.Conv2d(idim, odim, 3, padding=1, stride=stride)
+        self.bn1 = nn.BatchNorm2d(odim)
+        self.conv2 = nn.Conv2d(odim, odim, 3, padding=1)
+        self.bn2 = nn.BatchNorm2d(odim)
+        self.relu = nn.ReLU()
+        # residual
+        self.diff = False
+        if (idim != odim) or (stride[0] > 1):
+            self.conv3 = nn.Conv2d(idim, odim, 3, padding=1, stride=stride)
+            self.bn3 = nn.BatchNorm2d(odim)
+            self.diff = True
+    def forward(self, x):
+        out = self.bn2(self.conv2(self.relu(self.bn1(self.conv1(x)))))
+        if self.diff:
+            x = self.bn3(self.conv3(x))
+        out = x + out
+        out = self.relu(out)
+        return out
+class Conv2dSubsampling(nn.Module):
+    """Convolutional 2D subsampling (to 1/4 length).
+    Args:
+        idim (int): Input dimension.
+        hdim (int): Hidden dimension.
+        odim (int): Output dimension.
+        strides (list): Sizes of strides.
+        n_bands (int): Number of frequency bands.
+    """
+    def __init__(self, idim, hdim, odim, strides=[2, 2], n_bands=64):
+        """Construct an Conv2dSubsampling object."""
+        super(Conv2dSubsampling, self).__init__()
+        self.conv = nn.Sequential(
+            Res2dModule(idim, hdim, (2, strides[0])),
+            Res2dModule(hdim, hdim, (2, strides[1])),
+        )
+        self.linear = nn.Linear(hdim * n_bands // 2 // 2, odim)
+    def forward(self, x):
+        """Subsample x.
+        Args:
+            x (torch.Tensor): Input tensor (#batch, idim, time).
+        Returns:
+            torch.Tensor: Subsampled tensor (#batch, time', odim),
+                where time' = time // 4.
+        """
+        if x.dim() == 3:
+            x = x.unsqueeze(1)  # (b, c, f, t)
+        x = self.conv(x)
+        x = rearrange(x, "b c f t -> b t (c f)")
+        x = self.linear(x)
+        return x

musicfm/modules/features.py ADDED Viewed

	@@ -0,0 +1,45 @@

+# MIT License
+#
+# Copyright 2023 ByteDance Inc.
+#
+# Permission is hereby granted, free of charge, to any person obtaining a copy of this software and associated documentation files (the “Software”),
+# to deal in the Software without restriction, including without limitation the rights to use, copy, modify, merge, publish, distribute, sublicense,
+# and/or sell copies of the Software, and to permit persons to whom the Software is furnished to do so, subject to the following conditions:
+#
+# The above copyright notice and this permission notice shall be included in all copies or substantial portions of the Software.
+#
+# THE SOFTWARE IS PROVIDED “AS IS”, WITHOUT WARRANTY OF ANY KIND, EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+# FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+# LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
+# IN THE SOFTWARE.
+import torchaudio
+from torch import nn
+class MelSTFT(nn.Module):
+    def __init__(
+        self,
+        sample_rate=24000,
+        n_fft=2048,
+        hop_length=240,
+        n_mels=128,
+        is_db=False,
+    ):
+        super(MelSTFT, self).__init__()
+        # spectrogram
+        self.mel_stft = torchaudio.transforms.MelSpectrogram(
+            sample_rate=sample_rate, n_fft=n_fft, hop_length=hop_length, n_mels=n_mels
+        )
+        # amplitude to decibel
+        self.is_db = is_db
+        if is_db:
+            self.amplitude_to_db = torchaudio.transforms.AmplitudeToDB()
+    def forward(self, waveform):
+        if self.is_db:
+            return self.amplitude_to_db(self.mel_stft(waveform))
+        else:
+            return self.mel_stft(waveform)

musicfm/modules/flash_conformer.py ADDED Viewed

	@@ -0,0 +1,2114 @@

+# coding=utf-8
+# Copyright 2022 The Fairseq Authors and the HuggingFace Inc. team. All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+""" PyTorch Wav2Vec2-Conformer model."""
+import math
+from dataclasses import dataclass
+from typing import Optional, Tuple, Union
+import numpy as np
+import torch
+import torch.utils.checkpoint
+from torch import nn
+from torch.nn import CrossEntropyLoss
+from torch.nn import functional as F
+from transformers.activations import ACT2FN
+from transformers.deepspeed import is_deepspeed_zero3_enabled
+from transformers.modeling_outputs import (
+    BaseModelOutput,
+    CausalLMOutput,
+    SequenceClassifierOutput,
+    TokenClassifierOutput,
+    Wav2Vec2BaseModelOutput,
+    XVectorOutput,
+)
+from transformers.modeling_utils import PreTrainedModel
+from transformers.utils import (
+    ModelOutput,
+    add_code_sample_docstrings,
+    add_start_docstrings,
+    add_start_docstrings_to_model_forward,
+    logging,
+    replace_return_docstrings,
+)
+from transformers.models.wav2vec2_conformer.configuration_wav2vec2_conformer import Wav2Vec2ConformerConfig
+logger = logging.get_logger(__name__)
+_HIDDEN_STATES_START_POSITION = 2
+# General docstring
+_CONFIG_FOR_DOC = "Wav2Vec2ConformerConfig"
+# Base docstring
+_CHECKPOINT_FOR_DOC = "facebook/wav2vec2-conformer-rope-large-960h-ft"
+_EXPECTED_OUTPUT_SHAPE = [1, 292, 1024]
+# CTC docstring
+_CTC_EXPECTED_OUTPUT = "'MISTER QUILTER IS THE APOSTLE OF THE MIDDLE CLASSES AND WE ARE GLAD TO WELCOME HIS GOSPEL'"
+_CTC_EXPECTED_LOSS = 64.21
+WAV2VEC2_CONFORMER_PRETRAINED_MODEL_ARCHIVE_LIST = [
+    "facebook/wav2vec2-conformer-rel-pos-large",
+    # See all Wav2Vec2Conformer models at https://huggingface.co/models?filter=wav2vec2-conformer
+]
+@dataclass
+# Copied from transformers.models.wav2vec2.modeling_wav2vec2.Wav2Vec2ForPreTrainingOutput with Wav2Vec2->Wav2Vec2Conformer
+class Wav2Vec2ConformerForPreTrainingOutput(ModelOutput):
+    """
+    Output type of [`Wav2Vec2ConformerForPreTraining`], with potential hidden states and attentions.
+    Args:
+        loss (*optional*, returned when `sample_negative_indices` are passed, `torch.FloatTensor` of shape `(1,)`):
+            Total loss as the sum of the contrastive loss (L_m) and the diversity loss (L_d) as stated in the [official
+            paper](https://arxiv.org/pdf/2006.11477.pdf) . (classification) loss.
+        projected_states (`torch.FloatTensor` of shape `(batch_size, sequence_length, config.proj_codevector_dim)`):
+            Hidden-states of the model projected to *config.proj_codevector_dim* that can be used to predict the masked
+            projected quantized states.
+        projected_quantized_states (`torch.FloatTensor` of shape `(batch_size, sequence_length, config.proj_codevector_dim)`):
+            Quantized extracted feature vectors projected to *config.proj_codevector_dim* representing the positive
+            target vectors for contrastive loss.
+        hidden_states (`tuple(torch.FloatTensor)`, *optional*, returned when `output_hidden_states=True` is passed or when `config.output_hidden_states=True`):
+            Tuple of `torch.FloatTensor` (one for the output of the embeddings + one for the output of each layer) of
+            shape `(batch_size, sequence_length, hidden_size)`.
+            Hidden-states of the model at the output of each layer plus the initial embedding outputs.
+        attentions (`tuple(torch.FloatTensor)`, *optional*, returned when `output_attentions=True` is passed or when `config.output_attentions=True`):
+            Tuple of `torch.FloatTensor` (one for each layer) of shape `(batch_size, num_heads, sequence_length,
+            sequence_length)`.
+            Attentions weights after the attention softmax, used to compute the weighted average in the self-attention
+            heads.
+        contrastive_loss (*optional*, returned when `sample_negative_indices` are passed, `torch.FloatTensor` of shape `(1,)`):
+            The contrastive loss (L_m) as stated in the [official paper](https://arxiv.org/pdf/2006.11477.pdf) .
+        diversity_loss (*optional*, returned when `sample_negative_indices` are passed, `torch.FloatTensor` of shape `(1,)`):
+            The diversity loss (L_d) as stated in the [official paper](https://arxiv.org/pdf/2006.11477.pdf) .
+    """
+    loss: Optional[torch.FloatTensor] = None
+    projected_states: torch.FloatTensor = None
+    projected_quantized_states: torch.FloatTensor = None
+    codevector_perplexity: torch.FloatTensor = None
+    hidden_states: Optional[Tuple[torch.FloatTensor]] = None
+    attentions: Optional[Tuple[torch.FloatTensor]] = None
+    contrastive_loss: Optional[torch.FloatTensor] = None
+    diversity_loss: Optional[torch.FloatTensor] = None
+# Copied from transformers.models.wav2vec2.modeling_wav2vec2._compute_mask_indices
+def _compute_mask_indices(
+    shape: Tuple[int, int],
+    mask_prob: float,
+    mask_length: int,
+    attention_mask: Optional[torch.LongTensor] = None,
+    min_masks: int = 0,
+) -> np.ndarray:
+    """
+    Computes random mask spans for a given shape. Used to implement [SpecAugment: A Simple Data Augmentation Method for
+    ASR](https://arxiv.org/abs/1904.08779). Note that this method is not optimized to run on TPU and should be run on
+    CPU as part of the preprocessing during training.
+    Args:
+        shape: The shape for which to compute masks. This should be of a tuple of size 2 where
+               the first element is the batch size and the second element is the length of the axis to span.
+        mask_prob:  The percentage of the whole axis (between 0 and 1) which will be masked. The number of
+                    independently generated mask spans of length `mask_length` is computed by
+                    `mask_prob*shape[1]/mask_length`. Note that due to overlaps, `mask_prob` is an upper bound and the
+                    actual percentage will be smaller.
+        mask_length: size of the mask
+        min_masks: minimum number of masked spans
+        attention_mask: A (right-padded) attention mask which independently shortens the feature axis of
+                        each batch dimension.
+    """
+    batch_size, sequence_length = shape
+    if mask_length < 1:
+        raise ValueError("`mask_length` has to be bigger than 0.")
+    if mask_length > sequence_length:
+        raise ValueError(
+            f"`mask_length` has to be smaller than `sequence_length`, but got `mask_length`: {mask_length}"
+            f" and `sequence_length`: {sequence_length}`"
+        )
+    # epsilon is used for probabilistic rounding
+    epsilon = np.random.rand(1).item()
+    def compute_num_masked_span(input_length):
+        """Given input length, compute how many spans should be masked"""
+        num_masked_span = int(mask_prob * input_length / mask_length + epsilon)
+        num_masked_span = max(num_masked_span, min_masks)
+        # make sure num masked span <= sequence_length
+        if num_masked_span * mask_length > sequence_length:
+            num_masked_span = sequence_length // mask_length
+        # make sure num_masked span is also <= input_length - (mask_length - 1)
+        if input_length - (mask_length - 1) < num_masked_span:
+            num_masked_span = max(input_length - (mask_length - 1), 0)
+        return num_masked_span
+    # compute number of masked spans in batch
+    input_lengths = (
+        attention_mask.sum(-1).detach().tolist()
+        if attention_mask is not None
+        else [sequence_length for _ in range(batch_size)]
+    )
+    # SpecAugment mask to fill
+    spec_aug_mask = np.zeros((batch_size, sequence_length), dtype=bool)
+    spec_aug_mask_idxs = []
+    max_num_masked_span = compute_num_masked_span(sequence_length)
+    if max_num_masked_span == 0:
+        return spec_aug_mask
+    for input_length in input_lengths:
+        # compute num of masked spans for this input
+        num_masked_span = compute_num_masked_span(input_length)
+        # get random indices to mask
+        spec_aug_mask_idx = np.random.choice(
+            np.arange(input_length - (mask_length - 1)), num_masked_span, replace=False
+        )
+        # pick first sampled index that will serve as a dummy index to pad vector
+        # to ensure same dimension for all batches due to probabilistic rounding
+        # Picking first sample just pads those vectors twice.
+        if len(spec_aug_mask_idx) == 0:
+            # this case can only happen if `input_length` is strictly smaller then
+            # `sequence_length` in which case the last token has to be a padding
+            # token which we can use as a dummy mask id
+            dummy_mask_idx = sequence_length - 1
+        else:
+            dummy_mask_idx = spec_aug_mask_idx[0]
+        spec_aug_mask_idx = np.concatenate(
+            [spec_aug_mask_idx, np.ones(max_num_masked_span - num_masked_span, dtype=np.int32) * dummy_mask_idx]
+        )
+        spec_aug_mask_idxs.append(spec_aug_mask_idx)
+    spec_aug_mask_idxs = np.array(spec_aug_mask_idxs)
+    # expand masked indices to masked spans
+    spec_aug_mask_idxs = np.broadcast_to(
+        spec_aug_mask_idxs[:, :, None], (batch_size, max_num_masked_span, mask_length)
+    )
+    spec_aug_mask_idxs = spec_aug_mask_idxs.reshape(batch_size, max_num_masked_span * mask_length)
+    # add offset to the starting indexes so that indexes now create a span
+    offsets = np.arange(mask_length)[None, None, :]
+    offsets = np.broadcast_to(offsets, (batch_size, max_num_masked_span, mask_length)).reshape(
+        batch_size, max_num_masked_span * mask_length
+    )
+    spec_aug_mask_idxs = spec_aug_mask_idxs + offsets
+    # ensure that we cannot have indices larger than sequence_length
+    if spec_aug_mask_idxs.max() > sequence_length - 1:
+        spec_aug_mask_idxs[spec_aug_mask_idxs > sequence_length - 1] = sequence_length - 1
+    # scatter indices to mask
+    np.put_along_axis(spec_aug_mask, spec_aug_mask_idxs, 1, -1)
+    return spec_aug_mask
+# Copied from transformers.models.wav2vec2.modeling_wav2vec2._sample_negative_indices
+def _sample_negative_indices(
+    features_shape: Tuple, num_negatives: int, mask_time_indices: Optional[np.ndarray] = None
+):
+    """
+    Sample `num_negatives` vectors from feature vectors.
+    """
+    batch_size, sequence_length = features_shape
+    # generate indices of the positive vectors themselves, repeat them `num_negatives` times
+    sequence_length_range = np.arange(sequence_length)
+    # get `num_negatives` random vector indices from the same utterance
+    sampled_negative_indices = np.zeros(shape=(batch_size, sequence_length, num_negatives), dtype=np.int32)
+    mask_time_indices = (
+        mask_time_indices.astype(bool) if mask_time_indices is not None else np.ones(features_shape, dtype=bool)
+    )
+    for batch_idx in range(batch_size):
+        high = mask_time_indices[batch_idx].sum() - 1
+        mapped_masked_indices = sequence_length_range[mask_time_indices[batch_idx]]
+        feature_indices = np.broadcast_to(np.arange(high + 1)[:, None], (high + 1, num_negatives))
+        sampled_indices = np.random.randint(0, high, size=(high + 1, num_negatives))
+        # avoid sampling the same positive vector, but keep the distribution uniform
+        sampled_indices[sampled_indices >= feature_indices] += 1
+        # remap to actual indices
+        sampled_negative_indices[batch_idx][mask_time_indices[batch_idx]] = mapped_masked_indices[sampled_indices]
+        # correct for batch size
+        sampled_negative_indices[batch_idx] += batch_idx * sequence_length
+    return sampled_negative_indices
+# Copied from transformers.models.wav2vec2.modeling_wav2vec2.Wav2Vec2NoLayerNormConvLayer with Wav2Vec2->Wav2Vec2Conformer
+class Wav2Vec2ConformerNoLayerNormConvLayer(nn.Module):
+    def __init__(self, config, layer_id=0):
+        super().__init__()
+        self.in_conv_dim = config.conv_dim[layer_id - 1] if layer_id > 0 else 1
+        self.out_conv_dim = config.conv_dim[layer_id]
+        self.conv = nn.Conv1d(
+            self.in_conv_dim,
+            self.out_conv_dim,
+            kernel_size=config.conv_kernel[layer_id],
+            stride=config.conv_stride[layer_id],
+            bias=config.conv_bias,
+        )
+        self.activation = ACT2FN[config.feat_extract_activation]
+    def forward(self, hidden_states):
+        hidden_states = self.conv(hidden_states)
+        hidden_states = self.activation(hidden_states)
+        return hidden_states
+# Copied from transformers.models.wav2vec2.modeling_wav2vec2.Wav2Vec2LayerNormConvLayer with Wav2Vec2->Wav2Vec2Conformer
+class Wav2Vec2ConformerLayerNormConvLayer(nn.Module):
+    def __init__(self, config, layer_id=0):
+        super().__init__()
+        self.in_conv_dim = config.conv_dim[layer_id - 1] if layer_id > 0 else 1
+        self.out_conv_dim = config.conv_dim[layer_id]
+        self.conv = nn.Conv1d(
+            self.in_conv_dim,
+            self.out_conv_dim,
+            kernel_size=config.conv_kernel[layer_id],
+            stride=config.conv_stride[layer_id],
+            bias=config.conv_bias,
+        )
+        self.layer_norm = nn.LayerNorm(self.out_conv_dim, elementwise_affine=True)
+        self.activation = ACT2FN[config.feat_extract_activation]
+    def forward(self, hidden_states):
+        hidden_states = self.conv(hidden_states)
+        hidden_states = hidden_states.transpose(-2, -1)
+        hidden_states = self.layer_norm(hidden_states)
+        hidden_states = hidden_states.transpose(-2, -1)
+        hidden_states = self.activation(hidden_states)
+        return hidden_states
+# Copied from transformers.models.wav2vec2.modeling_wav2vec2.Wav2Vec2GroupNormConvLayer with Wav2Vec2->Wav2Vec2Conformer
+class Wav2Vec2ConformerGroupNormConvLayer(nn.Module):
+    def __init__(self, config, layer_id=0):
+        super().__init__()
+        self.in_conv_dim = config.conv_dim[layer_id - 1] if layer_id > 0 else 1
+        self.out_conv_dim = config.conv_dim[layer_id]
+        self.conv = nn.Conv1d(
+            self.in_conv_dim,
+            self.out_conv_dim,
+            kernel_size=config.conv_kernel[layer_id],
+            stride=config.conv_stride[layer_id],
+            bias=config.conv_bias,
+        )
+        self.activation = ACT2FN[config.feat_extract_activation]
+        self.layer_norm = nn.GroupNorm(num_groups=self.out_conv_dim, num_channels=self.out_conv_dim, affine=True)
+    def forward(self, hidden_states):
+        hidden_states = self.conv(hidden_states)
+        hidden_states = self.layer_norm(hidden_states)
+        hidden_states = self.activation(hidden_states)
+        return hidden_states
+# Copied from transformers.models.wav2vec2.modeling_wav2vec2.Wav2Vec2PositionalConvEmbedding with Wav2Vec2->Wav2Vec2Conformer
+class Wav2Vec2ConformerPositionalConvEmbedding(nn.Module):
+    def __init__(self, config):
+        super().__init__()
+        self.conv = nn.Conv1d(
+            config.hidden_size,
+            config.hidden_size,
+            kernel_size=config.num_conv_pos_embeddings,
+            padding=config.num_conv_pos_embeddings // 2,
+            groups=config.num_conv_pos_embedding_groups,
+        )
+        if is_deepspeed_zero3_enabled():
+            import deepspeed
+            with deepspeed.zero.GatheredParameters(self.conv.weight, modifier_rank=0):
+                self.conv = nn.utils.weight_norm(self.conv, name="weight", dim=2)
+            deepspeed.zero.register_external_parameter(self, self.conv.weight_v)
+            deepspeed.zero.register_external_parameter(self, self.conv.weight_g)
+        else:
+            self.conv = nn.utils.weight_norm(self.conv, name="weight", dim=2)
+        self.padding = Wav2Vec2ConformerSamePadLayer(config.num_conv_pos_embeddings)
+        self.activation = ACT2FN[config.feat_extract_activation]
+    def forward(self, hidden_states):
+        hidden_states = hidden_states.transpose(1, 2)
+        hidden_states = self.conv(hidden_states)
+        hidden_states = self.padding(hidden_states)
+        hidden_states = self.activation(hidden_states)
+        hidden_states = hidden_states.transpose(1, 2)
+        return hidden_states
+class Wav2Vec2ConformerRotaryPositionalEmbedding(nn.Module):
+    """Rotary positional embedding
+    Reference : https://blog.eleuther.ai/rotary-embeddings/ Paper: https://arxiv.org/pdf/2104.09864.pdf
+    """
+    def __init__(self, config):
+        super().__init__()
+        dim = config.hidden_size // config.num_attention_heads
+        base = config.rotary_embedding_base
+        inv_freq = 1.0 / (base ** (torch.arange(0, dim, 2).float() / dim))
+        self.register_buffer("inv_freq", inv_freq)
+        self.cached_sequence_length = None
+        self.cached_rotary_positional_embedding = None
+    def forward(self, hidden_states):
+        sequence_length = hidden_states.shape[1]
+        if sequence_length == self.cached_sequence_length and self.cached_rotary_positional_embedding is not None:
+            return self.cached_rotary_positional_embedding
+        self.cached_sequence_length = sequence_length
+        time_stamps = torch.arange(sequence_length).type_as(self.inv_freq)
+        freqs = torch.einsum("i,j->ij", time_stamps, self.inv_freq)
+        embeddings = torch.cat((freqs, freqs), dim=-1)
+        cos_embeddings = embeddings.cos()[:, None, None, :]
+        sin_embeddings = embeddings.sin()[:, None, None, :]
+        self.cached_rotary_positional_embedding = torch.stack([cos_embeddings, sin_embeddings])
+        return self.cached_rotary_positional_embedding
+class Wav2Vec2ConformerRelPositionalEmbedding(nn.Module):
+    """Relative positional encoding module."""
+    def __init__(self, config):
+        super().__init__()
+        self.max_len = config.max_source_positions
+        self.d_model = config.hidden_size
+        self.pe = None
+        self.extend_pe(torch.tensor(0.0).expand(1, self.max_len))
+    def extend_pe(self, x):
+        # Reset the positional encodings
+        if self.pe is not None:
+            # self.pe contains both positive and negative parts
+            # the length of self.pe is 2 * input_len - 1
+            if self.pe.size(1) >= x.size(1) * 2 - 1:
+                if self.pe.dtype != x.dtype or self.pe.device != x.device:
+                    self.pe = self.pe.to(dtype=x.dtype, device=x.device)
+                return
+        # Suppose `i` is the position of query vector and `j` is the
+        # position of key vector. We use positive relative positions when keys
+        # are to the left (i>j) and negative relative positions otherwise (i<j).
+        pe_positive = torch.zeros(x.size(1), self.d_model)
+        pe_negative = torch.zeros(x.size(1), self.d_model)
+        position = torch.arange(0, x.size(1), dtype=torch.float32).unsqueeze(1)
+        div_term = torch.exp(
+            torch.arange(0, self.d_model, 2, dtype=torch.float32) * -(math.log(10000.0) / self.d_model)
+        )
+        pe_positive[:, 0::2] = torch.sin(position * div_term)
+        pe_positive[:, 1::2] = torch.cos(position * div_term)
+        pe_negative[:, 0::2] = torch.sin(-1 * position * div_term)
+        pe_negative[:, 1::2] = torch.cos(-1 * position * div_term)
+        # Reverse the order of positive indices and concat both positive and
+        # negative indices. This is used to support the shifting trick
+        # as in https://arxiv.org/abs/1901.02860
+        pe_positive = torch.flip(pe_positive, [0]).unsqueeze(0)
+        pe_negative = pe_negative[1:].unsqueeze(0)
+        pe = torch.cat([pe_positive, pe_negative], dim=1)
+        self.pe = pe.to(device=x.device, dtype=x.dtype)
+    def forward(self, hidden_states: torch.Tensor):
+        self.extend_pe(hidden_states)
+        start_idx = self.pe.size(1) // 2 - hidden_states.size(1) + 1
+        end_idx = self.pe.size(1) // 2 + hidden_states.size(1)
+        relative_position_embeddings = self.pe[:, start_idx:end_idx]
+        return relative_position_embeddings
+# Copied from transformers.models.wav2vec2.modeling_wav2vec2.Wav2Vec2SamePadLayer with Wav2Vec2->Wav2Vec2Conformer
+class Wav2Vec2ConformerSamePadLayer(nn.Module):
+    def __init__(self, num_conv_pos_embeddings):
+        super().__init__()
+        self.num_pad_remove = 1 if num_conv_pos_embeddings % 2 == 0 else 0
+    def forward(self, hidden_states):
+        if self.num_pad_remove > 0:
+            hidden_states = hidden_states[:, :, : -self.num_pad_remove]
+        return hidden_states
+# Copied from transformers.models.wav2vec2.modeling_wav2vec2.Wav2Vec2FeatureEncoder with Wav2Vec2->Wav2Vec2Conformer
+class Wav2Vec2ConformerFeatureEncoder(nn.Module):
+    """Construct the features from raw audio waveform"""
+    def __init__(self, config):
+        super().__init__()
+        if config.feat_extract_norm == "group":
+            conv_layers = [Wav2Vec2ConformerGroupNormConvLayer(config, layer_id=0)] + [
+                Wav2Vec2ConformerNoLayerNormConvLayer(config, layer_id=i + 1)
+                for i in range(config.num_feat_extract_layers - 1)
+            ]
+        elif config.feat_extract_norm == "layer":
+            conv_layers = [
+                Wav2Vec2ConformerLayerNormConvLayer(config, layer_id=i) for i in range(config.num_feat_extract_layers)
+            ]
+        else:
+            raise ValueError(
+                f"`config.feat_extract_norm` is {config.feat_extract_norm}, but has to be one of ['group', 'layer']"
+            )
+        self.conv_layers = nn.ModuleList(conv_layers)
+        self.gradient_checkpointing = False
+        self._requires_grad = True
+    def _freeze_parameters(self):
+        for param in self.parameters():
+            param.requires_grad = False
+        self._requires_grad = False
+    def forward(self, input_values):
+        hidden_states = input_values[:, None]
+        # make sure hidden_states require grad for gradient_checkpointing
+        if self._requires_grad and self.training:
+            hidden_states.requires_grad = True
+        for conv_layer in self.conv_layers:
+            if self._requires_grad and self.gradient_checkpointing and self.training:
+                def create_custom_forward(module):
+                    def custom_forward(*inputs):
+                        return module(*inputs)
+                    return custom_forward
+                hidden_states = torch.utils.checkpoint.checkpoint(
+                    create_custom_forward(conv_layer),
+                    hidden_states,
+                )
+            else:
+                hidden_states = conv_layer(hidden_states)
+        return hidden_states
+# Copied from transformers.models.wav2vec2.modeling_wav2vec2.Wav2Vec2FeatureProjection with Wav2Vec2->Wav2Vec2Conformer
+class Wav2Vec2ConformerFeatureProjection(nn.Module):
+    def __init__(self, config):
+        super().__init__()
+        self.layer_norm = nn.LayerNorm(config.conv_dim[-1], eps=config.layer_norm_eps)
+        self.projection = nn.Linear(config.conv_dim[-1], config.hidden_size)
+        self.dropout = nn.Dropout(config.feat_proj_dropout)
+    def forward(self, hidden_states):
+        # non-projected hidden states are needed for quantization
+        norm_hidden_states = self.layer_norm(hidden_states)
+        hidden_states = self.projection(norm_hidden_states)
+        hidden_states = self.dropout(hidden_states)
+        return hidden_states, norm_hidden_states
+# Copied from transformers.models.wav2vec2.modeling_wav2vec2.Wav2Vec2FeedForward with Wav2Vec2->Wav2Vec2Conformer
+class Wav2Vec2ConformerFeedForward(nn.Module):
+    def __init__(self, config):
+        super().__init__()
+        self.intermediate_dropout = nn.Dropout(config.activation_dropout)
+        self.intermediate_dense = nn.Linear(config.hidden_size, config.intermediate_size)
+        if isinstance(config.hidden_act, str):
+            self.intermediate_act_fn = ACT2FN[config.hidden_act]
+        else:
+            self.intermediate_act_fn = config.hidden_act
+        self.output_dense = nn.Linear(config.intermediate_size, config.hidden_size)
+        self.output_dropout = nn.Dropout(config.hidden_dropout)
+    def forward(self, hidden_states):
+        hidden_states = self.intermediate_dense(hidden_states)
+        hidden_states = self.intermediate_act_fn(hidden_states)
+        hidden_states = self.intermediate_dropout(hidden_states)
+        hidden_states = self.output_dense(hidden_states)
+        hidden_states = self.output_dropout(hidden_states)
+        return hidden_states
+class Wav2Vec2ConformerConvolutionModule(nn.Module):
+    """Convolution block used in the conformer block"""
+    def __init__(self, config):
+        super().__init__()
+        if (config.conv_depthwise_kernel_size - 1) % 2 == 1:
+            raise ValueError("`config.conv_depthwise_kernel_size` should be a odd number for 'SAME' padding")
+        self.layer_norm = nn.LayerNorm(config.hidden_size)
+        self.pointwise_conv1 = torch.nn.Conv1d(
+            config.hidden_size,
+            2 * config.hidden_size,
+            kernel_size=1,
+            stride=1,
+            padding=0,
+            bias=False,
+        )
+        self.glu = torch.nn.GLU(dim=1)
+        self.depthwise_conv = torch.nn.Conv1d(
+            config.hidden_size,
+            config.hidden_size,
+            config.conv_depthwise_kernel_size,
+            stride=1,
+            padding=(config.conv_depthwise_kernel_size - 1) // 2,
+            groups=config.hidden_size,
+            bias=False,
+        )
+        self.batch_norm = torch.nn.BatchNorm1d(config.hidden_size)
+        self.activation = ACT2FN[config.hidden_act]
+        self.pointwise_conv2 = torch.nn.Conv1d(
+            config.hidden_size,
+            config.hidden_size,
+            kernel_size=1,
+            stride=1,
+            padding=0,
+            bias=False,
+        )
+        self.dropout = torch.nn.Dropout(config.conformer_conv_dropout)
+    def forward(self, hidden_states):
+        hidden_states = self.layer_norm(hidden_states)
+        # exchange the temporal dimension and the feature dimension
+        hidden_states = hidden_states.transpose(1, 2)
+        # GLU mechanism
+        # => (batch, 2*channel, dim)
+        hidden_states = self.pointwise_conv1(hidden_states)
+        # => (batch, channel, dim)
+        hidden_states = self.glu(hidden_states)
+        # 1D Depthwise Conv
+        hidden_states = self.depthwise_conv(hidden_states)
+        hidden_states = self.batch_norm(hidden_states)
+        hidden_states = self.activation(hidden_states)
+        hidden_states = self.pointwise_conv2(hidden_states)
+        hidden_states = self.dropout(hidden_states)
+        hidden_states = hidden_states.transpose(1, 2)
+        return hidden_states
+class Wav2Vec2ConformerSelfAttention(nn.Module):
+    """Construct an Wav2Vec2ConformerSelfAttention object.
+    Can be enhanced with rotary or relative position embeddings.
+    """
+    def __init__(self, config):
+        super().__init__()
+        self.head_size = config.hidden_size // config.num_attention_heads
+        self.num_heads = config.num_attention_heads
+        self.position_embeddings_type = config.position_embeddings_type
+        self.linear_q = nn.Linear(config.hidden_size, config.hidden_size)
+        self.linear_k = nn.Linear(config.hidden_size, config.hidden_size)
+        self.linear_v = nn.Linear(config.hidden_size, config.hidden_size)
+        self.linear_out = nn.Linear(config.hidden_size, config.hidden_size)
+        self.dropout = nn.Dropout(p=config.attention_dropout)
+        self.dropout_p = config.attention_dropout
+        self.is_causal = config.is_causal
+        if self.position_embeddings_type == "relative":
+            # linear transformation for positional encoding
+            self.linear_pos = nn.Linear(config.hidden_size, config.hidden_size, bias=False)
+            # these two learnable bias are used in matrix c and matrix d
+            # as described in https://arxiv.org/abs/1901.02860 Section 3.3
+            self.pos_bias_u = nn.Parameter(torch.zeros(self.num_heads, self.head_size))
+            self.pos_bias_v = nn.Parameter(torch.zeros(self.num_heads, self.head_size))
+    def forward(
+        self,
+        hidden_states: torch.Tensor,
+        attention_mask: Optional[torch.Tensor] = None,
+        relative_position_embeddings: Optional[torch.Tensor] = None,
+        output_attentions: bool = False,
+    ) -> Tuple[torch.Tensor, Optional[torch.Tensor], Optional[Tuple[torch.Tensor]]]:
+        # self-attention mechanism
+        batch_size, sequence_length, hidden_size = hidden_states.size()
+        # make sure query/key states can be != value states
+        query_key_states = hidden_states
+        value_states = hidden_states
+        if self.position_embeddings_type == "rotary":
+            if relative_position_embeddings is None:
+                raise ValueError(
+                    "`relative_position_embeddings` has to be defined when `self.position_embeddings_type == 'rotary'"
+                )
+            query_key_states = self._apply_rotary_embedding(query_key_states, relative_position_embeddings)
+        # project query_key_states and value_states
+        query = self.linear_q(query_key_states).view(batch_size, -1, self.num_heads, self.head_size)
+        key = self.linear_k(query_key_states).view(batch_size, -1, self.num_heads, self.head_size)
+        value = self.linear_v(value_states).view(batch_size, -1, self.num_heads, self.head_size)
+        # => (batch, head, time1, d_k)
+        query = query.transpose(1, 2)
+        key = key.transpose(1, 2)
+        value = value.transpose(1, 2)
+        with torch.backends.cuda.sdp_kernel(enable_math=False, enable_flash=True, enable_mem_efficient=False):
+            hidden_states = F.scaled_dot_product_attention(query, key, value, attn_mask=attention_mask, dropout_p=self.dropout_p, is_causal=self.is_causal)
+        probs = None
+        # # apply attention_mask if necessary
+        # if attention_mask is not None:
+        #     scores = scores + attention_mask
+        # # => (batch, head, time1, time2)
+        # probs = torch.softmax(scores, dim=-1)
+        # probs = self.dropout(probs)
+        # # => (batch, head, time1, d_k)
+        # hidden_states = torch.matmul(probs, value)
+        # => (batch, time1, hidden_size)
+        hidden_states = hidden_states.transpose(1, 2).reshape(batch_size, -1, self.num_heads * self.head_size)
+        hidden_states = self.linear_out(hidden_states)
+        return hidden_states, probs
+    def _apply_rotary_embedding(self, hidden_states, relative_position_embeddings):
+        batch_size, sequence_length, hidden_size = hidden_states.size()
+        hidden_states = hidden_states.view(batch_size, sequence_length, self.num_heads, self.head_size)
+        cos = relative_position_embeddings[0, :sequence_length, ...]
+        sin = relative_position_embeddings[1, :sequence_length, ...]
+        # rotate hidden_states with rotary embeddings
+        hidden_states = hidden_states.transpose(0, 1)
+        rotated_states_begin = hidden_states[..., : self.head_size // 2]
+        rotated_states_end = hidden_states[..., self.head_size // 2 :]
+        rotated_states = torch.cat((-rotated_states_end, rotated_states_begin), dim=rotated_states_begin.ndim - 1)
+        hidden_states = (hidden_states * cos) + (rotated_states * sin)
+        hidden_states = hidden_states.transpose(0, 1)
+        hidden_states = hidden_states.view(batch_size, sequence_length, self.num_heads * self.head_size)
+        return hidden_states
+    def _apply_relative_embeddings(self, query, key, relative_position_embeddings):
+        # 1. project positional embeddings
+        # => (batch, head, 2*time1-1, d_k)
+        proj_relative_position_embeddings = self.linear_pos(relative_position_embeddings)
+        proj_relative_position_embeddings = proj_relative_position_embeddings.view(
+            relative_position_embeddings.size(0), -1, self.num_heads, self.head_size
+        )
+        proj_relative_position_embeddings = proj_relative_position_embeddings.transpose(1, 2)
+        proj_relative_position_embeddings = proj_relative_position_embeddings.transpose(2, 3)
+        # 2. Add bias to query
+        # => (batch, head, time1, d_k)
+        query = query.transpose(1, 2)
+        q_with_bias_u = (query + self.pos_bias_u).transpose(1, 2)
+        q_with_bias_v = (query + self.pos_bias_v).transpose(1, 2)
+        # 3. attention score: first compute matrix a and matrix c
+        # as described in https://arxiv.org/abs/1901.02860 Section 3.3
+        # => (batch, head, time1, time2)
+        scores_ac = torch.matmul(q_with_bias_u, key.transpose(-2, -1))
+        # 4. then compute matrix b and matrix d
+        # => (batch, head, time1, 2*time1-1)
+        scores_bd = torch.matmul(q_with_bias_v, proj_relative_position_embeddings)
+        # 5. shift matrix b and matrix d
+        zero_pad = torch.zeros((*scores_bd.size()[:3], 1), device=scores_bd.device, dtype=scores_bd.dtype)
+        scores_bd_padded = torch.cat([zero_pad, scores_bd], dim=-1)
+        scores_bd_padded_shape = scores_bd.size()[:2] + (scores_bd.shape[3] + 1, scores_bd.shape[2])
+        scores_bd_padded = scores_bd_padded.view(*scores_bd_padded_shape)
+        scores_bd = scores_bd_padded[:, :, 1:].view_as(scores_bd)
+        scores_bd = scores_bd[:, :, :, : scores_bd.size(-1) // 2 + 1]
+        # 6. sum matrices
+        # => (batch, head, time1, time2)
+        scores = (scores_ac + scores_bd) / math.sqrt(self.head_size)
+        return scores
+class Wav2Vec2ConformerEncoderLayer(nn.Module):
+    """Conformer block based on https://arxiv.org/abs/2005.08100."""
+    def __init__(self, config):
+        super().__init__()
+        embed_dim = config.hidden_size
+        dropout = config.attention_dropout
+        # Feed-forward 1
+        self.ffn1_layer_norm = nn.LayerNorm(embed_dim)
+        self.ffn1 = Wav2Vec2ConformerFeedForward(config)
+        # Self-Attention
+        self.self_attn_layer_norm = nn.LayerNorm(embed_dim)
+        self.self_attn_dropout = torch.nn.Dropout(dropout)
+        self.self_attn = Wav2Vec2ConformerSelfAttention(config)
+        # Conformer Convolution
+        self.conv_module = Wav2Vec2ConformerConvolutionModule(config)
+        # Feed-forward 2
+        self.ffn2_layer_norm = nn.LayerNorm(embed_dim)
+        self.ffn2 = Wav2Vec2ConformerFeedForward(config)
+        self.final_layer_norm = nn.LayerNorm(embed_dim)
+    def forward(
+        self,
+        hidden_states,
+        attention_mask: Optional[torch.Tensor] = None,
+        relative_position_embeddings: Optional[torch.Tensor] = None,
+        output_attentions: bool = False,
+    ):
+        hidden_states = hidden_states
+        # 1. Feed-Forward 1 layer
+        residual = hidden_states
+        hidden_states = self.ffn1_layer_norm(hidden_states)
+        hidden_states = self.ffn1(hidden_states)
+        hidden_states = hidden_states * 0.5 + residual
+        residual = hidden_states
+        # 2. Self-Attention layer
+        hidden_states = self.self_attn_layer_norm(hidden_states)
+        hidden_states, attn_weigts = self.self_attn(
+            hidden_states=hidden_states,
+            attention_mask=attention_mask,
+            relative_position_embeddings=relative_position_embeddings,
+            output_attentions=output_attentions,
+        )
+        hidden_states = self.self_attn_dropout(hidden_states)
+        hidden_states = hidden_states + residual
+        # 3. Convolutional Layer
+        residual = hidden_states
+        hidden_states = self.conv_module(hidden_states)
+        hidden_states = residual + hidden_states
+        # 4. Feed-Forward 2 Layer
+        residual = hidden_states
+        hidden_states = self.ffn2_layer_norm(hidden_states)
+        hidden_states = self.ffn2(hidden_states)
+        hidden_states = hidden_states * 0.5 + residual
+        hidden_states = self.final_layer_norm(hidden_states)
+        return hidden_states, attn_weigts
+class Wav2Vec2ConformerEncoder(nn.Module):
+    def __init__(self, config, is_causal=False):
+        super().__init__()
+        config.is_causal = is_causal
+        self.config = config
+        if config.position_embeddings_type == "relative":
+            self.embed_positions = Wav2Vec2ConformerRelPositionalEmbedding(config)
+        elif config.position_embeddings_type == "rotary":
+            self.embed_positions = Wav2Vec2ConformerRotaryPositionalEmbedding(config)
+        else:
+            self.embed_positions = None
+        self.pos_conv_embed = Wav2Vec2ConformerPositionalConvEmbedding(config)
+        self.layer_norm = nn.LayerNorm(config.hidden_size, eps=config.layer_norm_eps)
+        self.dropout = nn.Dropout(config.hidden_dropout)
+        self.layers = nn.ModuleList([Wav2Vec2ConformerEncoderLayer(config) for _ in range(config.num_hidden_layers)])
+        self.gradient_checkpointing = False
+    def forward(
+        self,
+        hidden_states,
+        attention_mask=None,
+        output_attentions=False,
+        output_hidden_states=False,
+        return_dict=True,
+    ):
+        all_hidden_states = () if output_hidden_states else None
+        all_self_attentions = () if output_attentions else None
+        if attention_mask is not None:
+            # make sure padded tokens output 0
+            hidden_states[~attention_mask] = 0.0
+            # extend attention_mask
+            attention_mask = 1.0 - attention_mask[:, None, None, :].to(dtype=hidden_states.dtype)
+            attention_mask = attention_mask * torch.finfo(hidden_states.dtype).min
+            attention_mask = attention_mask.expand(
+                attention_mask.shape[0], 1, attention_mask.shape[-1], attention_mask.shape[-1]
+            )
+        hidden_states = self.dropout(hidden_states)
+        if self.embed_positions is not None:
+            relative_position_embeddings = self.embed_positions(hidden_states)
+        else:
+            relative_position_embeddings = None
+        deepspeed_zero3_is_enabled = is_deepspeed_zero3_enabled()
+        for i, layer in enumerate(self.layers):
+            if output_hidden_states:
+                all_hidden_states = all_hidden_states + (hidden_states,)
+            # add LayerDrop (see https://arxiv.org/abs/1909.11556 for description)
+            dropout_probability = np.random.uniform(0, 1)
+            skip_the_layer = True if self.training and (dropout_probability < self.config.layerdrop) else False
+            if not skip_the_layer or deepspeed_zero3_is_enabled:
+                # under deepspeed zero3 all gpus must run in sync
+                if self.gradient_checkpointing and self.training:
+                    # create gradient checkpointing function
+                    def create_custom_forward(module):
+                        def custom_forward(*inputs):
+                            return module(*inputs, output_attentions)
+                        return custom_forward
+                    layer_outputs = torch.utils.checkpoint.checkpoint(
+                        create_custom_forward(layer),
+                        hidden_states,
+                        attention_mask,
+                        relative_position_embeddings,
+                    )
+                else:
+                    layer_outputs = layer(
+                        hidden_states,
+                        attention_mask=attention_mask,
+                        relative_position_embeddings=relative_position_embeddings,
+                        output_attentions=output_attentions,
+                    )
+                hidden_states = layer_outputs[0]
+            if skip_the_layer:
+                layer_outputs = (None, None)
+            if output_attentions:
+                all_self_attentions = all_self_attentions + (layer_outputs[1],)
+        hidden_states = self.layer_norm(hidden_states)
+        if output_hidden_states:
+            all_hidden_states = all_hidden_states + (hidden_states,)
+        if not return_dict:
+            return tuple(v for v in [hidden_states, all_hidden_states, all_self_attentions] if v is not None)
+        return BaseModelOutput(
+            last_hidden_state=hidden_states,
+            hidden_states=all_hidden_states,
+            attentions=all_self_attentions,
+        )
+# Copied from transformers.models.wav2vec2.modeling_wav2vec2.Wav2Vec2GumbelVectorQuantizer with Wav2Vec2->Wav2Vec2Conformer
+class Wav2Vec2ConformerGumbelVectorQuantizer(nn.Module):
+    """
+    Vector quantization using gumbel softmax. See `[CATEGORICAL REPARAMETERIZATION WITH
+    GUMBEL-SOFTMAX](https://arxiv.org/pdf/1611.01144.pdf) for more information.
+    """
+    def __init__(self, config):
+        super().__init__()
+        self.num_groups = config.num_codevector_groups
+        self.num_vars = config.num_codevectors_per_group
+        if config.codevector_dim % self.num_groups != 0:
+            raise ValueError(
+                f"`config.codevector_dim {config.codevector_dim} must be divisible "
+                f"by `config.num_codevector_groups` {self.num_groups} for concatenation"
+            )
+        # storage for codebook variables (codewords)
+        self.codevectors = nn.Parameter(
+            torch.FloatTensor(1, self.num_groups * self.num_vars, config.codevector_dim // self.num_groups)
+        )
+        self.weight_proj = nn.Linear(config.conv_dim[-1], self.num_groups * self.num_vars)
+        # can be decayed for training
+        self.temperature = 2
+    @staticmethod
+    def _compute_perplexity(probs, mask=None):
+        if mask is not None:
+            mask_extended = mask.flatten()[:, None, None].expand(probs.shape)
+            probs = torch.where(mask_extended, probs, torch.zeros_like(probs))
+            marginal_probs = probs.sum(dim=0) / mask.sum()
+        else:
+            marginal_probs = probs.mean(dim=0)
+        perplexity = torch.exp(-torch.sum(marginal_probs * torch.log(marginal_probs + 1e-7), dim=-1)).sum()
+        return perplexity
+    def forward(self, hidden_states, mask_time_indices=None):
+        batch_size, sequence_length, hidden_size = hidden_states.shape
+        # project to codevector dim
+        hidden_states = self.weight_proj(hidden_states)
+        hidden_states = hidden_states.view(batch_size * sequence_length * self.num_groups, -1)
+        if self.training:
+            # sample code vector probs via gumbel in differentiateable way
+            codevector_probs = nn.functional.gumbel_softmax(
+                hidden_states.float(), tau=self.temperature, hard=True
+            ).type_as(hidden_states)
+            # compute perplexity
+            codevector_soft_dist = torch.softmax(
+                hidden_states.view(batch_size * sequence_length, self.num_groups, -1).float(), dim=-1
+            )
+            perplexity = self._compute_perplexity(codevector_soft_dist, mask_time_indices)
+        else:
+            # take argmax in non-differentiable way
+            # comptute hard codevector distribution (one hot)
+            codevector_idx = hidden_states.argmax(dim=-1)
+            codevector_probs = hidden_states.new_zeros(hidden_states.shape).scatter_(
+                -1, codevector_idx.view(-1, 1), 1.0
+            )
+            codevector_probs = codevector_probs.view(batch_size * sequence_length, self.num_groups, -1)
+            perplexity = self._compute_perplexity(codevector_probs, mask_time_indices)
+        codevector_probs = codevector_probs.view(batch_size * sequence_length, -1)
+        # use probs to retrieve codevectors
+        codevectors_per_group = codevector_probs.unsqueeze(-1) * self.codevectors
+        codevectors = codevectors_per_group.view(batch_size * sequence_length, self.num_groups, self.num_vars, -1)
+        codevectors = codevectors.sum(-2).view(batch_size, sequence_length, -1)
+        return codevectors, perplexity
+# Copied from transformers.models.wav2vec2.modeling_wav2vec2.Wav2Vec2Adapter with Wav2Vec2->Wav2Vec2Conformer
+class Wav2Vec2ConformerAdapter(nn.Module):
+    def __init__(self, config):
+        super().__init__()
+        # feature dim might need to be down-projected
+        if config.output_hidden_size != config.hidden_size:
+            self.proj = nn.Linear(config.hidden_size, config.output_hidden_size)
+            self.proj_layer_norm = nn.LayerNorm(config.output_hidden_size)
+        else:
+            self.proj = self.proj_layer_norm = None
+        self.layers = nn.ModuleList(Wav2Vec2ConformerAdapterLayer(config) for _ in range(config.num_adapter_layers))
+        self.layerdrop = config.layerdrop
+    def forward(self, hidden_states):
+        # down project hidden_states if necessary
+        if self.proj is not None and self.proj_layer_norm is not None:
+            hidden_states = self.proj(hidden_states)
+            hidden_states = self.proj_layer_norm(hidden_states)
+        hidden_states = hidden_states.transpose(1, 2)
+        for layer in self.layers:
+            layerdrop_prob = np.random.random()
+            if not self.training or (layerdrop_prob > self.layerdrop):
+                hidden_states = layer(hidden_states)
+        hidden_states = hidden_states.transpose(1, 2)
+        return hidden_states
+# Copied from transformers.models.wav2vec2.modeling_wav2vec2.Wav2Vec2AdapterLayer with Wav2Vec2->Wav2Vec2Conformer
+class Wav2Vec2ConformerAdapterLayer(nn.Module):
+    def __init__(self, config):
+        super().__init__()
+        self.conv = nn.Conv1d(
+            config.output_hidden_size,
+            2 * config.output_hidden_size,
+            config.adapter_kernel_size,
+            stride=config.adapter_stride,
+            padding=1,
+        )
+    def forward(self, hidden_states):
+        hidden_states = self.conv(hidden_states)
+        hidden_states = nn.functional.glu(hidden_states, dim=1)
+        return hidden_states
+class Wav2Vec2ConformerPreTrainedModel(PreTrainedModel):
+    """
+    An abstract class to handle weights initialization and a simple interface for downloading and loading pretrained
+    models.
+    """
+    config_class = Wav2Vec2ConformerConfig
+    base_model_prefix = "wav2vec2_conformer"
+    main_input_name = "input_values"
+    _keys_to_ignore_on_load_missing = [r"position_ids"]
+    supports_gradient_checkpointing = True
+    def _init_weights(self, module):
+        """Initialize the weights"""
+        # Wav2Vec2ForPreTraining last 2 linear layers need standard Linear init.
+        if isinstance(module, Wav2Vec2ConformerForPreTraining):
+            module.project_hid.reset_parameters()
+            module.project_q.reset_parameters()
+            module.project_hid._is_hf_initialized = True
+            module.project_q._is_hf_initialized = True
+        # gumbel softmax requires special init
+        elif isinstance(module, Wav2Vec2ConformerGumbelVectorQuantizer):
+            module.weight_proj.weight.data.normal_(mean=0.0, std=1)
+            module.weight_proj.bias.data.zero_()
+            nn.init.uniform_(module.codevectors)
+        elif isinstance(module, Wav2Vec2ConformerSelfAttention):
+            if hasattr(module, "pos_bias_u"):
+                nn.init.xavier_uniform_(module.pos_bias_u)
+            if hasattr(module, "pos_bias_v"):
+                nn.init.xavier_uniform_(module.pos_bias_v)
+        elif isinstance(module, Wav2Vec2ConformerPositionalConvEmbedding):
+            nn.init.normal_(
+                module.conv.weight,
+                mean=0,
+                std=2 * math.sqrt(1 / (module.conv.kernel_size[0] * module.conv.in_channels)),
+            )
+            nn.init.constant_(module.conv.bias, 0)
+        elif isinstance(module, Wav2Vec2ConformerFeatureProjection):
+            k = math.sqrt(1 / module.projection.in_features)
+            nn.init.uniform_(module.projection.weight, a=-k, b=k)
+            nn.init.uniform_(module.projection.bias, a=-k, b=k)
+        elif isinstance(module, nn.Linear):
+            module.weight.data.normal_(mean=0.0, std=self.config.initializer_range)
+            if module.bias is not None:
+                module.bias.data.zero_()
+        elif isinstance(module, (nn.LayerNorm, nn.GroupNorm)):
+            module.bias.data.zero_()
+            module.weight.data.fill_(1.0)
+        elif isinstance(module, nn.Conv1d):
+            nn.init.kaiming_normal_(module.weight)
+            if module.bias is not None:
+                k = math.sqrt(module.groups / (module.in_channels * module.kernel_size[0]))
+                nn.init.uniform_(module.bias, a=-k, b=k)
+    def _get_feat_extract_output_lengths(
+        self, input_lengths: Union[torch.LongTensor, int], add_adapter: Optional[bool] = None
+    ):
+        """
+        Computes the output length of the convolutional layers
+        """
+        add_adapter = self.config.add_adapter if add_adapter is None else add_adapter
+        def _conv_out_length(input_length, kernel_size, stride):
+            # 1D convolutional layer output length formula taken
+            # from https://pytorch.org/docs/stable/generated/torch.nn.Conv1d.html
+            return torch.div(input_length - kernel_size, stride, rounding_mode="floor") + 1
+        for kernel_size, stride in zip(self.config.conv_kernel, self.config.conv_stride):
+            input_lengths = _conv_out_length(input_lengths, kernel_size, stride)
+        if add_adapter:
+            for _ in range(self.config.num_adapter_layers):
+                input_lengths = _conv_out_length(input_lengths, 1, self.config.adapter_stride)
+        return input_lengths
+    def _get_feature_vector_attention_mask(
+        self, feature_vector_length: int, attention_mask: torch.LongTensor, add_adapter=None
+    ):
+        # Effectively attention_mask.sum(-1), but not inplace to be able to run
+        # on inference mode.
+        non_padded_lengths = attention_mask.cumsum(dim=-1)[:, -1]
+        output_lengths = self._get_feat_extract_output_lengths(non_padded_lengths, add_adapter=add_adapter)
+        output_lengths = output_lengths.to(torch.long)
+        batch_size = attention_mask.shape[0]
+        attention_mask = torch.zeros(
+            (batch_size, feature_vector_length), dtype=attention_mask.dtype, device=attention_mask.device
+        )
+        # these two operations makes sure that all values before the output lengths idxs are attended to
+        attention_mask[(torch.arange(attention_mask.shape[0], device=attention_mask.device), output_lengths - 1)] = 1
+        attention_mask = attention_mask.flip([-1]).cumsum(-1).flip([-1]).bool()
+        return attention_mask
+    def _set_gradient_checkpointing(self, module, value=False):
+        if isinstance(module, (Wav2Vec2ConformerEncoder, Wav2Vec2ConformerFeatureEncoder)):
+            module.gradient_checkpointing = value
+WAV2VEC2_CONFORMER_START_DOCSTRING = r"""
+    Wav2Vec2Conformer was proposed in [wav2vec 2.0: A Framework for Self-Supervised Learning of Speech
+    Representations](https://arxiv.org/abs/2006.11477) by Alexei Baevski, Henry Zhou, Abdelrahman Mohamed, Michael
+    Auli.
+    This model inherits from [`PreTrainedModel`]. Check the superclass documentation for the generic methods the
+    library implements for all its model (such as downloading or saving etc.).
+    This model is a PyTorch [nn.Module](https://pytorch.org/docs/stable/nn.html#nn.Module) sub-class. Use it as a
+    regular PyTorch Module and refer to the PyTorch documentation for all matter related to general usage and behavior.
+    Parameters:
+        config ([`Wav2Vec2ConformerConfig`]): Model configuration class with all the parameters of the model.
+            Initializing with a config file does not load the weights associated with the model, only the
+            configuration. Check out the [`~PreTrainedModel.from_pretrained`] method to load the model weights.
+"""
+WAV2VEC2_CONFORMER_INPUTS_DOCSTRING = r"""
+    Args:
+        input_values (`torch.FloatTensor` of shape `(batch_size, sequence_length)`):
+            Float values of input raw speech waveform. Values can be obtained by loading a `.flac` or `.wav` audio file
+            into an array of type `List[float]` or a `numpy.ndarray`, *e.g.* via the soundfile library (`pip install
+            soundfile`). To prepare the array into `input_values`, the [`AutoProcessor`] should be used for padding and
+            conversion into a tensor of type `torch.FloatTensor`. See [`Wav2Vec2Processor.__call__`] for details.
+        attention_mask (`torch.LongTensor` of shape `(batch_size, sequence_length)`, *optional*):
+            Mask to avoid performing convolution and attention on padding token indices. Mask values selected in `[0,
+            1]`:
+            - 1 for tokens that are **not masked**,
+            - 0 for tokens that are **masked**.
+            [What are attention masks?](../glossary#attention-mask)
+            <Tip warning={true}>
+            `attention_mask` should only be passed if the corresponding processor has `config.return_attention_mask ==
+            True`. For all models whose processor has `config.return_attention_mask == False`, such as
+            [wav2vec2-conformer-rel-pos-large](https://huggingface.co/facebook/wav2vec2-conformer-rel-pos-large),
+            `attention_mask` should **not** be passed to avoid degraded performance when doing batched inference. For
+            such models `input_values` should simply be padded with 0 and passed without `attention_mask`. Be aware
+            that these models also yield slightly different results depending on whether `input_values` is padded or
+            not.
+            </Tip>
+        output_attentions (`bool`, *optional*):
+            Whether or not to return the attentions tensors of all attention layers. See `attentions` under returned
+            tensors for more detail.
+        output_hidden_states (`bool`, *optional*):
+            Whether or not to return the hidden states of all layers. See `hidden_states` under returned tensors for
+            more detail.
+        return_dict (`bool`, *optional*):
+            Whether or not to return a [`~utils.ModelOutput`] instead of a plain tuple.
+"""
+@add_start_docstrings(
+    "The bare Wav2Vec2Conformer Model transformer outputting raw hidden-states without any specific head on top.",
+    WAV2VEC2_CONFORMER_START_DOCSTRING,
+)
+class Wav2Vec2ConformerModel(Wav2Vec2ConformerPreTrainedModel):
+    def __init__(self, config: Wav2Vec2ConformerConfig):
+        super().__init__(config)
+        self.config = config
+        self.feature_extractor = Wav2Vec2ConformerFeatureEncoder(config)
+        self.feature_projection = Wav2Vec2ConformerFeatureProjection(config)
+        # model only needs masking vector if mask prob is > 0.0
+        if config.mask_time_prob > 0.0 or config.mask_feature_prob > 0.0:
+            self.masked_spec_embed = nn.Parameter(torch.FloatTensor(config.hidden_size).uniform_())
+        self.encoder = Wav2Vec2ConformerEncoder(config)
+        self.adapter = Wav2Vec2ConformerAdapter(config) if config.add_adapter else None
+        # Initialize weights and apply final processing
+        self.post_init()
+    # Copied from transformers.models.wav2vec2.modeling_wav2vec2.Wav2Vec2Model.freeze_feature_encoder
+    def freeze_feature_encoder(self):
+        """
+        Calling this function will disable the gradient computation for the feature encoder so that its parameter will
+        not be updated during training.
+        """
+        self.feature_extractor._freeze_parameters()
+    # Copied from transformers.models.wav2vec2.modeling_wav2vec2.Wav2Vec2Model._mask_hidden_states
+    def _mask_hidden_states(
+        self,
+        hidden_states: torch.FloatTensor,
+        mask_time_indices: Optional[torch.FloatTensor] = None,
+        attention_mask: Optional[torch.LongTensor] = None,
+    ):
+        """
+        Masks extracted features along time axis and/or along feature axis according to
+        [SpecAugment](https://arxiv.org/abs/1904.08779).
+        """
+        # `config.apply_spec_augment` can set masking to False
+        if not getattr(self.config, "apply_spec_augment", True):
+            return hidden_states
+        # generate indices & apply SpecAugment along time axis
+        batch_size, sequence_length, hidden_size = hidden_states.size()
+        if mask_time_indices is not None:
+            # apply SpecAugment along time axis with given mask_time_indices
+            hidden_states[mask_time_indices] = self.masked_spec_embed.to(hidden_states.dtype)
+        elif self.config.mask_time_prob > 0 and self.training:
+            mask_time_indices = _compute_mask_indices(
+                (batch_size, sequence_length),
+                mask_prob=self.config.mask_time_prob,
+                mask_length=self.config.mask_time_length,
+                attention_mask=attention_mask,
+                min_masks=self.config.mask_time_min_masks,
+            )
+            mask_time_indices = torch.tensor(mask_time_indices, device=hidden_states.device, dtype=torch.bool)
+            hidden_states[mask_time_indices] = self.masked_spec_embed.to(hidden_states.dtype)
+        if self.config.mask_feature_prob > 0 and self.training:
+            # generate indices & apply SpecAugment along feature axis
+            mask_feature_indices = _compute_mask_indices(
+                (batch_size, hidden_size),
+                mask_prob=self.config.mask_feature_prob,
+                mask_length=self.config.mask_feature_length,
+                min_masks=self.config.mask_feature_min_masks,
+            )
+            mask_feature_indices = torch.tensor(mask_feature_indices, device=hidden_states.device, dtype=torch.bool)
+            mask_feature_indices = mask_feature_indices[:, None].expand(-1, sequence_length, -1)
+            hidden_states[mask_feature_indices] = 0
+        return hidden_states
+    @add_start_docstrings_to_model_forward(WAV2VEC2_CONFORMER_INPUTS_DOCSTRING)
+    @add_code_sample_docstrings(
+        checkpoint=_CHECKPOINT_FOR_DOC,
+        output_type=Wav2Vec2BaseModelOutput,
+        config_class=_CONFIG_FOR_DOC,
+        modality="audio",
+        expected_output=_EXPECTED_OUTPUT_SHAPE,
+    )
+    # Copied from transformers.models.wav2vec2.modeling_wav2vec2.Wav2Vec2Model.forward with wav2vec2->wav2vec2_conformer
+    def forward(
+        self,
+        input_values: Optional[torch.Tensor],
+        attention_mask: Optional[torch.Tensor] = None,
+        mask_time_indices: Optional[torch.FloatTensor] = None,
+        output_attentions: Optional[bool] = None,
+        output_hidden_states: Optional[bool] = None,
+        return_dict: Optional[bool] = None,
+    ) -> Union[Tuple, Wav2Vec2BaseModelOutput]:
+        output_attentions = output_attentions if output_attentions is not None else self.config.output_attentions
+        output_hidden_states = (
+            output_hidden_states if output_hidden_states is not None else self.config.output_hidden_states
+        )
+        return_dict = return_dict if return_dict is not None else self.config.use_return_dict
+        extract_features = self.feature_extractor(input_values)
+        extract_features = extract_features.transpose(1, 2)
+        if attention_mask is not None:
+            # compute reduced attention_mask corresponding to feature vectors
+            attention_mask = self._get_feature_vector_attention_mask(
+                extract_features.shape[1], attention_mask, add_adapter=False
+            )
+        hidden_states, extract_features = self.feature_projection(extract_features)
+        hidden_states = self._mask_hidden_states(
+            hidden_states, mask_time_indices=mask_time_indices, attention_mask=attention_mask
+        )
+        encoder_outputs = self.encoder(
+            hidden_states,
+            attention_mask=attention_mask,
+            output_attentions=output_attentions,
+            output_hidden_states=output_hidden_states,
+            return_dict=return_dict,
+        )
+        hidden_states = encoder_outputs[0]
+        if self.adapter is not None:
+            hidden_states = self.adapter(hidden_states)
+        if not return_dict:
+            return (hidden_states, extract_features) + encoder_outputs[1:]
+        return Wav2Vec2BaseModelOutput(
+            last_hidden_state=hidden_states,
+            extract_features=extract_features,
+            hidden_states=encoder_outputs.hidden_states,
+            attentions=encoder_outputs.attentions,
+        )
+@add_start_docstrings(
+    """Wav2Vec2Conformer Model with a quantizer and `VQ` head on top.""", WAV2VEC2_CONFORMER_START_DOCSTRING
+)
+class Wav2Vec2ConformerForPreTraining(Wav2Vec2ConformerPreTrainedModel):
+    # Copied from transformers.models.wav2vec2.modeling_wav2vec2.Wav2Vec2ForPreTraining.__init__ with Wav2Vec2->Wav2Vec2Conformer,wav2vec2->wav2vec2_conformer
+    def __init__(self, config: Wav2Vec2ConformerConfig):
+        super().__init__(config)
+        self.wav2vec2_conformer = Wav2Vec2ConformerModel(config)
+        self.dropout_features = nn.Dropout(config.feat_quantizer_dropout)
+        self.quantizer = Wav2Vec2ConformerGumbelVectorQuantizer(config)
+        self.project_hid = nn.Linear(config.hidden_size, config.proj_codevector_dim)
+        self.project_q = nn.Linear(config.codevector_dim, config.proj_codevector_dim)
+        # Initialize weights and apply final processing
+        self.post_init()
+    # Copied from transformers.models.wav2vec2.modeling_wav2vec2.Wav2Vec2ForPreTraining.set_gumbel_temperature
+    def set_gumbel_temperature(self, temperature: int):
+        """
+        Set the Gumbel softmax temperature to a given value. Only necessary for training
+        """
+        self.quantizer.temperature = temperature
+    # Copied from transformers.models.wav2vec2.modeling_wav2vec2.Wav2Vec2ForPreTraining.freeze_feature_encoder with wav2vec2->wav2vec2_conformer
+    def freeze_feature_encoder(self):
+        """
+        Calling this function will disable the gradient computation for the feature encoder so that its parameter will
+        not be updated during training.
+        """
+        self.wav2vec2_conformer.feature_extractor._freeze_parameters()
+    @staticmethod
+    # Copied from transformers.models.wav2vec2.modeling_wav2vec2.Wav2Vec2ForPreTraining.compute_contrastive_logits
+    def compute_contrastive_logits(
+        target_features: torch.FloatTensor,
+        negative_features: torch.FloatTensor,
+        predicted_features: torch.FloatTensor,
+        temperature: int = 0.1,
+    ):
+        """
+        Compute logits for contrastive loss based using cosine similarity as the distance measure between
+        `[positive_feature, negative_features]` and `[predicted_features]`. Additionally, temperature can be applied.
+        """
+        target_features = torch.cat([target_features, negative_features], dim=0)
+        logits = torch.cosine_similarity(predicted_features.float(), target_features.float(), dim=-1).type_as(
+            target_features
+        )
+        # apply temperature
+        logits = logits / temperature
+        return logits
+    @add_start_docstrings_to_model_forward(WAV2VEC2_CONFORMER_INPUTS_DOCSTRING)
+    @replace_return_docstrings(output_type=Wav2Vec2ConformerForPreTrainingOutput, config_class=_CONFIG_FOR_DOC)
+    # Copied from transformers.models.wav2vec2.modeling_wav2vec2.Wav2Vec2ForPreTraining.forward with Wav2Vec2->Wav2Vec2Conformer,wav2vec2->wav2vec2_conformer,wav2vec2_conformer-base->wav2vec2-conformer-rel-pos-large
+    def forward(
+        self,
+        input_values: Optional[torch.Tensor],
+        attention_mask: Optional[torch.Tensor] = None,
+        mask_time_indices: Optional[torch.BoolTensor] = None,
+        sampled_negative_indices: Optional[torch.BoolTensor] = None,
+        output_attentions: Optional[bool] = None,
+        output_hidden_states: Optional[bool] = None,
+        return_dict: Optional[bool] = None,
+    ) -> Union[Tuple, Wav2Vec2ConformerForPreTrainingOutput]:
+        r"""
+        mask_time_indices (`torch.BoolTensor` of shape `(batch_size, sequence_length)`, *optional*):
+            Indices to mask extracted features for contrastive loss. When in training mode, model learns to predict
+            masked extracted features in *config.proj_codevector_dim* space.
+        sampled_negative_indices (`torch.BoolTensor` of shape `(batch_size, sequence_length, num_negatives)`, *optional*):
+            Indices indicating which quantized target vectors are used as negative sampled vectors in contrastive loss.
+            Required input for pre-training.
+        Returns:
+        Example:
+        ```python
+        >>> import torch
+        >>> from transformers import AutoFeatureExtractor, Wav2Vec2ConformerForPreTraining
+        >>> from transformers.models.wav2vec2_conformer.modeling_wav2vec2_conformer import (
+        ...     _compute_mask_indices,
+        ...     _sample_negative_indices,
+        ... )
+        >>> from datasets import load_dataset
+        >>> feature_extractor = AutoFeatureExtractor.from_pretrained("facebook/wav2vec2-conformer-rel-pos-large")
+        >>> model = Wav2Vec2ConformerForPreTraining.from_pretrained("facebook/wav2vec2-conformer-rel-pos-large")
+        >>> ds = load_dataset("hf-internal-testing/librispeech_asr_dummy", "clean", split="validation")
+        >>> input_values = feature_extractor(ds[0]["audio"]["array"], return_tensors="pt").input_values  # Batch size 1
+        >>> # compute masked indices
+        >>> batch_size, raw_sequence_length = input_values.shape
+        >>> sequence_length = model._get_feat_extract_output_lengths(raw_sequence_length).item()
+        >>> mask_time_indices = _compute_mask_indices(
+        ...     shape=(batch_size, sequence_length), mask_prob=0.2, mask_length=2
+        ... )
+        >>> sampled_negative_indices = _sample_negative_indices(
+        ...     features_shape=(batch_size, sequence_length),
+        ...     num_negatives=model.config.num_negatives,
+        ...     mask_time_indices=mask_time_indices,
+        ... )
+        >>> mask_time_indices = torch.tensor(data=mask_time_indices, device=input_values.device, dtype=torch.long)
+        >>> sampled_negative_indices = torch.tensor(
+        ...     data=sampled_negative_indices, device=input_values.device, dtype=torch.long
+        ... )
+        >>> with torch.no_grad():
+        ...     outputs = model(input_values, mask_time_indices=mask_time_indices)
+        >>> # compute cosine similarity between predicted (=projected_states) and target (=projected_quantized_states)
+        >>> cosine_sim = torch.cosine_similarity(outputs.projected_states, outputs.projected_quantized_states, dim=-1)
+        >>> # show that cosine similarity is much higher than random
+        >>> cosine_sim[mask_time_indices.to(torch.bool)].mean() > 0.5
+        tensor(True)
+        >>> # for contrastive loss training model should be put into train mode
+        >>> model = model.train()
+        >>> loss = model(
+        ...     input_values, mask_time_indices=mask_time_indices, sampled_negative_indices=sampled_negative_indices
+        ... ).loss
+        ```"""
+        return_dict = return_dict if return_dict is not None else self.config.use_return_dict
+        if mask_time_indices is not None:
+            mask_time_indices = mask_time_indices.to(torch.bool)
+        outputs = self.wav2vec2_conformer(
+            input_values,
+            attention_mask=attention_mask,
+            output_attentions=output_attentions,
+            output_hidden_states=output_hidden_states,
+            mask_time_indices=mask_time_indices,
+            return_dict=return_dict,
+        )
+        # 1. project all transformed features (including masked) to final vq dim
+        transformer_features = self.project_hid(outputs[0])
+        # 2. quantize all (unmasked) extracted features and project to final vq dim
+        extract_features = self.dropout_features(outputs[1])
+        if attention_mask is not None:
+            # compute reduced attention_mask correponding to feature vectors
+            attention_mask = self._get_feature_vector_attention_mask(
+                extract_features.shape[1], attention_mask, add_adapter=False
+            )
+        quantized_features, codevector_perplexity = self.quantizer(
+            extract_features, mask_time_indices=mask_time_indices
+        )
+        quantized_features = self.project_q(quantized_features)
+        loss = contrastive_loss = diversity_loss = None
+        if sampled_negative_indices is not None:
+            batch_size, sequence_length, hidden_size = quantized_features.shape
+            # for training, we sample negatives
+            # 3. sample K negatives (distractors) quantized states for contrastive loss
+            # if attention_mask is passed, make sure that padded feature vectors cannot be sampled
+            # sample negative quantized vectors BTC => (BxT)C
+            negative_quantized_features = quantized_features.view(-1, hidden_size)[
+                sampled_negative_indices.long().view(-1)
+            ]
+            negative_quantized_features = negative_quantized_features.view(
+                batch_size, sequence_length, -1, hidden_size
+            ).permute(2, 0, 1, 3)
+            # 4. compute logits, corresponding to `logs = sim(c_t, [q_t, \sim{q}_t]) / \kappa`
+            # of equation (3) in https://arxiv.org/pdf/2006.11477.pdf
+            logits = self.compute_contrastive_logits(
+                quantized_features[None, :],
+                negative_quantized_features,
+                transformer_features,
+                self.config.contrastive_logits_temperature,
+            )
+            # 5. if a negative vector is identical to the positive (i.e. when codebook utilization is low),
+            # its cosine similarity will be masked
+            neg_is_pos = (quantized_features == negative_quantized_features).all(-1)
+            if neg_is_pos.any():
+                logits[1:][neg_is_pos] = float("-inf")
+            # 6. compute contrastive loss \mathbf{L}_m = cross_entropy(logs) =
+            # -log(exp(sim(c_t, q_t)/\kappa) / \sum_{\sim{q}} exp(sim(c_t, \sim{q})/\kappa))
+            logits = logits.transpose(0, 2).reshape(-1, logits.size(0))
+            target = ((1 - mask_time_indices.long()) * -100).transpose(0, 1).flatten()
+            contrastive_loss = nn.functional.cross_entropy(logits.float(), target, reduction="sum")
+            # 7. compute diversity loss: \mathbf{L}_d
+            num_codevectors = self.config.num_codevectors_per_group * self.config.num_codevector_groups
+            diversity_loss = ((num_codevectors - codevector_perplexity) / num_codevectors) * mask_time_indices.sum()
+            # 8. \mathbf{L} = \mathbf{L}_m + \alpha * \mathbf{L}_d
+            loss = contrastive_loss + self.config.diversity_loss_weight * diversity_loss
+        if not return_dict:
+            if loss is not None:
+                return (loss, transformer_features, quantized_features, codevector_perplexity) + outputs[2:]
+            return (transformer_features, quantized_features, codevector_perplexity) + outputs[2:]
+        return Wav2Vec2ConformerForPreTrainingOutput(
+            loss=loss,
+            projected_states=transformer_features,
+            projected_quantized_states=quantized_features,
+            codevector_perplexity=codevector_perplexity,
+            hidden_states=outputs.hidden_states,
+            attentions=outputs.attentions,
+            contrastive_loss=contrastive_loss,
+            diversity_loss=diversity_loss,
+        )
+@add_start_docstrings(
+    """Wav2Vec2Conformer Model with a `language modeling` head on top for Connectionist Temporal Classification (CTC).""",
+    WAV2VEC2_CONFORMER_START_DOCSTRING,
+)
+class Wav2Vec2ConformerForCTC(Wav2Vec2ConformerPreTrainedModel):
+    # Copied from transformers.models.wav2vec2.modeling_wav2vec2.Wav2Vec2ForCTC.__init__ with Wav2Vec2->Wav2Vec2Conformer,wav2vec2->wav2vec2_conformer
+    def __init__(self, config):
+        super().__init__(config)
+        self.wav2vec2_conformer = Wav2Vec2ConformerModel(config)
+        self.dropout = nn.Dropout(config.final_dropout)
+        if config.vocab_size is None:
+            raise ValueError(
+                f"You are trying to instantiate {self.__class__} with a configuration that "
+                "does not define the vocabulary size of the language model head. Please "
+                "instantiate the model as follows: `Wav2Vec2ConformerForCTC.from_pretrained(..., vocab_size=vocab_size)`. "
+                "or define `vocab_size` of your model's configuration."
+            )
+        output_hidden_size = (
+            config.output_hidden_size if hasattr(config, "add_adapter") and config.add_adapter else config.hidden_size
+        )
+        self.lm_head = nn.Linear(output_hidden_size, config.vocab_size)
+        # Initialize weights and apply final processing
+        self.post_init()
+    # Copied from transformers.models.wav2vec2.modeling_wav2vec2.Wav2Vec2ForCTC.freeze_feature_encoder with wav2vec2->wav2vec2_conformer
+    def freeze_feature_encoder(self):
+        """
+        Calling this function will disable the gradient computation for the feature encoder so that its parameter will
+        not be updated during training.
+        """
+        self.wav2vec2_conformer.feature_extractor._freeze_parameters()
+    @add_start_docstrings_to_model_forward(WAV2VEC2_CONFORMER_INPUTS_DOCSTRING)
+    @add_code_sample_docstrings(
+        checkpoint=_CHECKPOINT_FOR_DOC,
+        output_type=CausalLMOutput,
+        config_class=_CONFIG_FOR_DOC,
+        expected_output=_CTC_EXPECTED_OUTPUT,
+        expected_loss=_CTC_EXPECTED_LOSS,
+    )
+    # Copied from transformers.models.wav2vec2.modeling_wav2vec2.Wav2Vec2ForCTC.forward with Wav2Vec2->Wav2Vec2Conformer,wav2vec2->wav2vec2_conformer
+    def forward(
+        self,
+        input_values: Optional[torch.Tensor],
+        attention_mask: Optional[torch.Tensor] = None,
+        output_attentions: Optional[bool] = None,
+        output_hidden_states: Optional[bool] = None,
+        return_dict: Optional[bool] = None,
+        labels: Optional[torch.Tensor] = None,
+    ) -> Union[Tuple, CausalLMOutput]:
+        r"""
+        labels (`torch.LongTensor` of shape `(batch_size, target_length)`, *optional*):
+            Labels for connectionist temporal classification. Note that `target_length` has to be smaller or equal to
+            the sequence length of the output logits. Indices are selected in `[-100, 0, ..., config.vocab_size - 1]`.
+            All labels set to `-100` are ignored (masked), the loss is only computed for labels in `[0, ...,
+            config.vocab_size - 1]`.
+        """
+        return_dict = return_dict if return_dict is not None else self.config.use_return_dict
+        outputs = self.wav2vec2_conformer(
+            input_values,
+            attention_mask=attention_mask,
+            output_attentions=output_attentions,
+            output_hidden_states=output_hidden_states,
+            return_dict=return_dict,
+        )
+        hidden_states = outputs[0]
+        hidden_states = self.dropout(hidden_states)
+        logits = self.lm_head(hidden_states)
+        loss = None
+        if labels is not None:
+            if labels.max() >= self.config.vocab_size:
+                raise ValueError(f"Label values must be <= vocab_size: {self.config.vocab_size}")
+            # retrieve loss input_lengths from attention_mask
+            attention_mask = (
+                attention_mask if attention_mask is not None else torch.ones_like(input_values, dtype=torch.long)
+            )
+            input_lengths = self._get_feat_extract_output_lengths(attention_mask.sum(-1)).to(torch.long)
+            # assuming that padded tokens are filled with -100
+            # when not being attended to
+            labels_mask = labels >= 0
+            target_lengths = labels_mask.sum(-1)
+            flattened_targets = labels.masked_select(labels_mask)
+            # ctc_loss doesn't support fp16
+            log_probs = nn.functional.log_softmax(logits, dim=-1, dtype=torch.float32).transpose(0, 1)
+            with torch.backends.cudnn.flags(enabled=False):
+                loss = nn.functional.ctc_loss(
+                    log_probs,
+                    flattened_targets,
+                    input_lengths,
+                    target_lengths,
+                    blank=self.config.pad_token_id,
+                    reduction=self.config.ctc_loss_reduction,
+                    zero_infinity=self.config.ctc_zero_infinity,
+                )
+        if not return_dict:
+            output = (logits,) + outputs[_HIDDEN_STATES_START_POSITION:]
+            return ((loss,) + output) if loss is not None else output
+        return CausalLMOutput(
+            loss=loss, logits=logits, hidden_states=outputs.hidden_states, attentions=outputs.attentions
+        )
+@add_start_docstrings(
+    """
+    Wav2Vec2Conformer Model with a sequence classification head on top (a linear layer over the pooled output) for
+    tasks like SUPERB Keyword Spotting.
+    """,
+    WAV2VEC2_CONFORMER_START_DOCSTRING,
+)
+class Wav2Vec2ConformerForSequenceClassification(Wav2Vec2ConformerPreTrainedModel):
+    # Copied from transformers.models.wav2vec2.modeling_wav2vec2.Wav2Vec2ForSequenceClassification.__init__ with Wav2Vec2->Wav2Vec2Conformer,wav2vec2->wav2vec2_conformer
+    def __init__(self, config):
+        super().__init__(config)
+        if hasattr(config, "add_adapter") and config.add_adapter:
+            raise ValueError(
+                "Sequence classification does not support the use of Wav2Vec2Conformer adapters (config.add_adapter=True)"
+            )
+        self.wav2vec2_conformer = Wav2Vec2ConformerModel(config)
+        num_layers = config.num_hidden_layers + 1  # transformer layers + input embeddings
+        if config.use_weighted_layer_sum:
+            self.layer_weights = nn.Parameter(torch.ones(num_layers) / num_layers)
+        self.projector = nn.Linear(config.hidden_size, config.classifier_proj_size)
+        self.classifier = nn.Linear(config.classifier_proj_size, config.num_labels)
+        # Initialize weights and apply final processing
+        self.post_init()
+    # Copied from transformers.models.wav2vec2.modeling_wav2vec2.Wav2Vec2ForSequenceClassification.freeze_feature_encoder with wav2vec2->wav2vec2_conformer
+    def freeze_feature_encoder(self):
+        """
+        Calling this function will disable the gradient computation for the feature encoder so that its parameter will
+        not be updated during training.
+        """
+        self.wav2vec2_conformer.feature_extractor._freeze_parameters()
+    def freeze_base_model(self):
+        """
+        Calling this function will disable the gradient computation for the base model so that its parameters will not
+        be updated during training. Only the classification head will be updated.
+        """
+        for param in self.wav2vec2_conformer.parameters():
+            param.requires_grad = False
+    @add_start_docstrings_to_model_forward(WAV2VEC2_CONFORMER_INPUTS_DOCSTRING)
+    @add_code_sample_docstrings(
+        checkpoint=_CHECKPOINT_FOR_DOC,
+        output_type=SequenceClassifierOutput,
+        config_class=_CONFIG_FOR_DOC,
+        modality="audio",
+    )
+    # Copied from transformers.models.wav2vec2.modeling_wav2vec2.Wav2Vec2ForSequenceClassification.forward with Wav2Vec2->Wav2Vec2Conformer,wav2vec2->wav2vec2_conformer,WAV_2_VEC_2->WAV2VEC2_CONFORMER
+    def forward(
+        self,
+        input_values: Optional[torch.Tensor],
+        attention_mask: Optional[torch.Tensor] = None,
+        output_attentions: Optional[bool] = None,
+        output_hidden_states: Optional[bool] = None,
+        return_dict: Optional[bool] = None,
+        labels: Optional[torch.Tensor] = None,
+    ) -> Union[Tuple, SequenceClassifierOutput]:
+        r"""
+        labels (`torch.LongTensor` of shape `(batch_size,)`, *optional*):
+            Labels for computing the sequence classification/regression loss. Indices should be in `[0, ...,
+            config.num_labels - 1]`. If `config.num_labels == 1` a regression loss is computed (Mean-Square loss), If
+            `config.num_labels > 1` a classification loss is computed (Cross-Entropy).
+        """
+        return_dict = return_dict if return_dict is not None else self.config.use_return_dict
+        output_hidden_states = True if self.config.use_weighted_layer_sum else output_hidden_states
+        outputs = self.wav2vec2_conformer(
+            input_values,
+            attention_mask=attention_mask,
+            output_attentions=output_attentions,
+            output_hidden_states=output_hidden_states,
+            return_dict=return_dict,
+        )
+        if self.config.use_weighted_layer_sum:
+            hidden_states = outputs[_HIDDEN_STATES_START_POSITION]
+            hidden_states = torch.stack(hidden_states, dim=1)
+            norm_weights = nn.functional.softmax(self.layer_weights, dim=-1)
+            hidden_states = (hidden_states * norm_weights.view(-1, 1, 1)).sum(dim=1)
+        else:
+            hidden_states = outputs[0]
+        hidden_states = self.projector(hidden_states)
+        if attention_mask is None:
+            pooled_output = hidden_states.mean(dim=1)
+        else:
+            padding_mask = self._get_feature_vector_attention_mask(hidden_states.shape[1], attention_mask)
+            hidden_states[~padding_mask] = 0.0
+            pooled_output = hidden_states.sum(dim=1) / padding_mask.sum(dim=1).view(-1, 1)
+        logits = self.classifier(pooled_output)
+        loss = None
+        if labels is not None:
+            loss_fct = CrossEntropyLoss()
+            loss = loss_fct(logits.view(-1, self.config.num_labels), labels.view(-1))
+        if not return_dict:
+            output = (logits,) + outputs[_HIDDEN_STATES_START_POSITION:]
+            return ((loss,) + output) if loss is not None else output
+        return SequenceClassifierOutput(
+            loss=loss,
+            logits=logits,
+            hidden_states=outputs.hidden_states,
+            attentions=outputs.attentions,
+        )
+@add_start_docstrings(
+    """
+    Wav2Vec2Conformer Model with a frame classification head on top for tasks like Speaker Diarization.
+    """,
+    WAV2VEC2_CONFORMER_START_DOCSTRING,
+)
+class Wav2Vec2ConformerForAudioFrameClassification(Wav2Vec2ConformerPreTrainedModel):
+    # Copied from transformers.models.wav2vec2.modeling_wav2vec2.Wav2Vec2ForAudioFrameClassification.__init__ with Wav2Vec2->Wav2Vec2Conformer,wav2vec2->wav2vec2_conformer,WAV_2_VEC_2->WAV2VEC2_CONFORMER
+    def __init__(self, config):
+        super().__init__(config)
+        if hasattr(config, "add_adapter") and config.add_adapter:
+            raise ValueError(
+                "Audio frame classification does not support the use of Wav2Vec2Conformer adapters (config.add_adapter=True)"
+            )
+        self.wav2vec2_conformer = Wav2Vec2ConformerModel(config)
+        num_layers = config.num_hidden_layers + 1  # transformer layers + input embeddings
+        if config.use_weighted_layer_sum:
+            self.layer_weights = nn.Parameter(torch.ones(num_layers) / num_layers)
+        self.classifier = nn.Linear(config.hidden_size, config.num_labels)
+        self.num_labels = config.num_labels
+        self.init_weights()
+    # Copied from transformers.models.wav2vec2.modeling_wav2vec2.Wav2Vec2ForAudioFrameClassification.freeze_feature_encoder with wav2vec2->wav2vec2_conformer
+    def freeze_feature_encoder(self):
+        """
+        Calling this function will disable the gradient computation for the feature encoder so that its parameter will
+        not be updated during training.
+        """
+        self.wav2vec2_conformer.feature_extractor._freeze_parameters()
+    # Copied from transformers.models.wav2vec2.modeling_wav2vec2.Wav2Vec2ForAudioFrameClassification.freeze_base_model with wav2vec2->wav2vec2_conformer
+    def freeze_base_model(self):
+        """
+        Calling this function will disable the gradient computation for the base model so that its parameters will not
+        be updated during training. Only the classification head will be updated.
+        """
+        for param in self.wav2vec2_conformer.parameters():
+            param.requires_grad = False
+    @add_start_docstrings_to_model_forward(WAV2VEC2_CONFORMER_INPUTS_DOCSTRING)
+    @add_code_sample_docstrings(
+        checkpoint=_CHECKPOINT_FOR_DOC,
+        output_type=TokenClassifierOutput,
+        config_class=_CONFIG_FOR_DOC,
+        modality="audio",
+    )
+    # Copied from transformers.models.wav2vec2.modeling_wav2vec2.Wav2Vec2ForAudioFrameClassification.forward with wav2vec2->wav2vec2_conformer
+    def forward(
+        self,
+        input_values: Optional[torch.Tensor],
+        attention_mask: Optional[torch.Tensor] = None,
+        labels: Optional[torch.Tensor] = None,
+        output_attentions: Optional[bool] = None,
+        output_hidden_states: Optional[bool] = None,
+        return_dict: Optional[bool] = None,
+    ) -> Union[Tuple, TokenClassifierOutput]:
+        r"""
+        labels (`torch.LongTensor` of shape `(batch_size,)`, *optional*):
+            Labels for computing the sequence classification/regression loss. Indices should be in `[0, ...,
+            config.num_labels - 1]`. If `config.num_labels == 1` a regression loss is computed (Mean-Square loss), If
+            `config.num_labels > 1` a classification loss is computed (Cross-Entropy).
+        """
+        return_dict = return_dict if return_dict is not None else self.config.use_return_dict
+        output_hidden_states = True if self.config.use_weighted_layer_sum else output_hidden_states
+        outputs = self.wav2vec2_conformer(
+            input_values,
+            attention_mask=attention_mask,
+            output_attentions=output_attentions,
+            output_hidden_states=output_hidden_states,
+            return_dict=return_dict,
+        )
+        if self.config.use_weighted_layer_sum:
+            hidden_states = outputs[_HIDDEN_STATES_START_POSITION]
+            hidden_states = torch.stack(hidden_states, dim=1)
+            norm_weights = nn.functional.softmax(self.layer_weights, dim=-1)
+            hidden_states = (hidden_states * norm_weights.view(-1, 1, 1)).sum(dim=1)
+        else:
+            hidden_states = outputs[0]
+        logits = self.classifier(hidden_states)
+        loss = None
+        if labels is not None:
+            loss_fct = CrossEntropyLoss()
+            loss = loss_fct(logits.view(-1, self.num_labels), torch.argmax(labels.view(-1, self.num_labels), axis=1))
+        if not return_dict:
+            output = (logits,) + outputs[_HIDDEN_STATES_START_POSITION:]
+            return output
+        return TokenClassifierOutput(
+            loss=loss,
+            logits=logits,
+            hidden_states=outputs.hidden_states,
+            attentions=outputs.attentions,
+        )
+# Copied from transformers.models.wav2vec2.modeling_wav2vec2.AMSoftmaxLoss
+class AMSoftmaxLoss(nn.Module):
+    def __init__(self, input_dim, num_labels, scale=30.0, margin=0.4):
+        super(AMSoftmaxLoss, self).__init__()
+        self.scale = scale
+        self.margin = margin
+        self.num_labels = num_labels
+        self.weight = nn.Parameter(torch.randn(input_dim, num_labels), requires_grad=True)
+        self.loss = nn.CrossEntropyLoss()
+    def forward(self, hidden_states, labels):
+        labels = labels.flatten()
+        weight = nn.functional.normalize(self.weight, dim=0)
+        hidden_states = nn.functional.normalize(hidden_states, dim=1)
+        cos_theta = torch.mm(hidden_states, weight)
+        psi = cos_theta - self.margin
+        onehot = nn.functional.one_hot(labels, self.num_labels)
+        logits = self.scale * torch.where(onehot.bool(), psi, cos_theta)
+        loss = self.loss(logits, labels)
+        return loss
+# Copied from transformers.models.wav2vec2.modeling_wav2vec2.TDNNLayer
+class TDNNLayer(nn.Module):
+    def __init__(self, config, layer_id=0):
+        super().__init__()
+        self.in_conv_dim = config.tdnn_dim[layer_id - 1] if layer_id > 0 else config.tdnn_dim[layer_id]
+        self.out_conv_dim = config.tdnn_dim[layer_id]
+        self.kernel_size = config.tdnn_kernel[layer_id]
+        self.dilation = config.tdnn_dilation[layer_id]
+        self.kernel = nn.Linear(self.in_conv_dim * self.kernel_size, self.out_conv_dim)
+        self.activation = nn.ReLU()
+    def forward(self, hidden_states):
+        hidden_states = hidden_states.unsqueeze(1)
+        hidden_states = nn.functional.unfold(
+            hidden_states,
+            (self.kernel_size, self.in_conv_dim),
+            stride=(1, self.in_conv_dim),
+            dilation=(self.dilation, 1),
+        )
+        hidden_states = hidden_states.transpose(1, 2)
+        hidden_states = self.kernel(hidden_states)
+        hidden_states = self.activation(hidden_states)
+        return hidden_states
+@add_start_docstrings(
+    """
+    Wav2Vec2Conformer Model with an XVector feature extraction head on top for tasks like Speaker Verification.
+    """,
+    WAV2VEC2_CONFORMER_START_DOCSTRING,
+)
+class Wav2Vec2ConformerForXVector(Wav2Vec2ConformerPreTrainedModel):
+    def __init__(self, config):
+        super().__init__(config)
+        self.wav2vec2_conformer = Wav2Vec2ConformerModel(config)
+        num_layers = config.num_hidden_layers + 1  # transformer layers + input embeddings
+        if config.use_weighted_layer_sum:
+            self.layer_weights = nn.Parameter(torch.ones(num_layers) / num_layers)
+        self.projector = nn.Linear(config.hidden_size, config.tdnn_dim[0])
+        tdnn_layers = [TDNNLayer(config, i) for i in range(len(config.tdnn_dim))]
+        self.tdnn = nn.ModuleList(tdnn_layers)
+        self.feature_extractor = nn.Linear(config.tdnn_dim[-1] * 2, config.xvector_output_dim)
+        self.classifier = nn.Linear(config.xvector_output_dim, config.xvector_output_dim)
+        self.objective = AMSoftmaxLoss(config.xvector_output_dim, config.num_labels)
+        self.init_weights()
+    # Copied from transformers.models.wav2vec2.modeling_wav2vec2.Wav2Vec2ForXVector.freeze_feature_encoder with wav2vec2->wav2vec2_conformer
+    def freeze_feature_encoder(self):
+        """
+        Calling this function will disable the gradient computation for the feature encoder so that its parameter will
+        not be updated during training.
+        """
+        self.wav2vec2_conformer.feature_extractor._freeze_parameters()
+    # Copied from transformers.models.wav2vec2.modeling_wav2vec2.Wav2Vec2ForXVector.freeze_base_model with wav2vec2->wav2vec2_conformer
+    def freeze_base_model(self):
+        """
+        Calling this function will disable the gradient computation for the base model so that its parameters will not
+        be updated during training. Only the classification head will be updated.
+        """
+        for param in self.wav2vec2_conformer.parameters():
+            param.requires_grad = False
+    # Copied from transformers.models.wav2vec2.modeling_wav2vec2.Wav2Vec2ForXVector._get_tdnn_output_lengths with wav2vec2->wav2vec2_conformer
+    def _get_tdnn_output_lengths(self, input_lengths: Union[torch.LongTensor, int]):
+        """
+        Computes the output length of the TDNN layers
+        """
+        def _conv_out_length(input_length, kernel_size, stride):
+            # 1D convolutional layer output length formula taken
+            # from https://pytorch.org/docs/stable/generated/torch.nn.Conv1d.html
+            return (input_length - kernel_size) // stride + 1
+        for kernel_size in self.config.tdnn_kernel:
+            input_lengths = _conv_out_length(input_lengths, kernel_size, 1)
+        return input_lengths
+    @add_start_docstrings_to_model_forward(WAV2VEC2_CONFORMER_INPUTS_DOCSTRING)
+    @add_code_sample_docstrings(
+        checkpoint=_CHECKPOINT_FOR_DOC,
+        output_type=XVectorOutput,
+        config_class=_CONFIG_FOR_DOC,
+        modality="audio",
+    )
+    # Copied from transformers.models.wav2vec2.modeling_wav2vec2.Wav2Vec2ForXVector.forward with Wav2Vec2->Wav2Vec2Conformer,wav2vec2->wav2vec2_conformer,WAV_2_VEC_2->WAV2VEC2_CONFORMER
+    def forward(
+        self,
+        input_values: Optional[torch.Tensor],
+        attention_mask: Optional[torch.Tensor] = None,
+        output_attentions: Optional[bool] = None,
+        output_hidden_states: Optional[bool] = None,
+        return_dict: Optional[bool] = None,
+        labels: Optional[torch.Tensor] = None,
+    ) -> Union[Tuple, XVectorOutput]:
+        r"""
+        labels (`torch.LongTensor` of shape `(batch_size,)`, *optional*):
+            Labels for computing the sequence classification/regression loss. Indices should be in `[0, ...,
+            config.num_labels - 1]`. If `config.num_labels == 1` a regression loss is computed (Mean-Square loss), If
+            `config.num_labels > 1` a classification loss is computed (Cross-Entropy).
+        """
+        return_dict = return_dict if return_dict is not None else self.config.use_return_dict
+        output_hidden_states = True if self.config.use_weighted_layer_sum else output_hidden_states
+        outputs = self.wav2vec2_conformer(
+            input_values,
+            attention_mask=attention_mask,
+            output_attentions=output_attentions,
+            output_hidden_states=output_hidden_states,
+            return_dict=return_dict,
+        )
+        if self.config.use_weighted_layer_sum:
+            hidden_states = outputs[_HIDDEN_STATES_START_POSITION]
+            hidden_states = torch.stack(hidden_states, dim=1)
+            norm_weights = nn.functional.softmax(self.layer_weights, dim=-1)
+            hidden_states = (hidden_states * norm_weights.view(-1, 1, 1)).sum(dim=1)
+        else:
+            hidden_states = outputs[0]
+        hidden_states = self.projector(hidden_states)
+        for tdnn_layer in self.tdnn:
+            hidden_states = tdnn_layer(hidden_states)
+        # Statistic Pooling
+        if attention_mask is None:
+            mean_features = hidden_states.mean(dim=1)
+            std_features = hidden_states.std(dim=1)
+        else:
+            feat_extract_output_lengths = self._get_feat_extract_output_lengths(attention_mask.sum(dim=1))
+            tdnn_output_lengths = self._get_tdnn_output_lengths(feat_extract_output_lengths)
+            mean_features = []
+            std_features = []
+            for i, length in enumerate(tdnn_output_lengths):
+                mean_features.append(hidden_states[i, :length].mean(dim=0))
+                std_features.append(hidden_states[i, :length].std(dim=0))
+            mean_features = torch.stack(mean_features)
+            std_features = torch.stack(std_features)
+        statistic_pooling = torch.cat([mean_features, std_features], dim=-1)
+        output_embeddings = self.feature_extractor(statistic_pooling)
+        logits = self.classifier(output_embeddings)
+        loss = None
+        if labels is not None:
+            loss = self.objective(logits, labels)
+        if not return_dict:
+            output = (logits, output_embeddings) + outputs[_HIDDEN_STATES_START_POSITION:]
+            return ((loss,) + output) if loss is not None else output
+        return XVectorOutput(
+            loss=loss,
+            logits=logits,
+            embeddings=output_embeddings,
+            hidden_states=outputs.hidden_states,
+            attentions=outputs.attentions,
+        )

musicfm/modules/random_quantizer.py ADDED Viewed

	@@ -0,0 +1,83 @@

+# MIT License
+#
+# Copyright 2023 ByteDance Inc.
+#
+# Permission is hereby granted, free of charge, to any person obtaining a copy of this software and associated documentation files (the “Software”),
+# to deal in the Software without restriction, including without limitation the rights to use, copy, modify, merge, publish, distribute, sublicense,
+# and/or sell copies of the Software, and to permit persons to whom the Software is furnished to do so, subject to the following conditions:
+#
+# The above copyright notice and this permission notice shall be included in all copies or substantial portions of the Software.
+#
+# THE SOFTWARE IS PROVIDED “AS IS”, WITHOUT WARRANTY OF ANY KIND, EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+# FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+# LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS
+# IN THE SOFTWARE.
+import torch
+from torch import nn, einsum
+from einops import rearrange
+class RandomProjectionQuantizer(nn.Module):
+    """
+    Random projection and codebook lookup module
+    Some code is borrowed from:
+     https://github.com/lucidrains/vector-quantize-pytorch/blob/master/vector_quantize_pytorch/random_projection_quantizer.py
+    But I did normalization using pre-computed global mean & variance instead of using layer norm.
+    """
+    def __init__(
+        self,
+        input_dim,
+        codebook_dim,
+        codebook_size,
+        seed=142,
+    ):
+        super().__init__()
+        # random seed
+        torch.manual_seed(seed)
+        # randomly initialized projection
+        random_projection = torch.empty(input_dim, codebook_dim)
+        nn.init.xavier_normal_(random_projection)
+        self.register_buffer("random_projection", random_projection)
+        # randomly initialized codebook
+        codebook = torch.empty(codebook_size, codebook_dim)
+        nn.init.normal_(codebook)
+        self.register_buffer("codebook", codebook)
+    def codebook_lookup(self, x):
+        # reshape
+        b = x.shape[0]
+        x = rearrange(x, "b n e -> (b n) e")
+        # L2 normalization
+        normalized_x = nn.functional.normalize(x, dim=1, p=2)
+        normalized_codebook = nn.functional.normalize(self.codebook, dim=1, p=2)
+        # compute distances
+        distances = torch.cdist(normalized_codebook, normalized_x)
+        # get nearest
+        nearest_indices = torch.argmin(distances, dim=0)
+        # reshape
+        xq = rearrange(nearest_indices, "(b n) -> b n", b=b)
+        return xq
+    @torch.no_grad()
+    def forward(self, x):
+        # always eval
+        self.eval()
+        # random projection [batch, length, input_dim] -> [batch, length, codebook_dim]
+        x = einsum("b n d, d e -> b n e", x, self.random_projection)
+        # codebook lookup
+        xq = self.codebook_lookup(x)
+        return xq

postprocessing/functional.py ADDED Viewed

	@@ -0,0 +1,71 @@

+# This file contains code adapted from the following sources:
+# [MIT license] https://github.com/mir-aidj/all-in-one/blob/main/src/allin1/postprocessing/functional.py
+import numpy as np
+import torch
+from .helpers import (
+    local_maxima,
+    peak_picking,
+    # event_frames_to_time,
+)
+from dataset.label2id import LABEL_TO_ID, ID_TO_LABEL
+from dataset.custom_types import MsaInfo
+def event_frames_to_time(frame_rates, boundary: np.array):
+    boundary = np.array(boundary)
+    boundary_times = boundary / frame_rates
+    return boundary_times
+def postprocess_functional_structure(
+    logits,
+    config,
+):
+    # pdb.set_trace()
+    boundary_logits = logits["boundary_logits"]
+    function_logits = logits["function_logits"]
+    assert boundary_logits.shape[0] == 1 and function_logits.shape[0] == 1, (
+        "Only batch size 1 is supported"
+    )
+    raw_prob_sections = torch.sigmoid(boundary_logits[0])
+    raw_prob_functions = torch.softmax(function_logits[0].transpose(0, 1), dim=0)
+    # filter_size=4 * cfg.min_hops_per_beat + 1
+    prob_sections, _ = local_maxima(
+        raw_prob_sections, filter_size=config.local_maxima_filter_size
+    )
+    prob_sections = prob_sections.cpu().numpy()
+    prob_functions = raw_prob_functions.cpu().numpy()
+    boundary_candidates = peak_picking(
+        boundary_activation=prob_sections,
+        window_past=int(12 * config.frame_rates),  # 原来是fps
+        window_future=int(12 * config.frame_rates),
+    )
+    boundary = boundary_candidates > 0.0
+    duration = len(prob_sections) / config.frame_rates
+    pred_boundary_times = event_frames_to_time(
+        frame_rates=config.frame_rates, boundary=np.flatnonzero(boundary)
+    )
+    if pred_boundary_times[0] != 0:
+        pred_boundary_times = np.insert(pred_boundary_times, 0, 0)
+    if pred_boundary_times[-1] != duration:
+        pred_boundary_times = np.append(pred_boundary_times, duration)
+    pred_boundaries = np.stack([pred_boundary_times[:-1], pred_boundary_times[1:]]).T
+    pred_boundary_indices = np.flatnonzero(boundary)
+    pred_boundary_indices = pred_boundary_indices[pred_boundary_indices > 0]
+    prob_segment_function = np.split(prob_functions, pred_boundary_indices, axis=1)
+    pred_labels = [p.mean(axis=1).argmax().item() for p in prob_segment_function]
+    segments: MsaInfo = []
+    for (start, end), label in zip(pred_boundaries, pred_labels):
+        segment = (float(start), str(ID_TO_LABEL[label]))
+        segments.append(segment)
+    segments.append((float(pred_boundary_times[-1]), "end"))
+    return segments

postprocessing/helpers.py ADDED Viewed

	@@ -0,0 +1,101 @@

+# This file contains code adapted from the following sources:
+# [MIT license] https://github.com/mir-aidj/all-in-one/blob/main/src/allin1/postprocessing/helpers.py
+import numpy as np
+import torch.nn.functional as F
+import torch
+import librosa
+from typing import Union
+from scipy.signal import argrelextrema
+from scipy.interpolate import interp1d
+from numpy.lib.stride_tricks import sliding_window_view
+from numpy.typing import NDArray
+def local_maxima(tensor, filter_size=41):
+    assert len(tensor.shape) in (1, 2), "Input tensor should have 1 or 2 dimensions"
+    assert filter_size % 2 == 1, "Filter size should be an odd number"
+    original_shape = tensor.shape
+    if len(original_shape) == 1:
+        tensor = tensor.unsqueeze(0)
+    # Pad the input array with the minimum value
+    padding = filter_size // 2
+    padded_arr = F.pad(tensor, (padding, padding), mode="constant", value=-torch.inf)
+    # Create a rolling window view of the padded array
+    rolling_view = padded_arr.unfold(1, filter_size, 1)
+    # Find the indices of the local maxima
+    center = filter_size // 2
+    local_maxima_mask = torch.eq(
+        rolling_view[:, :, center], torch.max(rolling_view, dim=-1).values
+    )
+    local_maxima_indices = local_maxima_mask.nonzero()
+    # Initialize a new PyTorch tensor with zeros and the same shape as the input tensor
+    output_arr = torch.zeros_like(tensor)
+    # Set the local maxima values in the output tensor
+    output_arr[local_maxima_mask] = tensor[local_maxima_mask]
+    output_arr = output_arr.reshape(original_shape)
+    return output_arr, local_maxima_indices
+def local_maxima_numpy(arr, order=20):
+    is_batch = len(arr.shape) == 2
+    if is_batch:
+        return np.stack([local_maxima_numpy(x, order) for x in arr])
+    # Define a comparison function for argrelextrema to find local maxima
+    compare_func = np.greater
+    # Find the indices of the local maxima
+    local_maxima_indices = argrelextrema(arr, compare_func, order=order)
+    # Initialize a new numpy array with zeros and the same shape as the input array
+    output_arr = np.zeros_like(arr)
+    # Set the local maxima values in the output array
+    output_arr[local_maxima_indices] = arr[local_maxima_indices]
+    return output_arr
+def peak_picking(boundary_activation, window_past=12, window_future=6):
+    # Find local maxima using a sliding window
+    window_size = window_past + window_future
+    assert window_size % 2 == 0, "window_past + window_future must be even"
+    window_size += 1
+    # Pad boundary_activation
+    boundary_activation_padded = np.pad(
+        boundary_activation, (window_past, window_future), mode="constant"
+    )
+    max_filter = sliding_window_view(boundary_activation_padded, window_size)
+    local_maxima = (boundary_activation == np.max(max_filter, axis=-1)) & (
+        boundary_activation > 0
+    )
+    # Compute strength values by subtracting the mean of the past and future windows
+    past_window_filter = sliding_window_view(
+        boundary_activation_padded[: -(window_future + 1)], window_past
+    )
+    future_window_filter = sliding_window_view(
+        boundary_activation_padded[window_past + 1 :], window_future
+    )
+    past_mean = np.mean(past_window_filter, axis=-1)
+    future_mean = np.mean(future_window_filter, axis=-1)
+    strength_values = boundary_activation - ((past_mean + future_mean) / 2)
+    # Get boundary candidates and their corresponding strength values
+    boundary_candidates = np.flatnonzero(local_maxima)
+    strength_values = strength_values[boundary_candidates]
+    strength_activations = np.zeros_like(boundary_activation)
+    strength_activations[boundary_candidates] = strength_values
+    return strength_activations