Upload DiCoWForConditionalGeneration

Browse files

Files changed (15) hide show

FDDT.py +75 -0
README.md +199 -0
SCBs.py +411 -0
coattention.py +120 -0
config.json +86 -0
config.py +103 -0
contrastive_loss.py +190 -0
decoding.py +397 -0
encoder.py +328 -0
generation.py +1808 -0
generation_config.json +12 -0
layers.py +99 -0
model.safetensors +3 -0
modeling_dicow.py +450 -0
utils.py +96 -0

FDDT.py ADDED Viewed

	@@ -0,0 +1,75 @@

+from typing import Optional
+import torch
+from torch import nn
+from .layers import CustomDiagonalLinear, CustomLinear
+from .SCBs import SpeakerCommunicationBlock
+class FDDT(nn.Module):
+    def __init__(self, config, d_model, non_target_rate=0.01, is_diagonal=False, bias_only=False, use_silence=True,
+                 use_target=True, use_overlap=True, use_non_target=True, use_interaction=False):
+        super().__init__()
+        if use_target:
+            self.target_linear = nn.Parameter(torch.zeros(d_model)) if bias_only else (
+                CustomDiagonalLinear(d_model, bias=True, init_eye_val=1.0) if is_diagonal else CustomLinear(d_model,
+                                                                                                            d_model,
+                                                                                                            bias=True,
+                                                                                                            init_eye_val=1.0))
+        if use_non_target:
+            self.non_target_linear = nn.Parameter(torch.zeros(d_model)) if bias_only else (
+                CustomDiagonalLinear(d_model, bias=True, init_eye_val=non_target_rate) if is_diagonal else CustomLinear(
+                    d_model, d_model, bias=True, init_eye_val=non_target_rate))
+        if use_overlap:
+            self.overlap_linear = nn.Parameter(torch.zeros(d_model)) if bias_only else (
+                CustomDiagonalLinear(d_model, bias=True, init_eye_val=1.0) if is_diagonal else CustomLinear(d_model,
+                                                                                                            d_model,
+                                                                                                            bias=True,
+                                                                                                            init_eye_val=1.0))
+        if use_silence:
+            self.silence_linear = nn.Parameter(torch.zeros(d_model)) if bias_only else (
+                CustomDiagonalLinear(d_model, bias=True, init_eye_val=non_target_rate) if is_diagonal else CustomLinear(
+                    d_model, d_model, bias=True, init_eye_val=non_target_rate))
+        if use_interaction:
+            self.scb = SpeakerCommunicationBlock(config)
+        self.use_silence = use_silence
+        self.use_target = use_target
+        self.use_overlap = use_overlap
+        self.use_non_target = use_non_target
+        self.use_interaction = use_interaction
+        self.bias_only = bias_only
+    @staticmethod
+    def mask_out_non_interaction_signal(hidden_states, mask):
+        mask = torch.round(mask).bool()
+        masked_hidden_states = hidden_states * mask
+        return masked_hidden_states
+    def forward(self, hidden_states, stno_mask):
+        stno_mask = stno_mask.to(hidden_states.device)[..., None]
+        if self.bias_only:
+            if self.use_silence:
+                hidden_states += stno_mask[:, 0, ...] * self.silence_linear
+            if self.use_target:
+                hidden_states += stno_mask[:, 1, ...] * self.target_linear
+            if self.use_non_target:
+                hidden_states += stno_mask[:, 2, ...] * self.non_target_linear
+            if self.use_overlap:
+                hidden_states += stno_mask[:, 3, ...] * self.overlap_linear
+        else:
+            orig_hidden_states = hidden_states
+            hidden_states = (self.silence_linear(
+                orig_hidden_states) if self.use_silence else orig_hidden_states) * stno_mask[:, 0, :] + \
+                            (self.target_linear(
+                                orig_hidden_states) if self.use_target else orig_hidden_states) * stno_mask[:, 1, :] + \
+                            (self.non_target_linear(
+                                orig_hidden_states) if self.use_non_target else orig_hidden_states) * stno_mask[:, 2,
+                                                                                                      :] + \
+                            (self.overlap_linear(
+                                orig_hidden_states) if self.use_overlap else orig_hidden_states) * stno_mask[:, 3, :]
+        if self.use_interaction:
+            hidden_states = self.scb(hidden_states)
+        return hidden_states

README.md ADDED Viewed

	@@ -0,0 +1,199 @@

+---
+library_name: transformers
+tags: []
+---
+# Model Card for Model ID
+<!-- Provide a quick summary of what the model is/does. -->
+## Model Details
+### Model Description
+<!-- Provide a longer summary of what this model is. -->
+This is the model card of a 🤗 transformers model that has been pushed on the Hub. This model card has been automatically generated.
+- **Developed by:** [More Information Needed]
+- **Funded by [optional]:** [More Information Needed]
+- **Shared by [optional]:** [More Information Needed]
+- **Model type:** [More Information Needed]
+- **Language(s) (NLP):** [More Information Needed]
+- **License:** [More Information Needed]
+- **Finetuned from model [optional]:** [More Information Needed]
+### Model Sources [optional]
+<!-- Provide the basic links for the model. -->
+- **Repository:** [More Information Needed]
+- **Paper [optional]:** [More Information Needed]
+- **Demo [optional]:** [More Information Needed]
+## Uses
+<!-- Address questions around how the model is intended to be used, including the foreseeable users of the model and those affected by the model. -->
+### Direct Use
+<!-- This section is for the model use without fine-tuning or plugging into a larger ecosystem/app. -->
+[More Information Needed]
+### Downstream Use [optional]
+<!-- This section is for the model use when fine-tuned for a task, or when plugged into a larger ecosystem/app -->
+[More Information Needed]
+### Out-of-Scope Use
+<!-- This section addresses misuse, malicious use, and uses that the model will not work well for. -->
+[More Information Needed]
+## Bias, Risks, and Limitations
+<!-- This section is meant to convey both technical and sociotechnical limitations. -->
+[More Information Needed]
+### Recommendations
+<!-- This section is meant to convey recommendations with respect to the bias, risk, and technical limitations. -->
+Users (both direct and downstream) should be made aware of the risks, biases and limitations of the model. More information needed for further recommendations.
+## How to Get Started with the Model
+Use the code below to get started with the model.
+[More Information Needed]
+## Training Details
+### Training Data
+<!-- This should link to a Dataset Card, perhaps with a short stub of information on what the training data is all about as well as documentation related to data pre-processing or additional filtering. -->
+[More Information Needed]
+### Training Procedure
+<!-- This relates heavily to the Technical Specifications. Content here should link to that section when it is relevant to the training procedure. -->
+#### Preprocessing [optional]
+[More Information Needed]
+#### Training Hyperparameters
+- **Training regime:** [More Information Needed] <!--fp32, fp16 mixed precision, bf16 mixed precision, bf16 non-mixed precision, fp16 non-mixed precision, fp8 mixed precision -->
+#### Speeds, Sizes, Times [optional]
+<!-- This section provides information about throughput, start/end time, checkpoint size if relevant, etc. -->
+[More Information Needed]
+## Evaluation
+<!-- This section describes the evaluation protocols and provides the results. -->
+### Testing Data, Factors & Metrics
+#### Testing Data
+<!-- This should link to a Dataset Card if possible. -->
+[More Information Needed]
+#### Factors
+<!-- These are the things the evaluation is disaggregating by, e.g., subpopulations or domains. -->
+[More Information Needed]
+#### Metrics
+<!-- These are the evaluation metrics being used, ideally with a description of why. -->
+[More Information Needed]
+### Results
+[More Information Needed]
+#### Summary
+## Model Examination [optional]
+<!-- Relevant interpretability work for the model goes here -->
+[More Information Needed]
+## Environmental Impact
+<!-- Total emissions (in grams of CO2eq) and additional considerations, such as electricity usage, go here. Edit the suggested text below accordingly -->
+Carbon emissions can be estimated using the [Machine Learning Impact calculator](https://mlco2.github.io/impact#compute) presented in [Lacoste et al. (2019)](https://arxiv.org/abs/1910.09700).
+- **Hardware Type:** [More Information Needed]
+- **Hours used:** [More Information Needed]
+- **Cloud Provider:** [More Information Needed]
+- **Compute Region:** [More Information Needed]
+- **Carbon Emitted:** [More Information Needed]
+## Technical Specifications [optional]
+### Model Architecture and Objective
+[More Information Needed]
+### Compute Infrastructure
+[More Information Needed]
+#### Hardware
+[More Information Needed]
+#### Software
+[More Information Needed]
+## Citation [optional]
+<!-- If there is a paper or blog post introducing the model, the APA and Bibtex information for that should go in this section. -->
+**BibTeX:**
+[More Information Needed]
+**APA:**
+[More Information Needed]
+## Glossary [optional]
+<!-- If relevant, include terms and calculations in this section that can help readers understand the model or model card. -->
+[More Information Needed]
+## More Information [optional]
+[More Information Needed]
+## Model Card Authors [optional]
+[More Information Needed]
+## Model Card Contact
+[More Information Needed]

SCBs.py ADDED Viewed

	@@ -0,0 +1,411 @@

+import torch
+from torch import nn
+from transformers import WhisperConfig
+from transformers.activations import ACT2FN
+from transformers.models.whisper.modeling_whisper import WHISPER_ATTENTION_CLASSES
+import torch.nn.functional as F
+from .coattention import CoAttention
+from .layers import CustomLinear, CustomDiagonalLinear, Gate, CustomLinearInitialized
+class LowRankApproxSelectFirst(nn.Module):
+    def __init__(self, d_in, d_out, rank):
+        super().__init__()
+        self.d_in = d_in
+        self.d_out = d_out
+        self.rank = rank
+        self.proj_in = nn.Linear(d_in, rank)
+        self.proj_out = nn.Linear(rank, d_out)
+    def forward(self, x):
+        return self.proj_out(self.proj_in(x))
+    def _init_weights(self):
+        # Create low-rank approximation of the identity projection from first d_out of input
+        eye = torch.eye(self.d_out, self.d_in)  # (d_out x d_in)
+        # Low-rank SVD of eye matrix
+        U, S, Vh = torch.linalg.svd(eye, full_matrices=False)  # U: (d_out x d_out), Vh: (d_in x d_in)
+        U_k = U[:, :self.rank]              # (d_out x rank)
+        S_k = S[:self.rank]                 # (rank,)
+        V_k = Vh[:self.rank, :]             # (rank x d_in)
+        A = V_k                             # (rank x d_in)
+        B = U_k @ torch.diag(S_k)           # (d_out x rank)
+        # Set weights
+        self.proj_in.weight.data.copy_(A)
+        self.proj_in.bias.data.zero_()
+        self.proj_out.weight.data.copy_(B)
+        self.proj_out.bias.data.zero_()
+class TACBlock(nn.Module):
+    def __init__(self, config: WhisperConfig, d_int_factor: float = 1, num_speakers=2):
+        super().__init__()
+        d = config.d_model
+        d_prime = int(d * d_int_factor)
+        self.num_speakers = num_speakers
+        self.proj_in_1 = nn.Linear(d, d_prime, bias=True)
+        self.proj_in_2 = nn.Linear(d, d_prime, bias=True)
+        self.proj_int = nn.Linear(d_prime, d_prime,bias=True)
+        self.proj_out_1 = nn.Linear(d+d_prime, d,bias=True)
+        self.proj_out_2 = nn.Linear(d+d_prime, d,bias=True)
+        self.activation_fn = ACT2FN[config.activation_function]
+        self.norms = nn.ModuleList([nn.LayerNorm(d) for _ in range(self.num_speakers)])
+        self.gate = Gate(self.num_speakers, 0.05)
+    def forward(self, hidden_states: torch.FloatTensor) -> torch.FloatTensor:
+        # hidden_states: (B, self.num_speakers, T, F)
+        x_proj = torch.stack([self.activation_fn(self.proj_in_1(hidden_states[:,0])), self.activation_fn(self.proj_in_2(hidden_states[:, 1]))], dim=1)  # (B, 2, T, d')
+        x_mean = x_proj.mean(dim=1, keepdim=True)  # (B, 1, T, d')
+        z = self.activation_fn(self.proj_int(x_mean))  # (B, 1, T, d')
+        z_expand = z.expand(-1, self.num_speakers, -1, -1)  # (B, self.num_speakers, T, d')
+        x_cat = torch.cat([hidden_states, z_expand], dim=-1)  # (B, self.num_speakers, T, d + d')
+        x_out = torch.stack([self.norms[0](self.proj_out_1(x_cat[:, 0])), self.norms[1](self.proj_out_2(x_cat[:, 1]))], dim=1)  # (B, self.num_speakers, T, d)
+        return hidden_states + self.gate(x_out, dim=1)
+class CrossAttentionBlock(nn.Module):
+    def __init__(self, config: WhisperConfig):
+        super().__init__()
+        self.embed_dim = config.d_model
+        self.num_speakers = getattr(config, "mt_num_speakers", 2)
+        if self.num_speakers != 2:
+            raise ValueError("CrossAttentionBlock supports only 2 speakers.")
+        # Separate attention block per speaker
+        self.attn_blocks = nn.ModuleList([
+            WHISPER_ATTENTION_CLASSES[config._attn_implementation](
+                embed_dim=self.embed_dim,
+                num_heads=config.encoder_attention_heads,
+                dropout=config.attention_dropout,
+                config=config,
+            )
+            for _ in range(self.num_speakers)
+        ])
+        self.norms = nn.ModuleList([nn.LayerNorm(self.embed_dim) for _ in range(self.num_speakers)])
+        self.gate = Gate(self.num_speakers, 0.01)
+    def forward(self, hidden_states):
+        # hidden_states: (B, 2, T, F)
+        outputs = []
+        for s in range(self.num_speakers):
+            q = hidden_states[:, s]  # (B, T, F)
+            other_s = 1 - s
+            kv = hidden_states[:, other_s]  # (B, T, F)
+            attn_out, _, _ = self.attn_blocks[s](hidden_states=q, key_value_states=kv)  # (B, T, F)
+            outputs.append(self.norms[s](attn_out[:, None, :, :]))
+        outputs =  torch.concat(outputs, dim=1)
+        outputs_modulated = self.gate(outputs, dim=1) + hidden_states
+        return outputs_modulated
+# class CrossAttentionEnrollBlock(nn.Module):
+#     def __init__(self, config, layer_norm_eps: float = 1e-5):
+#         super().__init__()
+#         self.embed_dim = config.d_model
+#         self.ffn_dim = config.encoder_ffn_dim
+#
+#         self.cross_attn = WHISPER_ATTENTION_CLASSES[config._attn_implementation](
+#             embed_dim=self.embed_dim,
+#             num_heads=config.encoder_attention_heads,
+#             dropout=config.attention_dropout,
+#             config=config,
+#         )
+#
+#         # Layer normalization (pre-norm style)
+#         self.norm_attn = nn.LayerNorm(self.embed_dim, eps=layer_norm_eps)
+#         self.norm_ffn = nn.LayerNorm(self.embed_dim * 2, eps=layer_norm_eps)
+#
+#         # Feed-forward network
+#         self.ffn = nn.Sequential(
+#             nn.Linear(self.embed_dim * 2, self.ffn_dim),
+#             ACT2FN[config.activation_function],
+#             nn.Dropout(config.dropout if hasattr(config, 'dropout') else 0.1),
+#             nn.Linear(self.ffn_dim, self.embed_dim),
+#             nn.Dropout(config.dropout if hasattr(config, 'dropout') else 0.1)
+#         )
+#
+#     def forward(self, hidden_states: torch.Tensor) -> torch.Tensor:
+#         """
+#         Args:
+#             hidden_states: (B, 2, T, F) - batch, channels, time, features
+#         Returns:
+#             Updated hidden states of same shape
+#         """
+#         q_channel = hidden_states[:, 0]  # (B, T, F)
+#         kv_channel = hidden_states[:, 1]  # (B, T, F)
+#
+#         # Cross-attention with residual connection
+#         q_normed = self.norm_attn(q_channel)
+#         attn_output = self.cross_attn(
+#             hidden_states=q_normed,
+#             key_value_states=kv_channel,
+#             output_attentions=False
+#         )[0]
+#
+#         q_after_attn = torch.cat([attn_output, q_normed], dim=-1)
+#
+#         # Feed-forward with residual connection
+#         q_normed_ffn = self.norm_ffn(q_after_attn)
+#
+#         ffn_output = self.ffn(q_normed_ffn)
+#         updated_q = q_after_attn + ffn_output
+#
+#         # Return stacked result (only query channel is updated)
+#         return torch.stack([updated_q, kv_channel], dim=1)
+def first_init_fun(module):
+    # Zero out all weights initially
+    # module.weight.data.zero_()
+    torch.nn.init.xavier_uniform_(module.weight, gain=0.1)
+    # Create identity mapping for second half of input (q_normed part)
+    # Input: [cross_attn_output, q_normed] -> map q_normed to first embed_dim outputs
+    module.weight.data[:module.weight.shape[1] // 2, module.weight.shape[1] // 2:] += torch.eye(module.weight.shape[1] // 2)
+    # module.weight.data[:module.weight.shape[1]//2, module.weight.shape[1]//2:] = torch.eye(module.weight.shape[1]//2)
+    # Zero bias
+    module.bias.data.zero_()
+def second_init_fun(module):
+    # module.weight.data.zero_()
+    torch.nn.init.xavier_uniform_(module.weight, gain=0.1)
+    # Create identity mapping from first embed_dim inputs to output
+    module.weight.data[:, :module.weight.shape[0]] += torch.eye(module.weight.shape[0])
+    # Zero bias for second linear
+    module.bias.data.zero_()
+# Cross attention block that can easily learn to ignore cross attention initially
+class CrossAttentionEnrollBlockNew(nn.Module):
+    def __init__(self, config, layer_norm_eps: float = 1e-5):
+        super().__init__()
+        self.embed_dim = config.d_model
+        self.ffn_dim = config.encoder_ffn_dim
+        self.cross_attn = WHISPER_ATTENTION_CLASSES[config._attn_implementation](
+            embed_dim=self.embed_dim,
+            num_heads=config.encoder_attention_heads,
+            dropout=config.attention_dropout,
+            config=config,
+        )
+        # Layer normalization (pre-norm style)
+        # self.norm_attn = nn.LayerNorm(self.embed_dim, eps=layer_norm_eps)
+        self.cross_gate = nn.Parameter(torch.zeros(1))
+        # Feed-forward network that maps concat space back to single channel
+        self.ffn = nn.Sequential(
+            CustomLinearInitialized(self.embed_dim * 2, self.ffn_dim, init_fun=first_init_fun),
+            ACT2FN[config.activation_function],
+            nn.Dropout(config.dropout if hasattr(config, 'dropout') else 0.1),
+            CustomLinearInitialized(self.ffn_dim, self.embed_dim, init_fun=second_init_fun),
+            nn.Dropout(config.dropout if hasattr(config, 'dropout') else 0.1)
+        )
+    def forward(self, hidden_states: torch.Tensor) -> torch.Tensor:
+        """
+        Args:
+            hidden_states: (B, 2, T, F) - batch, channels, time, features
+        Returns:
+            Updated hidden states of same shape
+        """
+        q_channel = hidden_states[:, 0]  # (B, T, F)
+        kv_channel = hidden_states[:, 1]  # (B, T, F)
+        # Cross-attention
+        attn_output = self.cross_attn(
+            hidden_states=q_channel,
+            key_value_states=kv_channel,
+            output_attentions=False
+        )[0]
+        # Concatenate attention output with original normalized query
+        q_concat = torch.cat([attn_output, q_channel], dim=-1)  # (B, T, 2*F)
+        # Feed-forward processing (no normalization to preserve initialization)
+        # updated_q = self.ffn(q_concat)  # (B, T, F)
+        updated_q = q_channel + torch.tanh(self.cross_gate) * self.ffn(q_concat)
+        # Return stacked result (only query channel is updated)
+        return torch.stack([updated_q, kv_channel], dim=1)
+class CrossAttentionEnrollBlock(nn.Module):
+    def __init__(self, config: WhisperConfig):
+        super().__init__()
+        self.embed_dim = config.d_model
+        # Separate attention block per speaker
+        self.attn_block =  WHISPER_ATTENTION_CLASSES[config._attn_implementation](
+                embed_dim=self.embed_dim,
+                num_heads=config.encoder_attention_heads,
+                dropout=config.attention_dropout,
+                config=config,
+            )
+        self.norm = nn.LayerNorm(self.embed_dim)
+        self.gate = Gate(1, 0.1)
+    def forward(self, hidden_states):
+        q = hidden_states[:, 0]  # (B, T, F)
+        kv = hidden_states[:, 1]  # (B, T, F)
+        attn_out, _, _ = self.attn_block(hidden_states=q, key_value_states=kv)  # (B, T, F)
+        out = self.norm(attn_out)
+        # Create updated first channel
+        updated_q = self.gate(out[:, None, :, :], dim=1)[:, 0] + q
+        # Concatenate along the channel dimension
+        result = torch.stack([updated_q, kv], dim=1)
+        return result
+class CompetitiveCrossAttentionBlock(nn.Module):
+    def __init__(self, config):
+        super().__init__()
+        self.embed_dim = config.d_model
+        self.num_heads = config.encoder_attention_heads
+        self.head_dim = self.embed_dim // self.num_heads
+        assert (
+                self.head_dim * self.num_heads == self.embed_dim
+        ), "embed_dim must be divisible by num_heads"
+        self.num_speakers = getattr(config, "mt_num_speakers", 2)
+        if self.num_speakers != 2:
+            raise ValueError("CompetitiveCrossAttentionBlock supports only 2 speakers.")
+        # Separate projections for Q, K, V
+        self.q_proj = nn.Linear(self.embed_dim, self.embed_dim)
+        self.k_proj = nn.Linear(self.embed_dim, self.embed_dim)
+        self.v_proj = nn.Linear(self.embed_dim, self.embed_dim)
+        self.out_proj = nn.Linear(self.embed_dim, self.embed_dim)
+        self.norms = nn.ModuleList([nn.LayerNorm(self.embed_dim) for _ in range(self.num_speakers)])
+        self.eps = 1e-6
+        self.gate = Gate(self.num_speakers, 0.01)
+    def _shape(self, tensor, seq_len, batch_size):
+        # reshape into (B, num_heads, T, head_dim)
+        return tensor.view(batch_size, seq_len, self.num_heads, self.head_dim).transpose(1, 2)
+    def forward(self, hidden_states):
+        # hidden_states: (B, 2, T, F)
+        B, _, T, _ = hidden_states.shape
+        h1, h2 = hidden_states[:, 0], hidden_states[:, 1]  # (B, T, F)
+        # Project Q,K,V
+        Q1 = self.q_proj(h1)  # (B, T, F)
+        K2 = self.k_proj(h2)
+        V2 = self.v_proj(h2)
+        Q2 = self.q_proj(h2)
+        K1 = self.k_proj(h1)
+        V1 = self.v_proj(h1)
+        # Reshape for multi-head attention
+        Q1 = self._shape(Q1, T, B)  # (B, heads, T, head_dim)
+        K2 = self._shape(K2, T, B)
+        V2 = self._shape(V2, T, B)
+        Q2 = self._shape(Q2, T, B)
+        K1 = self._shape(K1, T, B)
+        V1 = self._shape(V1, T, B)
+        # Scaled dot-product attention logits
+        scale = 1 / (self.head_dim ** 0.5)
+        L_1to2 = torch.matmul(Q1, K2.transpose(-1, -2)) * scale  # (B, heads, T, T)
+        L_2to1 = torch.matmul(Q2, K1.transpose(-1, -2)) * scale  # (B, heads, T, T)
+        # Softmax over last dim (keys)
+        S_1to2 = F.softmax(L_1to2, dim=-1)
+        S_2to1 = F.softmax(L_2to1, dim=-1)
+        # Competitive normalization (soft exclusivity)
+        M_joint = S_1to2 + S_2to1 + self.eps
+        A_1to2 = S_1to2 / M_joint
+        A_2to1 = S_2to1 / M_joint
+        # Weighted sum of values
+        H1_attn = torch.matmul(A_1to2, V2)  # (B, heads, T, head_dim)
+        H2_attn = torch.matmul(A_2to1, V1)
+        # Concatenate heads back
+        H1_attn = H1_attn.transpose(1, 2).contiguous().view(B, T, self.embed_dim)  # (B, T, F)
+        H2_attn = H2_attn.transpose(1, 2).contiguous().view(B, T, self.embed_dim)
+        # Output projection
+        H1_attn = self.norms[0](self.out_proj(H1_attn))
+        H2_attn = self.norms[1](self.out_proj(H2_attn))
+        # Residuals
+        out = hidden_states + self.gate(torch.concat([H1_attn[:, None, :, :], H2_attn[:, None, :, :]], dim=1), dim=1)
+        return out # (B, 2, T, F)
+class CoAttentionWrapper(nn.Module):
+    def __init__(self, config, num_speakers=2):
+        super().__init__()
+        self.coa = CoAttention(embed_dim=config.d_model, single_dim=config.d_model//2, multi_dim=config.d_model // 4, n_heads=config.encoder_attention_heads, attn_dropout=config.attention_dropout)
+        self.gate = Gate(num_speakers, 0.01)
+    def forward(self, coa_input: torch.Tensor) -> torch.Tensor:
+        # hidden_states: (B, 2, T, F)
+        hidden_states = coa_input.permute(-2, 0, 1, -1)
+        hidden_states = self.coa(hidden_states)
+        out = coa_input + self.gate(hidden_states.permute(1, 2, 0, -1), dim=1)
+        return out
+class SpeakerCommunicationBlock(nn.Module):
+    def __init__(self, config):
+        super().__init__()
+        self.num_speakers = getattr(config, "mt_num_speakers", 2)
+        self.embed_dim = config.d_model
+        self.scb_method = config.scb_method
+        self.config = config
+        if self.scb_method == "tac":
+            self.method = TACBlock(config)
+        elif self.scb_method == "cross_attention":
+            self.method = CrossAttentionBlock(config)
+        elif self.scb_method == "cross_attention_enroll":
+            self.method = CrossAttentionEnrollBlock(config)
+        elif self.scb_method == "cross_attention_enroll_new":
+            self.method = CrossAttentionEnrollBlockNew(config)
+        elif self.scb_method == "competitive_cross_attention":
+            self.method = CompetitiveCrossAttentionBlock(config)
+        elif self.scb_method == "co_attention":
+            self.method = CoAttentionWrapper(config)
+        elif self.scb_method == "identity":
+            self.method = (nn.Parameter(torch.zeros(self.embed_dim)) if config.fddt_bias_only else (
+                CustomDiagonalLinear(self.embed_dim, bias=True, init_eye_val=1.0) if config.fddt_is_diagonal else CustomLinear(
+                    self.embed_dim, self.embed_dim, bias=True, init_eye_val=1.0)))
+        else:
+            raise ValueError(f"Unsupported scb_method: {self.scb_method}")
+    def forward(self, x):
+        # x: (B, T, F)
+        B, T, F = x.shape
+        S = self.num_speakers
+        # Reshape to (B//S, S, T, F)
+        x_reshaped = x.view(B//S, S, T, F)
+        # Call the selected method
+        out = self.method(x_reshaped)
+        # Reshape back (B, T, F)
+        out_merged = out.view(B, T, F)
+        return out_merged

coattention.py ADDED Viewed

	@@ -0,0 +1,120 @@

+import torch
+from torch import nn
+class MultiHeadCoAttention(nn.Module):
+    def __init__(self, multi_dim, single_dim, num_heads):
+        assert multi_dim % num_heads == 0, 'multi_dim must be divisible by num_heads'
+        assert single_dim % num_heads == 0, 'single_dim must be divisible by num_heads'
+        super().__init__()
+        self.q_proj = nn.Linear(single_dim, single_dim)
+        self.k_proj = nn.Linear(single_dim, single_dim)
+        self.multi_v_proj = nn.Linear(multi_dim, multi_dim)  # D'
+        self.single_v_proj = nn.Linear(single_dim, single_dim)  # D
+        self.multi_out_proj = nn.Linear(multi_dim, multi_dim)  # D'
+        self.single_out_proj = nn.Linear(single_dim, single_dim)  # D
+        self.multi_dim = multi_dim
+        self.single_dim = single_dim
+        self.num_heads = num_heads
+    def forward(self, query, key, multi_value, single_value):
+        # q, k, multi_v: (T,B,ch,D')
+        # single_v: (T,B,1,D)
+        query = torch.transpose(query, 0, 1)  # (B,T,ch,D')...[32, 150, 4, 64]
+        key = torch.transpose(key, 0, 1)  # (B,T,ch,D')...[32, 150, 4, 64]
+        multi_value = torch.permute(multi_value, (1, 2, 0, 3))  # (B,ch,T,D')...[32, 4, 150, 64]
+        single_value = torch.permute(single_value, (1, 2, 0, 3))  # (B,1,T,D)...[32, 1, 150, 256]
+        ###########
+        q = torch.split(self.q_proj(query), self.single_dim // self.num_heads, dim=-1)  # seq: (B,T,ch,D'/h)
+        q = torch.stack(q, dim=1)  # (B,h,T,ch,D'/h)...[32, 8, 150, 4, 8]
+        k = torch.split(self.k_proj(key), self.single_dim // self.num_heads, dim=-1)  # seq: (B,T,ch,D'/h)
+        k = torch.stack(k, dim=1)  # (B,h,T,ch,D'/h)...[32, 8, 150, 4, 8]
+        multi_v = torch.split(self.multi_v_proj(multi_value), self.multi_dim // self.num_heads,
+                              dim=-1)  # seq: (B,ch,T,D'/h)
+        multi_v = torch.stack(multi_v, dim=1)  # (B, h, ch, T, D'/h)...[32, 8, 4, 150, 8]
+        single_v = torch.split(self.single_v_proj(single_value), self.single_dim // self.num_heads,
+                               dim=-1)  # seq: (B,1,T,D/h)
+        single_v = torch.stack(single_v, dim=1)  # seq: (B,h,1,T,D/h)...[32, 32, 1, 150, 8]
+        q = q.view(*q.shape[:-2], -1)  # (B, h, T, ch*D/h)
+        k = k.view(*k.shape[:-2], -1)  # (B, h, T, ch*D/h)
+        normalizer = torch.sqrt(torch.Tensor([float(q.shape[-1])]).to(q.device))
+        sim_mat = torch.matmul(q, torch.transpose(k, -2, -1)) / normalizer  # (B, h, T, T)
+        att_mat = torch.unsqueeze(nn.functional.softmax(sim_mat, dim=-1), 2)  # (B, h, 1, T, T)
+        # co-attention
+        multi_result = torch.matmul(att_mat, multi_v)  # (B, h, ch, T, D'/h)
+        single_result = torch.matmul(att_mat, single_v)  # (B, h, 1, T, D/h)
+        multi_result = torch.permute(multi_result, (3, 0, 2, 1, 4))  # (T, B, ch, h, D'/h)
+        single_result = torch.permute(single_result, (3, 0, 2, 1, 4))  # (T, B, 1, h, D/h)
+        multi_result = torch.reshape(multi_result, multi_result.shape[:-2] + (-1,))  # (T, B, ch, D')
+        single_result = torch.reshape(single_result, single_result.shape[:-2] + (-1,))  # (T, B, 1, D)
+        multi_result = self.multi_out_proj(multi_result)
+        single_result = self.single_out_proj(single_result)
+        return multi_result, single_result
+class CoAttention(nn.Module):
+    def __init__(self, embed_dim=768, single_dim=256, multi_dim=64, n_heads=8, attn_dropout=0.,
+                 init_mult=1e-2):  # , pre_norm=True):
+        super().__init__()
+        self.init_mult = init_mult
+        self.in_single_proj = nn.Linear(embed_dim, single_dim)  # single_dim == D
+        self.in_single_ln = nn.LayerNorm(single_dim)
+        self.in_multi_proj = nn.Linear(embed_dim, multi_dim)  # multi_dim == D'
+        self.in_multi_ln = nn.LayerNorm(multi_dim)
+        self.mca = MultiHeadCoAttention(multi_dim, single_dim, n_heads)
+        self.mca_multi_out_ln = nn.LayerNorm(multi_dim)
+        self.mca_single_out_ln = nn.LayerNorm(single_dim)
+        # default MHA input: (seq, batch, feature)
+        self.cross_frame_mha = nn.MultiheadAttention(single_dim, n_heads, dropout=attn_dropout, bias=True, kdim=None,
+                                                     vdim=None)
+        self.mha_ln = nn.LayerNorm(single_dim)
+        self.cat_proj = nn.Linear(single_dim + multi_dim, embed_dim)
+        self.miso = False
+    def scale_weights(self):
+        self.cat_proj.bias.data *= 0.
+        self.cat_proj.weight.data *= self.init_mult
+    def forward(self, x):
+        # x: (T,B,ch,F); (150, 32, 4, 768)
+        frames, B, chans, feat_dim = x.shape
+        single_x = torch.mean(x,dim=2)  # (T,B,F)
+        single_x = self.in_single_ln(self.in_single_proj(single_x)).unsqueeze(dim=-2)  # (T,B,1,D)
+        multi_x = self.in_multi_ln(self.in_multi_proj(x))  # (T,B,ch,D')
+        # MCA
+        multi_mca, single_mca = self.mca(single_x, single_x, multi_x, single_x)  # (T,B,ch,D'), (T,B,ch,D)
+        single_x = single_x + single_mca
+        multi_x = multi_x + multi_mca
+        multi_x = self.mca_multi_out_ln(multi_x)  # (T,B,ch,D')
+        single_x = torch.squeeze(self.mca_single_out_ln(single_x), -2)  # (T,B,D)
+        # MHA
+        single_mha, _ = self.cross_frame_mha(single_x, single_x, single_x, need_weights=False)  # (T, B, D)
+        single_x = self.mha_ln(single_mha + single_x)
+        # join representations
+        single_x = single_x.unsqueeze(-2)  # (T,B,1,D)
+        single_x_tile = torch.tile(single_x, (1, 1, chans, 1))  # (T,B,ch,D)
+        cat_x = torch.cat([single_x_tile, multi_x], dim=-1)  # (T,B,ch,D+D')
+        out = self.cat_proj(cat_x)  # (T,B,ch,F)
+        return out

config.json ADDED Viewed

	@@ -0,0 +1,86 @@

+{
+  "_name_or_path": "/mnt/matylda5/ipoloka/ASRU_models/se_dicow",
+  "activation_dropout": 0.0,
+  "activation_function": "gelu",
+  "additional_layer": false,
+  "additional_self_attention_layer": true,
+  "apply_fddt_to_n_layers": -1,
+  "apply_spec_augment": false,
+  "architectures": [
+    "DiCoWForConditionalGeneration"
+  ],
+  "attend_to_enrollment": false,
+  "attention_dropout": 0.0,
+  "auto_map": {
+    "AutoConfig": "config.DiCoWConfig",
+    "AutoModelForSpeechSeq2Seq": "modeling_dicow.DiCoWForConditionalGeneration"
+  },
+  "begin_suppress_tokens": [
+    220,
+    50256
+  ],
+  "blank_token_id": null,
+  "bos_token_id": 50257,
+  "classifier_proj_size": 256,
+  "contrastive_loss_weight": 0,
+  "ctc_loss_reduction": "mean",
+  "ctc_weight": 0.3,
+  "ctc_zero_infinity": false,
+  "d_model": 1280,
+  "decoder_attention_heads": 20,
+  "decoder_ffn_dim": 5120,
+  "decoder_layerdrop": 0.0,
+  "decoder_layers": 4,
+  "decoder_start_token_id": 50258,
+  "dropout": 0.0,
+  "encoder_attention_heads": 20,
+  "encoder_ffn_dim": 5120,
+  "encoder_layerdrop": 0.0,
+  "encoder_layers": 32,
+  "eos_token_id": 50257,
+  "fddt_bias_only": false,
+  "fddt_init": "disparagement",
+  "fddt_is_diagonal": true,
+  "fddt_use_non_target": true,
+  "fddt_use_overlap": true,
+  "fddt_use_silence": true,
+  "fddt_use_target": true,
+  "final_dropout": 0.0,
+  "forced_decoder_ids": null,
+  "init_std": 0.02,
+  "is_encoder_decoder": true,
+  "is_mt": true,
+  "mask_feature_length": 10,
+  "mask_feature_min_masks": 0,
+  "mask_feature_prob": 0.0,
+  "mask_time_length": 10,
+  "mask_time_min_masks": 2,
+  "mask_time_prob": 0.05,
+  "max_source_positions": 1500,
+  "max_target_positions": 448,
+  "median_filter_width": 7,
+  "model_type": "DiCoW",
+  "mt_num_speakers": 2,
+  "n_soft_prompts": 16,
+  "non_target_fddt_value": 0.5,
+  "num_hidden_layers": 32,
+  "num_mel_bins": 128,
+  "num_speakers": null,
+  "pad_token_id": 50257,
+  "remove_timestamps_from_ctc": true,
+  "scale_embedding": false,
+  "scb_layers": 8,
+  "scb_method": "cross_attention_enroll_new",
+  "sid_loss_weight": 0,
+  "spk_embedding_extraction_layer": -1,
+  "sub_sample": true,
+  "torch_dtype": "float32",
+  "transformers_version": "4.42.0",
+  "use_cache": true,
+  "use_enrollment_network": false,
+  "use_fddt": true,
+  "use_initial_fddt": true,
+  "use_weighted_layer_sum": false,
+  "uses_enrollments": true,
+  "vocab_size": 51866
+}

config.py ADDED Viewed

	@@ -0,0 +1,103 @@

+from dataclasses import dataclass
+from typing import Optional
+import torch
+from transformers import WhisperConfig
+from transformers.modeling_outputs import Seq2SeqLMOutput, BaseModelOutput, Seq2SeqModelOutput
+@dataclass
+class Seq2SeqLMOutputLosses(Seq2SeqLMOutput):
+    enc_loss: Optional[torch.FloatTensor] = None
+    dec_loss: Optional[torch.FloatTensor] = None
+    encoder_logits: Optional[torch.FloatTensor] = None
+@dataclass
+class BaseModelOutputLogit(BaseModelOutput):
+    logits: Optional[torch.FloatTensor] = None
+@dataclass
+class Seq2SeqModelOutputLogit(Seq2SeqModelOutput):
+    encoder_logits: Optional[torch.FloatTensor] = None
+class DiCoWConfig(WhisperConfig):
+    """This is a modified version of the `WhisperEncoder` model from the `transformers` library.
+    The model has been modified to support CTC loss computation in the forward pass."""
+    model_type = "DiCoW"
+    def __init__(
+            self,
+            ctc_loss_reduction: str = "mean",
+            final_dropout: float = 0.0,
+            ctc_zero_infinity: bool = False,
+            ctc_weight: float = 0.0,
+            blank_token_id: Optional[int] = None,
+            additional_layer: bool = False,
+            additional_self_attention_layer: bool = False,
+            sub_sample: bool = False,
+            use_fddt: bool = True,
+            fddt_is_diagonal: bool = True,
+            fddt_bias_only: bool = False,
+            fddt_use_silence: bool = True,
+            fddt_use_target: bool = True,
+            fddt_use_overlap: bool = True,
+            fddt_use_non_target: bool = True,
+            remove_timestamps_from_ctc: bool = False,
+            apply_fddt_to_n_layers: int = -1,
+            fddt_init: str = 'non-disturbing',  # random, non-disturbing, dispargement
+            n_soft_prompts: int = 16,
+            mt_num_speakers: int = 1,
+            is_mt: bool = False,
+            non_target_fddt_value: float = 0.0,
+            use_initial_fddt: bool = False,
+            scb_method: str = None,
+            scb_layers: int = -1,
+            contrastive_loss_weight: float = 0.0,
+            use_enrollment_network: bool = False,
+            spk_embedding_extraction_layer: int = -1,
+            num_speakers: int = -1,
+            sid_loss_weight: float = 0.0,
+            attend_to_enrollment: bool = False,
+            uses_enrollments: bool = False,
+            **kwargs,
+    ):
+        super().__init__(**kwargs)
+        self.ctc_loss_reduction = ctc_loss_reduction
+        self.final_dropout = final_dropout
+        self.ctc_zero_infinity = ctc_zero_infinity
+        self.ctc_weight = ctc_weight
+        self.blank_token_id = blank_token_id
+        self.additional_layer = additional_layer
+        self.additional_self_attention_layer = additional_self_attention_layer
+        self.sub_sample = sub_sample
+        self.use_fddt = use_fddt
+        self.fddt_is_diagonal = fddt_is_diagonal
+        self.fddt_bias_only = fddt_bias_only
+        self.fddt_use_silence = fddt_use_silence
+        self.fddt_use_target = fddt_use_target
+        self.fddt_use_overlap = fddt_use_overlap
+        self.fddt_use_non_target = fddt_use_non_target
+        self.remove_timestamps_from_ctc = remove_timestamps_from_ctc
+        self.apply_fddt_to_n_layers = apply_fddt_to_n_layers
+        self.fddt_init = fddt_init
+        self.n_soft_prompts = n_soft_prompts
+        self.mt_num_speakers = mt_num_speakers
+        self.non_target_fddt_value = non_target_fddt_value
+        self.use_initial_fddt = use_initial_fddt
+        self.scb_method = scb_method
+        self.scb_layers = scb_layers
+        self.contrastive_loss_weight = contrastive_loss_weight
+        self.is_mt = is_mt
+        self.use_enrollment_network = use_enrollment_network
+        self.spk_embedding_extraction_layer = spk_embedding_extraction_layer
+        self.num_speakers = num_speakers
+        self.sid_loss_weight = sid_loss_weight
+        self.attend_to_enrollment = attend_to_enrollment
+        self.use_enrollment_network = use_enrollment_network
+        self.uses_enrollments = uses_enrollments
+_HIDDEN_STATES_START_POSITION = 2

contrastive_loss.py ADDED Viewed

	@@ -0,0 +1,190 @@

+import torch
+import torch.nn as nn
+import torch.nn.functional as F
+from typing import Optional
+from torch import Tensor
+class ContrastiveLoss(nn.Module):
+    def __init__(self, temperature=.25, distance_metric='cosine'):
+        super(ContrastiveLoss, self).__init__()
+        self.temperature = temperature
+        self.distance_metric = distance_metric
+    def compute_similarity(self, embeddings):
+        if self.distance_metric == 'cosine':
+            embeddings = F.normalize(embeddings, p=2, dim=-1)  # [B, 2T, D]
+            sim = torch.matmul(embeddings, embeddings.transpose(-1, -2))  # [B, 2T, 2T]
+        else:
+            raise ValueError(f"Unsupported distance metric: {self.distance_metric}")
+        return sim / self.temperature
+    def compute_cross_similarity(self, embeddings1, embeddings2):
+        """Compute similarity between two different embedding sets"""
+        if self.distance_metric == 'cosine':
+            embeddings1 = F.normalize(embeddings1, p=2, dim=-1)  # [B, 2T, D]
+            embeddings2 = F.normalize(embeddings2, p=2, dim=-1)  # [B, 2T, D]
+            sim = torch.matmul(embeddings1, embeddings2.transpose(-1, -2))  # [B, 2T, 2T]
+        else:
+            raise ValueError(f"Unsupported distance metric: {self.distance_metric}")
+        return sim / self.temperature
+    def pairwise_and_no_diag(self, m):
+        m_i = m.unsqueeze(2)  # [B, T, 1]
+        m_j = m.unsqueeze(1)  # [B, 1, T]
+        out = m_i & m_j  # [B, T, T]
+        diag = torch.eye(m.size(1), dtype=torch.bool, device=m.device).unsqueeze(0)
+        return out & ~diag
+    def forward(self, embeddings, anchors, enrollment_embeddings: Optional[Tensor] = None,
+                enrollment_embeddings_mask: Optional[Tensor] = None):
+        """
+        Args:
+            embeddings: [B, 2T, D] - main embeddings
+            anchors: [B, 2T] - boolean mask indicating anchor positions
+            enrollment_embeddings: Optional[B, 2T, D] - enrollment embeddings for positive pairs
+            enrollment_embeddings_mask: Optional[B, 2T] - boolean mask for valid enrollment positions
+        Returns:
+            Scalar contrastive loss
+        """
+        # Use enrollment embeddings if provided
+        if enrollment_embeddings is not None and enrollment_embeddings_mask is not None:
+            return self._forward_with_enrollment(embeddings, anchors, enrollment_embeddings, enrollment_embeddings_mask)
+        else:
+            # Fall back to original behavior
+            return self._forward_original(embeddings, anchors)
+    def _forward_with_enrollment(self, embeddings, anchors, enrollment_embeddings, enrollment_embeddings_mask):
+        """Forward pass using enrollment embeddings as positives"""
+        B, two_T, D = embeddings.shape
+        T = two_T // 2
+        # Compute similarity between main embeddings and enrollment embeddings
+        cross_sim = self.compute_cross_similarity(embeddings, enrollment_embeddings)  # [B, 2T, 2T]
+        # Compute similarity within main embeddings for negatives
+        self_sim = self.compute_similarity(embeddings)  # [B, 2T, 2T]
+        # Split anchor mask
+        m1 = anchors[:, :T]  # [B, T]
+        m2 = anchors[:, T:]  # [B, T]
+        # Split enrollment mask
+        enroll_m1 = enrollment_embeddings_mask[:, :T]  # [B, T]
+        enroll_m2 = enrollment_embeddings_mask[:, T:]  # [B, T]
+        # Create positive mask: anchor positions can match with corresponding enrollment positions
+        # First speaker (positions 0:T) matches with enrollment first speaker (positions 0:T)
+        pos_mask_1to1 = m1.unsqueeze(2) & enroll_m1.unsqueeze(1)  # [B, T, T]
+        # Second speaker (positions T:2T) matches with enrollment second speaker (positions T:2T)
+        pos_mask_2to2 = m2.unsqueeze(2) & enroll_m2.unsqueeze(1)  # [B, T, T]
+        # Build full positive mask
+        pos_mask = torch.cat([
+            torch.cat([pos_mask_1to1, torch.zeros_like(pos_mask_1to1)], dim=2),  # [B, T, 2T]
+            torch.cat([torch.zeros_like(pos_mask_2to2), pos_mask_2to2], dim=2)  # [B, T, 2T]
+        ], dim=1)  # [B, 2T, 2T]
+        # Create negative mask: cross-speaker pairs within main embeddings
+        cross = m1.unsqueeze(2) & m2.unsqueeze(1)  # [B, T, T]
+        neg_mask = torch.cat([
+            torch.cat([torch.zeros_like(cross), cross], dim=2),  # [B, T, 2T]
+            torch.cat([cross.transpose(1, 2), torch.zeros_like(cross)], dim=2)  # [B, T, 2T]
+        ], dim=1)  # [B, 2T, 2T]
+        # Exclude self-pairs in negative mask
+        identity_mask = torch.eye(two_T, dtype=torch.bool, device=embeddings.device).unsqueeze(0)  # [1, 2T, 2T]
+        neg_mask &= ~identity_mask
+        # Also exclude self-pairs in positive mask (diagonal elements)
+        pos_mask &= ~identity_mask
+        # Compute contrastive loss
+        if pos_mask.any():
+            # Get positive similarities from cross-similarity matrix
+            pos_sim = cross_sim[pos_mask]  # [num_pos_pairs]
+            pos_exp = torch.exp(pos_sim)  # [num_pos_pairs]
+            # Compute negative exponentials from self-similarity matrix
+            exp_self_sim = torch.exp(self_sim)  # [B, 2T, 2T]
+            neg_exp_sum = torch.sum(exp_self_sim * neg_mask.float(), dim=2)  # [B, 2T]
+            # Get the negative sums corresponding to each positive pair
+            pos_indices = torch.nonzero(pos_mask, as_tuple=False)  # [num_pos_pairs, 3]
+            batch_idx = pos_indices[:, 0]  # [num_pos_pairs]
+            row_idx = pos_indices[:, 1]  # [num_pos_pairs]
+            # Get negative sums for each positive pair's anchor
+            neg_sums_for_pos = neg_exp_sum[batch_idx, row_idx]  # [num_pos_pairs]
+            # Compute denominators: exp(pos) + sum(exp(neg)) for each positive pair
+            denominators = pos_exp + neg_sums_for_pos  # [num_pos_pairs]
+            # InfoNCE loss: -log(exp(pos) / denominator)
+            loss = -torch.log(pos_exp / denominators)
+            total_loss = loss.mean()
+        else:
+            # No positive pairs found, return zero loss
+            total_loss = torch.tensor(0.0, device=embeddings.device, requires_grad=True)
+        return total_loss
+    def _forward_original(self,  embeddings, pos_indicator_mask):
+        """Original forward pass for backward compatibility"""
+        B, two_T, D = embeddings.shape
+        T = two_T // 2
+        sim = self.compute_similarity(embeddings)  # [B, 2T, 2T]
+        # Split input mask
+        m1 = pos_indicator_mask[:, :T]  # [B, T]
+        m2 = pos_indicator_mask[:, T:]  # [B, T]
+        # Positive mask (same speaker pairs, diagonal excluded)
+        pos_block1 = self.pairwise_and_no_diag(m1)  # [B, T, T]
+        pos_block2 = self.pairwise_and_no_diag(m2)  # [B, T, T]
+        pos_mask = torch.cat([
+            torch.cat([pos_block1, torch.zeros_like(pos_block1)], dim=2),  # [B, T, 2T]
+            torch.cat([torch.zeros_like(pos_block2), pos_block2], dim=2)  # [B, T, 2T]
+        ], dim=1)  # [B, 2T, 2T]
+        # Negative mask (cross-speaker pairs where both are active)
+        cross = m1.unsqueeze(2) & m2.unsqueeze(1)  # [B, T, T]
+        neg_mask = torch.cat([
+            torch.cat([torch.zeros_like(cross), cross], dim=2),  # [B, T, 2T]
+            torch.cat([cross.transpose(1, 2), torch.zeros_like(cross)], dim=2)  # [B, T, 2T]
+        ], dim=1)  # [B, 2T, 2T]
+        # Identity mask (exclude [i, i] self-pairs)
+        identity_mask = torch.eye(two_T, dtype=torch.bool, device=embeddings.device).unsqueeze(0)  # [1, 2T, 2T]
+        pos_mask &= ~identity_mask
+        neg_mask &= ~identity_mask
+        # Fully vectorized InfoNCE computation
+        if pos_mask.any():
+            # Compute exp(similarities) for numerical stability
+            exp_sim = torch.exp(sim)  # [B, 2T, 2T]
+            # Get positive similarities
+            pos_sim = sim[pos_mask]  # [num_pos_pairs]
+            pos_exp = torch.exp(pos_sim)  # [num_pos_pairs]
+            # For each position, sum the exponentials of its negatives
+            neg_exp_avg = 10 * torch.mean(exp_sim * neg_mask.float(), dim=2)  # [B, 2T]
+            # Get the negative sums corresponding to each positive pair
+            pos_indices = torch.nonzero(pos_mask, as_tuple=False)  # [num_pos_pairs, 3]
+            batch_idx = pos_indices[:, 0]  # [num_pos_pairs]
+            row_idx = pos_indices[:, 1]  # [num_pos_pairs]
+            # Get negative sums for each positive pair's anchor
+            neg_avgs_for_pos = neg_exp_avg[batch_idx, row_idx]  # [num_pos_pairs]
+            # Compute denominators: exp(pos) + sum(exp(neg)) for each positive pair
+            denominators = pos_exp + neg_avgs_for_pos  # [num_pos_pairs]
+            # InfoNCE loss: -log(exp(pos) / denominator)
+            loss = -torch.log(pos_exp / denominators)
+            total_loss = loss.mean()
+        else:
+            # No positive pairs found, return zero loss
+            total_loss = torch.tensor(0.0, device=embeddings.device, requires_grad=True)
+        return total_loss

decoding.py ADDED Viewed

	@@ -0,0 +1,397 @@

+# pylint: skip-file
+# Copied from: https://github.com/espnet/espnet/blob/master/espnet/nets/ctc_prefix_score.py
+import itertools as it
+from typing import List
+import pandas as pd
+import torch
+from transformers import LogitsProcessor, PreTrainedTokenizer
+class CTCPrefixScore(object):
+    """Compute CTC label sequence scores
+    which is based on Algorithm 2 in WATANABE et al.
+    "HYBRID CTC/ATTENTION ARCHITECTURE FOR END-TO-END SPEECH RECOGNITION,"
+    but extended to efficiently compute the label probabilities for multiple
+    hypotheses simultaneously
+    See also Seki et al. "Vectorized Beam Search for CTC-Attention-Based
+    Speech Recognition," In INTERSPEECH (pp. 3825-3829), 2019.
+    """
+    def __init__(self, x, blank, eos):
+        self.logzero = -1e10
+        self.blank = blank
+        self.eos = eos
+        self.input_length = x.shape[1]
+        self.batch_size = x.shape[0]
+        self.x = x
+        self.device = x.device
+        # Preallocate `r` and `xs` tensors
+        # `num_labels` will be set dynamically in __call__ but preallocated with maximum capacity
+        self.max_num_labels = x.shape[2]  # Set to a max value that can be dynamically resized
+        self.r = torch.full((self.batch_size, self.input_length, 2, self.max_num_labels), self.logzero,
+                            device=self.device)
+        self.xs = torch.full((self.batch_size, self.input_length, self.max_num_labels), self.logzero,
+                             device=self.device)
+    def initial_state(self):
+        """Obtain an initial CTC state."""
+        # Create initial CTC state tensor and use in-place operations to fill
+        r = torch.full((self.batch_size, self.input_length, 2), self.logzero, device=self.device)
+        r[..., 1] = torch.cumsum(self.x[..., self.blank], dim=1)
+        s = torch.zeros((self.batch_size, 1), device=self.device)
+        return r, s
+    def _resize_tensors(self, number_of_current_samples, num_labels):
+        if self.r.shape[0] != number_of_current_samples:
+            self.r = self.r[:number_of_current_samples, ...]
+            self.xs = self.xs[:number_of_current_samples, ...]
+        if self.r.shape[3] != num_labels:
+            self.r = self.r[:, :, :, :num_labels].fill_(self.logzero)
+            self.xs = self.xs[:, :, :num_labels].fill_(self.logzero)
+        else:
+            self.r.fill_(self.logzero)
+            self.xs.fill_(self.logzero)
+    def _initialize_r(self, decoded_len):
+        mask = (decoded_len == 0)
+        self.r[mask, 0, 0, :] = self.xs[mask, 0]
+    def _compute_log_phi(self, r_sum, cs, last, decoded_len, r_prev):
+        # Expand r_sum for num_labels and initialize log_phi
+        log_phi = r_sum[..., None].expand(-1, -1, cs.shape[1])
+        # Create mask for cases where `decoded_len > 0` and to identify where `c == last[i]` for all `i`
+        non_zero_mask = (decoded_len > 0)
+        label_match_mask = (cs == last.unsqueeze(1))
+        # Update log_phi where both `decoded_len > 0` and `c == last[i]`
+        log_phi = torch.where((non_zero_mask.unsqueeze(1) & label_match_mask)[:, None, :], r_prev[..., 1:2], log_phi)
+        return log_phi
+    def _compute_log_psi(self, decoded_len, log_phi, x_current):
+        """This function computes forward probabilities log(r_t^n(h)), log(r_t^b(h)),
+        and log prefix probabilities log(psi) for all labels in the batch.
+        :param decoded_len: tensor of shape (batch_size,) containing the length of the decoded sequence
+        :param log_phi: tensor of shape (batch_size, input_length, num_labels) containing the forward probabilities
+        :param x_current: tensor of shape (batch_size, input_length, num_labels) containing the input frame
+        :return log_psi: tensor of shape (batch_size,num_labels) containing the log prefix probabilities
+        """
+        B, T, V = log_phi.shape
+        start = torch.clamp(decoded_len, min=1)  # Ensure start is at least 1 to avoid out-of-bounds
+        # Initialize log_psi with the start position of r[:, start - 1, 0, :]
+        log_psi = self.r[torch.arange(B), start - 1, 0, :]
+        # Mask for handling sequence lengths based on decoded_len
+        mask_t = torch.arange(1, T, device=decoded_len.device).expand(B, T - 1) >= decoded_len.unsqueeze(1)
+        # Accumulate log_psi only up to the last valid time step for each sequence
+        log_psi = torch.logaddexp(log_psi, torch.logsumexp(
+            torch.where(mask_t.unsqueeze(-1), log_phi[:, :-1] + self.xs[:, 1:], self.logzero), dim=1))
+        start = torch.clamp(decoded_len, 1)
+        # TODO: Vectorize this loop by compute suffix xs and multiplying with log_phi
+        # xs  = self.xs[:,1:,:].clone()
+        # xs_cum = torch.cumsum(xs, dim=1)
+        # xs_cum_expanded = xs_cum.unsqueeze(1).repeat(1, T-1, 1, 1)
+        # xs_u = (xs_cum_expanded - torch.nn.functional.pad(xs_cum[:,:-1,:], (0,0,1,0), value=0).unsqueeze(2).repeat(1, 1,T-1,1)).permute(0,2,1,3)
+        #
+        # phis_new = log_phi[:,:-1].clone()
+        # phis_new[:, 0] = torch.logaddexp(phis_new[:, 0], self.r[:, 0, 0, :])
+        # phis_new = phis_new.unsqueeze(1).repeat(1, T-1, 1, 1)
+        # causal_mask = torch.ones((T-1,T-1), dtype=torch.bool, device=self.device).tril().unsqueeze(0).unsqueeze(-1).repeat(B,1,1,1)
+        # mask = causal_mask & mask_t.unsqueeze(2).unsqueeze(-1)
+        # r_zero = torch.logsumexp(torch.where(mask, xs_u + phis_new, self.logzero), dim=2)
+        # self.r[:,1:,0] = r_zero
+        for t in range(start.min(), self.input_length):
+            should_decode = decoded_len <= t
+            self.r[:, t, 0] = torch.logaddexp(self.r[:, t - 1, 0],
+                                              log_phi[:, t - 1]) + self.xs[:, t]
+            self.r[:, t, 1] = (
+                    torch.logaddexp(self.r[:, t - 1, 0], self.r[:, t - 1, 1]) + x_current[:, t, self.blank][:, None]
+            )
+            if ~should_decode.any():
+                self.r[:, t] = torch.where(should_decode.unsqueeze(-1).unsqueeze(-1), self.r[:, t], self.logzero)
+        return log_psi
+    def _update_log_psi_with_eos(self, log_psi, cs, r_sum):
+        # Update log_psi for eos positions
+        eos_mask = (cs == self.eos)
+        log_psi[eos_mask] = r_sum[:, -1].unsqueeze(1).expand_as(log_psi)[eos_mask]
+        # Exclude blank probabilities if eos is not the blank
+        if self.eos != self.blank:
+            blank_mask = (cs == self.blank)
+            log_psi[blank_mask] = self.logzero
+        return log_psi
+    def __call__(self, y, cs, decoded_len, samples_to_be_decoded, r_prev):
+        """Compute CTC prefix scores for next labels
+        :param y     : prefix label sequence
+        :param cs    : array of next labels
+        :param r_prev: previous CTC state
+        :return ctc_scores, ctc_states
+        """
+        # initialize CTC states
+        # output_length = y.shape[1] - 1  # ignore sos
+        # new CTC states are prepared as a frame x (n or b) x n_labels tensor
+        # that corresponds to r_t^n(h) and r_t^b(h).
+        # Dynamically resize r and xs to match num_labels if necessary
+        num_labels = cs.shape[1]
+        number_of_current_samples = cs.shape[0]
+        self._resize_tensors(number_of_current_samples, num_labels)
+        # Create a view of the current input frame
+        x_current = self.x[samples_to_be_decoded]
+        self.xs = torch.gather(x_current, 2, cs.unsqueeze(1).expand(-1, self.input_length, -1))
+        # Initialize r for the first frame
+        self._initialize_r(decoded_len)
+        # prepare forward probabilities for the last label
+        r_sum = torch.logaddexp(r_prev[:, :, 0], r_prev[:, :, 1])  # log(r_t^n(g) + r_t^b(g))
+        last = y[:, -1]
+        # precompute log_phi
+        log_phi = self._compute_log_phi(r_sum, cs, last, decoded_len, r_prev)
+        # compute forward probabilities log(r_t^n(h)), log(r_t^b(h)),
+        # and log prefix probabilities log(psi)
+        log_psi = self._compute_log_psi(decoded_len, log_phi, x_current)
+        # get P(...eos|X) that ends with the prefix itself
+        log_psi = self._update_log_psi_with_eos(log_psi, cs, r_sum)
+        # return the log prefix probability and CTC states, where the label axis
+        # of the CTC states is moved to the first axis to slice it easily
+        return log_psi, self.r
+class CTCRescorerLogitsProcessor(LogitsProcessor):
+    def __init__(
+            self,
+            encoder_logits: torch.FloatTensor,
+            encoder_output_lens: torch.Tensor,
+            blank_token_id: int,
+            pad_token_id: int,
+            eos_token_id: int,
+            bos_token_id: int,
+            tokenizer: PreTrainedTokenizer,
+            ctc_margin: int,
+            ctc_weight: float,
+            num_beams: int,
+            debug: bool = False,
+            ctc_tokens_to_score: int = 500
+    ):
+        super().__init__()
+        same_logits = torch.tensor(list((tokenizer.upper_cased_tokens.items())))
+        logits = torch.nn.functional.log_softmax(encoder_logits, dim=-1)
+        logits[..., same_logits[:, 1]] = logits[..., same_logits[:, 0]]
+        self.logits = logits
+        self.ctc_prefix_scorer = CTCPrefixScore(
+            self.logits,
+            blank_token_id,
+            eos_token_id,
+        )
+        self.batch_size = logits.shape[0]
+        self.input_length = logits.shape[1]
+        self.num_tokens = logits.shape[2]
+        self.device = logits.device
+        self.ctc_weight = ctc_weight
+        self.num_beams = num_beams
+        self.ctc_state_prev, self.ctc_score_prev = self.ctc_prefix_scorer.initial_state()
+        self.eos_token_id = eos_token_id
+        self.bos_token_id = bos_token_id
+        self.tokenizer = tokenizer
+        self.pad_token_id = pad_token_id
+        self.blank_token_id = blank_token_id
+        self.debug = False
+        self.first_timestamp_token_id = tokenizer.get_vocab()["<|0.00|>"]
+        self.tmp_ctc_scores = torch.empty((self.batch_size, self.num_tokens - 1), device=self.device)
+        self.tmp_ctc_states = torch.empty((self.batch_size, self.num_tokens - 1, self.input_length, 2),
+                                          device=self.device)
+        self.ctc_tokens_to_score = ctc_tokens_to_score
+    def analyze_predictions(self,
+                            scores, ctc_scores, next_token_scores, input_ids, k=10):
+        print("\n" + "#" * 100)
+        batch_size = input_ids.shape[0]
+        best_att_ids = scores.topk(k=k, dim=1)
+        ctc_scores[:, self.first_timestamp_token_id:] = self.ctc_prefix_scorer.logzero
+        best_ctc_ids = ctc_scores.topk(k=k, dim=1)
+        best_ids = next_token_scores.topk(k=k, dim=1)
+        decoded_prefixes = self.tokenizer.batch_decode(
+            input_ids, decode_with_timestamps=True, skip_special_tokens=False
+        )
+        def prepare_and_decode(best_ids_tensor):
+            new_tensor = torch.zeros((batch_size, k * 2), dtype=torch.long)
+            new_tensor[:, 0::2] = best_ids_tensor.indices
+            new_tensor[:, 1::2] = self.tokenizer.vocab['#']
+            # Flatten to (batch_size * k, 2)
+            flat_tensor = new_tensor.view(-1, 2)
+            decoded = self.tokenizer.batch_decode(
+                flat_tensor, decode_with_timestamps=True, skip_special_tokens=False
+            )
+            # Reshape back to (batch_size, k)
+            decoded = [(decoded[i * k:(i + 1) * k]) for i in range(batch_size)]
+            return decoded
+        decoded_att = prepare_and_decode(best_att_ids)
+        decoded_ctc = prepare_and_decode(best_ctc_ids)
+        decoded_next = prepare_and_decode(best_ids)
+        for idx in range(batch_size):
+            print("-" * 80)
+            print(f"HYPOTHESIS {idx}")
+            print("\nPREFIX:")
+            print(decoded_prefixes[idx])
+            def print_with_pandas(tokens, scores, title):
+                df = pd.DataFrame([tokens, [f"{s.item():.2f}" for s in scores]])
+                df.index = [f"{title}", "Score"]
+                print(f"\n{title}:")
+                print(df.to_string(index=True, header=False))
+            print_with_pandas(decoded_att[idx], best_att_ids.values[idx], "ATT_TOKENS")
+            print_with_pandas(decoded_ctc[idx], best_ctc_ids.values[idx], "CTC_TOKENS")
+            print_with_pandas(decoded_next[idx], best_ids.values[idx], "NEXT_TOKENS")
+            print(f"\nCTC_EOS: {ctc_scores[idx, self.tokenizer.eos_token_id].item():.2f}")
+            print()
+        print("#" * 100)
+    def update_state(self, best_ids, beam_idx):
+        mask = best_ids < self.first_timestamp_token_id
+        self.ctc_state_prev = torch.where(mask.unsqueeze(-1).unsqueeze(-1),
+                                          self.tmp_ctc_states[beam_idx, best_ids],
+                                          self.ctc_state_prev[beam_idx])
+        self.ctc_score_prev = torch.where(mask.unsqueeze(-1),
+                                          self.tmp_ctc_scores[beam_idx, best_ids].unsqueeze(-1),
+                                          self.ctc_score_prev[beam_idx])
+    def __call__(self, input_ids_orig: torch.LongTensor, scores: torch.FloatTensor) -> torch.FloatTensor:
+        input_ids = input_ids_orig.clone()
+        # Remove prefix from CTC scoring
+        if (input_ids[:, 0] != self.bos_token_id).any():
+            input_ids = torch.stack(
+                [row[(row == self.bos_token_id).nonzero(as_tuple=True)[0].item():] for row in input_ids])
+        # Remove task/lang/timestamp tokens from input_ids
+        input_prefix_len = len(self.tokenizer.prefix_tokens)
+        if input_prefix_len > 1:
+            input_ids = input_ids[:, input_prefix_len - 1:]
+        # Setup the first token to be the blank token(sos)
+        input_ids[:, 0] = self.blank_token_id
+        # If there is last token in input_ids timestamp replicate last non-timestamp token which could be potentially even the first token
+        decoded_len = torch.logical_and(input_ids <= self.first_timestamp_token_id,
+                                        input_ids != self.blank_token_id).sum(dim=1)
+        mask = torch.logical_and(input_ids[:, -1] >= self.first_timestamp_token_id,
+                                 input_ids[:, -1] != self.blank_token_id)
+        last_non_timestamp_token = torch.gather(input_ids, 1,
+                                                torch.logical_or(input_ids < self.first_timestamp_token_id,
+                                                                 input_ids == self.blank_token_id).sum(dim=1,
+                                                                                                       keepdim=True) - 1)
+        input_ids[mask, -1] = last_non_timestamp_token[mask, 0]
+        # If there is no eos token in the last position, we need to continue decoding
+        to_be_decoded = input_ids[:, -1] != self.eos_token_id
+        self.tmp_ctc_scores[:] = self.ctc_prefix_scorer.logzero
+        input_ids_local = input_ids[to_be_decoded]
+        ids_to_score = torch.topk(scores[:, :self.first_timestamp_token_id], k=self.ctc_tokens_to_score).indices
+        # always score EOS token if not present put on position of last id
+        is_eos_present = (ids_to_score == self.eos_token_id).any(dim=1)
+        ids_to_score[~is_eos_present, self.ctc_tokens_to_score - 1] = self.eos_token_id
+        decoded_len_local = decoded_len[to_be_decoded]
+        ctc_scores_local, ctc_states_local = self.ctc_prefix_scorer(input_ids_local, ids_to_score[to_be_decoded],
+                                                                    decoded_len_local, to_be_decoded,
+                                                                    self.ctc_state_prev[to_be_decoded])
+        # As the CTC scorer might run on subset of samples, we need to scatter the results back to the original batch
+        self.tmp_ctc_scores[to_be_decoded] = (self.tmp_ctc_scores[to_be_decoded]
+                                              .scatter(1, ids_to_score[to_be_decoded], ctc_scores_local))
+        self.tmp_ctc_states[to_be_decoded] = (self.tmp_ctc_states[to_be_decoded].permute(0, 2, 3, 1)
+                                              .scatter(3, ids_to_score[to_be_decoded].unsqueeze(1).unsqueeze(1)
+                                                       .repeat(1, *ctc_states_local.shape[1:3], 1), ctc_states_local)
+                                              .permute(0, 3, 1, 2))
+        # Set the CTC score for the timestamp tokens to the maximum to prefer them over the rest
+        self.tmp_ctc_scores[:, self.first_timestamp_token_id:] = self.tmp_ctc_scores.max(dim=1).values[:, None]
+        ctc_scores = self.tmp_ctc_scores - self.ctc_score_prev
+        next_token_scores = (1 - self.ctc_weight) * scores + self.ctc_weight * ctc_scores
+        if self.debug:
+            self.analyze_predictions(scores, ctc_scores, next_token_scores, input_ids_orig)
+        return next_token_scores
+class LogSoftmaxProcessor(LogitsProcessor):
+    def __init__(
+            self,
+    ):
+        super().__init__()
+    def __call__(self, input_ids: torch.LongTensor, scores: torch.FloatTensor) -> torch.FloatTensor:
+        scores = torch.nn.functional.log_softmax(scores, dim=-1)
+        return scores
+class GreedyCTCDecoder(torch.nn.Module):
+    def __init__(self, tokenizer, blank=0):
+        super().__init__()
+        self.blank = blank
+        self.tokenizer = tokenizer
+    def forward(self, emission: torch.Tensor) -> List[str]:
+        """Given a sequence emission over labels, get the best path
+        Args:
+          emission (Tensor): Logit tensors. Shape `[num_seq, num_label]`.
+        Returns:
+          List[str]: The resulting transcript
+        """
+        indices = torch.argmax(emission, dim=-1)  # [num_seq,]
+        indices = [torch.unique_consecutive(index, dim=-1) for index in indices]
+        indices = [index[index != self.blank] for index in indices]
+        indices = torch.nn.utils.rnn.pad_sequence(indices, batch_first=True,
+                                                  padding_value=self.tokenizer.pad_token_id)
+        indices[indices >= len(self.tokenizer)] = self.tokenizer.unk_token_id
+        return indices
+def ctc_greedy_decode(logits: torch.Tensor, blank, pad_token_id) -> torch.Tensor:
+    idxs = torch.argmax(logits, dim=-1)
+    for i, prediction in enumerate(idxs):
+        deduplicated = [k for k, g in it.groupby(prediction) if k != blank]
+        idxs[i, : len(deduplicated)] = torch.tensor(deduplicated)
+        idxs[i, len(deduplicated):] = pad_token_id
+    return idxs

encoder.py ADDED Viewed

	@@ -0,0 +1,328 @@

+import torch
+from torch import nn
+from transformers.modeling_outputs import CausalLMOutput, BaseModelOutput
+from transformers.models.whisper.modeling_whisper import WhisperEncoder, WhisperEncoderLayer, WHISPER_ATTENTION_CLASSES
+from .FDDT import FDDT
+from .config import DiCoWConfig
+from .SCBs import SpeakerCommunicationBlock
+class DiCoWEncoder(WhisperEncoder):
+    config_class = DiCoWConfig
+    def __init__(self, config: DiCoWConfig):
+        super().__init__(config)
+        self.ctc_weight = config.ctc_weight
+        if config.additional_layer and self.ctc_weight > 0.0:
+            self.additional_layer = WhisperEncoderLayer(config)
+        if config.additional_self_attention_layer and self.ctc_weight > 0.0:
+            self.additional_self_attention_layer = WHISPER_ATTENTION_CLASSES[config._attn_implementation](
+                embed_dim=config.d_model,
+                num_heads=config.encoder_attention_heads,
+                dropout=config.attention_dropout,
+                config=config,
+            )
+        if config.sub_sample and self.ctc_weight > 0.0:
+            self.subsample_conv1 = nn.Conv1d(
+                in_channels=config.d_model,
+                out_channels=config.d_model,
+                kernel_size=3,
+                stride=2,
+                padding=1,
+                bias=False,
+            )
+            self.subsample_conv2 = nn.Conv1d(
+                in_channels=config.d_model,
+                out_channels=config.d_model,
+                kernel_size=3,
+                stride=2,
+                padding=1,
+                bias=False,
+            )
+        if self.ctc_weight > 0.0:
+            self.lm_head = nn.Linear(config.d_model, config.vocab_size + 1, bias=False)
+        self.final_dropout = nn.Dropout(config.final_dropout)
+        if config.use_fddt:
+            num_fddts = self.config.apply_fddt_to_n_layers if self.config.apply_fddt_to_n_layers != -1 else len(
+                self.layers)
+            self.initial_fddt = FDDT(config,
+                                     d_model=config.d_model,
+                                     non_target_rate=config.non_target_fddt_value,
+                                     is_diagonal=config.fddt_is_diagonal,
+                                     bias_only=config.fddt_bias_only,
+                                     use_silence=config.fddt_use_silence,
+                                     use_target=config.fddt_use_target,
+                                     use_overlap=config.fddt_use_overlap,
+                                     use_non_target=config.fddt_use_non_target,
+                                     use_interaction=False,
+                                     )
+            num_scbs = (self.config.scb_layers if self.config.scb_layers != -1 else len(
+                self.layers)) if self.config.is_mt else 0
+            self.fddts = nn.ModuleList([
+                FDDT(config,
+                     d_model=config.d_model,
+                     non_target_rate=1.0,
+                     is_diagonal=config.fddt_is_diagonal,
+                     bias_only=config.fddt_bias_only,
+                     use_silence=config.fddt_use_silence,
+                     use_target=config.fddt_use_target,
+                     use_overlap=config.fddt_use_overlap,
+                     use_non_target=config.fddt_use_non_target,
+                     use_interaction=i < num_scbs,
+                     )
+                for i in range(num_fddts)
+            ])
+        self.first_task_token = self.config.vocab_size - 30 * 50 - 1 - 6  # 30 seconds of 50 Hz timestamps -1 to get to 0.0 and -6 number of tasks
+        self.post_init()
+    def encode_enrollment(
+            self,
+            input_features,
+            num_layers_to_apply,
+            head_mask=None,
+            stno_mask=None,
+    ):
+        # For MT-ASR the input has shape (B X S) x F x T
+        # we can use torch.view(B, S, F, -1) to obtain
+        # new tensor with speaker dim
+        expected_seq_length = self.config.max_source_positions * self.conv1.stride[0] * self.conv2.stride[0]
+        if input_features.shape[-1] != expected_seq_length:
+            if input_features.shape[-1] > expected_seq_length:
+                return CausalLMOutput(
+                    logits=None,
+                    hidden_states=None,
+                    attentions=None,
+                )
+            else:
+                raise ValueError(
+                    f"Whisper expects the mel input features to be of length {expected_seq_length}, but found {input_features.shape[-1]}. Make sure to pad the input mel features to {expected_seq_length}."
+                )
+        inputs_embeds = nn.functional.gelu(self.conv1(input_features))
+        inputs_embeds = nn.functional.gelu(self.conv2(inputs_embeds))
+        inputs_embeds = inputs_embeds.permute(0, 2, 1)
+        embed_pos = self.embed_positions.weight
+        if self.config.use_fddt:
+            inputs_embeds = self.initial_fddt(inputs_embeds, stno_mask)
+        hidden_states = inputs_embeds + embed_pos
+        hidden_states = nn.functional.dropout(hidden_states, p=self.dropout, training=self.training)
+        # check if head_mask has a correct number of layers specified if desired
+        if head_mask is not None:
+            assert head_mask.size()[0] == (
+                len(self.layers)
+            ), f"The head_mask should be specified for {len(self.layers)} layers, but it is for {head_mask.size()[0]}."
+        for idx, encoder_layer in enumerate(self.layers[:num_layers_to_apply]):
+            if self.config.use_fddt and idx < len(self.fddts):
+                hidden_states = self.fddts[idx](hidden_states, stno_mask)
+                if self.gradient_checkpointing and self.training:
+                    layer_outputs = self._gradient_checkpointing_func(
+                        encoder_layer.__call__,
+                        hidden_states,
+                        None,
+                        (head_mask[idx] if head_mask is not None else None),
+                    )
+                else:
+                    layer_outputs = encoder_layer(
+                        hidden_states,
+                        None,
+                        layer_head_mask=(head_mask[idx] if head_mask is not None else None),
+                    )
+                hidden_states = layer_outputs[0]
+        return hidden_states
+    @classmethod
+    def _load_pretrained_model(
+            cls,
+            model,
+            state_dict,
+            loaded_keys,
+            resolved_archive_file,
+            pretrained_model_name_or_path,
+            **kwargs
+    ):
+        for key in list(state_dict.keys()):
+            if key.startswith("encoder."):
+                state_dict[key[8:]] = state_dict.pop(key)
+                loaded_keys.remove(key)
+                loaded_keys.append(key[8:])
+        output = super()._load_pretrained_model(
+            model,
+            state_dict,
+            loaded_keys,
+            resolved_archive_file,
+            pretrained_model_name_or_path,
+            **kwargs
+        )
+        return output
+    def get_loss(self, logits, labels):
+        if labels.max() >= self.config.vocab_size:
+            raise ValueError(f"Label values must be <= vocab_size: {self.config.vocab_size}")
+        if self.config.remove_timestamps_from_ctc:
+            labels = torch.nn.utils.rnn.pad_sequence([label[label < self.first_task_token] for label in labels],
+                                                     padding_value=-100).T
+        input_lengths = torch.full((logits.shape[0],), fill_value=logits.shape[1],
+                                   device=logits.device)
+        # assuming that padded tokens are filled with -100
+        # when not being attended to
+        labels_mask = labels >= 0
+        target_lengths = labels_mask.sum(-1)
+        # flattened_targets = labels_enc.masked_select(labels_mask)
+        # ctc_loss doesn't support fp16
+        log_probs = nn.functional.log_softmax(logits, dim=-1, dtype=torch.float32).transpose(0, 1)
+        with torch.backends.cudnn.flags(enabled=True):
+            ctc_loss = nn.functional.ctc_loss(
+                log_probs,
+                labels,
+                input_lengths,
+                target_lengths,
+                blank=logits.shape[-1] - 1,
+                reduction=self.config.ctc_loss_reduction,
+                zero_infinity=True,
+            )
+        return ctc_loss
+    def forward(
+            self,
+            input_features,
+            attention_mask=None,
+            head_mask=None,
+            output_attentions=None,
+            output_hidden_states=None,
+            return_dict=None,
+            stno_mask=None,
+            per_group_sizes=None
+    ):
+        # For MT-ASR the input has shape (B X S) x F x T
+        # we can use torch.view(B, S, F, -1) to obtain
+        # new tensor with speaker dim
+        expected_seq_length = self.config.max_source_positions * self.conv1.stride[0] * self.conv2.stride[0]
+        if input_features.shape[-1] != expected_seq_length:
+            if input_features.shape[-1] > expected_seq_length:
+                return CausalLMOutput(
+                    logits=None,
+                    hidden_states=None,
+                    attentions=None,
+                )
+            else:
+                raise ValueError(
+                    f"Whisper expects the mel input features to be of length {expected_seq_length}, but found {input_features.shape[-1]}. Make sure to pad the input mel features to {expected_seq_length}."
+                )
+        output_attentions = output_attentions if output_attentions is not None else self.config.output_attentions
+        output_hidden_states = (
+            output_hidden_states if output_hidden_states is not None else self.config.output_hidden_states
+        )
+        return_dict = return_dict if return_dict is not None else self.config.use_return_dict
+        inputs_embeds = nn.functional.gelu(self.conv1(input_features))
+        inputs_embeds = nn.functional.gelu(self.conv2(inputs_embeds))
+        inputs_embeds = inputs_embeds.permute(0, 2, 1)
+        embed_pos = self.embed_positions.weight
+        if self.config.use_fddt:
+            inputs_embeds = self.initial_fddt(inputs_embeds, stno_mask)
+        hidden_states = inputs_embeds + embed_pos
+        hidden_states = nn.functional.dropout(hidden_states, p=self.dropout, training=self.training)
+        encoder_states = () if output_hidden_states else None
+        all_attentions = () if output_attentions else None
+        # check if head_mask has a correct number of layers specified if desired
+        if head_mask is not None:
+            assert head_mask.size()[0] == (
+                len(self.layers)
+            ), f"The head_mask should be specified for {len(self.layers)} layers, but it is for {head_mask.size()[0]}."
+        for idx, encoder_layer in enumerate(self.layers):
+            if output_hidden_states:
+                encoder_states = encoder_states + (hidden_states,)
+            # add LayerDrop (see https://arxiv.org/abs/1909.11556 for description)
+            to_drop = False
+            if self.training:
+                dropout_probability = torch.rand([])
+                if dropout_probability < self.layerdrop:  # skip the layer
+                    to_drop = True
+            if self.config.use_fddt and idx < len(self.fddts):
+                hidden_states = self.fddts[idx](hidden_states, stno_mask)
+            if to_drop:
+                layer_outputs = (None, None)
+            else:
+                if self.gradient_checkpointing and self.training:
+                    layer_outputs = self._gradient_checkpointing_func(
+                        encoder_layer.__call__,
+                        hidden_states,
+                        None,
+                        (head_mask[idx] if head_mask is not None else None),
+                        output_attentions,
+                    )
+                else:
+                    layer_outputs = encoder_layer(
+                        hidden_states,
+                        None,
+                        layer_head_mask=(head_mask[idx] if head_mask is not None else None),
+                        output_attentions=output_attentions,
+                    )
+                hidden_states = layer_outputs[0]
+            if output_attentions:
+                all_attentions = all_attentions + (layer_outputs[1],)
+        hidden_states = self.layer_norm(hidden_states)
+        if output_hidden_states:
+            encoder_states = encoder_states + (hidden_states,)
+        if not return_dict:
+            outputs = tuple(v for v in [hidden_states, encoder_states, all_attentions] if v is not None)
+        else:
+            outputs = BaseModelOutput(
+                last_hidden_state=hidden_states, hidden_states=encoder_states, attentions=all_attentions
+            )
+        if hasattr(self, "additional_layer"):
+            inter_output, = self.additional_layer(
+                outputs.last_hidden_state,
+                attention_mask=None,
+                output_attentions=output_attentions,
+                layer_head_mask=None,
+            )
+        elif hasattr(self, "additional_self_attention_layer"):
+            inter_output, _, __ = self.additional_self_attention_layer(
+                outputs.last_hidden_state,
+                attention_mask=None,
+                output_attentions=output_attentions,
+                layer_head_mask=None,
+            )
+        else:
+            inter_output = outputs.last_hidden_state
+        inter_output = self.final_dropout(inter_output)
+        if hasattr(self, "subsample_conv2"):
+            inter_output = self.subsample_conv2(self.subsample_conv1(inter_output.transpose(1, 2))).transpose(1, 2)
+        if self.ctc_weight > 0.0:
+            logits = self.lm_head(inter_output)
+        else:
+            logits = None
+        return CausalLMOutput(
+            logits=logits,
+            hidden_states=outputs.hidden_states,
+            attentions=outputs.attentions,
+        )

generation.py ADDED Viewed

	@@ -0,0 +1,1808 @@

+import copy
+from typing import Any, Callable, Dict, List, Optional, Tuple, Union
+from typing import Iterator
+import warnings
+import numpy as np
+import torch
+import torch.utils.checkpoint
+import torch.utils.checkpoint
+from torch import nn
+from torch.nn.utils.rnn import pad_sequence
+from decimal import Decimal, ROUND_HALF_UP
+from transformers import LogitsProcessorList, SuppressTokensLogitsProcessor, \
+    SuppressTokensAtBeginLogitsProcessor
+from transformers.generation.configuration_utils import GenerationConfig
+from transformers.generation.configuration_utils import GenerationMode
+from transformers.generation.logits_process import (
+    LogitsProcessorList,
+    SuppressTokensAtBeginLogitsProcessor,
+    SuppressTokensLogitsProcessor, )
+from transformers.generation.logits_process import WhisperNoSpeechDetection
+from transformers.generation.stopping_criteria import (
+    StoppingCriteriaList,
+)
+from transformers.generation.utils import GenerateBeamOutput, BeamScorer, GenerateBeamDecoderOnlyOutput, \
+    stack_model_outputs, GenerateBeamEncoderDecoderOutput, _split_model_inputs, GenerateNonBeamOutput, \
+    GenerateEncoderDecoderOutput, GenerateDecoderOnlyOutput
+from transformers.modeling_outputs import BaseModelOutput
+from transformers.models.whisper.modeling_whisper import (
+    WhisperForConditionalGeneration,
+)
+from transformers.models.whisper.generation_whisper import _get_attr_from_logit_processors, _pad_to_max_length
+from transformers.models.whisper.tokenization_whisper import TASK_IDS, TO_LANGUAGE_CODE
+from transformers.utils import logging
+from .utils import WhisperTimeStampLogitsProcessorCustom
+from .decoding import CTCRescorerLogitsProcessor, LogSoftmaxProcessor
+logging.set_verbosity_debug()
+logger = logging.get_logger("transformers")
+class DiCoWGenerationMixin(WhisperForConditionalGeneration):
+    def _prepare_encoder_decoder_kwargs_for_generation(
+            self, inputs_tensor: torch.Tensor, model_kwargs, model_input_name, generation_config,
+    ) -> Dict[str, Any]:
+        # self.encoder_output_lens = self._get_feat_extract_output_lengths(
+        #     model_kwargs['attention_mask_enc'].sum(dim=1)
+        # ).int()
+        generation_config.output_hidden_states = True
+        # pylint: disable=no-memberva
+        model_kwargs = super()._prepare_encoder_decoder_kwargs_for_generation(
+            inputs_tensor, model_kwargs, model_input_name, generation_config
+        )
+        if "is_valid" in model_kwargs:
+            for key in ['decoder_input_ids', 'stno_mask', 'labels', 'upp_labels', 'attention_mask', 'attention_mask_enc']:
+                if key in model_kwargs:
+                    model_kwargs[key] = model_kwargs[key][model_kwargs['is_valid']]
+            model_kwargs['encoder_outputs']['logits'] = model_kwargs['encoder_outputs']['logits'][model_kwargs['is_valid']]
+            hidden_states = []
+            for layer in range(len(model_kwargs['encoder_outputs']['hidden_states'])):
+                hidden_states.append(model_kwargs['encoder_outputs']['hidden_states'][layer][model_kwargs['is_valid']])
+            model_kwargs['encoder_outputs']['hidden_states'] = tuple(hidden_states)
+            model_kwargs.pop("is_valid")
+        self.encoder_logits = model_kwargs["encoder_outputs"].logits
+        return model_kwargs
+    def _prepare_decoder_input_ids_for_generation(
+        self,
+        batch_size: int,
+        model_input_name: str,
+        model_kwargs: Dict[str, torch.Tensor],
+        decoder_start_token_id: torch.Tensor,
+        device: torch.device = None,
+    ) -> Tuple[torch.LongTensor, Dict[str, torch.Tensor]]:
+        batch_size = model_kwargs['decoder_input_ids'].shape[0]
+        out  = super()._prepare_decoder_input_ids_for_generation(
+            batch_size,
+            model_input_name,
+            model_kwargs,
+            decoder_start_token_id,
+            device,
+        )
+        return out
+    @staticmethod
+    def _expand_inputs_for_generation(
+            expand_size: int = 1,
+            is_encoder_decoder: bool = False,
+            input_ids: Optional[torch.LongTensor] = None,
+            **model_kwargs,
+    ) -> Tuple[torch.LongTensor, Dict[str, Any]]:
+        """Expands tensors from [batch_size, ...] to [batch_size * expand_size, ...]"""
+        def _expand_dict_for_generation(dict_to_expand):
+            for key in dict_to_expand:
+                if dict_to_expand[key] is not None and isinstance(dict_to_expand[key], torch.Tensor) and key != "loss":
+                    dict_to_expand[key] = dict_to_expand[key].repeat_interleave(expand_size, dim=0)
+            return dict_to_expand
+        if input_ids is not None:
+            input_ids = input_ids.repeat_interleave(expand_size, dim=0)
+        model_kwargs = _expand_dict_for_generation(model_kwargs)
+        if is_encoder_decoder:
+            if model_kwargs.get("encoder_outputs") is None:
+                raise ValueError("If `is_encoder_decoder` is True, make sure that `encoder_outputs` is defined.")
+            model_kwargs["encoder_outputs"] = _expand_dict_for_generation(model_kwargs["encoder_outputs"])
+            if "hidden_states" in model_kwargs["encoder_outputs"]:
+                model_kwargs["encoder_outputs"]["hidden_states"] = tuple(
+                    hidden_state.repeat_interleave(expand_size, dim=0) for hidden_state in
+                    model_kwargs["encoder_outputs"]["hidden_states"]
+                )
+        return input_ids, model_kwargs
+    def generate(
+            self,
+            input_features: Optional[torch.Tensor] = None,
+            generation_config: Optional[GenerationConfig] = None,
+            logits_processor: Optional[LogitsProcessorList] = None,
+            stopping_criteria: Optional[StoppingCriteriaList] = None,
+            prefix_allowed_tokens_fn: Optional[Callable[[int, torch.Tensor], List[int]]] = None,
+            synced_gpus: bool = False,
+            return_timestamps: Optional[bool] = None,
+            task: Optional[str] = None,
+            language: Optional[str] = None,
+            is_multilingual: Optional[bool] = None,
+            prompt_ids: Optional[torch.Tensor] = None,
+            prompt_condition_type: Optional[str] = None,  # first-segment, all-segments
+            condition_on_prev_tokens: Optional[bool] = None,
+            temperature: Optional[Union[float, Tuple[float, ...]]] = None,
+            compression_ratio_threshold: Optional[float] = None,
+            logprob_threshold: Optional[float] = None,
+            no_speech_threshold: Optional[float] = None,
+            num_segment_frames: Optional[int] = None,
+            attention_mask: Optional[torch.Tensor] = None,
+            time_precision: float = 0.02,
+            return_token_timestamps: Optional[bool] = None,
+            return_segments: bool = False,
+            return_dict_in_generate: Optional[bool] = None,
+            assistant_model: Optional["PreTrainedModel"] = None,
+            **kwargs,
+    ):
+        if condition_on_prev_tokens:
+            raise NotImplementedError("Current version does not support conditioning")
+        gen_c, _ = self._prepare_generation_config(generation_config, **kwargs)
+        gen_mode = gen_c.get_generation_mode(assistant_model)
+        if gen_mode not in [GenerationMode.GREEDY_SEARCH, GenerationMode.BEAM_SEARCH]:
+            raise ValueError(
+                f"Provided generation mode {gen_mode} is not supported"
+                f" for WhisperForConditionalGeneration with joint CTC decoding")
+        if "stno_mask" in kwargs:
+            self.stno_mask = kwargs["stno_mask"]
+        if "encoder_outputs" in kwargs:
+            self.encoder_logits = kwargs["encoder_outputs"].logits
+        # pylint: disable=no-member
+        # 0. deprecate old inputs
+        if "inputs" in kwargs:
+            input_features = kwargs.pop("inputs")
+            warnings.warn(
+                "The input name `inputs` is deprecated. Please make sure to use `input_features` instead.",
+                FutureWarning,
+            )
+        # 1. prepare generation config
+        generation_config, kwargs = self._prepare_generation_config(generation_config, **kwargs)
+        # 2. set global generate variables
+        input_stride = self.model.encoder.conv1.stride[0] * self.model.encoder.conv2.stride[0]
+        num_segment_frames = input_stride * self.config.max_source_positions
+        batch_size, total_input_frames = self._retrieve_total_input_frames(
+            input_features=input_features, input_stride=input_stride, kwargs=kwargs
+        )
+        is_shortform = total_input_frames <= num_segment_frames
+        if is_shortform:
+            # warn user of ignored inputs
+            self._maybe_warn_unused_inputs(
+                condition_on_prev_tokens=condition_on_prev_tokens,
+                temperature=temperature,
+                compression_ratio_threshold=compression_ratio_threshold,
+                logprob_threshold=logprob_threshold,
+                no_speech_threshold=no_speech_threshold,
+                total_input_frames=total_input_frames,
+            )
+        # 3. Make sure generation config is correctly set
+        # Make sure the generation config is correctly set depending on whether timestamps are to be returned or not
+        self._set_return_outputs(
+            return_dict_in_generate=return_dict_in_generate,
+            return_token_timestamps=return_token_timestamps,
+            is_shortform=is_shortform,
+            logprob_threshold=logprob_threshold,
+            generation_config=generation_config,
+        )
+        self._set_return_timestamps(
+            return_timestamps=return_timestamps, is_shortform=is_shortform, generation_config=generation_config
+        )
+        self._set_language_and_task(
+            language=language, task=task, is_multilingual=is_multilingual, generation_config=generation_config
+        )
+        self._set_num_frames(
+            return_token_timestamps=return_token_timestamps, generation_config=generation_config, kwargs=kwargs
+        )
+        self._set_thresholds_and_condition(
+            generation_config=generation_config,
+            logprob_threshold=logprob_threshold,
+            compression_ratio_threshold=compression_ratio_threshold,
+            no_speech_threshold=no_speech_threshold,
+            condition_on_prev_tokens=condition_on_prev_tokens,
+        )
+        self._set_prompt_condition_type(
+            generation_config=generation_config,
+            prompt_condition_type=prompt_condition_type,
+        )
+        # pass self.config for backward compatibility
+        init_tokens = self._retrieve_init_tokens(
+            input_features,
+            batch_size=batch_size,
+            generation_config=generation_config,
+            config=self.config,
+            num_segment_frames=num_segment_frames,
+            kwargs=kwargs,
+        )
+        # passing `decoder_input_ids` is deprecated - the only exception is for assisted generation
+        # where the input ids are handled explicitly by the generate method
+        self._check_decoder_input_ids(kwargs=kwargs)
+        # 3. Retrieve logits processors
+        device = kwargs["encoder_outputs"][0].device if "encoder_outputs" in kwargs else input_features.device
+        begin_index = init_tokens.shape[1]
+        logits_processor = self._retrieve_logit_processors(
+            generation_config=generation_config,
+            logits_processor=logits_processor,
+            begin_index=begin_index,  # begin index is index of first generated decoder token
+            is_shortform=is_shortform,
+            num_beams=kwargs.get("num_beams", 1),
+            device=device,
+        )
+        # 5. If we're in shortform mode, simple generate the whole input at once and return the output
+        if is_shortform:
+            if temperature is not None:
+                generation_config.temperature = temperature
+            decoder_input_ids = kwargs.pop("decoder_input_ids", None)
+            if decoder_input_ids is None:
+                decoder_input_ids = init_tokens
+            if prompt_ids is not None:
+                decoder_input_ids = torch.cat(
+                    [prompt_ids[None].repeat(decoder_input_ids.shape[0], 1), decoder_input_ids], dim=-1
+                )
+            max_new_tokens = generation_config.max_new_tokens if generation_config.max_new_tokens is not None else 0
+            if max_new_tokens + decoder_input_ids.shape[-1] > self.config.max_target_positions:
+                raise ValueError(
+                    f"The length of `decoder_input_ids` equal `prompt_ids` plus special start tokens is {decoder_input_ids.shape[-1]}, and the `max_new_tokens` "
+                    f"is {max_new_tokens}. Thus, the combined length of "
+                    f"`decoder_input_ids` and `max_new_tokens` is: {max_new_tokens + decoder_input_ids.shape[-1]}. This exceeds the "
+                    f"`max_target_positions` of the Whisper model: {self.config.max_target_positions}. "
+                    "You should either reduce the length of your prompt, or reduce the value of `max_new_tokens`, "
+                    f"so that their combined length is less than {self.config.max_target_positions}."
+                )
+            outputs = super().generate(
+                input_features,
+                generation_config=generation_config,
+                logits_processor=logits_processor,
+                stopping_criteria=stopping_criteria,
+                prefix_allowed_tokens_fn=prefix_allowed_tokens_fn,
+                synced_gpus=synced_gpus,
+                decoder_input_ids=decoder_input_ids,
+                **kwargs,
+            )
+            if generation_config.return_token_timestamps and hasattr(generation_config, "alignment_heads"):
+                outputs["token_timestamps"] = self._extract_token_timestamps(
+                    outputs, generation_config.alignment_heads, num_frames=generation_config.num_frames
+                )
+            # print("\n".join(self.tokenizer.batch_decode(outputs,skip_special_tokens=True, decode_with_timestamps=True)))
+            return outputs
+        # 6. Else we're in longform mode which is more complex.
+        # We need to chunk the audio input depending on when the model generates timestamp tokens
+        # 6.1 Set and retrieve global longform generation variables
+        self._set_condition_on_prev_tokens(
+            condition_on_prev_tokens=condition_on_prev_tokens, generation_config=generation_config
+        )
+        timestamp_begin = generation_config.no_timestamps_token_id + 1
+        temperatures = [temperature] if not isinstance(temperature, (list, tuple)) else temperature
+        temperature = temperatures[0]
+        batch_size = input_features.shape[0]
+        max_frames, seek = self._retrieve_max_frames_and_seek(
+            batch_size=batch_size, attention_mask=attention_mask, total_input_frames=total_input_frames
+        )
+        # 6.2 Preppare running variables, list for generation
+        cur_bsz = batch_size
+        current_segments = self._prepare_segments(
+            prompt_ids=prompt_ids,
+            batch_size=batch_size,
+            generation_config=generation_config,
+        )
+        batch_idx_map = list(range(batch_size))
+        do_condition_on_prev_tokens = [condition_on_prev_tokens for _ in range(batch_size)]
+        # 6.2 Transcribe audio until we reach the end of all input audios
+        while (seek < max_frames).any():
+            # 6.3 NOTE: When in longform transcription mode and batch size > 1 we need to dynamically reduce the batch size during the loop
+            # in case one audio finished earlier than another one. Thus, we need to keep a table of "previous-index-2-current-index" in order
+            # to know which original audio is being decoded
+            # Set updated index map, duration of previously decoded chunks and number of max frames of current decoding chunk
+            input_features, cur_bsz, batch_idx_map = self._maybe_reduce_batch(
+                input_features=input_features,
+                seek=seek,
+                max_frames=max_frames,
+                cur_bsz=cur_bsz,
+                batch_idx_map=batch_idx_map,
+            )
+            time_offset = seek * time_precision / input_stride
+            seek_num_frames = (max_frames - seek).clamp(max=num_segment_frames)
+            # 6.4 cut out next 30s segment from input features
+            segment_input = self._get_input_segment(
+                input_features=input_features,
+                seek=seek,
+                seek_num_frames=seek_num_frames,
+                num_segment_frames=num_segment_frames,
+                cur_bsz=cur_bsz,
+                batch_idx_map=batch_idx_map,
+            )
+            # 6.5 prepare decoder input ids
+            suppress_tokens = _get_attr_from_logit_processors(
+                logits_processor, SuppressTokensLogitsProcessor, "suppress_tokens"
+            )
+            decoder_input_ids, kwargs = self._prepare_decoder_input_ids(
+                cur_bsz=cur_bsz,
+                init_tokens=init_tokens,
+                current_segments=current_segments,
+                batch_idx_map=batch_idx_map,
+                do_condition_on_prev_tokens=do_condition_on_prev_tokens,
+                prompt_ids=prompt_ids,
+                generation_config=generation_config,
+                config=self.config,
+                device=segment_input.device,
+                suppress_tokens=suppress_tokens,
+                kwargs=kwargs,
+            )
+            # 6.6 set max new tokens or max length
+            self._set_max_new_tokens_and_length(
+                config=self.config,
+                decoder_input_ids=decoder_input_ids,
+                generation_config=generation_config,
+            )
+            # 6.7 Set current `begin_index` for all logit processors
+            for proc in logits_processor:
+                if hasattr(proc, "set_begin_index"):
+                    proc.set_begin_index(decoder_input_ids.shape[-1])
+            # 6.8 Run generate with fallback
+            seek_sequences, seek_outputs, should_skip, do_condition_on_prev_tokens = self.generate_with_fallback(
+                segment_input=segment_input,
+                decoder_input_ids=decoder_input_ids,
+                cur_bsz=cur_bsz,
+                batch_idx_map=batch_idx_map,
+                seek=seek,
+                num_segment_frames=num_segment_frames,
+                max_frames=max_frames,
+                temperatures=temperatures,
+                generation_config=generation_config,
+                logits_processor=logits_processor,
+                stopping_criteria=stopping_criteria,
+                prefix_allowed_tokens_fn=prefix_allowed_tokens_fn,
+                synced_gpus=synced_gpus,
+                return_token_timestamps=return_token_timestamps,
+                do_condition_on_prev_tokens=do_condition_on_prev_tokens,
+                kwargs=kwargs,
+            )
+            # 6.9 In every generated sequence, split by timestamp tokens and extract segments
+            if not self.config.is_mt or self.config.mt_num_speakers == 1:
+                for i, seek_sequence in enumerate(seek_sequences):
+                    prev_i = batch_idx_map[i]
+                    if should_skip[i]:
+                        seek[prev_i] += seek_num_frames[prev_i]
+                        continue
+                    segments, segment_offset = self._retrieve_segment(
+                        seek_sequence=seek_sequence,
+                        seek_outputs=seek_outputs,
+                        time_offset=time_offset,
+                        timestamp_begin=timestamp_begin,
+                        seek_num_frames=seek_num_frames,
+                        time_precision=time_precision,
+                        input_stride=input_stride,
+                        prev_idx=prev_i,
+                        idx=i,
+                        return_token_timestamps=return_token_timestamps,
+                    )
+                    current_segments[prev_i] += segments
+                    seek[prev_i] += segment_offset
+            else:
+                # We have to make sure all speakers are synchronized thus we have to find minumum of seeks that each instance like
+                for j, seek_seqs in enumerate(
+                        [seek_sequences[i * self.config.mt_num_speakers:(i + 1) * self.config.mt_num_speakers] for i in
+                         range(len(seek_sequences) // self.config.mt_num_speakers)]):
+                    indexes = [j * self.config.mt_num_speakers + i for i in range(self.config.mt_num_speakers)]
+                    prev_ids = [batch_idx_map[i] for i in indexes]
+                    if all([should_skip[i] for i in indexes]):
+                        for i, prev_i in zip(indexes, prev_ids):
+                            seek[prev_i] += seek_num_frames[prev_i]
+                        continue
+                    segments, segment_offset = self._retrieve_segment_mt(
+                        seek_sequences=seek_seqs,
+                        seek_outputs=seek_outputs,
+                        time_offset=time_offset,
+                        timestamp_begin=timestamp_begin,
+                        seek_num_frames=seek_num_frames,
+                        time_precision=time_precision,
+                        input_stride=input_stride,
+                        prev_ids=prev_ids,
+                        ids=indexes,
+                        return_token_timestamps=return_token_timestamps,
+                    )
+                    if self.config.uses_enrollments:
+                        segment_offset[1:] =  [torch.tensor(0)] *len(segment_offset[1:])
+                    else:
+                        segment_offset[1:] = [segment_offset[0]] * len(segment_offset[1:])
+                    for prev_i, i in zip(prev_ids, range(self.config.mt_num_speakers)):
+                        current_segments[prev_i] += segments[i]
+                        seek[prev_i] += segment_offset[i]
+                    if self.config.uses_enrollments:
+                        if seek[prev_ids[0]] >= max_frames[prev_ids[0]]:
+                            seek[prev_ids[1]] = max_frames[prev_ids[1]]
+        # 7. Once all segments are added to the list of all segments, called `current_segments`, we extract the predicted
+        # output tokens from the list of dicts. If we use batch size > 1, we make sure to pad the output
+        final_segments = (
+            [x[1:] for x in current_segments]
+            if (prompt_ids is not None and generation_config.prompt_condition_type == "first-segment")
+            else current_segments
+        )
+        if "is_valid" in kwargs:
+            final_segments = [seg for idx, seg in enumerate(final_segments) if kwargs['is_valid'][idx]]
+        sequences = _pad_to_max_length(
+            final_segments, generation_config.pad_token_id, device=self.device, padding="right"
+        )
+        # 8. If we return all segments, the predicted output sequences are put under `"sequences"`.
+        output = {"sequences": sequences, "segments": final_segments}
+        self.encoder_logits = None
+        if isinstance(output, dict):
+            output = self._fix_timestamps_from_segmentation(output)
+        return output
+    @staticmethod
+    def _find_common_seek(sequences, seeks):
+        """
+        Finds the minimum seek that does not overlap with other sequences,
+        and falls back to (segment.start - 0.2) if needed. Assumes:
+        - 'seeks' is a list of (seek_time_int, sequence_index),
+        - seek_time_int is in timestamp * 100 format (e.g., 125.5s -> 12550).
+        """
+        def is_valid_seek(seek_time, exclude_seq_idx):
+            for idx, seq in enumerate(sequences):
+                if idx == exclude_seq_idx:
+                    continue
+                for segment in seq:
+                    start = getattr(segment, 'start', segment['start'])
+                    end = getattr(segment, 'end', segment['end'])
+                    if seek_time < start:
+                        break  # Segments are sorted by end
+                    if start < seek_time < end:
+                        return False
+            return True
+        # Step 1: Find minimum seek
+        # if all seek values are the same, return it immediately
+        seeks = [s if isinstance(s, int) else s.item() for s in seeks]
+        if len(set(seeks)) == 1:
+            return seeks[0]
+        min_seek_val = min(seeks)
+        min_seek_idx = seeks.index(min_seek_val)
+        min_seek_real = min_seek_val / 100
+        if is_valid_seek(min_seek_real, min_seek_idx):
+            return min_seek_val
+        # Step 2: Try fallback seeks from all sequences (segment.start - 0.1s)
+        fallback_seeks = set()
+        for idx, seq in enumerate(sequences):
+            for segment in seq:
+                start = getattr(segment, 'start', segment['start'])
+                if isinstance(start, torch.Tensor):
+                    start = start.item()
+                candidate = round(start, 2)
+                fallback_seeks.add((candidate, idx, True))
+                end = getattr(segment, 'end', segment['end'])
+                if isinstance(end, torch.Tensor):
+                    end = end.item()
+                if end < min_seek_real:
+                    candidate = round(end, 2)
+                    fallback_seeks.add((candidate, idx, True))
+        valid_fallbacks = [
+            (int(s * 100), idx, is_start) for s, idx, is_start in fallback_seeks
+            if is_valid_seek(s, min_seek_idx)
+        ]
+        if valid_fallbacks:
+            return max(valid_fallbacks)
+        # Step 3: Nothing valid
+        return 0
+    @staticmethod
+    def remove_segments_after_seek(sequences, seek, eps=100):
+        """
+        Keep only segments that finish before given timestamp.
+        Args:
+            sequences: List of lists, each containing segments (dict or object with 'start' and 'end').
+            seek: Integer seek timestamp (e.g., timestamp * 100).
+        Returns:
+            None. Modifies the sequences in-place.
+        """
+        return [[seg for seg in seq if (getattr(seg, 'end', seg['end']) * 100 <= seek + eps)] for seq in sequences]
+    @staticmethod
+    def _retrieve_segment_wo_seek(
+            seek_sequence,
+            seek_outputs,
+            time_offset,
+            timestamp_begin,
+            seek_num_frames,
+            time_precision,
+            input_stride,
+            prev_idx,
+            idx,
+            return_token_timestamps,
+    ):
+        # find the predicted "end of segment" predictions of Whisper
+        # "end of segment" predictions occur whenever Whisper predicts a timestamp token
+        timestamp_tokens: torch.Tensor = seek_sequence.ge(timestamp_begin)
+        single_timestamp_ending = timestamp_tokens[-2:].tolist() == [False, True]
+        timestamp_segment_indices = torch.where(timestamp_tokens[:-1] & timestamp_tokens[1:])[0]
+        timestamp_segment_indices.add_(1)
+        token_timestamps = seek_outputs[idx]["token_timestamps"] if return_token_timestamps else []
+        # If whisper predicted a "end of segment" via a timestep token, let's go ever each
+        # "end of segment" prediction and slice the decoding into segments accordingly
+        if len(timestamp_segment_indices) > 0:
+            # if the output contains two consecutive timestamp tokens
+            slices = timestamp_segment_indices.tolist()
+            segments = []
+            if single_timestamp_ending:
+                slices.append(len(seek_sequence))
+            last_slice = 0
+            # Add each segment to list of all segments
+            for current_slice in slices:
+                sliced_tokens = seek_sequence[last_slice:current_slice]
+                start_timestamp_pos = sliced_tokens[0].item() - timestamp_begin
+                end_timestamp_pos = sliced_tokens[-1].item() - timestamp_begin
+                segments.append(
+                    {
+                        "start": time_offset[prev_idx] + start_timestamp_pos * time_precision,
+                        "end": time_offset[prev_idx] + end_timestamp_pos * time_precision,
+                        "tokens": sliced_tokens,
+                        "result": seek_outputs[idx],
+                    }
+                )
+                if return_token_timestamps:
+                    segments[-1]["token_timestamps"] = (
+                            token_timestamps[last_slice:current_slice] + time_offset[prev_idx]
+                    )
+                last_slice = current_slice
+            if not single_timestamp_ending:
+                # generate all predictions after the last predicted "end of segment" and seek by 30s
+                sliced_tokens = seek_sequence[last_slice:]
+                start_timestamp_pos = sliced_tokens[0].item() - timestamp_begin
+                end_timestamp_pos = seek_num_frames[prev_idx] // 2
+                segments.append(
+                    {
+                        "start": time_offset[prev_idx] + start_timestamp_pos * time_precision,
+                        "end": time_offset[prev_idx] + end_timestamp_pos * time_precision,
+                        "tokens": sliced_tokens,
+                        "result": seek_outputs[idx],
+                    }
+                )
+            segment_offset = seek_num_frames[prev_idx]
+        else:
+            # If whisper does not predict any "end of segment" token, then
+            # the whole decoding is considered a segment and we add it to the list of segments
+            timestamps = seek_sequence[timestamp_tokens.nonzero().flatten()]
+            start_timestamp_pos = 0.0
+            last_timestamp_pos = seek_num_frames[prev_idx] // 2
+            if timestamps.numel() > 1:
+                start_timestamp_pos = timestamps[-2].item() - timestamp_begin
+                last_timestamp_pos = timestamps[-1].item() - timestamp_begin
+            elif timestamps.numel() == 1:
+                # no consecutive timestamps but it has a timestamp; use the last one.
+                start_timestamp_pos = timestamps[-1].item() - timestamp_begin
+            segments = [
+                {
+                    "start": time_offset[prev_idx] + start_timestamp_pos * time_precision,
+                    "end": time_offset[prev_idx] + last_timestamp_pos * time_precision,
+                    "tokens": seek_sequence,
+                    "result": seek_outputs[idx],
+                }
+            ]
+            segment_offset = seek_num_frames[prev_idx]
+        return segments, segment_offset
+    def _retrieve_segment_mt(
+            self,
+            seek_sequences,
+            seek_outputs,
+            time_offset,
+            timestamp_begin,
+            seek_num_frames,
+            time_precision,
+            input_stride,
+            prev_ids,
+            ids,
+            return_token_timestamps,
+    ):
+        sequences, seeks = [], []
+        for sequence, prev_id, idx in zip(seek_sequences, prev_ids, ids):
+            seq, seek = self._retrieve_segment(
+                seek_sequence=sequence,
+                seek_outputs=seek_outputs,
+                time_offset=time_offset,
+                timestamp_begin=timestamp_begin,
+                seek_num_frames=seek_num_frames,
+                time_precision=time_precision,
+                input_stride=input_stride,
+                prev_idx=prev_id,
+                idx=idx,
+                return_token_timestamps=return_token_timestamps,
+            )
+            sequences.append(seq)
+            seeks.append(seek)
+        return sequences, seeks
+    def _beam_search(
+            self,
+            input_ids: torch.LongTensor,
+            beam_scorer: BeamScorer,
+            logits_processor: LogitsProcessorList,
+            stopping_criteria: StoppingCriteriaList,
+            generation_config: GenerationConfig,
+            synced_gpus: bool,
+            logits_warper: Optional[LogitsProcessorList] = None,
+            **model_kwargs,
+    ) -> Union[GenerateBeamOutput, torch.LongTensor]:
+        r"""
+        Generates sequences of token ids for models with a language modeling head using **beam search decoding** and
+        can be used for text-decoder, text-to-text, speech-to-text, and vision-to-text models.
+        Parameters:
+            input_ids (`torch.LongTensor` of shape `(batch_size, sequence_length)`):
+                The sequence used as a prompt for the generation.
+            beam_scorer (`BeamScorer`):
+                An derived instance of [`BeamScorer`] that defines how beam hypotheses are constructed, stored and
+                sorted during generation. For more information, the documentation of [`BeamScorer`] should be read.
+            logits_processor (`LogitsProcessorList`):
+                An instance of [`LogitsProcessorList`]. List of instances of class derived from [`LogitsProcessor`]
+                used to modify the prediction scores of the language modeling head applied at each generation step.
+            stopping_criteria (`StoppingCriteriaList`:
+                An instance of [`StoppingCriteriaList`]. List of instances of class derived from [`StoppingCriteria`]
+                used to tell if the generation loop should stop.
+            generation_config ([`~generation.GenerationConfig`]):
+                The generation configuration to be used as parametrization of the decoding method.
+            synced_gpus (`bool`):
+                Whether to continue running the while loop until max_length (needed for ZeRO stage 3)
+            logits_warper (`LogitsProcessorList`, *optional*):
+                An instance of [`LogitsProcessorList`]. List of instances of class derived from [`LogitsWarper`] used
+                to warp the prediction score distribution of the language modeling head applied before multinomial
+                sampling at each generation step. Only required with sampling strategies (i.e. `do_sample` is set in
+                `generation_config`)
+            model_kwargs:
+                Additional model specific kwargs will be forwarded to the `forward` function of the model. If model is
+                an encoder-decoder model the kwargs should include `encoder_outputs`.
+        Return:
+            [`generation.GenerateBeamDecoderOnlyOutput`], [`~generation.GenerateBeamEncoderDecoderOutput`] or
+            `torch.LongTensor`: A `torch.LongTensor` containing the generated tokens (default behaviour) or a
+            [`~generation.GenerateBeamDecoderOnlyOutput`] if `model.config.is_encoder_decoder=False` and
+            `return_dict_in_generate=True` or a [`~generation.GenerateBeamEncoderDecoderOutput`] if
+            `model.config.is_encoder_decoder=True`.
+        """
+        # init values
+        pad_token_id = generation_config.pad_token_id
+        eos_token_id = generation_config.eos_token_id
+        output_attentions = generation_config.output_attentions
+        output_hidden_states = generation_config.output_hidden_states
+        output_scores = generation_config.output_scores
+        output_logits = generation_config.output_logits
+        return_dict_in_generate = generation_config.return_dict_in_generate
+        sequential = generation_config.low_memory
+        do_sample = generation_config.do_sample
+        if do_sample is True and not isinstance(logits_warper, LogitsProcessorList):
+            raise ValueError(
+                "`do_sample` is set to `True`, `logits_warper` must be a `LogitsProcessorList` instance (it is "
+                f"{logits_warper})."
+            )
+        beam_scorer._beam_hyps = beam_scorer._beam_hyps[:self.encoder_logits.shape[0]]
+        batch_size = len(beam_scorer._beam_hyps)
+        num_beams = beam_scorer.num_beams
+        batch_beam_size, cur_len = input_ids.shape
+        model_kwargs = self._get_initial_cache_position(input_ids, model_kwargs)
+        if num_beams * batch_size != batch_beam_size:
+            raise ValueError(
+                f"Batch dimension of `input_ids` should be {num_beams * batch_size}, but is {batch_beam_size}."
+            )
+        # init attention / hidden states / scores tuples
+        scores = () if (return_dict_in_generate and output_scores) else None
+        raw_logits = () if (return_dict_in_generate and output_logits) else None
+        beam_indices = (
+            tuple(() for _ in range(batch_beam_size)) if (return_dict_in_generate and output_scores) else None
+        )
+        decoder_attentions = () if (return_dict_in_generate and output_attentions) else None
+        cross_attentions = () if (return_dict_in_generate and output_attentions) else None
+        decoder_hidden_states = () if (return_dict_in_generate and output_hidden_states) else None
+        # if model is an encoder-decoder, retrieve encoder attention weights and hidden states
+        if return_dict_in_generate and self.config.is_encoder_decoder:
+            encoder_attentions = model_kwargs["encoder_outputs"].get("attentions") if output_attentions else None
+            encoder_hidden_states = (
+                model_kwargs["encoder_outputs"].get("hidden_states") if output_hidden_states else None
+            )
+        # initialise score of first beam with 0 and the rest with -1e9. This makes sure that only tokens
+        # of the first beam are considered to avoid sampling the exact same tokens across all beams.
+        beam_scores = torch.zeros((batch_size, num_beams), dtype=torch.float, device=input_ids.device)
+        beam_scores[:, 1:] = -1e9
+        beam_scores = beam_scores.view((batch_size * num_beams,))
+        this_peer_finished = False
+        decoder_prompt_len = input_ids.shape[-1]  # record the prompt length of decoder
+        while self._has_unfinished_sequences(this_peer_finished, synced_gpus, device=input_ids.device):
+            model_inputs = self.prepare_inputs_for_generation(input_ids, **model_kwargs)
+            # if sequential is True, split the input to batches of batch_size and run sequentially
+            if sequential:
+                if any(
+                        model_name in self.__class__.__name__.lower()
+                        for model_name in [
+                            "fsmt",
+                            "reformer",
+                            "bloom",
+                            "ctrl",
+                            "gpt_bigcode",
+                            "transo_xl",
+                            "xlnet",
+                            "cpm",
+                            "jamba",
+                        ]
+                ):
+                    raise RuntimeError(
+                        f"Currently generation for {self.__class__.__name__} is not supported "
+                        f"for `low_memory beam_search`. Please open an issue on GitHub if you need this feature."
+                    )
+                inputs_per_sub_batches = _split_model_inputs(
+                    model_inputs, split_size=batch_size, full_batch_size=batch_beam_size
+                )
+                outputs_per_sub_batch = [
+                    self(
+                        **inputs_per_sub_batch,
+                        return_dict=True,
+                        output_attentions=output_attentions,
+                        output_hidden_states=output_hidden_states,
+                    )
+                    for inputs_per_sub_batch in inputs_per_sub_batches
+                ]
+                outputs = stack_model_outputs(outputs_per_sub_batch)
+            else:  # Unchanged original behavior
+                outputs = self(
+                    **model_inputs,
+                    return_dict=True,
+                    output_attentions=output_attentions,
+                    output_hidden_states=output_hidden_states,
+                )
+            if synced_gpus and this_peer_finished:
+                cur_len = cur_len + 1
+                continue  # don't waste resources running the code we don't need
+            next_token_logits = outputs.logits[:, -1, :]
+            next_token_scores = nn.functional.log_softmax(
+                next_token_logits, dim=-1
+            )  # (batch_size * num_beams, vocab_size)
+            next_token_scores_processed = logits_processor(input_ids, next_token_scores)
+            if do_sample:
+                next_token_scores_processed = logits_warper(input_ids, next_token_scores_processed)
+            next_token_scores = next_token_scores_processed + beam_scores[:, None].expand_as(
+                next_token_scores_processed
+            )
+            # Store scores, attentions and hidden_states when required
+            if return_dict_in_generate:
+                if output_scores:
+                    scores += (next_token_scores_processed,)
+                if output_logits:
+                    raw_logits += (next_token_logits,)
+                if output_attentions:
+                    decoder_attentions += (
+                        (outputs.decoder_attentions,) if self.config.is_encoder_decoder else (outputs.attentions,)
+                    )
+                    if self.config.is_encoder_decoder:
+                        cross_attentions += (outputs.cross_attentions,)
+                if output_hidden_states:
+                    decoder_hidden_states += (
+                        (outputs.decoder_hidden_states,)
+                        if self.config.is_encoder_decoder
+                        else (outputs.hidden_states,)
+                    )
+            # reshape for beam search
+            vocab_size = next_token_scores.shape[-1]
+            next_token_scores = next_token_scores.view(batch_size, num_beams * vocab_size)
+            # Beam token selection: pick 1 + eos_token_id.shape[0] next tokens for each beam so we have at least 1
+            # non eos token per beam.
+            n_eos_tokens = eos_token_id.shape[0] if eos_token_id is not None else 0
+            n_tokens_to_keep = max(2, 1 + n_eos_tokens) * num_beams
+            if do_sample:
+                probs = nn.functional.softmax(next_token_scores, dim=-1)
+                next_tokens = torch.multinomial(probs, num_samples=n_tokens_to_keep)
+                next_token_scores = torch.gather(next_token_scores, -1, next_tokens)
+                next_token_scores, _indices = torch.sort(next_token_scores, descending=True, dim=1)
+                next_tokens = torch.gather(next_tokens, -1, _indices)
+            else:
+                next_token_scores, next_tokens = torch.topk(
+                    next_token_scores, n_tokens_to_keep, dim=1, largest=True, sorted=True
+                )
+            next_indices = torch.div(next_tokens, vocab_size, rounding_mode="floor")
+            next_tokens = next_tokens % vocab_size
+            # stateless
+            beam_outputs = beam_scorer.process(
+                input_ids,
+                next_token_scores,
+                next_tokens,
+                next_indices,
+                pad_token_id=pad_token_id,
+                eos_token_id=eos_token_id,
+                beam_indices=beam_indices,
+                decoder_prompt_len=decoder_prompt_len,
+            )
+            beam_scores = beam_outputs["next_beam_scores"]
+            beam_next_tokens = beam_outputs["next_beam_tokens"]
+            beam_idx = beam_outputs["next_beam_indices"]
+            # Based on the beam idx and next tokens reshuffle the ctc prev states and scores
+            if hasattr(self, "ctc_rescorer"):
+                self.ctc_rescorer.update_state(beam_next_tokens, beam_idx)
+            input_ids = torch.cat([input_ids[beam_idx, :], beam_next_tokens.unsqueeze(-1)], dim=-1)
+            model_kwargs = self._update_model_kwargs_for_generation(
+                outputs,
+                model_kwargs,
+                is_encoder_decoder=self.config.is_encoder_decoder,
+            )
+            if model_kwargs.get("past_key_values", None) is not None:
+                model_kwargs["past_key_values"] = self._temporary_reorder_cache(
+                    model_kwargs["past_key_values"], beam_idx
+                )
+            if return_dict_in_generate and output_scores:
+                beam_indices = tuple((beam_indices[beam_idx[i]] + (beam_idx[i],) for i in range(len(beam_indices))))
+            # increase cur_len
+            cur_len = cur_len + 1
+            if beam_scorer.is_done or all(stopping_criteria(input_ids, scores)):
+                this_peer_finished = True
+        sequence_outputs = beam_scorer.finalize(
+            input_ids,
+            beam_scores,
+            next_tokens,
+            next_indices,
+            pad_token_id=pad_token_id,
+            eos_token_id=eos_token_id,
+            max_length=stopping_criteria.max_length,
+            beam_indices=beam_indices,
+            decoder_prompt_len=decoder_prompt_len,
+        )
+        if return_dict_in_generate:
+            if not output_scores:
+                sequence_outputs["sequence_scores"] = None
+            if self.config.is_encoder_decoder:
+                return GenerateBeamEncoderDecoderOutput(
+                    sequences=sequence_outputs["sequences"],
+                    sequences_scores=sequence_outputs["sequence_scores"],
+                    scores=scores,
+                    logits=raw_logits,
+                    beam_indices=sequence_outputs["beam_indices"],
+                    encoder_attentions=encoder_attentions,
+                    encoder_hidden_states=encoder_hidden_states,
+                    decoder_attentions=decoder_attentions,
+                    cross_attentions=cross_attentions,
+                    decoder_hidden_states=decoder_hidden_states,
+                    past_key_values=model_kwargs.get("past_key_values"),
+                )
+            else:
+                return GenerateBeamDecoderOnlyOutput(
+                    sequences=sequence_outputs["sequences"],
+                    sequences_scores=sequence_outputs["sequence_scores"],
+                    scores=scores,
+                    logits=raw_logits,
+                    beam_indices=sequence_outputs["beam_indices"],
+                    attentions=decoder_attentions,
+                    hidden_states=decoder_hidden_states,
+                    past_key_values=model_kwargs.get("past_key_values"),
+                )
+        else:
+            return sequence_outputs["sequences"]
+    def _sample(
+            self,
+            input_ids: torch.LongTensor,
+            logits_processor: LogitsProcessorList,
+            stopping_criteria: StoppingCriteriaList,
+            generation_config: GenerationConfig,
+            synced_gpus: bool,
+            streamer: Optional["BaseStreamer"],
+            logits_warper: Optional[LogitsProcessorList] = None,
+            **model_kwargs,
+    ) -> Union[GenerateNonBeamOutput, torch.LongTensor]:
+        r"""
+        Generates sequences of token ids for models with a language modeling head using **multinomial sampling** and
+        can be used for text-decoder, text-to-text, speech-to-text, and vision-to-text models.
+        Parameters:
+            input_ids (`torch.LongTensor` of shape `(batch_size, sequence_length)`):
+                The sequence used as a prompt for the generation.
+            logits_processor (`LogitsProcessorList`):
+                An instance of [`LogitsProcessorList`]. List of instances of class derived from [`LogitsProcessor`]
+                used to modify the prediction scores of the language modeling head applied at each generation step.
+            stopping_criteria (`StoppingCriteriaList`):
+                An instance of [`StoppingCriteriaList`]. List of instances of class derived from [`StoppingCriteria`]
+                used to tell if the generation loop should stop.
+            generation_config ([`~generation.GenerationConfig`]):
+                The generation configuration to be used as parametrization of the decoding method.
+            synced_gpus (`bool`):
+                Whether to continue running the while loop until max_length (needed for ZeRO stage 3)
+            streamer (`BaseStreamer`, *optional*):
+                Streamer object that will be used to stream the generated sequences. Generated tokens are passed
+                through `streamer.put(token_ids)` and the streamer is responsible for any further processing.
+            logits_warper (`LogitsProcessorList`, *optional*):
+                An instance of [`LogitsProcessorList`]. List of instances of class derived from [`LogitsWarper`] used
+                to warp the prediction score distribution of the language modeling head applied before multinomial
+                sampling at each generation step. Only required with sampling strategies (i.e. `do_sample` is set in
+                `generation_config`)
+            model_kwargs:
+                Additional model specific kwargs will be forwarded to the `forward` function of the model. If model is
+                an encoder-decoder model the kwargs should include `encoder_outputs`.
+        Return:
+            [`~generation.GenerateDecoderOnlyOutput`], [`~generation.GenerateEncoderDecoderOutput`] or `torch.LongTensor`:
+            A `torch.LongTensor` containing the generated tokens (default behaviour) or a
+            [`~generation.GenerateDecoderOnlyOutput`] if `model.config.is_encoder_decoder=False` and
+            `return_dict_in_generate=True` or a [`~generation.GenerateEncoderDecoderOutput`] if
+            `model.config.is_encoder_decoder=True`.
+        """
+        # init values
+        pad_token_id = generation_config.pad_token_id
+        output_attentions = generation_config.output_attentions
+        output_hidden_states = generation_config.output_hidden_states
+        output_scores = generation_config.output_scores
+        output_logits = generation_config.output_logits
+        return_dict_in_generate = generation_config.return_dict_in_generate
+        has_eos_stopping_criteria = any(hasattr(criteria, "eos_token_id") for criteria in stopping_criteria)
+        do_sample = generation_config.do_sample
+        if do_sample is True and not isinstance(logits_warper, LogitsProcessorList):
+            raise ValueError(
+                "`do_sample` is set to `True`, `logits_warper` must be a `LogitsProcessorList` instance (it is "
+                f"{logits_warper})."
+            )
+        # init attention / hidden states / scores tuples
+        scores = () if (return_dict_in_generate and output_scores) else None
+        raw_logits = () if (return_dict_in_generate and output_logits) else None
+        decoder_attentions = () if (return_dict_in_generate and output_attentions) else None
+        cross_attentions = () if (return_dict_in_generate and output_attentions) else None
+        decoder_hidden_states = () if (return_dict_in_generate and output_hidden_states) else None
+        # if model is an encoder-decoder, retrieve encoder attention weights and hidden states
+        if return_dict_in_generate and self.config.is_encoder_decoder:
+            encoder_attentions = model_kwargs["encoder_outputs"].get("attentions") if output_attentions else None
+            encoder_hidden_states = (
+                model_kwargs["encoder_outputs"].get("hidden_states") if output_hidden_states else None
+            )
+        # keep track of which sequences are already finished
+        batch_size = input_ids.shape[0]
+        this_peer_finished = False
+        unfinished_sequences = torch.ones(batch_size, dtype=torch.long, device=input_ids.device)
+        model_kwargs = self._get_initial_cache_position(input_ids, model_kwargs)
+        while self._has_unfinished_sequences(this_peer_finished, synced_gpus, device=input_ids.device):
+            # prepare model inputs
+            model_inputs = self.prepare_inputs_for_generation(input_ids, **model_kwargs)
+            # forward pass to get next token
+            outputs = self(
+                **model_inputs,
+                return_dict=True,
+                output_attentions=output_attentions,
+                output_hidden_states=output_hidden_states,
+            )
+            if synced_gpus and this_peer_finished:
+                continue  # don't waste resources running the code we don't need
+            next_token_logits = outputs.logits[:, -1, :]
+            # pre-process distribution
+            next_token_scores = logits_processor(input_ids, next_token_logits)
+            if do_sample:
+                next_token_scores = logits_warper(input_ids, next_token_scores)
+            # Store scores, attentions and hidden_states when required
+            if return_dict_in_generate:
+                if output_scores:
+                    scores += (next_token_scores,)
+                if output_logits:
+                    raw_logits += (next_token_logits,)
+                if output_attentions:
+                    decoder_attentions += (
+                        (outputs.decoder_attentions,) if self.config.is_encoder_decoder else (outputs.attentions,)
+                    )
+                    if self.config.is_encoder_decoder:
+                        cross_attentions += (outputs.cross_attentions,)
+                if output_hidden_states:
+                    decoder_hidden_states += (
+                        (outputs.decoder_hidden_states,)
+                        if self.config.is_encoder_decoder
+                        else (outputs.hidden_states,)
+                    )
+            # token selection
+            if do_sample:
+                probs = nn.functional.softmax(next_token_scores, dim=-1)
+                next_tokens = torch.multinomial(probs, num_samples=1).squeeze(1)
+            else:
+                next_tokens = torch.argmax(next_token_scores, dim=-1)
+            # finished sentences should have their next token be a padding token
+            if has_eos_stopping_criteria:
+                next_tokens = next_tokens * unfinished_sequences + pad_token_id * (1 - unfinished_sequences)
+            # Based on the next tokens select the ctc prev states and scores
+            if hasattr(self, "ctc_rescorer"):
+                self.ctc_rescorer.update_state(next_tokens, torch.arange(next_tokens.shape[0]))
+            # update generated ids, model inputs, and length for next step
+            input_ids = torch.cat([input_ids, next_tokens[:, None]], dim=-1)
+            if streamer is not None:
+                streamer.put(next_tokens.cpu())
+            model_kwargs = self._update_model_kwargs_for_generation(
+                outputs,
+                model_kwargs,
+                is_encoder_decoder=self.config.is_encoder_decoder,
+            )
+            unfinished_sequences = unfinished_sequences & ~stopping_criteria(input_ids, scores)
+            this_peer_finished = unfinished_sequences.max() == 0
+        if streamer is not None:
+            streamer.end()
+        if return_dict_in_generate:
+            if self.config.is_encoder_decoder:
+                return GenerateEncoderDecoderOutput(
+                    sequences=input_ids,
+                    scores=scores,
+                    logits=raw_logits,
+                    encoder_attentions=encoder_attentions,
+                    encoder_hidden_states=encoder_hidden_states,
+                    decoder_attentions=decoder_attentions,
+                    cross_attentions=cross_attentions,
+                    decoder_hidden_states=decoder_hidden_states,
+                    past_key_values=model_kwargs.get("past_key_values"),
+                )
+            else:
+                return GenerateDecoderOnlyOutput(
+                    sequences=input_ids,
+                    scores=scores,
+                    logits=raw_logits,
+                    attentions=decoder_attentions,
+                    hidden_states=decoder_hidden_states,
+                    past_key_values=model_kwargs.get("past_key_values"),
+                )
+        else:
+            return input_ids
+    def prepare_kwargs_for_generate(self,
+                                    segment_input,
+                                    cur_bsz,
+                                    batch_idx_map,
+                                    seek,
+                                    num_segment_frames,
+                                    max_frames,
+                                    kwargs):
+        kwargs["attention_mask_enc"] = torch.ones(cur_bsz, segment_input.size(-1), device=segment_input.device)
+        seek_vad = seek // 2
+        num_frames_vad = num_segment_frames // 2
+        max_frames_vad = max_frames // 2
+        seek_num_frames = (max_frames_vad - seek_vad).clamp(max=num_frames_vad)
+        stno_masks = []
+        for i in range(cur_bsz):
+            prev_i = batch_idx_map[i]
+            segment_input_slice = kwargs["stno_mask"][prev_i: prev_i + 1, :,
+                                  seek_vad[prev_i]: seek_vad[prev_i] + seek_num_frames[prev_i]]
+            if segment_input_slice.shape[-1] < num_frames_vad:
+                orig_len = segment_input_slice.shape[-1]
+                # pad to 3000 if necessary
+                segment_input_slice = torch.nn.functional.pad(
+                    segment_input_slice, pad=(0, num_frames_vad - orig_len)
+                )
+                # set corresponding padding tokens to 1 in vad mask representing silence
+                segment_input_slice[0, 0, orig_len:] = 1.0
+            stno_masks.append(segment_input_slice)
+        kwargs["stno_mask"] = torch.cat(stno_masks, dim=0)
+        self.stno_mask_seek = kwargs["stno_mask"]
+        if "per_group_sizes" in kwargs:
+            group_sizes = kwargs["per_group_sizes"].clone()
+            group_sizes[:] = 0
+            cummulative_group_sizes = (
+                kwargs["per_group_sizes"].max().repeat(kwargs["per_group_sizes"].shape[0])).cumsum(dim=0)
+            for i in batch_idx_map:
+                group_idx = (cummulative_group_sizes > i).nonzero().min()
+                group_sizes[group_idx] += 1
+            kwargs["per_group_sizes"] = group_sizes
+        if self.vad_seek_callback is not None:
+            self.vad_seek_callback(kwargs["stno_mask"])
+        if "is_valid" in kwargs:
+            kwargs['is_valid'] = kwargs["is_valid"][batch_idx_map]
+        kwargs['labels'] = kwargs["labels"][batch_idx_map]
+        kwargs['upp_labels'] = kwargs["upp_labels"][batch_idx_map]
+        return kwargs
+    def generate_with_fallback(
+            self,
+            segment_input,
+            decoder_input_ids,
+            cur_bsz,
+            batch_idx_map,
+            seek,
+            num_segment_frames,
+            max_frames,
+            temperatures,
+            generation_config,
+            logits_processor,
+            stopping_criteria,
+            prefix_allowed_tokens_fn,
+            synced_gpus,
+            return_token_timestamps,
+            do_condition_on_prev_tokens,
+            kwargs,
+    ):
+        kwargs = copy.copy(kwargs)
+        kwargs = self.prepare_kwargs_for_generate(segment_input, cur_bsz, batch_idx_map, seek, num_segment_frames,
+                                                  max_frames, kwargs)
+        seek_sequences, seek_outputs, should_skip, do_condition_on_prev_tokens = super().generate_with_fallback(
+            segment_input,
+            decoder_input_ids,
+            cur_bsz,
+            batch_idx_map,
+            seek,
+            num_segment_frames,
+            max_frames,
+            temperatures,
+            generation_config,
+            logits_processor,
+            stopping_criteria,
+            prefix_allowed_tokens_fn,
+            synced_gpus,
+            return_token_timestamps,
+            do_condition_on_prev_tokens,
+            kwargs,
+        )
+        self.stno_mask_seek = None
+        if "is_valid" in kwargs:
+            seek_sequences_tmp = [torch.tensor([])] * len(seek_sequences)
+            seek_outputs_tmp = [torch.tensor([])] * len(seek_sequences)
+            should_skip_tmp = [False] * len(seek_sequences)
+            do_condition_on_prev_tokens_tmp = [None] * len(seek_sequences)
+            non_valid_inc = 0
+            for idx, is_valid in enumerate(kwargs["is_valid"]):
+                if is_valid:
+                    seek_sequences_tmp[idx] = seek_sequences[non_valid_inc]
+                    seek_outputs_tmp[idx] = seek_outputs[non_valid_inc]
+                    should_skip_tmp[idx] = should_skip[non_valid_inc]
+                    do_condition_on_prev_tokens_tmp[idx] = do_condition_on_prev_tokens[non_valid_inc]
+                    non_valid_inc+= 1
+            seek_sequences = seek_sequences_tmp
+            seek_outputs = seek_outputs_tmp
+            should_skip = should_skip_tmp
+            do_condition_on_prev_tokens = do_condition_on_prev_tokens_tmp
+        # for i,  seq in enumerate(seek_outputs):
+        #     print(f"Sequence {i} {self.safe_tokenizer_decode(kwargs['labels'][batch_idx_map[i]])}: {self.tokenizer.decode(seq, decode_with_timestamps=True)}")
+        # print("-"*50)
+        return seek_sequences, seek_outputs, should_skip, do_condition_on_prev_tokens
+    def _retrieve_init_tokens(self, input_features, batch_size, generation_config, config, num_segment_frames, kwargs):
+        def replace_or_add(lst: List[int], num: int, itr: Iterator[int]):
+            """short function to replace num with a itr in lst"""
+            found = any(i in lst for i in itr)
+            if found:
+                lst = [num if i in itr else i for i in lst]
+            else:
+                lst.append(num)
+            return lst
+        def language_to_id(language: str) -> int:
+            language = language.lower()
+            if language in generation_config.lang_to_id.keys():
+                language_token = language
+            elif language in TO_LANGUAGE_CODE.keys():
+                language_token = f"<|{TO_LANGUAGE_CODE[language]}|>"
+            elif language in TO_LANGUAGE_CODE.values():
+                language_token = f"<|{language}|>"
+            else:
+                is_language_code = len(language) == 2
+                raise ValueError(
+                    f"Unsupported language: {language}. Language should be one of:"
+                    f" {list(TO_LANGUAGE_CODE.values()) if is_language_code else list(TO_LANGUAGE_CODE.keys())}."
+                )
+            if language_token not in generation_config.lang_to_id:
+                raise ValueError(
+                    f"{language_token} is not supported by this specific model as it is not in the `generation_config.lang_to_id`."
+                    "(You should just add it to the generation config)"
+                )
+            return generation_config.lang_to_id[language_token]
+        task = getattr(generation_config, "task", None)
+        language = getattr(generation_config, "language", None)
+        forced_decoder_ids = generation_config.forced_decoder_ids
+        if forced_decoder_ids is not None:
+            if language is None and task is None and forced_decoder_ids[0][1] is None:
+                logger.warning_once(
+                    "Due to a bug fix in https://github.com/huggingface/transformers/pull/28687 transcription using a multilingual Whisper will default to language detection followed by transcription instead of translation to English."
+                    "This might be a breaking change for your use case. If you want to instead always translate your audio to English, make sure to pass `language='en'`."
+                )
+        elif hasattr(config, "forced_decoder_ids") and config.forced_decoder_ids is not None:
+            forced_decoder_ids = config.forced_decoder_ids
+        elif forced_decoder_ids is not None and language is not None:
+            logger.info(
+                f"You have passed language={language}, but also have set `forced_decoder_ids` to {forced_decoder_ids} which creates a conflict. `forced_decoder_ids` will be ignored in favor of language={language}."
+            )
+            forced_decoder_ids = None
+        init_tokens = [generation_config.decoder_start_token_id]
+        # Update init_tokens with languages
+        lang_ids = None
+        if forced_decoder_ids is not None:
+            return forced_decoder_ids
+        # from v4.39 the forced decoder ids are always None in favour of decoder input ids
+        generation_config.forced_decoder_ids = None
+        is_lang_id_undefined = len(init_tokens) <= 1 or (len(init_tokens) > 1 and init_tokens[1] is None)
+        # Make sure language is a list of strings of the correct length
+        if isinstance(language, (list, tuple)):
+            if any(l is None for l in language):
+                raise TypeError(
+                    "Expected `language` to be `None`, a single string (e.g. `'en'`), or a list of strings with length equal to the batch size (e.g. `('en', 'fr')` for a batch size of 2). Got a list containing `None`."
+                )
+            if len(language) != batch_size:
+                raise ValueError(
+                    "When passing a list of languages, the length of the list must match the batch size. "
+                    f"Expected length of {batch_size}, but got {len(language)} languages."
+                )
+            languages = language
+        elif language is None:
+            # Language will be detected for each item in batch
+            languages = [None] * batch_size
+        else:
+            languages = [language]  # Use a length-1 list now, broadcast later
+        # Separate init_tokens for each language
+        init_tokens = [copy.copy(init_tokens) for _ in languages]
+        if language is not None and lang_ids is not None:
+            lang_ids = [language_to_id(l) for l in languages]
+        elif hasattr(generation_config, "lang_to_id") and is_lang_id_undefined:
+            # language is not defined or intentially set to `None` to trigger language detection
+            lang_ids = self.detect_language(
+                input_features=input_features,
+                encoder_outputs=kwargs.get("encoder_outputs", None),
+                generation_config=generation_config,
+                num_segment_frames=num_segment_frames,
+            ).tolist()
+        if lang_ids is not None:
+            # append or replace lang_ids to init_tokens
+            for i in range(len(init_tokens)):
+                if len(init_tokens[i]) > 1:
+                    init_tokens[i][1] = lang_ids[i]
+                else:
+                    init_tokens[i].append(lang_ids[i])
+        del languages
+        # Update init_tokens with task
+        for i in range(len(init_tokens)):
+            if task is not None:
+                if task in TASK_IDS:
+                    init_tokens[i].append(generation_config.task_to_id[generation_config.task])
+                    task_id = generation_config.task_to_id[generation_config.task]
+                    # if task is defined it'll overwrite task ids that might have already been defined via the generation_config
+                    replace_or_add(init_tokens[i], task_id, generation_config.task_to_id.values())
+                else:
+                    raise ValueError(f"The `{task}`task is not supported. The task should be one of `{TASK_IDS}`")
+            elif language is not None and hasattr(generation_config, "task_to_id"):
+                # if language is defined, but no task id is in `init_tokens`, default to transcribe
+                if not any(ti in init_tokens[i] for ti in generation_config.task_to_id.values()):
+                    init_tokens[i].append(generation_config.task_to_id["transcribe"])
+            # let's make sure we don't pass `None` tokens as prompt tokens
+            init_tokens[i] = [t for t in init_tokens[i] if t is not None]
+        return torch.as_tensor(init_tokens, dtype=torch.long, device=self.device).expand(batch_size, -1)
+    def detect_language(
+            self,
+            input_features: Optional[torch.FloatTensor] = None,
+            encoder_outputs: Optional[Union[torch.FloatTensor, BaseModelOutput]] = None,
+            generation_config: Optional[GenerationConfig] = None,
+            num_segment_frames: int = 3000,
+    ) -> torch.Tensor:
+        """
+        Detects language from log-mel input features or encoder_outputs
+        Parameters:
+            input_features (`torch.Tensor` of shape `(batch_size, feature_size, sequence_length)`, *optional*):
+                Float values of log-mel features extracted from the raw speech waveform. The raw speech waveform can be obtained by
+                loading a `.flac` or `.wav` audio file into an array of type `List[float]` or a `numpy.ndarray`, *e.g.* via
+                the soundfile library (`pip install soundfile`). To prepare the array into `input_features`, the
+                [`AutoFeatureExtractor`] should be used for extracting the mel features, padding and conversion into a
+                tensor of type `torch.FloatTensor`. See [`~WhisperFeatureExtractor.__call__`] for details.
+            encoder_outputs (`tuple(tuple(torch.FloatTensor)`, *optional*):
+                Tuple consists of (`last_hidden_state`, *optional*: `hidden_states`, *optional*: `attentions`)
+                `last_hidden_state` of shape `(batch_size, sequence_length, hidden_size)`, *optional*) is a sequence of
+                hidden-states at the output of the last layer of the encoder. Used in the cross-attention of the decoder.
+            generation_config (`~generation.GenerationConfig`, *optional*):
+                The generation configuration to be used as base parametrization for the generation call. `**kwargs`
+                passed to generate matching the attributes of `generation_config` will override them. If
+                `generation_config` is not provided, the default will be used, which had the following loading
+                priority: 1) from the `generation_config.json` model file, if it exists; 2) from the model
+                configuration. Please note that unspecified parameters will inherit [`~generation.GenerationConfig`]'s
+                default values, whose documentation should be checked to parameterize generation.
+            num_segment_frames (`int`, defaults to 3000):
+                The number of log-mel frames the model expects
+        Return:
+            A `torch.LongTensor` representing the detected language ids.
+        """
+        if input_features is None and encoder_outputs is None:
+            raise ValueError("You have to specify either `input_features` or `encoder_outputs`")
+        elif input_features is not None and encoder_outputs is not None:
+            raise ValueError("Make sure to specificy only one of `input_features` or `encoder_outputs` - not both!")
+        elif input_features is not None:
+            inputs = {"input_features": input_features[:, :, :num_segment_frames]}
+            batch_size = input_features.shape[0]
+        elif encoder_outputs is not None:
+            inputs = {"encoder_outputs": encoder_outputs}
+            batch_size = (
+                encoder_outputs[0].shape[0] if isinstance(encoder_outputs, BaseModelOutput) else encoder_outputs[0]
+            )
+        generation_config = generation_config or self.generation_config
+        decoder_input_ids = (
+                torch.ones((batch_size, 1), device=self.device, dtype=torch.long)
+                * generation_config.decoder_start_token_id
+        )
+        with torch.no_grad():
+            logits = self(**inputs, decoder_input_ids=decoder_input_ids,
+                          stno_mask=self.stno_mask[:, :, :num_segment_frames // 2]).logits[:, -1]
+        non_lang_mask = torch.ones_like(logits[0], dtype=torch.bool)
+        non_lang_mask[list(generation_config.lang_to_id.values())] = False
+        logits[:, non_lang_mask] = -np.inf
+        lang_ids = logits.argmax(-1)
+        return lang_ids
+    def _get_logits_processor(
+            self,
+            generation_config: GenerationConfig,
+            input_ids_seq_length: int,
+            encoder_input_ids: torch.LongTensor,
+            prefix_allowed_tokens_fn: Callable[[int, torch.Tensor], List[int]],
+            logits_processor: Optional[LogitsProcessorList],
+            device: str = None,
+            model_kwargs: Optional[Dict[str, Any]] = None,
+            negative_prompt_ids: Optional[torch.Tensor] = None,
+            negative_prompt_attention_mask: Optional[torch.Tensor] = None,
+    ) -> LogitsProcessorList:
+        # pylint: disable=no-member
+        gen_config_copy = copy.deepcopy(generation_config)
+        gen_config_copy.forced_decoder_ids = None
+        processors = super()._get_logits_processor(
+            gen_config_copy,
+            input_ids_seq_length,
+            encoder_input_ids,
+            prefix_allowed_tokens_fn,
+            logits_processor,
+            device,
+            model_kwargs,
+            negative_prompt_ids,
+            negative_prompt_attention_mask,
+        )
+        if hasattr(generation_config, "ctc_weight") and generation_config.ctc_weight > 0:
+            enc_logits = self.encoder_logits
+            if generation_config.num_beams <= 1:
+                processors.append(LogSoftmaxProcessor())
+            else:
+                enc_logits = enc_logits.repeat_interleave(generation_config.num_beams, dim=0)
+            self.ctc_rescorer = CTCRescorerLogitsProcessor(
+                enc_logits,
+                torch.full((enc_logits.shape[0],), fill_value=enc_logits.shape[1],
+                           device=enc_logits.device),
+                enc_logits.shape[-1] - 1,
+                generation_config.pad_token_id.item(),
+                generation_config.eos_token_id.item(),
+                generation_config.decoder_start_token_id.item(),
+                self.tokenizer,
+                generation_config.ctc_margin,
+                generation_config.ctc_weight,
+                generation_config.num_beams,
+                False,
+            )
+            processors.append(self.ctc_rescorer)
+        return processors
+    def _retrieve_logit_processors(self, generation_config, logits_processor, begin_index, is_shortform, num_beams,
+                                   device):
+        if generation_config.return_timestamps is True:
+            timestamp_processor = WhisperTimeStampLogitsProcessorCustom(generation_config, begin_index=begin_index)
+            logits_processor = (
+                [timestamp_processor] if logits_processor is None else [timestamp_processor] + logits_processor
+            )
+        if generation_config.suppress_tokens is not None:
+            suppress_tokens_processor = SuppressTokensLogitsProcessor(generation_config.suppress_tokens, device=device)
+            logits_processor = (
+                [suppress_tokens_processor]
+                if logits_processor is None
+                else [suppress_tokens_processor] + logits_processor
+            )
+            generation_config.suppress_tokens = None
+        if generation_config.begin_suppress_tokens is not None:
+            begin_suppress_processor = SuppressTokensAtBeginLogitsProcessor(
+                generation_config.begin_suppress_tokens, begin_index=begin_index, device=device
+            )
+            logits_processor = (
+                [begin_suppress_processor]
+                if logits_processor is None
+                else [begin_suppress_processor] + logits_processor
+            )
+            generation_config.begin_suppress_tokens = None
+        if generation_config.no_speech_threshold is not None and not is_shortform:
+            no_speech_detector = WhisperNoSpeechDetection(
+                no_speech_token=generation_config.no_timestamps_token_id - 1,
+                begin_index=begin_index,
+                scores_is_logprobs=num_beams > 1,
+            )
+            logits_processor = (
+                [no_speech_detector] if logits_processor is None else [no_speech_detector] + logits_processor
+            )
+            no_speech_detector.set_model(self)
+        return logits_processor
+    @staticmethod
+    def round_to_nearest_0_02(x):
+        d = Decimal(str(x))  # Use str(x) to preserve input precision
+        step = Decimal('0.02')
+        # Divide, round, multiply back
+        rounded = (d / step).to_integral_value(rounding=ROUND_HALF_UP) * step
+        return rounded
+    def _fix_timestamps_from_segmentation(self, sequences):
+        """
+        Adjusts token sequences with global timestamps to fit within Whisper's 0–30s timestamp token range.
+        This function modifies the input sequences by inserting appropriate timestamp tokens and
+        offset corrections to ensure the decoded token order is correct, without splitting any segment.
+        It aligns all timestamps to 0.02-second precision, inserts placeholder segments to bridge
+        time gaps between 30-second windows, and maintains segment continuity during encoding.
+        Args:
+            sequences (dict): A dictionary containing:
+                - 'segments': A list of segment lists, each segment being a dict with 'start', 'end', and 'tokens'.
+                - 'sequences': A tensor used to determine device for padding.
+        Returns:
+            torch.Tensor: A batch of padded token sequences with corrected timestamp alignment.
+        """
+        # Get the token ID for the "<|0.00|>" timestamp used to detect dummy segments
+        first_timestamp_token = self.tokenizer.get_vocab()["<|0.00|>"]
+        results = []
+        # Filter out segments that are either empty or consist only of the "<|0.00|>" token
+        for idx, sequence_segs in enumerate(sequences['segments']):
+            sequences['segments'][idx] = [
+                seg for seg in sequence_segs
+                if len(seg['tokens']) > 0 and (len(seg['tokens']) != 1 or seg['tokens'][0] != first_timestamp_token)
+            ]
+        # Iterate over each group of segments (e.g., one per utterance)
+        for idx, sequence_segs in enumerate(sequences['segments']):
+            result = []
+            prev_segment_end_time = None
+            correction = Decimal(0.0)
+            for i, seg in enumerate(sequence_segs):
+                # Round start and end times to nearest 0.02 seconds
+                start_time = self.round_to_nearest_0_02(seg['start'].item())
+                end_time = self.round_to_nearest_0_02(seg['end'].item())
+                tokens = seg['tokens']
+                # Determine which 30s window this segment falls into
+                current_block = (start_time + correction) // 30
+                if prev_segment_end_time is not None:
+                    # If not the first segment, calculate difference in 30s windows
+                    prev_block = prev_segment_end_time // 30
+                    num_dummies = current_block - prev_block - 1
+                    # Insert (30, [], 30) marker if we're moving to a new block
+                    if current_block > prev_block:
+                        result.append((30, [], 30))
+                    # Insert dummy segments to bridge skipped 30s blocks
+                    for _ in range(int(num_dummies)):
+                        result.append((0, [], 30))
+                else:
+                    # For the first segment, add dummy blocks if it starts after 30s
+                    for _ in range(int(start_time // 30)):
+                        result.append((0, [], 30))
+                # Determine whether segment fits in one block or wraps to the next
+                if (start_time + correction) // 30 == (end_time + correction) // 30:
+                    # Segment fits within a single 30s window
+                    result.append(((start_time + correction) % 30, tokens, (end_time + correction) % 30))
+                else:
+                    # Segment would wrap across a 30s boundary
+                    new_seg_start = (correction + start_time) % 30
+                    new_seg_end = end_time - start_time
+                    if new_seg_end >= new_seg_start:
+                        # Seek back to the beginning of the segment window
+                        result.append((new_seg_start, [], new_seg_start))
+                        result.append((0, tokens, new_seg_end))
+                        # Apply correction to align future timestamps to new 30s block
+                        correction = self.round_to_nearest_0_02(-(start_time % 30))
+                    else:
+                        # Otherwise, just insert with adjusted times
+                        result.append((new_seg_start, tokens, new_seg_end))
+                        correction = self.round_to_nearest_0_02(30 - (start_time % 30))
+                # print(f'Processed segment {i}, result: {self.tokenizer.decode(self.tokenizer("".join([f"<|{seg[0]:.2f}|>{self.tokenizer.decode(seg[1])}<|{seg[2]:.2f}|>" for seg in result]))["input_ids"], decode_with_timestamps=True)[-250:]}')
+                # Update the previous segment's end time for next iteration
+                prev_segment_end_time = end_time + correction
+            # Convert result segments into a token sequence with proper timestamp formatting
+            encoded = self.tokenizer(
+                "".join([f"<|{seg[0]:.2f}|>{self.tokenizer.decode(seg[1])}<|{seg[2]:.2f}|>" for seg in result])
+            )['input_ids']
+            results.append(encoded)
+        # Pad all sequences to the same length for batching
+        sequences = pad_sequence(
+            [torch.tensor(res, device=sequences['sequences'].device) for res in results],
+            batch_first=True,
+            padding_value=self.tokenizer.pad_token_id
+        )
+        return sequences
+    @staticmethod
+    def _retrieve_segment(
+            seek_sequence,
+            seek_outputs,
+            time_offset,
+            timestamp_begin,
+            seek_num_frames,
+            time_precision,
+            input_stride,
+            prev_idx,
+            idx,
+            return_token_timestamps,
+    ):
+        # find the predicted "end of segment" predictions of Whisper
+        # "end of segment" predictions occur whenever Whisper predicts a timestamp token
+        timestamp_tokens: torch.Tensor = seek_sequence.ge(timestamp_begin)
+        single_timestamp_ending = timestamp_tokens[-2:].tolist() == [False, True]
+        timestamp_segment_indices = torch.where(timestamp_tokens[:-1] & timestamp_tokens[1:])[0]
+        timestamp_segment_indices.add_(1)
+        token_timestamps = seek_outputs[idx]["token_timestamps"] if return_token_timestamps else []
+        # If whisper predicted a "end of segment" via a timestep token, let's go ever each
+        # "end of segment" prediction and slice the decoding into segments accordingly
+        if len(timestamp_segment_indices) > 0:
+            # if the output contains two consecutive timestamp tokens
+            slices = timestamp_segment_indices.tolist()
+            segments = []
+            if single_timestamp_ending:
+                slices.append(len(seek_sequence))
+            last_slice = 0
+            # Add each segment to list of all segments
+            for current_slice in slices:
+                sliced_tokens = seek_sequence[last_slice:current_slice]
+                start_timestamp_pos = sliced_tokens[0].item() - timestamp_begin
+                end_timestamp_pos = sliced_tokens[-1].item() - timestamp_begin
+                segments.append(
+                    {
+                        "start": time_offset[prev_idx] + start_timestamp_pos * time_precision,
+                        "end": time_offset[prev_idx] + end_timestamp_pos * time_precision,
+                        "tokens": sliced_tokens,
+                        "result": seek_outputs[idx],
+                    }
+                )
+                if return_token_timestamps:
+                    segments[-1]["token_timestamps"] = (
+                            token_timestamps[last_slice:current_slice] + time_offset[prev_idx]
+                    )
+                last_slice = current_slice
+            if single_timestamp_ending:
+                # single timestamp at the end means no speech after the last timestamp.
+                segment_offset = seek_num_frames[prev_idx]
+            else:
+                # otherwise, ignore the unfinished segment and seek to the last timestamp
+                # here we throw away all predictions after the last predicted "end of segment"
+                # since we are cutting right in the middle of an audio
+                last_timestamp_pos = seek_sequence[last_slice - 1].item() - timestamp_begin
+                segment_offset = last_timestamp_pos * input_stride
+        else:
+            # If whisper does not predict any "end of segment" token, then
+            # the whole decoding is considered a segment and we add it to the list of segments
+            timestamps = seek_sequence[timestamp_tokens.nonzero().flatten()]
+            start_timestamp_pos = 0.0
+            last_timestamp_pos = seek_num_frames[prev_idx] // 2
+            skip = False
+            segment_offset = seek_num_frames[prev_idx]
+            if timestamps.numel() > 1:
+                start_timestamp_pos = timestamps[-2].item() - timestamp_begin
+                last_timestamp_pos = timestamps[-1].item() - timestamp_begin
+            elif timestamps.numel() == 1:
+                # no consecutive timestamps but it has a timestamp; use the last one.
+                start_timestamp_pos = timestamps[-1].item() - timestamp_begin
+                if start_timestamp_pos > 200:
+                    # segment does not fit into decoding window, so we need to rollback
+                    segment_offset = start_timestamp_pos * input_stride - 100  # timestamp might be inaccurate
+                    skip = True
+            else:
+                # empty sequence, or sequence w/o timestamps
+                skip = True
+            if skip:
+                segments = []
+            else:
+                segments = [
+                    {
+                        "start": time_offset[prev_idx] + start_timestamp_pos * time_precision,
+                        "end": time_offset[prev_idx] + last_timestamp_pos * time_precision,
+                        "tokens": seek_sequence,
+                        "result": seek_outputs[idx],
+                    }
+                ]
+                if return_token_timestamps:
+                    segments[-1]["token_timestamps"] = token_timestamps + time_offset[prev_idx]
+                segment_offset = seek_num_frames[prev_idx]
+        if segment_offset <= 0:
+            msg = f"Timestamps: {timestamps}, Segments: {segments}"
+            raise ValueError(f"Segment offset: {segment_offset} <= 0. This should not happen!\n{msg}")
+        return segments, segment_offset
+    def _postprocess_outputs(self, seek_outputs, decoder_input_ids, return_token_timestamps, generation_config):
+        # remove all previously passed decoder input ids
+        if isinstance(seek_outputs, torch.Tensor):
+            seek_outputs = seek_outputs[:, decoder_input_ids.shape[-1]:]
+            seek_outputs = torch.hstack((
+                seek_outputs,
+                torch.full((seek_outputs.shape[0], 1),
+                           fill_value=generation_config.pad_token_id,
+                           dtype=seek_outputs.dtype,
+                           device=seek_outputs.device
+                           )
+            ))
+            # first_eos = (seek_outputs == generation_config.eos_token_id).int().argmax(dim=1)
+            # biggest_timestamp = generation_config.no_timestamps_token_id + 1 + 30 * 50
+            # empty_transcriptions = first_eos == 0
+            # seek_outputs[empty_transcriptions, 0] = generation_config.no_timestamps_token_id + 1  # 0.00 timestamp
+            # seek_outputs[empty_transcriptions, 1] = biggest_timestamp  # 30.00 timestamp
+            # seek_outputs[empty_transcriptions, 2] = generation_config.eos_token_id  # 30.00 timestamp
+            return seek_outputs, seek_outputs
+        if return_token_timestamps and hasattr(generation_config, "alignment_heads"):
+            num_frames = getattr(generation_config, "num_frames", None)
+            seek_outputs["token_timestamps"] = self._extract_token_timestamps(
+                seek_outputs, generation_config.alignment_heads, num_frames=num_frames
+            )
+            seek_outputs["token_timestamps"] = seek_outputs["token_timestamps"][:, decoder_input_ids.shape[-1]:]
+        seek_outputs["sequences"] = seek_outputs["sequences"][:, decoder_input_ids.shape[-1]:]
+        def split_by_batch_index(values, key, batch_idx):
+            if key == "scores":
+                return [v[batch_idx].cpu() for v in values]
+            elif key == "past_key_values":
+                # we don't save `past_key_values` as this is too costly
+                return None
+            elif isinstance(values[batch_idx], tuple) and torch.is_tensor(values[batch_idx][0]):
+                return tuple(tuple(w[batch_idx][None].cpu() for w in v) for v in values)
+            return values[batch_idx].cpu()
+        sequence_tokens = seek_outputs["sequences"]
+        seek_outputs = [
+            {k: split_by_batch_index(v, k, i) for k, v in seek_outputs.items()}
+            for i in range(sequence_tokens.shape[0])
+        ]
+        return sequence_tokens, seek_outputs

generation_config.json ADDED Viewed

	@@ -0,0 +1,12 @@

+{
+  "_from_model_config": true,
+  "begin_suppress_tokens": [
+    220,
+    50256
+  ],
+  "bos_token_id": 50257,
+  "decoder_start_token_id": 50258,
+  "eos_token_id": 50257,
+  "pad_token_id": 50257,
+  "transformers_version": "4.42.0"
+}

layers.py ADDED Viewed

	@@ -0,0 +1,99 @@

+import torch
+from torch import nn
+import torch.nn.functional as F
+class CustomLinear(nn.Linear):
+    def __init__(self, *args, init_eye_val=0.0, is_diagonal=False, **kwargs):
+        super().__init__(*args, **kwargs)
+        self.init_eye_val = init_eye_val
+class CustomLinearInitialized(nn.Linear):
+    def __init__(self, in_features: int, out_features: int, bias: bool = True,
+                 device=None, dtype=None, init_fun=None) -> None:
+        super().__init__(in_features, out_features, bias, device, dtype)
+        self.init_fun = init_fun
+class CustomDiagonalLinear(nn.Module):
+    def __init__(self, d_model, bias=True, init_eye_val=0.0):
+        super().__init__()
+        self.init_eye_val = init_eye_val
+        self.weight = nn.Parameter(torch.full((d_model,), init_eye_val))
+        self.bias = nn.Parameter(torch.zeros(d_model)) if bias else None
+    def forward(self, input):
+        out = input * self.weight
+        if self.bias is not None:
+            out += self.bias
+        return out
+class Gate(nn.Module):
+    def __init__(self, items, init_val=0.0):
+        super().__init__()
+        self.init_val = init_val
+        self.gate = nn.Parameter(torch.full((items,), init_val))
+    def forward(self, input, dim):
+        if input.ndim != 4:
+            raise ValueError('input must be a 4D tensor')
+        shape = [1] * 4
+        shape[dim] = -1
+        return input * self.gate.view(*shape)
+class AttentivePoolingClassifier(nn.Module):
+    def __init__(self, d_model, num_classes, hidden_dim=128):
+        """
+        Attentive Pooling Classifier
+        Args:
+            d_model: Input feature dimension (D)
+            num_classes: Number of output classes (V)
+            hidden_dim: Hidden dimension for attention mechanism
+        """
+        super(AttentivePoolingClassifier, self).__init__()
+        # Attention mechanism for pooling [B,T,D] -> [B,D]
+        self.attention_projection = nn.Linear(d_model, hidden_dim)
+        self.attention_weights = nn.Linear(hidden_dim, 1)
+        # Classifier [B,D] -> [B,V]
+        self.classifier = nn.Sequential(
+            nn.Linear(d_model, hidden_dim),
+            nn.ReLU(),
+            nn.Dropout(0.1),
+            nn.Linear(hidden_dim, num_classes)
+        )
+    def forward(self, x, apply_stop_gradient=True):
+        """
+        Forward pass
+        Args:
+            x: Input tensor of shape [B, T, D]
+            apply_stop_gradient: Whether to apply stop gradient
+        Returns:
+            logits: Output logits [B, V]
+            attention_weights: Attention weights [B, T]
+            pooled_features: Pooled features [B, D]
+        """
+        # Apply stop gradient if specified
+        if apply_stop_gradient:
+            x = x.detach()
+        # Compute attention weights
+        # x: [B, T, D] -> [B, T, hidden_dim]
+        att_proj = torch.tanh(self.attention_projection(x))
+        # att_proj: [B, T, hidden_dim] -> [B, T, 1] -> [B, T]
+        attention_scores = self.attention_weights(att_proj).squeeze(-1)
+        attention_weights = F.softmax(attention_scores, dim=-1)
+        # Apply attentive pooling: [B, T, D] * [B, T, 1] -> [B, D]
+        pooled_features = torch.sum(x * attention_weights.unsqueeze(-1), dim=1)
+        # Classification
+        logits = self.classifier(pooled_features)
+        return logits

model.safetensors ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:67601a48c8342a5e8aa5e4542892906703d203fd8ce8fb5009860b72dffe4adc
+size 4672829976

modeling_dicow.py ADDED Viewed

	@@ -0,0 +1,450 @@

+from typing import Optional, Tuple, Union
+import torch
+import torch.nn as nn
+from torch.nn import CrossEntropyLoss
+import torch.utils.checkpoint
+import torch.utils.checkpoint
+from transformers.modeling_outputs import Seq2SeqLMOutput
+from transformers.models.speech_encoder_decoder.modeling_speech_encoder_decoder import (
+    shift_tokens_right,
+)
+from transformers.models.whisper.modeling_whisper import (
+    WhisperEncoder,
+)
+from transformers.models.whisper.modeling_whisper import (
+    WhisperForConditionalGeneration,
+    shift_tokens_right,
+    WhisperModel,
+)
+from transformers.models.whisper.modeling_whisper import sinusoids
+from transformers.utils import logging
+from .config import Seq2SeqLMOutputLosses, Seq2SeqModelOutputLogit, DiCoWConfig
+from .encoder import DiCoWEncoder
+from .FDDT import FDDT
+from .layers import CustomLinear, CustomDiagonalLinear, Gate, AttentivePoolingClassifier, CustomLinearInitialized
+from .generation import DiCoWGenerationMixin
+from .contrastive_loss import ContrastiveLoss
+import wandb
+logging.set_verbosity_debug()
+logger = logging.get_logger("transformers")
+class DiCoW(WhisperModel):
+    def __init__(self, config: DiCoWConfig):
+        super().__init__(config)
+        self.encoder = DiCoWEncoder(config)
+    def forward(
+            self,
+            input_features: Optional[torch.FloatTensor] = None,
+            attention_mask: Optional[torch.LongTensor] = None,
+            decoder_input_ids: Optional[torch.LongTensor] = None,
+            decoder_attention_mask: Optional[torch.LongTensor] = None,
+            head_mask: Optional[torch.Tensor] = None,
+            decoder_head_mask: Optional[torch.Tensor] = None,
+            cross_attn_head_mask: Optional[torch.Tensor] = None,
+            encoder_outputs: Optional[Tuple[Tuple[torch.FloatTensor]]] = None,
+            past_key_values: Optional[Tuple[Tuple[torch.FloatTensor]]] = None,
+            decoder_inputs_embeds: Optional[Tuple[torch.FloatTensor]] = None,
+            decoder_position_ids: Optional[Tuple[torch.LongTensor]] = None,
+            use_cache: Optional[bool] = None,
+            output_attentions: Optional[bool] = None,
+            output_hidden_states: Optional[bool] = None,
+            return_dict: Optional[bool] = None,
+            stno_mask: Optional[torch.FloatTensor] = None,
+            per_group_sizes: Optional[torch.LongTensor] = None,
+    ) -> Union[Tuple[torch.Tensor], Seq2SeqLMOutputLosses]:
+        r"""
+        Returns:
+        Example:
+         ```python
+         >>> import torch
+         >>> from transformers import AutoFeatureExtractor, WhisperModel
+         >>> from datasets import load_dataset
+         >>> model = WhisperModel.from_pretrained("openai/whisper-base")
+         >>> feature_extractor = AutoFeatureExtractor.from_pretrained("openai/whisper-base")
+         >>> ds = load_dataset("hf-internal-testing/librispeech_asr_dummy", "clean", split="validation")
+         >>> inputs = feature_extractor(ds[0]["audio"]["array"], return_tensors="pt")
+         >>> input_features = inputs.input_features
+         >>> decoder_input_ids = torch.tensor([[1, 1]]) * model.config.decoder_start_token_id
+         >>> last_hidden_state = model(input_features, decoder_input_ids=decoder_input_ids).last_hidden_state
+         >>> list(last_hidden_state.shape)
+         [1, 2, 512]
+         ```"""
+        output_attentions = output_attentions if output_attentions is not None else self.config.output_attentions
+        output_hidden_states = (
+            output_hidden_states if output_hidden_states is not None else self.config.output_hidden_states
+        )
+        use_cache = use_cache if use_cache is not None else self.config.use_cache
+        return_dict = return_dict if return_dict is not None else self.config.use_return_dict
+        if encoder_outputs is None:
+            input_features = self._mask_input_features(input_features, attention_mask=attention_mask)
+            encoder_outputs = self.encoder(
+                input_features,
+                output_attentions=output_attentions,
+                output_hidden_states=True,
+                head_mask=head_mask,
+                return_dict=return_dict,
+                stno_mask=stno_mask,
+                per_group_sizes=per_group_sizes
+            )
+        # If the user passed a tuple for encoder_outputs, we wrap it in a BaseModelOutput when return_dict=True
+        # elif return_dict and not isinstance(encoder_outputs, BaseModelOutput):
+        #     raise ValueError("encoder_outputs should be of type BaseModelOutput when return_dict=True.")
+        # decoder outputs consists of (dec_features, past_key_value, dec_hidden, dec_attn)
+        decoder_outputs = self.decoder(
+            input_ids=decoder_input_ids,
+            attention_mask=decoder_attention_mask,
+            encoder_hidden_states=encoder_outputs.hidden_states[-1],
+            head_mask=decoder_head_mask,
+            cross_attn_head_mask=cross_attn_head_mask,
+            past_key_values=past_key_values,
+            inputs_embeds=decoder_inputs_embeds,
+            position_ids=decoder_position_ids,
+            use_cache=use_cache,
+            output_attentions=output_attentions,
+            output_hidden_states=output_hidden_states,
+            return_dict=return_dict,
+        )
+        if not return_dict:
+            return decoder_outputs + encoder_outputs
+        return Seq2SeqModelOutputLogit(
+            last_hidden_state=decoder_outputs.last_hidden_state,
+            past_key_values=decoder_outputs.past_key_values,
+            decoder_hidden_states=decoder_outputs.hidden_states,
+            decoder_attentions=decoder_outputs.attentions,
+            cross_attentions=decoder_outputs.cross_attentions,
+            encoder_last_hidden_state=encoder_outputs.hidden_states[-1],
+            encoder_hidden_states=encoder_outputs.hidden_states,
+            encoder_attentions=encoder_outputs.attentions,
+            encoder_logits=encoder_outputs.logits,
+        )
+class DiCoWForConditionalGeneration(DiCoWGenerationMixin, WhisperForConditionalGeneration):
+    config_class = DiCoWConfig
+    def __init__(self, config: DiCoWConfig):
+        super().__init__(config)
+        self.model = DiCoW(config)
+        self.encoder_logits = None
+        self.tokenizer = None
+        self.vad_seek_callback = None
+        self.stno_mask = None
+        self.stno_mask_seek = None
+        self.use_enrollment_network = config.use_enrollment_network
+        if self.config.contrastive_loss_weight > 0.0:
+            self.contrastive_loss_fct = ContrastiveLoss(distance_metric="cosine")
+            self.sid_classifier = nn.Linear(config.d_model, config.num_speakers)
+            # self.sid_classifier = AttentivePoolingClassifier(config.d_model, config.num_speakers, config.d_model // 4)
+            self.embedding_projector = nn.Linear(config.d_model, config.d_model)
+    # We need this setter as we can't pass a function/method as a config argument.
+    # JSON serialization fails at that point.
+    def set_vad_seek_callback(self, vad_seek_callback):
+        self.vad_seek_callback = vad_seek_callback
+    def set_tokenizer(self, tokenizer):
+        self.tokenizer = tokenizer
+    def _init_weights(self, module):
+        std = self.config.init_std
+        fddt_init = self.config.fddt_init
+        if isinstance(module, CustomLinearInitialized):
+            module.init_fun(module)
+        elif isinstance(module, CustomLinear):
+            with torch.no_grad():
+                if fddt_init == 'random':
+                    module.weight.data.normal_(mean=0.0, std=std)
+                    if module.bias is not None:
+                        module.bias.data.normal_(mean=0.0, std=std)
+                elif fddt_init == 'non-disturbing':
+                    module.weight.data = torch.eye(*module.weight.shape).data
+                    if module.bias is not None:
+                        module.bias.data.zero_()
+                elif fddt_init == 'disparagement':
+                    eye = torch.eye(*module.weight.shape)
+                    eye *= module.init_eye_val
+                    module.weight.data = eye.data
+                    if module.bias is not None:
+                        module.bias.data.zero_()
+        elif isinstance(module, CustomDiagonalLinear):
+            with torch.no_grad():
+                if fddt_init == 'random':
+                    module.weight.data.normal_(mean=0.0, std=std)
+                    if module.bias is not None:
+                        module.bias.data.normal_(mean=0.0, std=std)
+                elif fddt_init == 'non-disturbing':
+                    module.weight.data = torch.ones_like(module.weight.data).data
+                    if module.bias is not None:
+                        module.bias.data.zero_()
+                elif fddt_init == 'disparagement':
+                    module.weight.data = module.init_eye_val * torch.ones_like(module.weight.data).data
+                    if module.bias is not None:
+                        module.bias.data.zero_()
+        elif isinstance(module, FDDT):
+            if module.bias_only:
+                if fddt_init == 'random':
+                    module.target_linear.data.normal_(mean=0.0, std=std)
+                    module.non_target_linear.data.normal_(mean=0.0, std=std)
+                    module.overlap_linear.data.normal_(mean=0.0, std=std)
+                    module.silence_linear.data.normal_(mean=0.0, std=std)
+                    module.scb.data.normal_(mean=0.0, std=std)
+                else:
+                    module.target_linear.data.zero_()
+                    module.non_target_linear.data.zero_()
+                    module.overlap_linear.data.zero_()
+                    module.silence_linear.data.zero_()
+                    module.scb.data.zero_()
+        elif isinstance(module, (nn.Linear, nn.Conv1d)):
+            module.weight.data.normal_(mean=0.0, std=std)
+            if module.bias is not None:
+                module.bias.data.zero_()
+        elif isinstance(module, nn.Embedding):
+            module.weight.data.normal_(mean=0.0, std=std)
+            if module.padding_idx is not None:
+                module.weight.data[module.padding_idx].zero_()
+        elif isinstance(module, WhisperEncoder):
+            with torch.no_grad():
+                embed_positions = module.embed_positions.weight
+                embed_positions.copy_(sinusoids(*embed_positions.shape))
+        elif isinstance(module, nn.LayerNorm):
+            module.reset_parameters()
+        elif isinstance(module, nn.MultiheadAttention):
+            module._reset_parameters()
+        elif isinstance(module, nn.ConvTranspose1d):
+            module.reset_parameters()
+        elif isinstance(module, Gate):
+            module.gate.data = module.init_val * torch.ones_like(module.gate.data).data
+    def forward(
+            self,
+            input_features: Optional[torch.FloatTensor] = None,
+            stno_mask: Optional[torch.FloatTensor] = None,
+            per_group_sizes: Optional[torch.LongTensor] = None,
+            attention_mask_enc: Optional[torch.LongTensor] = None,
+            attention_mask: Optional[torch.LongTensor] = None,
+            decoder_input_ids: Optional[torch.LongTensor] = None,
+            decoder_attention_mask: Optional[torch.LongTensor] = None,
+            head_mask: Optional[torch.Tensor] = None,
+            decoder_head_mask: Optional[torch.Tensor] = None,
+            cross_attn_head_mask: Optional[torch.Tensor] = None,
+            encoder_outputs: Optional[Tuple[Tuple[torch.FloatTensor]]] = None,
+            past_key_values: Optional[Tuple[Tuple[torch.FloatTensor]]] = None,
+            decoder_inputs_embeds: Optional[Tuple[torch.FloatTensor]] = None,
+            decoder_position_ids: Optional[Tuple[torch.LongTensor]] = None,
+            labels: Optional[torch.LongTensor] = None,
+            upp_labels: Optional[torch.LongTensor] = None,
+            use_cache: Optional[bool] = None,
+            output_attentions: Optional[bool] = None,
+            output_hidden_states: Optional[bool] = None,
+            return_dict: Optional[bool] = None,
+            is_valid: Optional[bool] = None,
+            spk_id: Optional[torch.LongTensor] = None,
+    ) -> Union[Tuple[torch.Tensor], Seq2SeqLMOutput]:
+        r"""
+        labels (`torch.LongTensor` of shape `(batch_size, sequence_length)`, *optional*):
+            Labels for computing the language modeling loss. Indices should either be in `[0, ..., config.vocab_size]`
+            or -100 (see `input_ids` docstring). Tokens with indices set to `-100` are ignored (masked), the loss is
+            only computed for the tokens with labels in `[0, ..., config.vocab_size]`.
+        Returns:
+        Example:
+        ```python
+        >>> import torch
+        >>> from transformers import AutoProcessor, WhisperForConditionalGeneration
+        >>> from datasets import load_dataset
+        >>> processor = AutoProcessor.from_pretrained("openai/whisper-tiny.en")
+        >>> model = WhisperForConditionalGeneration.from_pretrained("openai/whisper-tiny.en")
+        >>> ds = load_dataset("hf-internal-testing/librispeech_asr_dummy", "clean", split="validation")
+        >>> inputs = processor(ds[0]["audio"]["array"], return_tensors="pt")
+        >>> input_features = inputs.input_features
+        >>> generated_ids = model.generate(inputs=input_features)
+        >>> transcription = processor.batch_decode(generated_ids, skip_special_tokens=True)[0]
+        >>> transcription
+        ' Mr. Quilter is the apostle of the middle classes, and we are glad to welcome his gospel.'
+        ```"""
+        stno_mask_orig = stno_mask
+        enrollments_processed = None
+        enroll_stno_mask_reshape = None
+        enrollments_enc = None
+        if self.training and self.use_enrollment_network:
+            attention_mask = attention_mask[::2, ...]
+            enroll_input = input_features[1::2, ...]
+            input_features = input_features[::2, ...]
+            is_valid = is_valid[::2, ...]
+            enroll_stno_mask = stno_mask[1::2, ...]
+            stno_mask = stno_mask[::2, ...]
+            labels = labels[::2, ...]
+            upp_labels = upp_labels[::2, ...]
+            enrollments_enc = self.model.encoder.encode_enrollment(
+                input_features=enroll_input,
+                num_layers_to_apply=self.config.spk_embedding_extraction_layer,
+                head_mask=head_mask,
+                stno_mask=enroll_stno_mask,
+            )
+            enroll_stno_mask_reshape = ((enroll_stno_mask[:, 1, :] + enroll_stno_mask[:, 3, :]) > 0.5).view(-1,
+                                                                                                    self.config.mt_num_speakers,
+                                                                                                    enroll_stno_mask.shape[
+                                                                                                        2]).flatten(1,
+                                                                                                                    2)
+            enrollments_processed = enrollments_enc.view(-1, self.config.mt_num_speakers, enrollments_enc.shape[1],
+                                                         enrollments_enc.shape[2]).flatten(1, 2)
+        return_dict = return_dict if return_dict is not None else self.config.use_return_dict
+        if labels is not None:
+            if decoder_input_ids is None and decoder_inputs_embeds is None:
+                decoder_input_ids = shift_tokens_right(
+                    labels, self.config.pad_token_id, self.config.decoder_start_token_id
+                )
+        outputs = self.model(
+            input_features,
+            attention_mask=attention_mask,
+            decoder_input_ids=decoder_input_ids,
+            encoder_outputs=encoder_outputs,
+            decoder_attention_mask=decoder_attention_mask,
+            head_mask=head_mask,
+            decoder_head_mask=decoder_head_mask,
+            cross_attn_head_mask=cross_attn_head_mask,
+            past_key_values=past_key_values,
+            decoder_inputs_embeds=decoder_inputs_embeds,
+            decoder_position_ids=decoder_position_ids,
+            use_cache=use_cache,
+            output_attentions=output_attentions,
+            output_hidden_states=output_hidden_states,
+            return_dict=return_dict,
+            stno_mask=stno_mask,
+            per_group_sizes=per_group_sizes
+        )
+        dec_lm_logits = self.proj_out(outputs.last_hidden_state)
+        enc_lm_logits = outputs.encoder_logits
+        loss = None
+        ctc_loss = 0
+        # remove fake inputs from labels and logits given per group sizes
+        if is_valid is not None:
+            if self.config.ctc_weight > 0.0:
+                enc_lm_logits = enc_lm_logits[is_valid]
+            dec_lm_logits = dec_lm_logits[is_valid]
+            labels = labels[is_valid]
+            upp_labels = upp_labels[is_valid]
+        if labels is not None and self.config.ctc_weight > 0.0:
+            enc_labels = labels.clone()
+            for token in self.tokenizer.prefix_tokens:
+                if (enc_labels[:, 0] == token).all():
+                    enc_labels = enc_labels[:, 1:]
+            enc_labels[enc_labels == self.config.eos_token_id] = -100
+            ctc_loss = self.get_encoder().get_loss(enc_lm_logits, enc_labels)
+        if labels is not None:
+            loss_fct = CrossEntropyLoss(reduction='none')
+            # move labels to correct device to enable PP
+            labels = labels.to(dec_lm_logits.device)
+            dec_loss1 = loss_fct(dec_lm_logits.view(-1, self.config.vocab_size), labels.reshape(-1))
+            dec_loss2 = loss_fct(dec_lm_logits.view(-1, self.config.vocab_size), upp_labels.reshape(-1))
+            dec_loss = torch.hstack((dec_loss1[..., None], dec_loss2[..., None])).min(dim=-1).values.mean()
+            if wandb.run is not None:
+                wandb.log({"dec_loss": dec_loss})
+                wandb.log({"ctc_loss": ctc_loss})
+            loss = (1 - self.config.ctc_weight) * dec_loss + self.config.ctc_weight * ctc_loss
+            if hasattr(self, "contrastive_loss_fct"):
+                stno_per_spk_pair = stno_mask.view(-1, self.config.mt_num_speakers, stno_mask.shape[1],
+                                                   stno_mask.shape[2])
+                anchors = ((stno_per_spk_pair[:, :, 1, :] + stno_per_spk_pair[:, :, 3, :]) > 0.5).flatten(1)
+                intermediate_states = outputs.encoder_hidden_states[self.config.spk_embedding_extraction_layer].view(-1,
+                                                                                                                     self.config.mt_num_speakers,
+                                                                                                                     stno_mask.shape[
+                                                                                                                         2],
+                                                                                                                     outputs.encoder_hidden_states[
+                                                                                                                         self.config.spk_embedding_extraction_layer].shape[
+                                                                                                                         -1]).flatten(
+                    1, 2)
+                valid_pairs = is_valid.view((-1, self.config.mt_num_speakers)).all(dim=-1)
+                contrastive_loss = self.contrastive_loss_fct(
+                    self.embedding_projector(intermediate_states[valid_pairs]),
+                    anchors[valid_pairs],
+                    self.embedding_projector(enrollments_processed[valid_pairs]) if enrollments_processed is not None else None,
+                    enroll_stno_mask_reshape[valid_pairs] if enroll_stno_mask_reshape is not None else None
+                )
+                if wandb.run is not None:
+                    wandb.log({"contrastive_loss": contrastive_loss})
+                loss += self.config.contrastive_loss_weight * contrastive_loss
+                embeds = outputs.encoder_hidden_states[self.config.spk_embedding_extraction_layer]
+                all_embeds = torch.empty((embeds.shape[0] * 2, embeds.shape[1], embeds.shape[2]), dtype=embeds.dtype,
+                                         device=embeds.device)
+                all_embeds[::2] = embeds
+                all_embeds[1::2] = enrollments_enc
+                spk_logits = self.sid_classifier(self.embedding_projector(all_embeds))
+                spk_id_mask = (stno_mask_orig[:, 1] + stno_mask_orig[:, 3]) > 0.5
+                spk_loss_fun = CrossEntropyLoss(reduction='mean')
+                spk_labels = spk_id[:,None].repeat((1, spk_logits.shape[1]))[spk_id_mask]
+                spk_loss =  spk_loss_fun(spk_logits[spk_id_mask], spk_labels)
+                if wandb.run is not None:
+                    spk_id_acc = (torch.argmax(spk_logits[spk_id_mask], dim=-1) == spk_labels).sum() / len(spk_labels[spk_labels!=-100])
+                    wandb.log({"spk_loss": spk_loss, "spk_id_acc": spk_id_acc})
+                loss += spk_loss
+        if not return_dict:
+            output = (dec_lm_logits,) + outputs[1:]
+            return ((loss,) + output) if loss is not None else output
+        return Seq2SeqLMOutputLosses(
+            loss=loss,
+            logits=dec_lm_logits,
+            past_key_values=outputs.past_key_values,
+            decoder_hidden_states=outputs.decoder_hidden_states,
+            decoder_attentions=outputs.decoder_attentions,
+            cross_attentions=outputs.cross_attentions,
+            encoder_last_hidden_state=outputs.encoder_last_hidden_state,
+            encoder_hidden_states=outputs.encoder_hidden_states,
+            encoder_attentions=outputs.encoder_attentions,
+            encoder_logits=enc_lm_logits,
+        )
+    def _get_feat_extract_output_lengths(self, attention_mask: torch.Tensor) -> torch.Tensor:
+        return (self.model.encoder._get_feat_extract_output_lengths(attention_mask) / 4).ceil()
+    def freeze_except(self, prefixes_to_preheat):
+        for name, param in self.named_parameters():
+            param.requires_grad = False
+            for prefix in prefixes_to_preheat:
+                if name.startswith(prefix):
+                    param.requires_grad = True
+    def suppress_interactions(self):
+        """This method suppress final projection in CoAttention blocks to let the original information flow through"""
+        for name, param in self.named_parameters():
+            if "interaction" in name and "cat_proj" in name:
+                with torch.no_grad():
+                    if "bias" in name:
+                        param[:] = 0.
+                    else:
+                        param[:] *= 0.001

utils.py ADDED Viewed

	@@ -0,0 +1,96 @@

+from typing import Optional
+import torch
+from transformers import WhisperTimeStampLogitsProcessor
+def remove_fake_elements(inputs, per_group_sizes):
+    max_spks = per_group_sizes.max()
+    number_of_groups = per_group_sizes.shape[0]
+    outputs = []
+    inputs = inputs.view(number_of_groups, max_spks, *inputs.shape[1:])
+    for i, group_size in enumerate(per_group_sizes):
+        outputs.append(inputs[i, :group_size])
+    outputs = torch.cat(outputs, dim=0)
+    return outputs
+class WhisperTimeStampLogitsProcessorCustom(WhisperTimeStampLogitsProcessor):
+    def __init__(
+            self, generate_config, begin_index: Optional[int] = None,
+            _detect_timestamp_from_logprob: Optional[bool] = None
+    ):  # support for the kwargs
+        self.no_timestamps_token_id = generate_config.no_timestamps_token_id
+        self.timestamp_begin = generate_config.no_timestamps_token_id + 1
+        self.eos_token_id = generate_config.eos_token_id or generate_config.bos_token_id
+        # this variable is mostly just used for testing
+        self._detect_timestamp_from_logprob = (
+            _detect_timestamp_from_logprob
+            if _detect_timestamp_from_logprob is not None
+            else getattr(generate_config, "_detect_timestamp_from_logprob", True)
+        )
+        num_forced_ids = (
+            len(generate_config.forced_decoder_ids) if generate_config.forced_decoder_ids is not None else 0
+        )
+        self.begin_index = begin_index or (num_forced_ids + 1)
+        self.max_initial_timestamp_index = getattr(generate_config, "max_initial_timestamp_index", None)
+        self.min_initial_timestamp_index = getattr(generate_config, "min_initial_timestamp_index", None)
+        # TODO(Patrick): Make sure that official models have max_initial_timestamp_index set to 50
+        # self.max_initial_timestamp_index = 50
+    def __call__(self, input_ids: torch.LongTensor, scores: torch.FloatTensor) -> torch.FloatTensor:
+        # suppress <|notimestamps|> which is handled by without_timestamps
+        scores_processed = scores.clone()
+        scores_processed[:, self.no_timestamps_token_id] = -float("inf")
+        # timestamps have to appear in pairs, except directly before eos_token; mask logits accordingly
+        for k in range(input_ids.shape[0]):
+            sampled_tokens = input_ids[k, self.begin_index:]
+            seq = list(sampled_tokens.tolist())
+            last_was_timestamp = len(seq) >= 1 and seq[-1] >= self.timestamp_begin
+            penultimate_was_timestamp = len(seq) < 2 or seq[-2] >= self.timestamp_begin
+            if last_was_timestamp:
+                if penultimate_was_timestamp:  # has to be non-timestamp
+                    scores_processed[k, self.timestamp_begin:] = -float("inf")
+                else:  # cannot be normal text tokens
+                    scores_processed[k, : self.eos_token_id] = -float("inf")
+            timestamps = sampled_tokens[sampled_tokens.ge(self.timestamp_begin)]
+            if timestamps.numel() > 0:
+                # `timestamps` shouldn't decrease; forbid timestamp tokens smaller than the last
+                # The following lines of code are copied from: https://github.com/openai/whisper/pull/914/files#r1137085090
+                if last_was_timestamp and not penultimate_was_timestamp:
+                    timestamp_last = timestamps[-1]
+                else:
+                    # Avoid to emit <|0.00|> again
+                    timestamp_last = timestamps[-1] + 1
+                scores_processed[k, self.timestamp_begin: timestamp_last] = -float("inf")
+        # apply the `max_initial_timestamp` option
+        if input_ids.shape[1] == self.begin_index:
+            eos_scores = scores_processed[:, self.eos_token_id].clone()
+            scores_processed[:, : self.timestamp_begin] = -float("inf")
+            scores_processed[:, self.eos_token_id] = eos_scores
+            if self.max_initial_timestamp_index is not None:
+                last_allowed = self.timestamp_begin + self.max_initial_timestamp_index
+                scores_processed[:, last_allowed + 1:] = -float("inf")
+            if self.min_initial_timestamp_index is not None:
+                first_allowed = self.timestamp_begin + self.min_initial_timestamp_index
+                scores_processed[:, self.timestamp_begin:first_allowed] = -float("inf")
+        # if sum of probability over timestamps is above any other token, sample timestamp
+        logprobs = torch.nn.functional.log_softmax(scores_processed.float(), dim=-1)
+        for k in range(input_ids.shape[0]):
+            timestamp_logprob = logprobs[k, self.timestamp_begin:].logsumexp(dim=-1)
+            max_text_token_logprob = logprobs[k, : self.timestamp_begin].max()
+            if timestamp_logprob > max_text_token_logprob and self._detect_timestamp_from_logprob:
+                scores_processed[k, : self.timestamp_begin] = -float("inf")
+        return scores_processed