BUT-FIT
/

SE_DiCoW

Automatic Speech Recognition

speaker-diarization

meeting-transcription

target-speaker-asr

Model card Files Files and versions

Lakoc commited on Sep 30

Commit

1f6e305

·

verified ·

1 Parent(s): 9151753

Update layers.py

Files changed (1) hide show

layers.py +0 -58

layers.py CHANGED Viewed

@@ -39,61 +39,3 @@ class Gate(nn.Module):
         shape = [1] * 4
         shape[dim] = -1
         return input * self.gate.view(*shape)
-class AttentivePoolingClassifier(nn.Module):
-    def __init__(self, d_model, num_classes, hidden_dim=128):
-        """
-        Attentive Pooling Classifier
-        Args:
-            d_model: Input feature dimension (D)
-            num_classes: Number of output classes (V)
-            hidden_dim: Hidden dimension for attention mechanism
-        """
-        super(AttentivePoolingClassifier, self).__init__()
-        # Attention mechanism for pooling [B,T,D] -> [B,D]
-        self.attention_projection = nn.Linear(d_model, hidden_dim)
-        self.attention_weights = nn.Linear(hidden_dim, 1)
-        # Classifier [B,D] -> [B,V]
-        self.classifier = nn.Sequential(
-            nn.Linear(d_model, hidden_dim),
-            nn.ReLU(),
-            nn.Dropout(0.1),
-            nn.Linear(hidden_dim, num_classes)
-        )
-    def forward(self, x, apply_stop_gradient=True):
-        """
-        Forward pass
-        Args:
-            x: Input tensor of shape [B, T, D]
-            apply_stop_gradient: Whether to apply stop gradient
-        Returns:
-            logits: Output logits [B, V]
-            attention_weights: Attention weights [B, T]
-            pooled_features: Pooled features [B, D]
-        """
-        # Apply stop gradient if specified
-        if apply_stop_gradient:
-            x = x.detach()
-        # Compute attention weights
-        # x: [B, T, D] -> [B, T, hidden_dim]
-        att_proj = torch.tanh(self.attention_projection(x))
-        # att_proj: [B, T, hidden_dim] -> [B, T, 1] -> [B, T]
-        attention_scores = self.attention_weights(att_proj).squeeze(-1)
-        attention_weights = F.softmax(attention_scores, dim=-1)
-        # Apply attentive pooling: [B, T, D] * [B, T, 1] -> [B, D]
-        pooled_features = torch.sum(x * attention_weights.unsqueeze(-1), dim=1)
-        # Classification
-        logits = self.classifier(pooled_features)
-        return logits

         shape = [1] * 4
         shape[dim] = -1
         return input * self.gate.view(*shape)