Completely overhauled the attention implementation. Using the existing Gemma-3 attention implementation rather than custom monkey-patched implementation. (#10)

Browse files

- Completely overhauled the attention implementation. Using the existing Gemma-3 attention implementation rather than custom monkey-patched implementation. (efb4d2d4f654499b929a467d423403d3830628e7)

Co-authored-by: Pulipaka Prem Sidharth <[email protected]>

Files changed (1) hide show

modeling_gemma3_punctuation.py +155 -190

modeling_gemma3_punctuation.py CHANGED Viewed

@@ -1,22 +1,21 @@
 """
-Custom Gemma3 model for token classification with non-causal attention
 """
 import torch
 import torch.nn as nn
-from typing import Optional, Tuple, List, Dict, Any
-import types
 from transformers import PretrainedConfig, PreTrainedModel
-from transformers import Gemma3ForCausalLM
 from transformers.models.gemma3.modeling_gemma3 import (
     Gemma3Attention,
-    repeat_kv,
-    apply_rotary_pos_emb,
-    ALL_ATTENTION_FUNCTIONS,
-    Cache,
-    FlashAttentionKwargs,
 )
 from transformers.modeling_outputs import TokenClassifierOutput
 from transformers.utils import logging
@@ -27,7 +26,6 @@ class Gemma3PunctuationConfig(PretrainedConfig):
     """
     Configuration class for Gemma3 punctuation model.
     """
     model_type = "cadence_punctuation"
     def __init__(
@@ -43,171 +41,141 @@ class Gemma3PunctuationConfig(PretrainedConfig):
         super().__init__(**kwargs)
-def _extract_padding_mask_corrected(
-    combined_mask_4d: Optional[torch.Tensor],
-    debug_print: bool = False
-) -> Optional[torch.Tensor]:
-    """Extract padding mask from combined 4D attention mask."""
-    if combined_mask_4d is None:
-        return None
-    mask_value = torch.finfo(combined_mask_4d.dtype).min
-    is_key_padding = (combined_mask_4d == mask_value).all(dim=2, keepdim=True)
-    padding_only_mask = torch.where(
-        is_key_padding.expand_as(combined_mask_4d),
-        torch.full_like(combined_mask_4d, mask_value),
-        torch.zeros_like(combined_mask_4d)
-    )
-    return padding_only_mask
-def non_causal_eager_attention_forward_with_padding(
-    module: nn.Module,
-    query: torch.Tensor,
-    key: torch.Tensor,
-    value: torch.Tensor,
-    attention_mask: Optional[torch.Tensor],
-    **kwargs: Any,
-) -> Tuple[torch.Tensor, torch.Tensor]:
-    """Non-causal eager attention implementation."""
-    dropout = kwargs.get("dropout", 0.0)
-    scaling = kwargs.get("scaling", None)
-    softcap = kwargs.get("softcap", None)
-    if scaling is None:
-        head_dim = getattr(module, "head_dim", query.shape[-1])
-        scaling = head_dim**-0.5
-    num_key_value_groups = getattr(module, "num_key_value_groups", 1)
-    key_states = repeat_kv(key, num_key_value_groups)
-    value_states = repeat_kv(value, num_key_value_groups)
-    attn_weights = torch.matmul(query, key_states.transpose(2, 3)) * scaling
-    if softcap is not None:
-        attn_weights = attn_weights / softcap
-        attn_weights = torch.tanh(attn_weights)
-        attn_weights = attn_weights * softcap
-    if attention_mask is not None:
-        mask_slice = attention_mask[:, :, :, : key_states.shape[-2]]
-        attn_weights = attn_weights + mask_slice
-    attn_weights = nn.functional.softmax(attn_weights, dim=-1, dtype=torch.float32).to(query.dtype)
-    is_training = getattr(module, "training", False)
-    attn_weights = nn.functional.dropout(attn_weights, p=dropout, training=is_training)
-    attn_output = torch.matmul(attn_weights, value_states)
-    attn_output = attn_output.transpose(1, 2).contiguous()
-    return attn_output, attn_weights
-def modified_gemma3_attention_forward_non_causal(
-    self: Gemma3Attention,
-    hidden_states: torch.Tensor,
-    position_embeddings: torch.Tensor,
-    attention_mask: Optional[torch.Tensor],
-    past_key_value: Optional[Cache] = None,
-    cache_position: Optional[torch.LongTensor] = None,
-    **kwargs: Any,
-) -> tuple[torch.Tensor, Optional[torch.Tensor], Optional[Cache]]:
-    """Modified Gemma3 attention forward for non-causal behavior."""
-    bsz, q_len, _ = hidden_states.size()
-    input_shape = hidden_states.shape[:-1]
-    hidden_shape = (*input_shape, -1, self.head_dim)
-    query_states = self.q_proj(hidden_states).view(hidden_shape).transpose(1, 2)
-    key_states = self.k_proj(hidden_states).view(hidden_shape).transpose(1, 2)
-    value_states = self.v_proj(hidden_states).view(hidden_shape).transpose(1, 2)
-    query_states = self.q_norm(query_states)
-    key_states = self.k_norm(key_states)
-    cos, sin = position_embeddings
-    query_states, key_states = apply_rotary_pos_emb(query_states, key_states, cos, sin)
-    if past_key_value is not None:
-        cache_kwargs = {
-            "sin": sin,
-            "cos": cos,
-            "cache_position": cache_position,
-            "sliding_window": self.sliding_window
-        }
-        key_states, value_states = past_key_value.update(
-            key_states, value_states, self.layer_idx, cache_kwargs
         )
-    effective_attn_implementation = self.config._attn_implementation
-    output_attentions = kwargs.get("output_attentions", False)
-    if effective_attn_implementation == "sdpa" and output_attentions:
-        effective_attn_implementation = "eager"
-    elif effective_attn_implementation == "flash_attention_2" and output_attentions:
-        effective_attn_implementation = "eager"
-    padding_only_mask = _extract_padding_mask_corrected(attention_mask)
-    use_causal_flag = False  # Non-causal for punctuation
-    # Select attention interface
-    if effective_attn_implementation == "eager":
-        attention_interface = non_causal_eager_attention_forward_with_padding
-    elif effective_attn_implementation == "sdpa":
-        attention_interface = ALL_ATTENTION_FUNCTIONS.get("sdpa", non_causal_eager_attention_forward_with_padding)
-    elif effective_attn_implementation == "flash_attention_2":
-        attention_interface = ALL_ATTENTION_FUNCTIONS.get("flash_attention_2", non_causal_eager_attention_forward_with_padding)
-    else:
-        attention_interface = non_causal_eager_attention_forward_with_padding
-    final_attention_mask = padding_only_mask
-    if final_attention_mask is not None:
-        final_attention_mask = final_attention_mask.to(query_states.device)
-    # Prepare kwargs for attention interface
-    attn_specific_kwargs: Dict[str, Any] = {}
-    if attention_interface == non_causal_eager_attention_forward_with_padding:
-        attn_specific_kwargs = {
-            "dropout": 0.0,
-            "scaling": self.scaling,
-            "softcap": getattr(self, "softcap", None)
-        }
-    elif effective_attn_implementation == "sdpa":
-        attn_specific_kwargs = {"is_causal": use_causal_flag}
-        if output_attentions:
-            attn_specific_kwargs["output_attentions"] = True
-    elif effective_attn_implementation == "flash_attention_2":
-        attn_specific_kwargs = {
-            "causal": use_causal_flag,
-            "softcap": getattr(self, "softcap", None),
-            "dropout": 0.0
-        }
-        if output_attentions:
-            attn_specific_kwargs["output_attentions"] = True
-    attn_output, attn_weights = attention_interface(
-        self, query_states, key_states, value_states, final_attention_mask, **attn_specific_kwargs
-    )
-    attn_output = attn_output.reshape(*input_shape, -1).contiguous()
-    attn_output = self.o_proj(attn_output)
-    returned_weights = attn_weights if output_attentions and attn_weights is not None else None
-    return attn_output, returned_weights
 class Gemma3ForTokenClassification(Gemma3ForCausalLM):
     """
     Gemma3 model for token classification (punctuation prediction).
-    Inherits from Gemma3ForCausalLM and replaces the LM head with classification head.
     """
     config_class = Gemma3PunctuationConfig
     def __init__(self, config):
-        # Initialize the parent Gemma3ForCausalLM
         super().__init__(config)
         self.num_labels = config.num_labels
         # Replace the lm_head with classification head
-        # Don't create a separate classifier - just replace lm_head directly
         classifier_dropout_prob = getattr(config, 'classifier_dropout_prob', 0.0)
         self.lm_head = nn.Sequential(
             nn.Dropout(classifier_dropout_prob),
@@ -219,32 +187,6 @@ class Gemma3ForTokenClassification(Gemma3ForCausalLM):
         # Initialize weights for the new head
         self.post_init()
-        # Apply non-causal attention patching if requested
-        if getattr(config, 'use_non_causal_attention', True):
-            self._patch_attention_layers()
-    def _patch_attention_layers(self):
-        """Patch attention layers to use non-causal attention."""
-        count = 0
-        # The model structure is self.model.layers (inherited from Gemma3ForCausalLM)
-        if hasattr(self, 'model') and hasattr(self.model, 'layers'):
-            target_layers = self.model.layers
-        else:
-            logger.warning("Could not find model.layers for attention patching")
-            return
-        for idx, layer in enumerate(target_layers):
-            if hasattr(layer, 'self_attn') and isinstance(layer.self_attn, Gemma3Attention):
-                layer.self_attn.layer_idx = idx
-                layer.self_attn.forward = types.MethodType(
-                    modified_gemma3_attention_forward_non_causal,
-                    layer.self_attn
-                )
-                count += 1
-        logger.info(f"Patched {count} attention layers for non-causal attention")
     def forward(
         self,
@@ -260,12 +202,10 @@ class Gemma3ForTokenClassification(Gemma3ForCausalLM):
         return_dict: Optional[bool] = None,
         cache_position: Optional[torch.LongTensor] = None,
     ) -> TokenClassifierOutput:
-        """
-        Forward pass for token classification.
-        """
         return_dict = return_dict if return_dict is not None else self.config.use_return_dict
-        # Call the parent's forward method but get the hidden states instead of logits
         outputs = self.model(
             input_ids=input_ids,
             attention_mask=attention_mask,
@@ -302,7 +242,32 @@ class Gemma3ForTokenClassification(Gemma3ForCausalLM):
         )
-# Register the model for AutoModel
 from transformers import AutoConfig, AutoModel
 AutoConfig.register("cadence_punctuation", Gemma3PunctuationConfig)
-AutoModel.register(Gemma3PunctuationConfig, Gemma3ForTokenClassification)

 """
+Change the attention of Gemma3 to be bidirectional.
 """
 import torch
 import torch.nn as nn
+from typing import Optional, List, Dict, Any
+from functools import partial
 from transformers import PretrainedConfig, PreTrainedModel
+from transformers import Gemma3ForCausalLM, Gemma3TextConfig
 from transformers.models.gemma3.modeling_gemma3 import (
     Gemma3Attention,
+    Gemma3DecoderLayer,
+    Gemma3TextModel,
 )
 from transformers.modeling_outputs import TokenClassifierOutput
 from transformers.utils import logging
     """
     Configuration class for Gemma3 punctuation model.
     """
     model_type = "cadence_punctuation"
     def __init__(
         super().__init__(**kwargs)
+# ============ Token Classification Model Components ============
+class NonCausalGemma3Attention(Gemma3Attention):
+    """Gemma3Attention configured for non-causal token classification."""
+    def __init__(self, config, layer_idx: int):
+        super().__init__(config, layer_idx)
+        self.is_causal = False
+        self.sliding_window = None
+class NonCausalGemma3DecoderLayer(Gemma3DecoderLayer):
+    """Decoder layer with non-causal attention for token classification."""
+    def __init__(self, config, layer_idx: int):
+        super().__init__(config, layer_idx)
+        self.self_attn = NonCausalGemma3Attention(config, layer_idx)
+class Gemma3TokenClassificationModel(Gemma3TextModel):
+    """Gemma3 base model configured for token classification."""
+    _no_split_modules = ["NonCausalGemma3DecoderLayer"]
+    def __init__(self, config):
+        super().__init__(config)
+        if getattr(config, 'use_non_causal_attention', True):
+            # Replace layers with non-causal versions
+            self.layers = nn.ModuleList(
+                [
+                    NonCausalGemma3DecoderLayer(config, layer_idx)
+                    for layer_idx in range(config.num_hidden_layers)
+                ]
+            )
+    def _update_causal_mask(
+        self,
+        attention_mask: torch.Tensor,
+        input_tensor: torch.Tensor,
+        cache_position: torch.Tensor,
+        past_key_values = None,
+        output_attentions: bool = False,
+    ):
+        """Override to create bidirectional attention mask (no causal masking)."""
+        if self.config._attn_implementation == "flash_attention_2":
+            if attention_mask is not None and 0.0 in attention_mask:
+                return attention_mask
+            return None
+        past_seen_tokens = (
+            past_key_values.get_seq_length() if past_key_values is not None else 0
         )
+        using_static_cache = isinstance(past_key_values, type(None)) is False and hasattr(past_key_values, 'get_max_length')
+        dtype, device = input_tensor.dtype, input_tensor.device
+        min_dtype = torch.finfo(dtype).min
+        sequence_length = input_tensor.shape[1]
+        if using_static_cache:
+            target_length = past_key_values.get_max_length()
+        else:
+            target_length = (
+                attention_mask.shape[-1]
+                if isinstance(attention_mask, torch.Tensor)
+                else past_seen_tokens + sequence_length + 1
+            )
+        if attention_mask is not None and attention_mask.dim() == 4:
+            # in this case we assume that the mask comes already in inverted form and requires no inversion or slicing
+            if attention_mask.max() != 0:
+                raise ValueError(
+                    "Custom 4D attention mask should be passed in inverted form with max==0`"
+                )
+            causal_mask = attention_mask
+        else:
+            # KEY CHANGE: Start with zeros (attend to all) instead of min_dtype (mask all)
+            causal_mask = torch.zeros(
+                (sequence_length, target_length), dtype=dtype, device=device
+            )
+            # REMOVED: Causal masking lines that would make it lower triangular
+            # if sequence_length != 1:
+            #     causal_mask = torch.triu(causal_mask, diagonal=1)
+            causal_mask *= torch.arange(
+                target_length, device=device
+            ) > cache_position.reshape(-1, 1)
+            causal_mask = causal_mask[None, None, :, :].expand(
+                input_tensor.shape[0], 1, -1, -1
+            )
+            if attention_mask is not None:
+                causal_mask = causal_mask.clone()  # copy to contiguous memory for in-place edit
+                mask_length = attention_mask.shape[-1]
+                padding_mask = (
+                    causal_mask[:, :, :, :mask_length]
+                    + attention_mask[:, None, None, :]
+                )
+                padding_mask = padding_mask == 0
+                causal_mask[:, :, :, :mask_length] = causal_mask[
+                    :, :, :, :mask_length
+                ].masked_fill(padding_mask, min_dtype)
+        # Handle SDPA-specific optimizations if needed
+        if (
+            self.config._attn_implementation == "sdpa"
+            and attention_mask is not None
+            and attention_mask.device.type == "cuda"
+            and not output_attentions
+        ):
+            try:
+                from transformers.modeling_attn_mask_utils import AttentionMaskConverter
+                causal_mask = AttentionMaskConverter._unmask_unattended(
+                    causal_mask, min_dtype
+                )
+            except ImportError:
+                pass  # Fallback for older transformers versions
+        return causal_mask
 class Gemma3ForTokenClassification(Gemma3ForCausalLM):
     """
     Gemma3 model for token classification (punctuation prediction).
+    Uses class-based architecture without monkey patching.
     """
     config_class = Gemma3PunctuationConfig
     def __init__(self, config):
+        # Initialize with base Gemma3ForCausalLM structure
         super().__init__(config)
         self.num_labels = config.num_labels
+        # Replace the base model with token classification version
+        if getattr(config, 'use_non_causal_attention', True):
+            self.model = Gemma3TokenClassificationModel(config)
         # Replace the lm_head with classification head
         classifier_dropout_prob = getattr(config, 'classifier_dropout_prob', 0.0)
         self.lm_head = nn.Sequential(
             nn.Dropout(classifier_dropout_prob),
         # Initialize weights for the new head
         self.post_init()
     def forward(
         self,
         return_dict: Optional[bool] = None,
         cache_position: Optional[torch.LongTensor] = None,
     ) -> TokenClassifierOutput:
+        """Forward pass for token classification."""
         return_dict = return_dict if return_dict is not None else self.config.use_return_dict
+        # Get hidden states from the model
         outputs = self.model(
             input_ids=input_ids,
             attention_mask=attention_mask,
         )
+# ============ Model Registration ============
 from transformers import AutoConfig, AutoModel
+# Register the punctuation config and model
 AutoConfig.register("cadence_punctuation", Gemma3PunctuationConfig)
+AutoModel.register(Gemma3PunctuationConfig, Gemma3ForTokenClassification)
+# ============ Utility Functions ============
+def create_token_classification_model(config: Gemma3PunctuationConfig):
+    """Create a token classification model with non-causal attention."""
+    return Gemma3ForTokenClassification(config)
+def load_from_pretrained_with_config_detection(model_path: str, **kwargs):
+    """
+    Load model and auto-detect whether it's for token classification or bidirectional tasks
+    based on the config.
+    """
+    from transformers import AutoConfig
+    config = AutoConfig.from_pretrained(model_path)
+    if hasattr(config, 'model_type') and config.model_type == "cadence_punctuation":
+        # Token classification model
+        return Gemma3ForTokenClassification.from_pretrained(model_path, config=config, **kwargs)