appledora
/

recast3.2-G1W64H4

@@ -32,35 +32,33 @@ class MLPTemplateBank(nn.Module):
         self.coef_shape = (coef_rows, coef_columns)
         assert coef_columns is not None, "coef_columns must not be None"
         # Ensure divisibility for proper reshaping
-        assert (self.hidden_size * self.intermediate_size) % coef_rows == 0, \
-            f"hidden_size * intermediate_size ({self.hidden_size * self.intermediate_size}) must be divisible by coef_rows ({coef_rows})"
         template_size = self.hidden_size * self.intermediate_size // coef_rows
-        self.up_templates = nn.Parameter(
-            torch.randn(coef_columns, template_size)
-        )
-        self.gate_templates = nn.Parameter(
-            torch.randn(coef_columns, template_size)
-        )
         # Better initialization
         nn.init.xavier_uniform_(self.up_templates)
         nn.init.xavier_uniform_(self.gate_templates)
     def forward(self, up_coeffs, gate_coeffs):
         # Compute chunked weights
-        up_chunks = torch.matmul(up_coeffs, self.up_templates)
         gate_chunks = torch.matmul(gate_coeffs, self.gate_templates)
         # Reshape to final weight matrices
         up_weights = up_chunks.reshape(self.intermediate_size, self.hidden_size)
         gate_weights = gate_chunks.reshape(self.intermediate_size, self.hidden_size)
         return up_weights, gate_weights
 class SharedLlamaMLP(nn.Module):
     def __init__(self, config, bank):
         super().__init__()
@@ -68,7 +66,9 @@ class SharedLlamaMLP(nn.Module):
         self.bank = bank
         self.hidden_size = config.hidden_size
         self.intermediate_size = config.intermediate_size
-        self.down_proj = nn.Linear(config.intermediate_size, config.hidden_size, bias=False)
         # Initialize coefficients with proper shapes
         self.up_coefficients = nn.Parameter(torch.randn(bank.coef_shape))
@@ -90,31 +90,37 @@ class SharedLlamaMLP(nn.Module):
     def forward(self, x):
         # Generate weights using template bank
         up_weights, gate_weights = self.bank(
-            self.up_coefficients,
-            self.gate_coefficients # Fixed order
         )
         # Apply SwiGLU: SiLU(gate * x) * up * x
-        hidden_states = self.act_fn(F.linear(x, gate_weights, self.gate_bias)) * F.linear(x, up_weights, self.up_bias)
         output = self.down_proj(hidden_states)
         return output
 class AttTemplateBank(nn.Module):
     def __init__(self, config, coef_rows, coef_columns):
         super().__init__()
         self.hidden_size = config.hidden_size
         self.num_heads = config.num_attention_heads
         self.head_dim = config.hidden_size // config.num_attention_heads
-        self.num_key_value_heads = getattr(config, 'num_key_value_heads', config.num_attention_heads)
         self.kv_dim = self.num_key_value_heads * self.head_dim
         self.coef_shape = (coef_rows, coef_columns)
         # Ensure divisibility
-        assert (self.hidden_size * self.hidden_size) % coef_rows == 0, \
-            "Q projection size must be divisible by coef_rows"
-        assert (self.kv_dim * self.hidden_size) % coef_rows == 0, \
-            "K/V projection size must be divisible by coef_rows"
         # Create templates for Q, K, V
         self.q_templates = nn.Parameter(
@@ -144,9 +150,15 @@ class AttTemplateBank(nn.Module):
         v_weights = v_chunks.reshape(self.kv_dim, self.hidden_size)
         return q_weights, k_weights, v_weights
 class SharedLlamaAttention(nn.Module):
-    def __init__(self, config, layer_idx: Optional[int] = None, bank: Optional[AttTemplateBank] = None):
         super().__init__()
         self.config = config
         self.bank = bank
@@ -155,15 +167,21 @@ class SharedLlamaAttention(nn.Module):
         self.hidden_size = config.hidden_size
         self.num_heads = config.num_attention_heads
         self.head_dim = self.hidden_size // self.num_heads
-        self.num_key_value_heads = getattr(config, 'num_key_value_heads', config.num_attention_heads)
         self.num_key_value_groups = self.num_heads // self.num_key_value_heads
         self.max_position_embeddings = config.max_position_embeddings
-        self.rope_theta = getattr(config, 'rope_theta', 10000.0)
         self.is_causal = True
-        self.o_proj = nn.Linear(self.hidden_size, self.hidden_size, bias=getattr(config, 'attention_bias', False))
         self.rotary_emb = LlamaRotaryEmbedding(config=self.config)
         # Initialize coefficients with proper shapes
         self.q_coefficients = nn.Parameter(torch.randn(bank.coef_shape))
         self.k_coefficients = nn.Parameter(torch.randn(bank.coef_shape))
@@ -187,50 +205,64 @@ class SharedLlamaAttention(nn.Module):
         **kwargs,
     ):
         bsz, q_len, _ = hidden_states.size()
         # Generate weights using template bank
-        q_weights, k_weights, v_weights = self.bank(
-            self.q_coefficients,
-            self.k_coefficients,
-            self.v_coefficients
         )
         # Apply projections
         query_states = F.linear(hidden_states, q_weights)
         key_states = F.linear(hidden_states, k_weights)
         value_states = F.linear(hidden_states, v_weights)
         # Reshape for multi-head attention
-        query_states = query_states.view(bsz, q_len, self.num_heads, self.head_dim).transpose(1, 2)
-        key_states = key_states.view(bsz, q_len, self.num_key_value_heads, self.head_dim).transpose(1, 2)
-        value_states = value_states.view(bsz, q_len, self.num_key_value_heads, self.head_dim).transpose(1, 2)
         # Apply rotary embeddings
         if position_embeddings is None:
             cos, sin = self.rotary_emb(value_states, position_ids)
         else:
             cos, sin = position_embeddings
-        query_states, key_states = apply_rotary_pos_emb(query_states, key_states, cos, sin)
         # Handle past key values
         if past_key_value is not None:
             cache_kwargs = {"sin": sin, "cos": cos, "cache_position": cache_position}
-            key_states, value_states = past_key_value.update(key_states, value_states, self.layer_idx, cache_kwargs)
         # Repeat key/value for grouped query attention
         key_states = repeat_kv(key_states, self.num_key_value_groups)
         value_states = repeat_kv(value_states, self.num_key_value_groups)
         # Compute attention
-        attn_weights = torch.matmul(query_states, key_states.transpose(2, 3)) / math.sqrt(self.head_dim)
         if attention_mask is not None:
             causal_mask = attention_mask[:, :, :, : key_states.shape[-2]]
             attn_weights = attn_weights + causal_mask
         # Apply softmax and dropout
-        attn_weights = nn.functional.softmax(attn_weights, dim=-1, dtype=torch.float32).to(query_states.dtype)
-        attn_weights = nn.functional.dropout(attn_weights, p=self.attention_dropout, training=self.training)
         attn_output = torch.matmul(attn_weights, value_states)
         if attn_output.size() != (bsz, self.num_heads, q_len, self.head_dim):
@@ -242,10 +274,10 @@ class SharedLlamaAttention(nn.Module):
         attn_output = attn_output.transpose(1, 2).contiguous()
         attn_output = attn_output.reshape(bsz, q_len, -1)
         attn_output = self.o_proj(attn_output)
         if not output_attentions:
             attn_weights = None
         return attn_output, attn_weights, past_key_value
@@ -269,6 +301,8 @@ class RECAST1B_llamaModel(PreTrainedModel):
     config_class = RECAST1B_llama
     base_model_prefix = "llama"
     supports_gradient_checkpointing = True
     def __init__(self, config):
         super().__init__(config)
@@ -641,6 +675,8 @@ class RECAST1B_LlamaForCausalLM(PreTrainedModel, GenerationMixin):
     config_class = RECAST1B_llama
     base_model_prefix = "llama"
     supports_gradient_checkpointing = True
     def __init__(self, config):
         super().__init__(config)

         self.coef_shape = (coef_rows, coef_columns)
         assert coef_columns is not None, "coef_columns must not be None"
         # Ensure divisibility for proper reshaping
+        assert (
+            self.hidden_size * self.intermediate_size
+        ) % coef_rows == 0, f"hidden_size * intermediate_size ({self.hidden_size * self.intermediate_size}) must be divisible by coef_rows ({coef_rows})"
         template_size = self.hidden_size * self.intermediate_size // coef_rows
+        self.up_templates = nn.Parameter(torch.randn(coef_columns, template_size))
+        self.gate_templates = nn.Parameter(torch.randn(coef_columns, template_size))
         # Better initialization
         nn.init.xavier_uniform_(self.up_templates)
         nn.init.xavier_uniform_(self.gate_templates)
     def forward(self, up_coeffs, gate_coeffs):
         # Compute chunked weights
+        up_chunks = torch.matmul(up_coeffs, self.up_templates)
         gate_chunks = torch.matmul(gate_coeffs, self.gate_templates)
         # Reshape to final weight matrices
         up_weights = up_chunks.reshape(self.intermediate_size, self.hidden_size)
         gate_weights = gate_chunks.reshape(self.intermediate_size, self.hidden_size)
         return up_weights, gate_weights
 class SharedLlamaMLP(nn.Module):
     def __init__(self, config, bank):
         super().__init__()
         self.bank = bank
         self.hidden_size = config.hidden_size
         self.intermediate_size = config.intermediate_size
+        self.down_proj = nn.Linear(
+            config.intermediate_size, config.hidden_size, bias=False
+        )
         # Initialize coefficients with proper shapes
         self.up_coefficients = nn.Parameter(torch.randn(bank.coef_shape))
     def forward(self, x):
         # Generate weights using template bank
         up_weights, gate_weights = self.bank(
+            self.up_coefficients, self.gate_coefficients  # Fixed order
         )
         # Apply SwiGLU: SiLU(gate * x) * up * x
+        hidden_states = self.act_fn(
+            F.linear(x, gate_weights, self.gate_bias)
+        ) * F.linear(x, up_weights, self.up_bias)
         output = self.down_proj(hidden_states)
         return output
 class AttTemplateBank(nn.Module):
     def __init__(self, config, coef_rows, coef_columns):
         super().__init__()
         self.hidden_size = config.hidden_size
         self.num_heads = config.num_attention_heads
         self.head_dim = config.hidden_size // config.num_attention_heads
+        self.num_key_value_heads = getattr(
+            config, "num_key_value_heads", config.num_attention_heads
+        )
         self.kv_dim = self.num_key_value_heads * self.head_dim
         self.coef_shape = (coef_rows, coef_columns)
         # Ensure divisibility
+        assert (
+            self.hidden_size * self.hidden_size
+        ) % coef_rows == 0, "Q projection size must be divisible by coef_rows"
+        assert (
+            self.kv_dim * self.hidden_size
+        ) % coef_rows == 0, "K/V projection size must be divisible by coef_rows"
         # Create templates for Q, K, V
         self.q_templates = nn.Parameter(
         v_weights = v_chunks.reshape(self.kv_dim, self.hidden_size)
         return q_weights, k_weights, v_weights
 class SharedLlamaAttention(nn.Module):
+    def __init__(
+        self,
+        config,
+        layer_idx: Optional[int] = None,
+        bank: Optional[AttTemplateBank] = None,
+    ):
         super().__init__()
         self.config = config
         self.bank = bank
         self.hidden_size = config.hidden_size
         self.num_heads = config.num_attention_heads
         self.head_dim = self.hidden_size // self.num_heads
+        self.num_key_value_heads = getattr(
+            config, "num_key_value_heads", config.num_attention_heads
+        )
         self.num_key_value_groups = self.num_heads // self.num_key_value_heads
         self.max_position_embeddings = config.max_position_embeddings
+        self.rope_theta = getattr(config, "rope_theta", 10000.0)
         self.is_causal = True
+        self.o_proj = nn.Linear(
+            self.hidden_size,
+            self.hidden_size,
+            bias=getattr(config, "attention_bias", False),
+        )
         self.rotary_emb = LlamaRotaryEmbedding(config=self.config)
         # Initialize coefficients with proper shapes
         self.q_coefficients = nn.Parameter(torch.randn(bank.coef_shape))
         self.k_coefficients = nn.Parameter(torch.randn(bank.coef_shape))
         **kwargs,
     ):
         bsz, q_len, _ = hidden_states.size()
         # Generate weights using template bank
+        q_weights, k_weights, v_weights = self.bank(
+            self.q_coefficients, self.k_coefficients, self.v_coefficients
         )
         # Apply projections
         query_states = F.linear(hidden_states, q_weights)
         key_states = F.linear(hidden_states, k_weights)
         value_states = F.linear(hidden_states, v_weights)
         # Reshape for multi-head attention
+        query_states = query_states.view(
+            bsz, q_len, self.num_heads, self.head_dim
+        ).transpose(1, 2)
+        key_states = key_states.view(
+            bsz, q_len, self.num_key_value_heads, self.head_dim
+        ).transpose(1, 2)
+        value_states = value_states.view(
+            bsz, q_len, self.num_key_value_heads, self.head_dim
+        ).transpose(1, 2)
         # Apply rotary embeddings
         if position_embeddings is None:
             cos, sin = self.rotary_emb(value_states, position_ids)
         else:
             cos, sin = position_embeddings
+        query_states, key_states = apply_rotary_pos_emb(
+            query_states, key_states, cos, sin
+        )
         # Handle past key values
         if past_key_value is not None:
             cache_kwargs = {"sin": sin, "cos": cos, "cache_position": cache_position}
+            key_states, value_states = past_key_value.update(
+                key_states, value_states, self.layer_idx, cache_kwargs
+            )
         # Repeat key/value for grouped query attention
         key_states = repeat_kv(key_states, self.num_key_value_groups)
         value_states = repeat_kv(value_states, self.num_key_value_groups)
         # Compute attention
+        attn_weights = torch.matmul(
+            query_states, key_states.transpose(2, 3)
+        ) / math.sqrt(self.head_dim)
         if attention_mask is not None:
             causal_mask = attention_mask[:, :, :, : key_states.shape[-2]]
             attn_weights = attn_weights + causal_mask
         # Apply softmax and dropout
+        attn_weights = nn.functional.softmax(
+            attn_weights, dim=-1, dtype=torch.float32
+        ).to(query_states.dtype)
+        attn_weights = nn.functional.dropout(
+            attn_weights, p=self.attention_dropout, training=self.training
+        )
         attn_output = torch.matmul(attn_weights, value_states)
         if attn_output.size() != (bsz, self.num_heads, q_len, self.head_dim):
         attn_output = attn_output.transpose(1, 2).contiguous()
         attn_output = attn_output.reshape(bsz, q_len, -1)
         attn_output = self.o_proj(attn_output)
         if not output_attentions:
             attn_weights = None
         return attn_output, attn_weights, past_key_value
     config_class = RECAST1B_llama
     base_model_prefix = "llama"
     supports_gradient_checkpointing = True
+    _no_split_modules = ["LlamaDecoderLayer"]  # Add this
+    _skip_keys_device_placement = "past_key_values"  # Add this
     def __init__(self, config):
         super().__init__(config)
     config_class = RECAST1B_llama
     base_model_prefix = "llama"
     supports_gradient_checkpointing = True
+    _no_split_modules = ["LlamaDecoderLayer"]  # Add this
+    _skip_keys_device_placement = "past_key_values"  # Add this
     def __init__(self, config):
         super().__init__(config)