leejunhyeok commited on
Commit
91c40ce
·
verified ·
1 Parent(s): f362538
Files changed (1) hide show
  1. modeling_motif.py +0 -40
modeling_motif.py CHANGED
@@ -98,18 +98,15 @@ ALL_LAYERNORM_LAYERS.append(MotifRMSNorm)
98
  class MotifRotaryEmbeddingWithCache(nn.Module):
99
  """
100
  Rotary positional embedding module with caching for efficiency.
101
-
102
  Args:
103
  dim (int): Dimensionality of the embedding.
104
  max_position_embeddings (int): Maximum sequence length for caching. Default is 2048.
105
  base (int): Base for computing inverse frequency. Default is 10000.
106
  device (torch.device, optional): Device for tensor storage.
107
-
108
  Methods:
109
  forward(x, seq_len=None):
110
  Computes cosine and sine embeddings for input sequence length.
111
  Automatically updates cache if `seq_len` exceeds cached length.
112
-
113
  Attributes:
114
  inv_freq (torch.Tensor): Inverse frequency tensor for position encoding.
115
  cos_cached (torch.Tensor): Cached cosine embeddings.
@@ -241,10 +238,8 @@ class MotifRotaryEmbedding(nn.Module):
241
  def rotate_half(x):
242
  """
243
  Rotates half of the dimensions of the input tensor using torch.roll and in-place negation.
244
-
245
  Args:
246
  x (torch.Tensor): The input tensor.
247
-
248
  Returns:
249
  torch.Tensor: A tensor where the latter half of the dimensions are negated
250
  and moved before the first half.
@@ -259,7 +254,6 @@ def rotate_half(x):
259
  def apply_rotary_pos_emb(q, k, cos, sin, position_ids=None, unsqueeze_dim=1, fused_rope=True):
260
  """
261
  Applies rotary position embeddings to the input tensors.
262
-
263
  Args:
264
  q (torch.Tensor): Query tensor of shape (B, NH, S, D_KV).
265
  k (torch.Tensor): Key tensor of shape (B, NH, S, D_KV).
@@ -270,7 +264,6 @@ def apply_rotary_pos_emb(q, k, cos, sin, position_ids=None, unsqueeze_dim=1, fus
270
  fused_rope (bool, optional): If True, applies fused rotary embeddings using
271
  `moreh_ops.apply_rotary_emb`. If False, computes rotary embeddings manually.
272
  Defaults to False.
273
-
274
  Returns:
275
  Tuple[torch.Tensor, torch.Tensor]: Returns transformed query and key tensors after applying rotary embeddings.
276
  """
@@ -322,26 +315,21 @@ def repeat_kv(hidden_states: torch.Tensor, n_rep: int) -> torch.Tensor:
322
  class MotifAttention(nn.Module):
323
  """
324
  Differential Attention (DiffAttention) module.
325
-
326
  Implements the Differential Attention from
327
  "DIFFERENTIAL TRANSFORMER" (https://arxiv.org/pdf/2410.05258).
328
-
329
  Overview
330
  Standard transformers often over-allocate attention to irrelevant context.
331
  DiffAttention addresses this by computing attention as the difference between
332
  two separate softmax attention maps, effectively canceling noise and promoting
333
  sparse, structured attention patterns.
334
-
335
  Reference Implementation
336
  https://github.com/microsoft/unilm/tree/master/Diff-Transformer
337
-
338
  Args
339
  The differential attention mechanism computes attention as the difference of two softmax attention scores, weighted by a learnable scalar λ.
340
  λ is re-parameterized as λ = exp(λ_q1 · λ_k1) − exp(λ_q2 · λ_k2) + λ_init.
341
  - lambda_q1, lambda_q2 (nn.Parameter): Learnable vectors used to compute the first and second components of λ for query transformations.
342
  - lambda_k1, lambda_k2 (nn.Parameter): Learnable vectors used to compute the first and second components of λ for key transformations.
343
  - lambda_init (float): A constant used for initializing λ, typically set as λ_init = 0.8 − 0.6 × exp(−0.3 × (layer_index − 1)).
344
-
345
  """
346
 
347
  def __init__(self, config: MotifConfig, layer_idx: Optional[int] = None):
@@ -964,11 +952,9 @@ MOTIF_START_DOCSTRING = r"""
964
  This model inherits from [`PreTrainedModel`]. Check the superclass documentation for the generic methods the
965
  library implements for all its model (such as downloading or saving, resizing the input embeddings, pruning heads
966
  etc.)
967
-
968
  This model is also a PyTorch [torch.nn.Module](https://pytorch.org/docs/stable/nn.html#torch.nn.Module) subclass.
969
  Use it as a regular PyTorch Module and refer to the PyTorch documentation for all matter related to general usage
970
  and behavior.
971
-
972
  Parameters:
973
  config ([`MotifConfig`]):
974
  Model configuration class with all the parameters of the model. Initializing with a config file does not
@@ -1049,51 +1035,39 @@ MOTIF_INPUTS_DOCSTRING = r"""
1049
  input_ids (`torch.LongTensor` of shape `(batch_size, sequence_length)`):
1050
  Indices of input sequence tokens in the vocabulary. Padding will be ignored by default should you provide
1051
  it.
1052
-
1053
  Indices can be obtained using [`AutoTokenizer`]. See [`PreTrainedTokenizer.encode`] and
1054
  [`PreTrainedTokenizer.__call__`] for details.
1055
-
1056
  [What are input IDs?](../glossary#input-ids)
1057
  attention_mask (`torch.Tensor` of shape `(batch_size, sequence_length)`, *optional*):
1058
  Mask to avoid performing attention on padding token indices. Mask values selected in `[0, 1]`:
1059
-
1060
  - 1 for tokens that are **not masked**,
1061
  - 0 for tokens that are **masked**.
1062
-
1063
  [What are attention masks?](../glossary#attention-mask)
1064
-
1065
  Indices can be obtained using [`AutoTokenizer`]. See [`PreTrainedTokenizer.encode`] and
1066
  [`PreTrainedTokenizer.__call__`] for details.
1067
-
1068
  If `past_key_values` is used, optionally only the last `decoder_input_ids` have to be input (see
1069
  `past_key_values`).
1070
-
1071
  If you want to change padding behavior, you should read [`modeling_opt._prepare_decoder_attention_mask`]
1072
  and modify to your needs. See diagram 1 in [the paper](https://arxiv.org/abs/1910.13461) for more
1073
  information on the default strategy.
1074
-
1075
  - 1 indicates the head is **not masked**,
1076
  - 0 indicates the head is **masked**.
1077
  position_ids (`torch.LongTensor` of shape `(batch_size, sequence_length)`, *optional*):
1078
  Indices of positions of each input sequence tokens in the position embeddings. Selected in the range `[0,
1079
  config.n_positions - 1]`.
1080
-
1081
  [What are position IDs?](../glossary#position-ids)
1082
  past_key_values (`Cache` or `tuple(tuple(torch.FloatTensor))`, *optional*):
1083
  Pre-computed hidden-states (key and values in the self-attention blocks and in the cross-attention
1084
  blocks) that can be used to speed up sequential decoding. This typically consists in the `past_key_values`
1085
  returned by the model at a previous stage of decoding, when `use_cache=True` or `config.use_cache=True`.
1086
-
1087
  Two formats are allowed:
1088
  - a [`~cache_utils.Cache`] instance, see our
1089
  [kv cache guide](https://huggingface.co/docs/transformers/en/kv_cache);
1090
  - Tuple of `tuple(torch.FloatTensor)` of length `config.n_layers`, with each tuple having 2 tensors of
1091
  shape `(batch_size, num_heads, sequence_length, embed_size_per_head)`). This is also known as the legacy
1092
  cache format.
1093
-
1094
  The model will output the same cache format that is fed as input. If no `past_key_values` are passed, the
1095
  legacy cache format will be returned.
1096
-
1097
  If `past_key_values` are used, the user can optionally input only the last `input_ids` (those that don't
1098
  have their past key value states given to this model) of shape `(batch_size, 1)` instead of all `input_ids`
1099
  of shape `(batch_size, sequence_length)`.
@@ -1126,7 +1100,6 @@ MOTIF_INPUTS_DOCSTRING = r"""
1126
  class MotifModel(MotifPreTrainedModel):
1127
  """
1128
  Transformer decoder consisting of *config.num_hidden_layers* layers. Each layer is a [`MotifDecoderLayer`]
1129
-
1130
  Args:
1131
  config: MotifConfig
1132
  """
@@ -1375,7 +1348,6 @@ class MotifModel(MotifPreTrainedModel):
1375
  """
1376
  Creates a causal 4D mask of shape `(batch_size, 1, query_length, key_value_length)` from a 2D mask of shape
1377
  `(batch_size, key_value_length)`, or if the input `attention_mask` is already 4D, do nothing.
1378
-
1379
  Args:
1380
  attention_mask (`torch.Tensor`):
1381
  A 2D attention mask of shape `(batch_size, key_value_length)` or a 4D attention mask of shape `(batch_size, 1, query_length, key_value_length)`.
@@ -1434,11 +1406,6 @@ class MotifForCausalLM(MotifPreTrainedModel, GenerationMixin):
1434
  self.multi_token_heads = config.multi_token_heads
1435
 
1436
  self.lm_head = nn.Linear(config.hidden_size, config.vocab_size, bias=False)
1437
- else:
1438
- self.tokenwise_last_layers = nn.ModuleList(
1439
- [MotifDecoderLayer(config, config.num_hidden_layers - 1) for _ in range(self.multi_token_heads)])
1440
- self.tokenwise_lm_heads = nn.ModuleList(
1441
- [nn.Linear(config.hidden_size, config.vocab_size, bias=False) for _ in range(self.multi_token_heads)])
1442
 
1443
  # Initialize weights and apply final processing
1444
  self.post_init()
@@ -1490,25 +1457,18 @@ class MotifForCausalLM(MotifPreTrainedModel, GenerationMixin):
1490
  Labels for computing the masked language modeling loss. Indices should either be in `[0, ...,
1491
  config.vocab_size]` or -100 (see `input_ids` docstring). Tokens with indices set to `-100` are ignored
1492
  (masked), the loss is only computed for the tokens with labels in `[0, ..., config.vocab_size]`.
1493
-
1494
  num_logits_to_keep (`int`, *optional*):
1495
  Calculate logits for the last `num_logits_to_keep` tokens. If `0`, calculate logits for all
1496
  `input_ids` (special case). Only last token logits are needed for generation, and calculating them only for that
1497
  token can save memory, which becomes pretty significant for long sequences or large vocabulary size.
1498
-
1499
  Returns:
1500
-
1501
  Example:
1502
-
1503
  ```python
1504
  >>> from transformers import AutoTokenizer, MotifForCausalLM
1505
-
1506
  >>> model = MotifForCausalLM.from_pretrained(PATH_TO_CONVERTED_WEIGHTS, trust_remote_code = True)
1507
  >>> tokenizer = AutoTokenizer.from_pretrained(PATH_TO_CONVERTED_TOKENIZER, trust_remote_code = True)
1508
-
1509
  >>> prompt = "Hey, are you conscious? Can you talk to me?"
1510
  >>> inputs = tokenizer(prompt, return_tensors="pt")
1511
-
1512
  >>> # Generate
1513
  >>> generate_ids = model.generate(inputs.input_ids, max_length=30)
1514
  >>> tokenizer.batch_decode(generate_ids, skip_special_tokens=True, clean_up_tokenization_spaces=False)[0]
 
98
  class MotifRotaryEmbeddingWithCache(nn.Module):
99
  """
100
  Rotary positional embedding module with caching for efficiency.
 
101
  Args:
102
  dim (int): Dimensionality of the embedding.
103
  max_position_embeddings (int): Maximum sequence length for caching. Default is 2048.
104
  base (int): Base for computing inverse frequency. Default is 10000.
105
  device (torch.device, optional): Device for tensor storage.
 
106
  Methods:
107
  forward(x, seq_len=None):
108
  Computes cosine and sine embeddings for input sequence length.
109
  Automatically updates cache if `seq_len` exceeds cached length.
 
110
  Attributes:
111
  inv_freq (torch.Tensor): Inverse frequency tensor for position encoding.
112
  cos_cached (torch.Tensor): Cached cosine embeddings.
 
238
  def rotate_half(x):
239
  """
240
  Rotates half of the dimensions of the input tensor using torch.roll and in-place negation.
 
241
  Args:
242
  x (torch.Tensor): The input tensor.
 
243
  Returns:
244
  torch.Tensor: A tensor where the latter half of the dimensions are negated
245
  and moved before the first half.
 
254
  def apply_rotary_pos_emb(q, k, cos, sin, position_ids=None, unsqueeze_dim=1, fused_rope=True):
255
  """
256
  Applies rotary position embeddings to the input tensors.
 
257
  Args:
258
  q (torch.Tensor): Query tensor of shape (B, NH, S, D_KV).
259
  k (torch.Tensor): Key tensor of shape (B, NH, S, D_KV).
 
264
  fused_rope (bool, optional): If True, applies fused rotary embeddings using
265
  `moreh_ops.apply_rotary_emb`. If False, computes rotary embeddings manually.
266
  Defaults to False.
 
267
  Returns:
268
  Tuple[torch.Tensor, torch.Tensor]: Returns transformed query and key tensors after applying rotary embeddings.
269
  """
 
315
  class MotifAttention(nn.Module):
316
  """
317
  Differential Attention (DiffAttention) module.
 
318
  Implements the Differential Attention from
319
  "DIFFERENTIAL TRANSFORMER" (https://arxiv.org/pdf/2410.05258).
 
320
  Overview
321
  Standard transformers often over-allocate attention to irrelevant context.
322
  DiffAttention addresses this by computing attention as the difference between
323
  two separate softmax attention maps, effectively canceling noise and promoting
324
  sparse, structured attention patterns.
 
325
  Reference Implementation
326
  https://github.com/microsoft/unilm/tree/master/Diff-Transformer
 
327
  Args
328
  The differential attention mechanism computes attention as the difference of two softmax attention scores, weighted by a learnable scalar λ.
329
  λ is re-parameterized as λ = exp(λ_q1 · λ_k1) − exp(λ_q2 · λ_k2) + λ_init.
330
  - lambda_q1, lambda_q2 (nn.Parameter): Learnable vectors used to compute the first and second components of λ for query transformations.
331
  - lambda_k1, lambda_k2 (nn.Parameter): Learnable vectors used to compute the first and second components of λ for key transformations.
332
  - lambda_init (float): A constant used for initializing λ, typically set as λ_init = 0.8 − 0.6 × exp(−0.3 × (layer_index − 1)).
 
333
  """
334
 
335
  def __init__(self, config: MotifConfig, layer_idx: Optional[int] = None):
 
952
  This model inherits from [`PreTrainedModel`]. Check the superclass documentation for the generic methods the
953
  library implements for all its model (such as downloading or saving, resizing the input embeddings, pruning heads
954
  etc.)
 
955
  This model is also a PyTorch [torch.nn.Module](https://pytorch.org/docs/stable/nn.html#torch.nn.Module) subclass.
956
  Use it as a regular PyTorch Module and refer to the PyTorch documentation for all matter related to general usage
957
  and behavior.
 
958
  Parameters:
959
  config ([`MotifConfig`]):
960
  Model configuration class with all the parameters of the model. Initializing with a config file does not
 
1035
  input_ids (`torch.LongTensor` of shape `(batch_size, sequence_length)`):
1036
  Indices of input sequence tokens in the vocabulary. Padding will be ignored by default should you provide
1037
  it.
 
1038
  Indices can be obtained using [`AutoTokenizer`]. See [`PreTrainedTokenizer.encode`] and
1039
  [`PreTrainedTokenizer.__call__`] for details.
 
1040
  [What are input IDs?](../glossary#input-ids)
1041
  attention_mask (`torch.Tensor` of shape `(batch_size, sequence_length)`, *optional*):
1042
  Mask to avoid performing attention on padding token indices. Mask values selected in `[0, 1]`:
 
1043
  - 1 for tokens that are **not masked**,
1044
  - 0 for tokens that are **masked**.
 
1045
  [What are attention masks?](../glossary#attention-mask)
 
1046
  Indices can be obtained using [`AutoTokenizer`]. See [`PreTrainedTokenizer.encode`] and
1047
  [`PreTrainedTokenizer.__call__`] for details.
 
1048
  If `past_key_values` is used, optionally only the last `decoder_input_ids` have to be input (see
1049
  `past_key_values`).
 
1050
  If you want to change padding behavior, you should read [`modeling_opt._prepare_decoder_attention_mask`]
1051
  and modify to your needs. See diagram 1 in [the paper](https://arxiv.org/abs/1910.13461) for more
1052
  information on the default strategy.
 
1053
  - 1 indicates the head is **not masked**,
1054
  - 0 indicates the head is **masked**.
1055
  position_ids (`torch.LongTensor` of shape `(batch_size, sequence_length)`, *optional*):
1056
  Indices of positions of each input sequence tokens in the position embeddings. Selected in the range `[0,
1057
  config.n_positions - 1]`.
 
1058
  [What are position IDs?](../glossary#position-ids)
1059
  past_key_values (`Cache` or `tuple(tuple(torch.FloatTensor))`, *optional*):
1060
  Pre-computed hidden-states (key and values in the self-attention blocks and in the cross-attention
1061
  blocks) that can be used to speed up sequential decoding. This typically consists in the `past_key_values`
1062
  returned by the model at a previous stage of decoding, when `use_cache=True` or `config.use_cache=True`.
 
1063
  Two formats are allowed:
1064
  - a [`~cache_utils.Cache`] instance, see our
1065
  [kv cache guide](https://huggingface.co/docs/transformers/en/kv_cache);
1066
  - Tuple of `tuple(torch.FloatTensor)` of length `config.n_layers`, with each tuple having 2 tensors of
1067
  shape `(batch_size, num_heads, sequence_length, embed_size_per_head)`). This is also known as the legacy
1068
  cache format.
 
1069
  The model will output the same cache format that is fed as input. If no `past_key_values` are passed, the
1070
  legacy cache format will be returned.
 
1071
  If `past_key_values` are used, the user can optionally input only the last `input_ids` (those that don't
1072
  have their past key value states given to this model) of shape `(batch_size, 1)` instead of all `input_ids`
1073
  of shape `(batch_size, sequence_length)`.
 
1100
  class MotifModel(MotifPreTrainedModel):
1101
  """
1102
  Transformer decoder consisting of *config.num_hidden_layers* layers. Each layer is a [`MotifDecoderLayer`]
 
1103
  Args:
1104
  config: MotifConfig
1105
  """
 
1348
  """
1349
  Creates a causal 4D mask of shape `(batch_size, 1, query_length, key_value_length)` from a 2D mask of shape
1350
  `(batch_size, key_value_length)`, or if the input `attention_mask` is already 4D, do nothing.
 
1351
  Args:
1352
  attention_mask (`torch.Tensor`):
1353
  A 2D attention mask of shape `(batch_size, key_value_length)` or a 4D attention mask of shape `(batch_size, 1, query_length, key_value_length)`.
 
1406
  self.multi_token_heads = config.multi_token_heads
1407
 
1408
  self.lm_head = nn.Linear(config.hidden_size, config.vocab_size, bias=False)
 
 
 
 
 
1409
 
1410
  # Initialize weights and apply final processing
1411
  self.post_init()
 
1457
  Labels for computing the masked language modeling loss. Indices should either be in `[0, ...,
1458
  config.vocab_size]` or -100 (see `input_ids` docstring). Tokens with indices set to `-100` are ignored
1459
  (masked), the loss is only computed for the tokens with labels in `[0, ..., config.vocab_size]`.
 
1460
  num_logits_to_keep (`int`, *optional*):
1461
  Calculate logits for the last `num_logits_to_keep` tokens. If `0`, calculate logits for all
1462
  `input_ids` (special case). Only last token logits are needed for generation, and calculating them only for that
1463
  token can save memory, which becomes pretty significant for long sequences or large vocabulary size.
 
1464
  Returns:
 
1465
  Example:
 
1466
  ```python
1467
  >>> from transformers import AutoTokenizer, MotifForCausalLM
 
1468
  >>> model = MotifForCausalLM.from_pretrained(PATH_TO_CONVERTED_WEIGHTS, trust_remote_code = True)
1469
  >>> tokenizer = AutoTokenizer.from_pretrained(PATH_TO_CONVERTED_TOKENIZER, trust_remote_code = True)
 
1470
  >>> prompt = "Hey, are you conscious? Can you talk to me?"
1471
  >>> inputs = tokenizer(prompt, return_tensors="pt")
 
1472
  >>> # Generate
1473
  >>> generate_ids = model.generate(inputs.input_ids, max_length=30)
1474
  >>> tokenizer.batch_decode(generate_ids, skip_special_tokens=True, clean_up_tokenization_spaces=False)[0]