bugfix
Browse files- modeling_motif.py +0 -40
modeling_motif.py
CHANGED
@@ -98,18 +98,15 @@ ALL_LAYERNORM_LAYERS.append(MotifRMSNorm)
|
|
98 |
class MotifRotaryEmbeddingWithCache(nn.Module):
|
99 |
"""
|
100 |
Rotary positional embedding module with caching for efficiency.
|
101 |
-
|
102 |
Args:
|
103 |
dim (int): Dimensionality of the embedding.
|
104 |
max_position_embeddings (int): Maximum sequence length for caching. Default is 2048.
|
105 |
base (int): Base for computing inverse frequency. Default is 10000.
|
106 |
device (torch.device, optional): Device for tensor storage.
|
107 |
-
|
108 |
Methods:
|
109 |
forward(x, seq_len=None):
|
110 |
Computes cosine and sine embeddings for input sequence length.
|
111 |
Automatically updates cache if `seq_len` exceeds cached length.
|
112 |
-
|
113 |
Attributes:
|
114 |
inv_freq (torch.Tensor): Inverse frequency tensor for position encoding.
|
115 |
cos_cached (torch.Tensor): Cached cosine embeddings.
|
@@ -241,10 +238,8 @@ class MotifRotaryEmbedding(nn.Module):
|
|
241 |
def rotate_half(x):
|
242 |
"""
|
243 |
Rotates half of the dimensions of the input tensor using torch.roll and in-place negation.
|
244 |
-
|
245 |
Args:
|
246 |
x (torch.Tensor): The input tensor.
|
247 |
-
|
248 |
Returns:
|
249 |
torch.Tensor: A tensor where the latter half of the dimensions are negated
|
250 |
and moved before the first half.
|
@@ -259,7 +254,6 @@ def rotate_half(x):
|
|
259 |
def apply_rotary_pos_emb(q, k, cos, sin, position_ids=None, unsqueeze_dim=1, fused_rope=True):
|
260 |
"""
|
261 |
Applies rotary position embeddings to the input tensors.
|
262 |
-
|
263 |
Args:
|
264 |
q (torch.Tensor): Query tensor of shape (B, NH, S, D_KV).
|
265 |
k (torch.Tensor): Key tensor of shape (B, NH, S, D_KV).
|
@@ -270,7 +264,6 @@ def apply_rotary_pos_emb(q, k, cos, sin, position_ids=None, unsqueeze_dim=1, fus
|
|
270 |
fused_rope (bool, optional): If True, applies fused rotary embeddings using
|
271 |
`moreh_ops.apply_rotary_emb`. If False, computes rotary embeddings manually.
|
272 |
Defaults to False.
|
273 |
-
|
274 |
Returns:
|
275 |
Tuple[torch.Tensor, torch.Tensor]: Returns transformed query and key tensors after applying rotary embeddings.
|
276 |
"""
|
@@ -322,26 +315,21 @@ def repeat_kv(hidden_states: torch.Tensor, n_rep: int) -> torch.Tensor:
|
|
322 |
class MotifAttention(nn.Module):
|
323 |
"""
|
324 |
Differential Attention (DiffAttention) module.
|
325 |
-
|
326 |
Implements the Differential Attention from
|
327 |
"DIFFERENTIAL TRANSFORMER" (https://arxiv.org/pdf/2410.05258).
|
328 |
-
|
329 |
Overview
|
330 |
Standard transformers often over-allocate attention to irrelevant context.
|
331 |
DiffAttention addresses this by computing attention as the difference between
|
332 |
two separate softmax attention maps, effectively canceling noise and promoting
|
333 |
sparse, structured attention patterns.
|
334 |
-
|
335 |
Reference Implementation
|
336 |
https://github.com/microsoft/unilm/tree/master/Diff-Transformer
|
337 |
-
|
338 |
Args
|
339 |
The differential attention mechanism computes attention as the difference of two softmax attention scores, weighted by a learnable scalar λ.
|
340 |
λ is re-parameterized as λ = exp(λ_q1 · λ_k1) − exp(λ_q2 · λ_k2) + λ_init.
|
341 |
- lambda_q1, lambda_q2 (nn.Parameter): Learnable vectors used to compute the first and second components of λ for query transformations.
|
342 |
- lambda_k1, lambda_k2 (nn.Parameter): Learnable vectors used to compute the first and second components of λ for key transformations.
|
343 |
- lambda_init (float): A constant used for initializing λ, typically set as λ_init = 0.8 − 0.6 × exp(−0.3 × (layer_index − 1)).
|
344 |
-
|
345 |
"""
|
346 |
|
347 |
def __init__(self, config: MotifConfig, layer_idx: Optional[int] = None):
|
@@ -964,11 +952,9 @@ MOTIF_START_DOCSTRING = r"""
|
|
964 |
This model inherits from [`PreTrainedModel`]. Check the superclass documentation for the generic methods the
|
965 |
library implements for all its model (such as downloading or saving, resizing the input embeddings, pruning heads
|
966 |
etc.)
|
967 |
-
|
968 |
This model is also a PyTorch [torch.nn.Module](https://pytorch.org/docs/stable/nn.html#torch.nn.Module) subclass.
|
969 |
Use it as a regular PyTorch Module and refer to the PyTorch documentation for all matter related to general usage
|
970 |
and behavior.
|
971 |
-
|
972 |
Parameters:
|
973 |
config ([`MotifConfig`]):
|
974 |
Model configuration class with all the parameters of the model. Initializing with a config file does not
|
@@ -1049,51 +1035,39 @@ MOTIF_INPUTS_DOCSTRING = r"""
|
|
1049 |
input_ids (`torch.LongTensor` of shape `(batch_size, sequence_length)`):
|
1050 |
Indices of input sequence tokens in the vocabulary. Padding will be ignored by default should you provide
|
1051 |
it.
|
1052 |
-
|
1053 |
Indices can be obtained using [`AutoTokenizer`]. See [`PreTrainedTokenizer.encode`] and
|
1054 |
[`PreTrainedTokenizer.__call__`] for details.
|
1055 |
-
|
1056 |
[What are input IDs?](../glossary#input-ids)
|
1057 |
attention_mask (`torch.Tensor` of shape `(batch_size, sequence_length)`, *optional*):
|
1058 |
Mask to avoid performing attention on padding token indices. Mask values selected in `[0, 1]`:
|
1059 |
-
|
1060 |
- 1 for tokens that are **not masked**,
|
1061 |
- 0 for tokens that are **masked**.
|
1062 |
-
|
1063 |
[What are attention masks?](../glossary#attention-mask)
|
1064 |
-
|
1065 |
Indices can be obtained using [`AutoTokenizer`]. See [`PreTrainedTokenizer.encode`] and
|
1066 |
[`PreTrainedTokenizer.__call__`] for details.
|
1067 |
-
|
1068 |
If `past_key_values` is used, optionally only the last `decoder_input_ids` have to be input (see
|
1069 |
`past_key_values`).
|
1070 |
-
|
1071 |
If you want to change padding behavior, you should read [`modeling_opt._prepare_decoder_attention_mask`]
|
1072 |
and modify to your needs. See diagram 1 in [the paper](https://arxiv.org/abs/1910.13461) for more
|
1073 |
information on the default strategy.
|
1074 |
-
|
1075 |
- 1 indicates the head is **not masked**,
|
1076 |
- 0 indicates the head is **masked**.
|
1077 |
position_ids (`torch.LongTensor` of shape `(batch_size, sequence_length)`, *optional*):
|
1078 |
Indices of positions of each input sequence tokens in the position embeddings. Selected in the range `[0,
|
1079 |
config.n_positions - 1]`.
|
1080 |
-
|
1081 |
[What are position IDs?](../glossary#position-ids)
|
1082 |
past_key_values (`Cache` or `tuple(tuple(torch.FloatTensor))`, *optional*):
|
1083 |
Pre-computed hidden-states (key and values in the self-attention blocks and in the cross-attention
|
1084 |
blocks) that can be used to speed up sequential decoding. This typically consists in the `past_key_values`
|
1085 |
returned by the model at a previous stage of decoding, when `use_cache=True` or `config.use_cache=True`.
|
1086 |
-
|
1087 |
Two formats are allowed:
|
1088 |
- a [`~cache_utils.Cache`] instance, see our
|
1089 |
[kv cache guide](https://huggingface.co/docs/transformers/en/kv_cache);
|
1090 |
- Tuple of `tuple(torch.FloatTensor)` of length `config.n_layers`, with each tuple having 2 tensors of
|
1091 |
shape `(batch_size, num_heads, sequence_length, embed_size_per_head)`). This is also known as the legacy
|
1092 |
cache format.
|
1093 |
-
|
1094 |
The model will output the same cache format that is fed as input. If no `past_key_values` are passed, the
|
1095 |
legacy cache format will be returned.
|
1096 |
-
|
1097 |
If `past_key_values` are used, the user can optionally input only the last `input_ids` (those that don't
|
1098 |
have their past key value states given to this model) of shape `(batch_size, 1)` instead of all `input_ids`
|
1099 |
of shape `(batch_size, sequence_length)`.
|
@@ -1126,7 +1100,6 @@ MOTIF_INPUTS_DOCSTRING = r"""
|
|
1126 |
class MotifModel(MotifPreTrainedModel):
|
1127 |
"""
|
1128 |
Transformer decoder consisting of *config.num_hidden_layers* layers. Each layer is a [`MotifDecoderLayer`]
|
1129 |
-
|
1130 |
Args:
|
1131 |
config: MotifConfig
|
1132 |
"""
|
@@ -1375,7 +1348,6 @@ class MotifModel(MotifPreTrainedModel):
|
|
1375 |
"""
|
1376 |
Creates a causal 4D mask of shape `(batch_size, 1, query_length, key_value_length)` from a 2D mask of shape
|
1377 |
`(batch_size, key_value_length)`, or if the input `attention_mask` is already 4D, do nothing.
|
1378 |
-
|
1379 |
Args:
|
1380 |
attention_mask (`torch.Tensor`):
|
1381 |
A 2D attention mask of shape `(batch_size, key_value_length)` or a 4D attention mask of shape `(batch_size, 1, query_length, key_value_length)`.
|
@@ -1434,11 +1406,6 @@ class MotifForCausalLM(MotifPreTrainedModel, GenerationMixin):
|
|
1434 |
self.multi_token_heads = config.multi_token_heads
|
1435 |
|
1436 |
self.lm_head = nn.Linear(config.hidden_size, config.vocab_size, bias=False)
|
1437 |
-
else:
|
1438 |
-
self.tokenwise_last_layers = nn.ModuleList(
|
1439 |
-
[MotifDecoderLayer(config, config.num_hidden_layers - 1) for _ in range(self.multi_token_heads)])
|
1440 |
-
self.tokenwise_lm_heads = nn.ModuleList(
|
1441 |
-
[nn.Linear(config.hidden_size, config.vocab_size, bias=False) for _ in range(self.multi_token_heads)])
|
1442 |
|
1443 |
# Initialize weights and apply final processing
|
1444 |
self.post_init()
|
@@ -1490,25 +1457,18 @@ class MotifForCausalLM(MotifPreTrainedModel, GenerationMixin):
|
|
1490 |
Labels for computing the masked language modeling loss. Indices should either be in `[0, ...,
|
1491 |
config.vocab_size]` or -100 (see `input_ids` docstring). Tokens with indices set to `-100` are ignored
|
1492 |
(masked), the loss is only computed for the tokens with labels in `[0, ..., config.vocab_size]`.
|
1493 |
-
|
1494 |
num_logits_to_keep (`int`, *optional*):
|
1495 |
Calculate logits for the last `num_logits_to_keep` tokens. If `0`, calculate logits for all
|
1496 |
`input_ids` (special case). Only last token logits are needed for generation, and calculating them only for that
|
1497 |
token can save memory, which becomes pretty significant for long sequences or large vocabulary size.
|
1498 |
-
|
1499 |
Returns:
|
1500 |
-
|
1501 |
Example:
|
1502 |
-
|
1503 |
```python
|
1504 |
>>> from transformers import AutoTokenizer, MotifForCausalLM
|
1505 |
-
|
1506 |
>>> model = MotifForCausalLM.from_pretrained(PATH_TO_CONVERTED_WEIGHTS, trust_remote_code = True)
|
1507 |
>>> tokenizer = AutoTokenizer.from_pretrained(PATH_TO_CONVERTED_TOKENIZER, trust_remote_code = True)
|
1508 |
-
|
1509 |
>>> prompt = "Hey, are you conscious? Can you talk to me?"
|
1510 |
>>> inputs = tokenizer(prompt, return_tensors="pt")
|
1511 |
-
|
1512 |
>>> # Generate
|
1513 |
>>> generate_ids = model.generate(inputs.input_ids, max_length=30)
|
1514 |
>>> tokenizer.batch_decode(generate_ids, skip_special_tokens=True, clean_up_tokenization_spaces=False)[0]
|
|
|
98 |
class MotifRotaryEmbeddingWithCache(nn.Module):
|
99 |
"""
|
100 |
Rotary positional embedding module with caching for efficiency.
|
|
|
101 |
Args:
|
102 |
dim (int): Dimensionality of the embedding.
|
103 |
max_position_embeddings (int): Maximum sequence length for caching. Default is 2048.
|
104 |
base (int): Base for computing inverse frequency. Default is 10000.
|
105 |
device (torch.device, optional): Device for tensor storage.
|
|
|
106 |
Methods:
|
107 |
forward(x, seq_len=None):
|
108 |
Computes cosine and sine embeddings for input sequence length.
|
109 |
Automatically updates cache if `seq_len` exceeds cached length.
|
|
|
110 |
Attributes:
|
111 |
inv_freq (torch.Tensor): Inverse frequency tensor for position encoding.
|
112 |
cos_cached (torch.Tensor): Cached cosine embeddings.
|
|
|
238 |
def rotate_half(x):
|
239 |
"""
|
240 |
Rotates half of the dimensions of the input tensor using torch.roll and in-place negation.
|
|
|
241 |
Args:
|
242 |
x (torch.Tensor): The input tensor.
|
|
|
243 |
Returns:
|
244 |
torch.Tensor: A tensor where the latter half of the dimensions are negated
|
245 |
and moved before the first half.
|
|
|
254 |
def apply_rotary_pos_emb(q, k, cos, sin, position_ids=None, unsqueeze_dim=1, fused_rope=True):
|
255 |
"""
|
256 |
Applies rotary position embeddings to the input tensors.
|
|
|
257 |
Args:
|
258 |
q (torch.Tensor): Query tensor of shape (B, NH, S, D_KV).
|
259 |
k (torch.Tensor): Key tensor of shape (B, NH, S, D_KV).
|
|
|
264 |
fused_rope (bool, optional): If True, applies fused rotary embeddings using
|
265 |
`moreh_ops.apply_rotary_emb`. If False, computes rotary embeddings manually.
|
266 |
Defaults to False.
|
|
|
267 |
Returns:
|
268 |
Tuple[torch.Tensor, torch.Tensor]: Returns transformed query and key tensors after applying rotary embeddings.
|
269 |
"""
|
|
|
315 |
class MotifAttention(nn.Module):
|
316 |
"""
|
317 |
Differential Attention (DiffAttention) module.
|
|
|
318 |
Implements the Differential Attention from
|
319 |
"DIFFERENTIAL TRANSFORMER" (https://arxiv.org/pdf/2410.05258).
|
|
|
320 |
Overview
|
321 |
Standard transformers often over-allocate attention to irrelevant context.
|
322 |
DiffAttention addresses this by computing attention as the difference between
|
323 |
two separate softmax attention maps, effectively canceling noise and promoting
|
324 |
sparse, structured attention patterns.
|
|
|
325 |
Reference Implementation
|
326 |
https://github.com/microsoft/unilm/tree/master/Diff-Transformer
|
|
|
327 |
Args
|
328 |
The differential attention mechanism computes attention as the difference of two softmax attention scores, weighted by a learnable scalar λ.
|
329 |
λ is re-parameterized as λ = exp(λ_q1 · λ_k1) − exp(λ_q2 · λ_k2) + λ_init.
|
330 |
- lambda_q1, lambda_q2 (nn.Parameter): Learnable vectors used to compute the first and second components of λ for query transformations.
|
331 |
- lambda_k1, lambda_k2 (nn.Parameter): Learnable vectors used to compute the first and second components of λ for key transformations.
|
332 |
- lambda_init (float): A constant used for initializing λ, typically set as λ_init = 0.8 − 0.6 × exp(−0.3 × (layer_index − 1)).
|
|
|
333 |
"""
|
334 |
|
335 |
def __init__(self, config: MotifConfig, layer_idx: Optional[int] = None):
|
|
|
952 |
This model inherits from [`PreTrainedModel`]. Check the superclass documentation for the generic methods the
|
953 |
library implements for all its model (such as downloading or saving, resizing the input embeddings, pruning heads
|
954 |
etc.)
|
|
|
955 |
This model is also a PyTorch [torch.nn.Module](https://pytorch.org/docs/stable/nn.html#torch.nn.Module) subclass.
|
956 |
Use it as a regular PyTorch Module and refer to the PyTorch documentation for all matter related to general usage
|
957 |
and behavior.
|
|
|
958 |
Parameters:
|
959 |
config ([`MotifConfig`]):
|
960 |
Model configuration class with all the parameters of the model. Initializing with a config file does not
|
|
|
1035 |
input_ids (`torch.LongTensor` of shape `(batch_size, sequence_length)`):
|
1036 |
Indices of input sequence tokens in the vocabulary. Padding will be ignored by default should you provide
|
1037 |
it.
|
|
|
1038 |
Indices can be obtained using [`AutoTokenizer`]. See [`PreTrainedTokenizer.encode`] and
|
1039 |
[`PreTrainedTokenizer.__call__`] for details.
|
|
|
1040 |
[What are input IDs?](../glossary#input-ids)
|
1041 |
attention_mask (`torch.Tensor` of shape `(batch_size, sequence_length)`, *optional*):
|
1042 |
Mask to avoid performing attention on padding token indices. Mask values selected in `[0, 1]`:
|
|
|
1043 |
- 1 for tokens that are **not masked**,
|
1044 |
- 0 for tokens that are **masked**.
|
|
|
1045 |
[What are attention masks?](../glossary#attention-mask)
|
|
|
1046 |
Indices can be obtained using [`AutoTokenizer`]. See [`PreTrainedTokenizer.encode`] and
|
1047 |
[`PreTrainedTokenizer.__call__`] for details.
|
|
|
1048 |
If `past_key_values` is used, optionally only the last `decoder_input_ids` have to be input (see
|
1049 |
`past_key_values`).
|
|
|
1050 |
If you want to change padding behavior, you should read [`modeling_opt._prepare_decoder_attention_mask`]
|
1051 |
and modify to your needs. See diagram 1 in [the paper](https://arxiv.org/abs/1910.13461) for more
|
1052 |
information on the default strategy.
|
|
|
1053 |
- 1 indicates the head is **not masked**,
|
1054 |
- 0 indicates the head is **masked**.
|
1055 |
position_ids (`torch.LongTensor` of shape `(batch_size, sequence_length)`, *optional*):
|
1056 |
Indices of positions of each input sequence tokens in the position embeddings. Selected in the range `[0,
|
1057 |
config.n_positions - 1]`.
|
|
|
1058 |
[What are position IDs?](../glossary#position-ids)
|
1059 |
past_key_values (`Cache` or `tuple(tuple(torch.FloatTensor))`, *optional*):
|
1060 |
Pre-computed hidden-states (key and values in the self-attention blocks and in the cross-attention
|
1061 |
blocks) that can be used to speed up sequential decoding. This typically consists in the `past_key_values`
|
1062 |
returned by the model at a previous stage of decoding, when `use_cache=True` or `config.use_cache=True`.
|
|
|
1063 |
Two formats are allowed:
|
1064 |
- a [`~cache_utils.Cache`] instance, see our
|
1065 |
[kv cache guide](https://huggingface.co/docs/transformers/en/kv_cache);
|
1066 |
- Tuple of `tuple(torch.FloatTensor)` of length `config.n_layers`, with each tuple having 2 tensors of
|
1067 |
shape `(batch_size, num_heads, sequence_length, embed_size_per_head)`). This is also known as the legacy
|
1068 |
cache format.
|
|
|
1069 |
The model will output the same cache format that is fed as input. If no `past_key_values` are passed, the
|
1070 |
legacy cache format will be returned.
|
|
|
1071 |
If `past_key_values` are used, the user can optionally input only the last `input_ids` (those that don't
|
1072 |
have their past key value states given to this model) of shape `(batch_size, 1)` instead of all `input_ids`
|
1073 |
of shape `(batch_size, sequence_length)`.
|
|
|
1100 |
class MotifModel(MotifPreTrainedModel):
|
1101 |
"""
|
1102 |
Transformer decoder consisting of *config.num_hidden_layers* layers. Each layer is a [`MotifDecoderLayer`]
|
|
|
1103 |
Args:
|
1104 |
config: MotifConfig
|
1105 |
"""
|
|
|
1348 |
"""
|
1349 |
Creates a causal 4D mask of shape `(batch_size, 1, query_length, key_value_length)` from a 2D mask of shape
|
1350 |
`(batch_size, key_value_length)`, or if the input `attention_mask` is already 4D, do nothing.
|
|
|
1351 |
Args:
|
1352 |
attention_mask (`torch.Tensor`):
|
1353 |
A 2D attention mask of shape `(batch_size, key_value_length)` or a 4D attention mask of shape `(batch_size, 1, query_length, key_value_length)`.
|
|
|
1406 |
self.multi_token_heads = config.multi_token_heads
|
1407 |
|
1408 |
self.lm_head = nn.Linear(config.hidden_size, config.vocab_size, bias=False)
|
|
|
|
|
|
|
|
|
|
|
1409 |
|
1410 |
# Initialize weights and apply final processing
|
1411 |
self.post_init()
|
|
|
1457 |
Labels for computing the masked language modeling loss. Indices should either be in `[0, ...,
|
1458 |
config.vocab_size]` or -100 (see `input_ids` docstring). Tokens with indices set to `-100` are ignored
|
1459 |
(masked), the loss is only computed for the tokens with labels in `[0, ..., config.vocab_size]`.
|
|
|
1460 |
num_logits_to_keep (`int`, *optional*):
|
1461 |
Calculate logits for the last `num_logits_to_keep` tokens. If `0`, calculate logits for all
|
1462 |
`input_ids` (special case). Only last token logits are needed for generation, and calculating them only for that
|
1463 |
token can save memory, which becomes pretty significant for long sequences or large vocabulary size.
|
|
|
1464 |
Returns:
|
|
|
1465 |
Example:
|
|
|
1466 |
```python
|
1467 |
>>> from transformers import AutoTokenizer, MotifForCausalLM
|
|
|
1468 |
>>> model = MotifForCausalLM.from_pretrained(PATH_TO_CONVERTED_WEIGHTS, trust_remote_code = True)
|
1469 |
>>> tokenizer = AutoTokenizer.from_pretrained(PATH_TO_CONVERTED_TOKENIZER, trust_remote_code = True)
|
|
|
1470 |
>>> prompt = "Hey, are you conscious? Can you talk to me?"
|
1471 |
>>> inputs = tokenizer(prompt, return_tensors="pt")
|
|
|
1472 |
>>> # Generate
|
1473 |
>>> generate_ids = model.generate(inputs.input_ids, max_length=30)
|
1474 |
>>> tokenizer.batch_decode(generate_ids, skip_special_tokens=True, clean_up_tokenization_spaces=False)[0]
|