Update modeling_motif.py
Browse files- modeling_motif.py +3 -25
modeling_motif.py
CHANGED
@@ -545,9 +545,9 @@ class MotifFlashAttention2(MotifAttention):
|
|
545 |
|
546 |
bsz = query_states.shape[0]
|
547 |
|
548 |
-
return _flash_attention_forward(query_states,
|
549 |
-
key_states,
|
550 |
-
value_states,
|
551 |
attention_mask,
|
552 |
q_len,
|
553 |
position_ids=position_ids,
|
@@ -604,28 +604,6 @@ class MotifFlashAttention2(MotifAttention):
|
|
604 |
value_states = repeat_kv(value_states, self.num_key_value_groups)
|
605 |
dropout_rate = 0.0 if not self.training else self.attention_dropout
|
606 |
|
607 |
-
# In PEFT, usually we cast the layer norms in float32 for training stability reasons
|
608 |
-
# therefore the input hidden states gets silently casted in float32. Hence, we need
|
609 |
-
# cast them back in float16 just to be sure everything works as expected.
|
610 |
-
input_dtype = query_states.dtype
|
611 |
-
if input_dtype == torch.float32:
|
612 |
-
if torch.is_autocast_enabled():
|
613 |
-
target_dtype = torch.get_autocast_gpu_dtype()
|
614 |
-
# Handle the case where the model is quantized
|
615 |
-
elif hasattr(self.config, "_pre_quantization_dtype"):
|
616 |
-
target_dtype = self.config._pre_quantization_dtype
|
617 |
-
else:
|
618 |
-
target_dtype = self.q_proj.weight.dtype
|
619 |
-
|
620 |
-
logger.warning_once(
|
621 |
-
f"The input hidden states seems to be silently casted in float32, this might be related to"
|
622 |
-
f" the fact you have upcasted embedding or layer norm layers in float32. We will cast back the input in"
|
623 |
-
f" {target_dtype}.")
|
624 |
-
|
625 |
-
query_states = query_states.to(target_dtype)
|
626 |
-
key_states = key_states.to(target_dtype)
|
627 |
-
value_states = value_states.to(target_dtype)
|
628 |
-
|
629 |
q_len = query_states.shape[-2]
|
630 |
kv_seq_len = key_states.shape[-2]
|
631 |
|
|
|
545 |
|
546 |
bsz = query_states.shape[0]
|
547 |
|
548 |
+
return _flash_attention_forward(query_states.bfloat16(),
|
549 |
+
key_states.bfloat16(),
|
550 |
+
value_states.bfloat16(),
|
551 |
attention_mask,
|
552 |
q_len,
|
553 |
position_ids=position_ids,
|
|
|
604 |
value_states = repeat_kv(value_states, self.num_key_value_groups)
|
605 |
dropout_rate = 0.0 if not self.training else self.attention_dropout
|
606 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
607 |
q_len = query_states.shape[-2]
|
608 |
kv_seq_len = key_states.shape[-2]
|
609 |
|