Motif-Technologies
/

Motif-2.6B

Text Generation

text-generation-inference

Model card Files Files and versions

leejunhyeok commited on 22 days ago

Commit

8855d03

·

verified ·

1 Parent(s): 38eae03

Update modeling_motif.py

Files changed (1) hide show

modeling_motif.py +3 -25

modeling_motif.py CHANGED Viewed

@@ -545,9 +545,9 @@ class MotifFlashAttention2(MotifAttention):
         bsz = query_states.shape[0]
-        return _flash_attention_forward(query_states,
-                                        key_states,
-                                        value_states,
                                         attention_mask,
                                         q_len,
                                         position_ids=position_ids,
@@ -604,28 +604,6 @@ class MotifFlashAttention2(MotifAttention):
         value_states = repeat_kv(value_states, self.num_key_value_groups)
         dropout_rate = 0.0 if not self.training else self.attention_dropout
-        # In PEFT, usually we cast the layer norms in float32 for training stability reasons
-        # therefore the input hidden states gets silently casted in float32. Hence, we need
-        # cast them back in float16 just to be sure everything works as expected.
-        input_dtype = query_states.dtype
-        if input_dtype == torch.float32:
-            if torch.is_autocast_enabled():
-                target_dtype = torch.get_autocast_gpu_dtype()
-            # Handle the case where the model is quantized
-            elif hasattr(self.config, "_pre_quantization_dtype"):
-                target_dtype = self.config._pre_quantization_dtype
-            else:
-                target_dtype = self.q_proj.weight.dtype
-            logger.warning_once(
-                f"The input hidden states seems to be silently casted in float32, this might be related to"
-                f" the fact you have upcasted embedding or layer norm layers in float32. We will cast back the input in"
-                f" {target_dtype}.")
-            query_states = query_states.to(target_dtype)
-            key_states = key_states.to(target_dtype)
-            value_states = value_states.to(target_dtype)
         q_len = query_states.shape[-2]
         kv_seq_len = key_states.shape[-2]

         bsz = query_states.shape[0]
+        return _flash_attention_forward(query_states.bfloat16(),
+                                        key_states.bfloat16(),
+                                        value_states.bfloat16(),
                                         attention_mask,
                                         q_len,
                                         position_ids=position_ids,
         value_states = repeat_kv(value_states, self.num_key_value_groups)
         dropout_rate = 0.0 if not self.training else self.attention_dropout
         q_len = query_states.shape[-2]
         kv_seq_len = key_states.shape[-2]