katuni4ka
/

tiny-random-phi-4-multimodal

Image-Text-to-Text

text-generation

Model card Files Files and versions Community

katuni4ka commited on 16 days ago

Commit

d21e013

·

verified ·

1 Parent(s): 9ffce0b

Update modeling_phi4mm.py

Files changed (1) hide show

modeling_phi4mm.py +54 -1

modeling_phi4mm.py CHANGED Viewed

@@ -1505,7 +1505,7 @@ PHI4MM_START_DOCSTRING = r"""
     "The bare Phi-4-MM model outputting raw hidden-states without any specific head on top.",
     PHI4MM_START_DOCSTRING,
 )
-class Phi4MMPreTrainedModel(PreTrainedModel):
     config_class = Phi4MMConfig
     base_model_prefix = "model"
     supports_gradient_checkpointing = True
@@ -1932,6 +1932,59 @@ class Phi4MMModel(Phi4MMPreTrainedModel):
                 )
         return causal_mask
 class Phi4MMForCausalLM(Phi4MMPreTrainedModel, GenerationMixin):
     _tied_weights_keys = ["lm_head.weight"]

     "The bare Phi-4-MM model outputting raw hidden-states without any specific head on top.",
     PHI4MM_START_DOCSTRING,
 )
+class Phi4MMPreTrainedModel(PreTrainedModel, GenerationMixin):
     config_class = Phi4MMConfig
     base_model_prefix = "model"
     supports_gradient_checkpointing = True
                 )
         return causal_mask
+    def prepare_inputs_for_generation(
+        self,
+        input_ids,
+        past_key_values=None,
+        attention_mask=None,
+        inputs_embeds=None,
+        input_image_embeds=None,
+        image_sizes=None,
+        image_attention_mask=None,
+        input_audio_embeds=None,
+        audio_embed_sizes=None,
+        audio_attention_mask=None,
+        input_mode=None,
+        cache_position=None,
+        position_ids=None,
+        use_cache=True,
+        num_logits_to_keep=0,
+        **kwargs
+    ):
+        # Overwritten -- this model may need to switch between short and long rope, invalidating the cache in the
+        # process
+        # When the first time input length reached long and short factor switching point, enforce re-compute cache
+        # It will cause downside of slower at this single token position, however, better than current failure.
+        if (
+            past_key_values
+            and self.config.rope_scaling
+            and input_ids.shape[1] >= self.config.original_max_position_embeddings + 1
+        ):
+            past_length = cache_position[0]
+            if past_length <= self.config.original_max_position_embeddings:
+                past_key_values = None
+        model_inputs = super().prepare_inputs_for_generation(
+            input_ids=input_ids,
+            past_key_values=past_key_values,
+            attention_mask=attention_mask,
+            inputs_embeds=inputs_embeds,
+            input_image_embeds=input_image_embeds,
+            image_sizes=image_sizes,
+            image_attention_mask=image_attention_mask,
+            input_audio_embeds=input_audio_embeds,
+            audio_embed_sizes=audio_embed_sizes,
+            audio_attention_mask=audio_attention_mask,
+            input_mode=input_mode,
+            cache_position=cache_position,
+            position_ids=position_ids,
+            use_cache=use_cache,
+            num_logits_to_keep=num_logits_to_keep or 0,
+            **kwargs,
+        )
+        return model_inputs
 class Phi4MMForCausalLM(Phi4MMPreTrainedModel, GenerationMixin):
     _tied_weights_keys = ["lm_head.weight"]