Update modeling_gemma3mm.py

#3
by urroxyz - opened
Files changed (1) hide show
  1. modeling_gemma3mm.py +7 -6
modeling_gemma3mm.py CHANGED
@@ -491,12 +491,13 @@ class Gemma3MMForConditionalGeneration(Gemma3MMPreTrainedModel, GenerationMixin)
491
 
492
  if not is_torchdynamo_compiling() and inputs_embeds[special_audio_mask].numel() != masked_audio_features.numel():
493
  audio_tokens_in_text = (special_audio_mask).sum(dim=1).sum(dim=0)[0]
494
- masked_audio_size = audio_embed_sizes.sum()[0]
495
- raise ValueError(
496
- f"Number of images does not match number of special image tokens in the input text. "
497
- f"Got {audio_tokens_in_text} image tokens in the text but {masked_audio_size} "
498
- "tokens from image embeddings."
499
- )
 
500
  masked_audio_features = masked_audio_features.to(inputs_embeds.device, inputs_embeds.dtype)
501
  inputs_embeds = inputs_embeds.masked_scatter(special_audio_mask, masked_audio_features)
502
 
 
491
 
492
  if not is_torchdynamo_compiling() and inputs_embeds[special_audio_mask].numel() != masked_audio_features.numel():
493
  audio_tokens_in_text = (special_audio_mask).sum(dim=1).sum(dim=0)[0]
494
+ masked_audio_size = audio_embed_sizes.sum()
495
+ #raise ValueError(
496
+ # f"Number of images does not match number of special image tokens in the input text. "
497
+ # f"Got {audio_tokens_in_text} image tokens in the text but {masked_audio_size} "
498
+ # "tokens from image embeddings."
499
+ #)
500
+ pass
501
  masked_audio_features = masked_audio_features.to(inputs_embeds.device, inputs_embeds.dtype)
502
  inputs_embeds = inputs_embeds.masked_scatter(special_audio_mask, masked_audio_features)
503