Update modeling_gemma3mm.py
#3
by
urroxyz
- opened
- modeling_gemma3mm.py +7 -6
modeling_gemma3mm.py
CHANGED
@@ -491,12 +491,13 @@ class Gemma3MMForConditionalGeneration(Gemma3MMPreTrainedModel, GenerationMixin)
|
|
491 |
|
492 |
if not is_torchdynamo_compiling() and inputs_embeds[special_audio_mask].numel() != masked_audio_features.numel():
|
493 |
audio_tokens_in_text = (special_audio_mask).sum(dim=1).sum(dim=0)[0]
|
494 |
-
masked_audio_size = audio_embed_sizes.sum()
|
495 |
-
raise ValueError(
|
496 |
-
|
497 |
-
|
498 |
-
|
499 |
-
)
|
|
|
500 |
masked_audio_features = masked_audio_features.to(inputs_embeds.device, inputs_embeds.dtype)
|
501 |
inputs_embeds = inputs_embeds.masked_scatter(special_audio_mask, masked_audio_features)
|
502 |
|
|
|
491 |
|
492 |
if not is_torchdynamo_compiling() and inputs_embeds[special_audio_mask].numel() != masked_audio_features.numel():
|
493 |
audio_tokens_in_text = (special_audio_mask).sum(dim=1).sum(dim=0)[0]
|
494 |
+
masked_audio_size = audio_embed_sizes.sum()
|
495 |
+
#raise ValueError(
|
496 |
+
# f"Number of images does not match number of special image tokens in the input text. "
|
497 |
+
# f"Got {audio_tokens_in_text} image tokens in the text but {masked_audio_size} "
|
498 |
+
# "tokens from image embeddings."
|
499 |
+
#)
|
500 |
+
pass
|
501 |
masked_audio_features = masked_audio_features.to(inputs_embeds.device, inputs_embeds.dtype)
|
502 |
inputs_embeds = inputs_embeds.masked_scatter(special_audio_mask, masked_audio_features)
|
503 |
|