junnei
/

gemma-3-4b-it-speech

@@ -131,15 +131,15 @@ class Gemma3AudioFeatureExtractor(SequenceFeatureExtractor):
     def __call__(
         self,
-        audios: List[AudioInput],
         return_tensors: Optional[Union[str, TensorType]] = None,
     ):
         # Ref: https://github.com/huggingface/transformers/blob/v4.47.0/src/transformers/models/audio_spectrogram_transformer/feature_extraction_audio_spectrogram_transformer.py#L161
         returned_input_audio_embeds = []
         returned_audio_embed_sizes = []
         audio_frames_list = []
-        for audio_data, sample_rate in audios:
             audio_embeds = self._extract_features(audio_data, sample_rate)
             audio_frames = len(audio_embeds) * self.audio_feat_stride
             audio_embed_size = self._compute_audio_embed_size(audio_frames)
@@ -152,7 +152,7 @@ class Gemma3AudioFeatureExtractor(SequenceFeatureExtractor):
         )
         returned_audio_embed_sizes = torch.stack(returned_audio_embed_sizes, dim=0)
         audio_frames = torch.tensor(audio_frames_list)
-        returned_audio_attention_mask = torch.arange(0, audio_frames.max()).unsqueeze(0) < audio_frames.unsqueeze(1) if len(audios) > 1 else None
         data = {
             "input_audio_embeds": returned_input_audio_embeds,
@@ -291,6 +291,7 @@ class Gemma3MMProcessor(ProcessorMixin):
         self.image_seq_length = image_seq_length
         self.image_token_id = tokenizer.image_token_id
         self.boi_token = tokenizer.boi_token
         image_tokens_expanded = "".join([tokenizer.image_token] * image_seq_length)
         self.full_image_sequence = f"\n\n{tokenizer.boi_token}{image_tokens_expanded}{tokenizer.eoi_token}\n\n"
@@ -312,7 +313,7 @@ class Gemma3MMProcessor(ProcessorMixin):
         images: ImageInput = None,
         text: Union[TextInput, PreTokenizedInput, List[TextInput], List[PreTokenizedInput]] = None,
         videos=None,
-        audios: List[AudioInput] = None,
         **kwargs: Unpack[Gemma3ProcessorKwargs],
     ) -> BatchFeature:
         if text is None and images is None:
@@ -344,8 +345,8 @@ class Gemma3MMProcessor(ProcessorMixin):
                 )
             # Replace image tokens by the full expanded sequence
-            batch_num_crops = to_py_obj(image_inputs.pop("num_crops"))
-            text_with_crops = text
             for batch_idx, (prompt, images, num_crops) in enumerate(zip(text, batched_images, batch_num_crops)):
                 image_indexes = [m.start() for m in re.finditer(self.boi_token, prompt)]
@@ -362,14 +363,15 @@ class Gemma3MMProcessor(ProcessorMixin):
                             + " ".join([self.boi_token] * num)
                         )
                         prompt = prompt[:idx] + formatted_image_text + prompt[idx + len(self.boi_token) :]
-                        text_with_crops[batch_idx] = prompt
             # Expand placeholder image tokens to the full image token sequence
             text = [prompt.replace(self.boi_token, self.full_image_sequence) for prompt in text]
         return_tensors = output_kwargs["text_kwargs"].pop("return_tensors", None)
         audio_inputs = {}
-        if audios is not None:
             def replace_tokens_sequentially(prompt, boa_token, audio_sequences):
                 parts = prompt.split(boa_token)
                 result = ""
@@ -383,7 +385,7 @@ class Gemma3MMProcessor(ProcessorMixin):
                 return result
             full_audio_sequences = []
-            audio_inputs = self.feature_extractor(audios)
             for i, embed_size in enumerate(audio_inputs.audio_embed_sizes):
                 audio_tokens_expanded = "".join([self.audio_token] * embed_size)
@@ -395,7 +397,7 @@ class Gemma3MMProcessor(ProcessorMixin):
         text_inputs = self.tokenizer(text=text, **output_kwargs["text_kwargs"], return_tensors="np")
         # Add token type ids manually, as tokenizer can't do arbitrary position token types
-        array_ids = np.array(text_inputs["input_ids"])
         mm_token_type_ids = np.zeros_like(text_inputs["input_ids"])
         mm_token_type_ids[array_ids == self.image_token_id] = 1
         mm_token_type_ids[array_ids == self.audio_token_id] = 2
@@ -409,7 +411,7 @@ class Gemma3MMProcessor(ProcessorMixin):
         text_inputs["token_type_ids"] = mm_token_type_ids.tolist()
         text_inputs["input_modes"] = input_modes.tolist()
-        return BatchFeature(data={**text_inputs, **image_inputs, **audio_inputs, }, tensor_type=return_tensors)
     # Copied from transformers.models.clip.processing_clip.CLIPProcessor.batch_decode with CLIP->Gemma
     def batch_decode(self, *args, **kwargs):

     def __call__(
         self,
+        audio: List[AudioInput],
         return_tensors: Optional[Union[str, TensorType]] = None,
     ):
         # Ref: https://github.com/huggingface/transformers/blob/v4.47.0/src/transformers/models/audio_spectrogram_transformer/feature_extraction_audio_spectrogram_transformer.py#L161
         returned_input_audio_embeds = []
         returned_audio_embed_sizes = []
         audio_frames_list = []
+        sample_rate = 16000
+        for audio_data in audio:
             audio_embeds = self._extract_features(audio_data, sample_rate)
             audio_frames = len(audio_embeds) * self.audio_feat_stride
             audio_embed_size = self._compute_audio_embed_size(audio_frames)
         )
         returned_audio_embed_sizes = torch.stack(returned_audio_embed_sizes, dim=0)
         audio_frames = torch.tensor(audio_frames_list)
+        returned_audio_attention_mask = torch.arange(0, audio_frames.max()).unsqueeze(0) < audio_frames.unsqueeze(1) if len(audio) > 1 else None
         data = {
             "input_audio_embeds": returned_input_audio_embeds,
         self.image_seq_length = image_seq_length
         self.image_token_id = tokenizer.image_token_id
         self.boi_token = tokenizer.boi_token
+        self.image_token = tokenizer.boi_token
         image_tokens_expanded = "".join([tokenizer.image_token] * image_seq_length)
         self.full_image_sequence = f"\n\n{tokenizer.boi_token}{image_tokens_expanded}{tokenizer.eoi_token}\n\n"
         images: ImageInput = None,
         text: Union[TextInput, PreTokenizedInput, List[TextInput], List[PreTokenizedInput]] = None,
         videos=None,
+        audio: List[AudioInput] = None,
         **kwargs: Unpack[Gemma3ProcessorKwargs],
     ) -> BatchFeature:
         if text is None and images is None:
                 )
             # Replace image tokens by the full expanded sequence
+            num_crops = to_py_obj(image_inputs.pop("num_crops"))
+            batch_num_crops = [[num_crops.pop(0) for _ in range(len(images))] for images in batched_images]
             for batch_idx, (prompt, images, num_crops) in enumerate(zip(text, batched_images, batch_num_crops)):
                 image_indexes = [m.start() for m in re.finditer(self.boi_token, prompt)]
                             + " ".join([self.boi_token] * num)
                         )
                         prompt = prompt[:idx] + formatted_image_text + prompt[idx + len(self.boi_token) :]
+                        text[batch_idx] = prompt
             # Expand placeholder image tokens to the full image token sequence
             text = [prompt.replace(self.boi_token, self.full_image_sequence) for prompt in text]
         return_tensors = output_kwargs["text_kwargs"].pop("return_tensors", None)
         audio_inputs = {}
+        if audio is not None:
             def replace_tokens_sequentially(prompt, boa_token, audio_sequences):
                 parts = prompt.split(boa_token)
                 result = ""
                 return result
             full_audio_sequences = []
+            audio_inputs = self.feature_extractor(audio)
             for i, embed_size in enumerate(audio_inputs.audio_embed_sizes):
                 audio_tokens_expanded = "".join([self.audio_token] * embed_size)
         text_inputs = self.tokenizer(text=text, **output_kwargs["text_kwargs"], return_tensors="np")
         # Add token type ids manually, as tokenizer can't do arbitrary position token types
+        array_ids = text_inputs["input_ids"]
         mm_token_type_ids = np.zeros_like(text_inputs["input_ids"])
         mm_token_type_ids[array_ids == self.image_token_id] = 1
         mm_token_type_ids[array_ids == self.audio_token_id] = 2
         text_inputs["token_type_ids"] = mm_token_type_ids.tolist()
         text_inputs["input_modes"] = input_modes.tolist()
+        return BatchFeature(data={**text_inputs, **image_inputs, **audio_inputs}, tensor_type=return_tensors)
     # Copied from transformers.models.clip.processing_clip.CLIPProcessor.batch_decode with CLIP->Gemma
     def batch_decode(self, *args, **kwargs):