ibm-granite/granite-speech-3.3-8b · Incomplete Transcripts (with Transformers package)

Hi together,
I'm experiencing issues with the Granite speech-to-text model (version 3.3.1, transformers 4.52.4 ) when transcribing German audio samples. Specifically, I've noticed that large portions of the audio content are being omitted from the generated transcript, resulting in significant gaps between spoken words or phrases. These omissions occur sporadically throughout the entire transcription output, despite the presence of corresponding audio data. There should be enough tokens available as I already tried to increase the max_new_tokens to 4096. I still get the same behaviour.
Notably, these omissions occur even when using high-quality, noise-free audio inputs, which should ideally yield accurate and complete transcripts.

Can anyone provide guidance on how to troubleshoot or resolve this issue?

Here is my code:

import io
from enum import Enum, auto
from importlib.resources import files

import torch
import torchaudio
from transformers import AutoProcessor, AutoModelForSpeechSeq2Seq

from speech_recognition.deploy import model as models
import os

torch.classes.__path__ = []

class GraniteSpeechRecognitionModel:

    def __init__(
        self,
        model_id: str = "granite-speech-3.3-8b"
    ) -> None:
        print("Model loading...")
        self.model_id = str(files(models) / model_id)
        self.device = "cuda:0" if torch.cuda.is_available() else "cpu"
        self.model = AutoModelForSpeechSeq2Seq.from_pretrained(
            self.model_id,
            device_map=self.device,
            torch_dtype=torch.bfloat16,
        )
        self.processor = AutoProcessor.from_pretrained(self.model_id)
        self.tokenizer = self.processor.tokenizer
        print("Loaded model")


    def transcribe(
        self,
        audio_file: io.BytesIO,
        chat=None,
    ) -> str:
        try:
            # Load audio as mono, 16kHz
            wav, sr = torchaudio.load(audio_file, normalize=True)
            if wav.shape[0] != 1 or sr != 16000:
                wav = wav.mean(dim=0, keepdim=True)
                wav = torchaudio.functional.resample(wav, sr, 16000)
                sr = 16000

            if chat is None:
                chat = [
                    {
                        "role": "system",
                        "content": "Knowledge Cutoff Date: April 2024.\nToday's Date: April 9, 2025.\nYou are Granite, developed by IBM. You are a helpful AI assistant",
                    },
                    {
                        "role": "user",
                        "content": "<|audio|>can you transcribe the speech into a written format?",
                    }
                ]

            prompt = self.tokenizer.apply_chat_template(
                chat, tokenize=False, add_generation_prompt=True
            )

            model_inputs = self.processor(
                prompt,
                wav,
                device=self.device,
                return_tensors="pt",
            ).to(self.device)

            model_outputs = self.model.generate(**model_inputs, max_new_tokens=1024, do_sample=False, num_beams=1)

            num_input_tokens = model_inputs["input_ids"].shape[-1]
            new_tokens = torch.unsqueeze(model_outputs[0, num_input_tokens:], dim=0)

            output_text = self.tokenizer.batch_decode(
                new_tokens, add_special_tokens=False, skip_special_tokens=True
            )

            pred_text = output_text[0].strip()

        except Exception as e:
            print(e)
            pred_text = None

        return pred_text if pred_text else None