Incomplete Transcripts (with Transformers package)
Hi together,
I'm experiencing issues with the Granite speech-to-text model (version 3.3.1, transformers 4.52.4 ) when transcribing German audio samples. Specifically, I've noticed that large portions of the audio content are being omitted from the generated transcript, resulting in significant gaps between spoken words or phrases. These omissions occur sporadically throughout the entire transcription output, despite the presence of corresponding audio data. There should be enough tokens available as I already tried to increase the max_new_tokens to 4096. I still get the same behaviour.
Notably, these omissions occur even when using high-quality, noise-free audio inputs, which should ideally yield accurate and complete transcripts.
Can anyone provide guidance on how to troubleshoot or resolve this issue?
Here is my code:
import io
from enum import Enum, auto
from importlib.resources import files
import torch
import torchaudio
from transformers import AutoProcessor, AutoModelForSpeechSeq2Seq
from speech_recognition.deploy import model as models
import os
torch.classes.__path__ = []
class GraniteSpeechRecognitionModel:
def __init__(
self,
model_id: str = "granite-speech-3.3-8b"
) -> None:
print("Model loading...")
self.model_id = str(files(models) / model_id)
self.device = "cuda:0" if torch.cuda.is_available() else "cpu"
self.model = AutoModelForSpeechSeq2Seq.from_pretrained(
self.model_id,
device_map=self.device,
torch_dtype=torch.bfloat16,
)
self.processor = AutoProcessor.from_pretrained(self.model_id)
self.tokenizer = self.processor.tokenizer
print("Loaded model")
def transcribe(
self,
audio_file: io.BytesIO,
chat=None,
) -> str:
try:
# Load audio as mono, 16kHz
wav, sr = torchaudio.load(audio_file, normalize=True)
if wav.shape[0] != 1 or sr != 16000:
wav = wav.mean(dim=0, keepdim=True)
wav = torchaudio.functional.resample(wav, sr, 16000)
sr = 16000
if chat is None:
chat = [
{
"role": "system",
"content": "Knowledge Cutoff Date: April 2024.\nToday's Date: April 9, 2025.\nYou are Granite, developed by IBM. You are a helpful AI assistant",
},
{
"role": "user",
"content": "<|audio|>can you transcribe the speech into a written format?",
}
]
prompt = self.tokenizer.apply_chat_template(
chat, tokenize=False, add_generation_prompt=True
)
model_inputs = self.processor(
prompt,
wav,
device=self.device,
return_tensors="pt",
).to(self.device)
model_outputs = self.model.generate(**model_inputs, max_new_tokens=1024, do_sample=False, num_beams=1)
num_input_tokens = model_inputs["input_ids"].shape[-1]
new_tokens = torch.unsqueeze(model_outputs[0, num_input_tokens:], dim=0)
output_text = self.tokenizer.batch_decode(
new_tokens, add_special_tokens=False, skip_special_tokens=True
)
pred_text = output_text[0].strip()
except Exception as e:
print(e)
pred_text = None
return pred_text if pred_text else None
Hi @KevinN15 The model was trained on short utterances and may have trouble with long-form audio. You could try segmenting the audio files either using VAD or at fixed time intervals (say 30 secs) and stitching the outputs together to get the full transcripts. The model also struggles with speaker changes in the audio because it was mostly trained with single-speaker utterances.