This is a Whisper large-v3-turbo model
fine-tuned on Kazakh Speech Corpus 2 (about 1000 hours of transcribed audio from diverse sources).
After training on the Train partition, it achieved 9.16% WER on the Test partition.
For longer audio (35+ seconds), you can divide them into 30-second chunks, transcribe each chunk separately, and then merge the results.
Example implementation of a transcriber that can handle both short and long audio files:
import librosa
import numpy as np
import torch
from transformers import WhisperProcessor, WhisperForConditionalGeneration
class Transcriber:
def __init__(
self,
model_path="abilmansplus/whisper-turbo-ksc2",
device="cuda:0",
sampling_rate=16_000,
language="kazakh", # set to None if audio is not always in Kazakh, it will still do well on Kazakh
task="transcribe",
num_beams=5,
chunk_length_s=30, # chunk duration (seconds)
stride_length_s=1 # overlap (seconds) between chunks
):
self.processor = WhisperProcessor.from_pretrained(
model_path,
language=language,
task=task
)
self.model = WhisperForConditionalGeneration.from_pretrained(model_path)
self.model = self.model.to(device)
self.sr = sampling_rate
self.language=language # language can be None or "kazakh", any of those will work with this model
self.task = task
self.num_beams=num_beams
self.chunk_length_s = chunk_length_s # chunk length in seconds
self.stride_length_s = stride_length_s # overlap between chunks in seconds
def transcribe(self, audio_path: str) -> str:
"""transcribes the audio chunk by chunk and merges the results
Args:
audio_path (str): path to the audio to be transcribed
Returns:
full_transcription (str): transcription of the entire audio
"""
speech_array, sampling_rate = librosa.load(audio_path, sr=self.sr)
audio_length_s = len(speech_array) / self.sr
# If audio is shorter than chunk_length_s, process normally
if audio_length_s <= self.chunk_length_s:
full_transcription = self._transcribe_chunk(speech_array)
return full_transcription
# For longer audio, process in chunks
chunk_length_samples = int(self.chunk_length_s * self.sr)
stride_length_samples = int(self.stride_length_s * self.sr)
# Calculate number of chunks
num_samples = len(speech_array)
num_chunks = max(1,
int(
1 +
np.ceil(
(num_samples - chunk_length_samples) /
(chunk_length_samples - stride_length_samples)
)
)
)
transcriptions = []
for i in range(num_chunks):
# Calculate chunk start and end
start = max(0, i * (chunk_length_samples - stride_length_samples))
end = min(num_samples, start + chunk_length_samples)
# Get audio chunk
chunk = speech_array[start:end]
# Transcribe chunk
chunk_transcription = self._transcribe_chunk(chunk)
transcriptions.append(chunk_transcription)
# Combine transcriptions
full_transcription = " ".join(transcriptions)
return full_transcription
def _transcribe_chunk(self, audio_chunk) -> str:
# Process inputs
inputs = self.processor(
audio_chunk,
sampling_rate=self.sr,
return_tensors="pt"
).input_features.to(self.model.device)
# Get forced decoder IDs for language and task
forced_decoder_ids = self.processor.get_decoder_prompt_ids(
language=self.language,
task=self.task
)
# The attention mask should be 1 for all positions in the input features
attention_mask = torch.ones_like(inputs[:, :, 0])
# Generate transcription
with torch.no_grad():
generated_ids = self.model.generate(
inputs,
forced_decoder_ids=forced_decoder_ids,
max_length=448,
num_beams=self.num_beams,
attention_mask=attention_mask,
)
# Decode the generated IDs to text
transcription = self.processor.batch_decode(
generated_ids,
skip_special_tokens=True
)[0]
return transcription
- Downloads last month
- 4,421
Model tree for abilmansplus/whisper-turbo-ksc2
Base model
openai/whisper-large-v3
Finetuned
openai/whisper-large-v3-turbo