abilmansplus/whisper-turbo-ksc2

This is a Whisper large-v3-turbo model fine-tuned on Kazakh Speech Corpus 2 (about 1000 hours of transcribed audio from diverse sources).
After training on the Train partition, it achieved 9.16% WER on the Test partition.

For longer audio (35+ seconds), you can divide them into 30-second chunks, transcribe each chunk separately, and then merge the results.
Example implementation of a transcriber that can handle both short and long audio files:

import librosa
import numpy as np
import torch
from transformers import WhisperProcessor, WhisperForConditionalGeneration

class Transcriber:
    def __init__(
            self,
            model_path="abilmansplus/whisper-turbo-ksc2",
            device="cuda:0",
            sampling_rate=16_000, 
            language="kazakh",  # set to None if audio is not always in Kazakh, it will still do well on Kazakh
            task="transcribe",
            num_beams=5,
            chunk_length_s=30,  # chunk duration (seconds)
            stride_length_s=1  # overlap (seconds) between chunks
        ):
        self.processor = WhisperProcessor.from_pretrained(
            model_path,
            language=language, 
            task=task
        )
        self.model = WhisperForConditionalGeneration.from_pretrained(model_path)
        self.model = self.model.to(device)
        self.sr = sampling_rate
        self.language=language  # language can be None or "kazakh", any of those will work with this model
        self.task = task
        self.num_beams=num_beams
        self.chunk_length_s = chunk_length_s  # chunk length in seconds
        self.stride_length_s = stride_length_s  # overlap between chunks in seconds   
    
    def transcribe(self, audio_path: str) -> str:
        """transcribes the audio chunk by chunk and merges the results
        Args:
            audio_path (str): path to the audio to be transcribed
        Returns:
            full_transcription (str): transcription of the entire audio 
        """
        speech_array, sampling_rate = librosa.load(audio_path, sr=self.sr)
        audio_length_s = len(speech_array) / self.sr
        
        # If audio is shorter than chunk_length_s, process normally
        if audio_length_s <= self.chunk_length_s:
            full_transcription = self._transcribe_chunk(speech_array)
            return full_transcription
        
        # For longer audio, process in chunks
        chunk_length_samples = int(self.chunk_length_s * self.sr)
        stride_length_samples = int(self.stride_length_s * self.sr)

        # Calculate number of chunks
        num_samples = len(speech_array)
        num_chunks = max(1, 
                         int(
                             1 +
                             np.ceil(
                                     (num_samples - chunk_length_samples) / 
                                     (chunk_length_samples - stride_length_samples)
                                    ) 
                            )
                        )

        transcriptions = []

        for i in range(num_chunks):
            # Calculate chunk start and end
            start = max(0, i * (chunk_length_samples - stride_length_samples))
            end = min(num_samples, start + chunk_length_samples)
            
            # Get audio chunk
            chunk = speech_array[start:end]
            
            # Transcribe chunk
            chunk_transcription = self._transcribe_chunk(chunk)
            transcriptions.append(chunk_transcription)
        
        # Combine transcriptions
        full_transcription = " ".join(transcriptions)
        return full_transcription        

    def _transcribe_chunk(self, audio_chunk) -> str:
        # Process inputs
        inputs = self.processor(
            audio_chunk, 
            sampling_rate=self.sr, 
            return_tensors="pt"
        ).input_features.to(self.model.device)
        
        # Get forced decoder IDs for language and task
        forced_decoder_ids = self.processor.get_decoder_prompt_ids(
            language=self.language, 
            task=self.task
        )

        # The attention mask should be 1 for all positions in the input features
        attention_mask = torch.ones_like(inputs[:, :, 0])
        
        # Generate transcription
        with torch.no_grad():
            generated_ids = self.model.generate(
                inputs, 
                forced_decoder_ids=forced_decoder_ids,
                max_length=448,
                num_beams=self.num_beams,
                attention_mask=attention_mask,
            )
        
        # Decode the generated IDs to text
        transcription = self.processor.batch_decode(
            generated_ids, 
            skip_special_tokens=True
        )[0]
        
        return transcription

abilmansplus
/

whisper-turbo-ksc2

Model tree for abilmansplus/whisper-turbo-ksc2

Dataset used to train abilmansplus/whisper-turbo-ksc2