|
import gradio as gr |
|
import torch |
|
from transformers import WhisperProcessor, WhisperForConditionalGeneration |
|
import librosa |
|
|
|
|
|
def load_components(): |
|
processor = WhisperProcessor.from_pretrained( |
|
"openai/whisper-large-v3-turbo", |
|
language="Japanese", |
|
task="transcribe" |
|
) |
|
|
|
model = WhisperForConditionalGeneration.from_pretrained( |
|
"AkitoP/whisper-jsut5000-voicevox-phone-lora", |
|
device_map="auto" if torch.cuda.is_available() else "cpu" |
|
) |
|
|
|
|
|
forced_decoder_ids = processor.get_decoder_prompt_ids( |
|
language="japanese", |
|
task="transcribe" |
|
) |
|
model.config.forced_decoder_ids = forced_decoder_ids |
|
model.eval() |
|
|
|
return processor, model |
|
|
|
processor, model = load_components() |
|
|
|
|
|
def transcribe_audio(audio_path): |
|
try: |
|
|
|
audio, sr = librosa.load(audio_path, sr=16000) |
|
|
|
|
|
inputs = processor.feature_extractor( |
|
audio, |
|
sampling_rate=16000, |
|
return_tensors="pt" |
|
).input_features.to(model.device) |
|
|
|
|
|
with torch.no_grad(): |
|
generated_ids = model.generate(inputs, max_length=256) |
|
|
|
|
|
text = processor.tokenizer.batch_decode( |
|
generated_ids, |
|
skip_special_tokens=True |
|
)[0] |
|
|
|
return text |
|
|
|
except Exception as e: |
|
return f"Error: {str(e)}" |
|
|
|
|
|
demo = gr.Interface( |
|
fn=transcribe_audio, |
|
inputs=gr.Audio(sources="upload", type="filepath"), |
|
outputs=gr.Textbox(label="output"), |
|
title="ASR", |
|
description="Whisper trained on Prosodic&Phonemic transcription on JSUT5000 & VOICEVOX generated dataset.", |
|
allow_flagging="never" |
|
) |
|
|
|
demo.launch() |