AkitoP's picture
Update app.py
7897abd verified
import gradio as gr
import torch
from transformers import WhisperProcessor, WhisperForConditionalGeneration
import librosa
# 初始化组件
def load_components():
processor = WhisperProcessor.from_pretrained(
"openai/whisper-large-v3-turbo",
language="Japanese",
task="transcribe"
)
model = WhisperForConditionalGeneration.from_pretrained(
"AkitoP/whisper-jsut5000-voicevox-phone-lora",
device_map="auto" if torch.cuda.is_available() else "cpu"
)
# 设置强制解码参数
forced_decoder_ids = processor.get_decoder_prompt_ids(
language="japanese",
task="transcribe"
)
model.config.forced_decoder_ids = forced_decoder_ids
model.eval()
return processor, model
processor, model = load_components()
# 语音转文字函数
def transcribe_audio(audio_path):
try:
# 加载音频
audio, sr = librosa.load(audio_path, sr=16000)
# 提取特征
inputs = processor.feature_extractor(
audio,
sampling_rate=16000,
return_tensors="pt"
).input_features.to(model.device)
# 生成预测
with torch.no_grad():
generated_ids = model.generate(inputs, max_length=256)
# 解码结果
text = processor.tokenizer.batch_decode(
generated_ids,
skip_special_tokens=True
)[0]
return text
except Exception as e:
return f"Error: {str(e)}"
# 创建Gradio界面
demo = gr.Interface(
fn=transcribe_audio,
inputs=gr.Audio(sources="upload", type="filepath"),
outputs=gr.Textbox(label="output"),
title="ASR",
description="Whisper trained on Prosodic&Phonemic transcription on JSUT5000 & VOICEVOX generated dataset.",
allow_flagging="never"
)
demo.launch()