Spaces:

Luigi
/

Whisper-vs-Sensevoice-Small

Running on Zero

App Files Files Community

Whisper-vs-Sensevoice-Small / app.py

Luigi

Add MediaTek-Research/Breeze-ASR-25

d6c150b 14 days ago

raw

history blame contribute delete

18.1 kB

	import os
	import re
	import tempfile

	import torch
	import gradio as gr
	from faster_whisper import BatchedInferencePipeline, WhisperModel
	from pydub import AudioSegment, effects
	from pyannote.audio import Pipeline as DiarizationPipeline
	import opencc

	import spaces # zeroGPU support
	from funasr import AutoModel
	from funasr.utils.postprocess_utils import rich_transcription_postprocess

	from termcolor import cprint
	import time
	import torchaudio
	from pyannote.audio.pipelines.utils.hook import ProgressHook

	# —————— Model Lists ——————
	WHISPER_MODELS = [
	"SoybeanMilk/faster-whisper-Breeze-ASR-25",
	"asadfgglie/faster-whisper-large-v3-zh-TW",
	"deepdml/faster-whisper-large-v3-turbo-ct2",
	"guillaumekln/faster-whisper-tiny",
	"Systran/faster-whisper-large-v3",
	"XA9/Belle-faster-whisper-large-v3-zh-punct",
	"guillaumekln/faster-whisper-medium",
	"guillaumekln/faster-whisper-small",
	"guillaumekln/faster-whisper-base",
	"Luigi/whisper-small-zh_tw-ct2",
	]

	SENSEVOICE_MODELS = [
	"FunAudioLLM/SenseVoiceSmall",
	"funasr/paraformer-zh",
	]

	# —————— Language Options ——————
	WHISPER_LANGUAGES = [
	"zh", "af","am","ar","as","az","ba","be","bg","bn","bo",
	"br","bs","ca","cs","cy","da","de","el","en","es","et",
	"eu","fa","fi","fo","fr","gl","gu","ha","haw","he","hi",
	"hr","ht","hu","hy","id","is","it","ja","jw","ka","kk",
	"km","kn","ko","la","lb","ln","lo","lt","lv","mg","mi",
	"mk","ml","mn","mr","ms","mt","my","ne","nl","nn","no",
	"oc","pa","pl","ps","pt","ro","ru","sa","sd","si","sk",
	"sl","sn","so","sq","sr","su","sv","sw","ta","te","tg",
	"th","tk","tl","tr","tt","uk","ur","uz","vi","yi","yo",
	"yue", "auto",
	]

	SENSEVOICE_LANGUAGES = ["zh", "yue", "en", "ja", "ko", "auto", "nospeech"]

	# —————— Caches ——————
	whisper_pipes = {}
	sense_models = {}
	dar_pipe = None

	converter = opencc.OpenCC('s2t')

	# —————— Diarization Formatter ——————
	def format_diarization_html(snippets):
	palette = ["#e74c3c", "#3498db", "#27ae60", "#e67e22", "#9b59b6", "#16a085", "#f1c40f"]
	speaker_colors = {}
	html_lines = []
	last_spk = None
	for s in snippets:
	if s.startswith("[") and "]" in s:
	spk, txt = s[1:].split("]", 1)
	spk, txt = spk.strip(), txt.strip()
	else:
	spk, txt = "", s.strip()
	# hide empty lines
	if not txt:
	continue
	# assign color if new speaker
	if spk not in speaker_colors:
	speaker_colors[spk] = palette[len(speaker_colors) % len(palette)]
	color = speaker_colors[spk]
	# simplify tag for same speaker
	if spk == last_spk:
	display = txt
	else:
	display = f"<strong>{spk}:</strong> {txt}"
	last_spk = spk
	html_lines.append(
	f"<p style='margin:4px 0; font-family:monospace; color:{color};'>{display}</p>"
	)
	return "<div>" + "".join(html_lines) + "</div>"

	# —————— Helpers ——————

	# —————— Faster-Whisper Cache & Factory ——————
	_fwhisper_models: dict[tuple[str, str], WhisperModel] = {}

	def get_fwhisper_model(model_id: str, device: str) -> WhisperModel:
	"""
	Lazily load and cache WhisperModel(model_id) on 'cpu' or 'cuda:0'.
	Uses float16 on GPU and int8 on CPU for speed.
	"""
	key = (model_id, device)
	if key not in _fwhisper_models:
	compute_type = "float16" if device.startswith("cuda") else "int8"
	model = WhisperModel(
	model_id,
	device=device,
	compute_type=compute_type,
	)
	_fwhisper_models[key] = BatchedInferencePipeline(model=model)
	return _fwhisper_models[key]

	def get_sense_model(model_id: str, device_str: str):
	key = (model_id, device_str)
	if key not in sense_models:
	sense_models[key] = AutoModel(
	model=model_id,
	vad_model="fsmn-vad",
	vad_kwargs={"max_single_segment_time": 300000},
	device=device_str,
	ban_emo_unk=False,
	hub="hf",
	)
	return sense_models[key]


	def get_diarization_pipe():
	global dar_pipe
	if dar_pipe is None:
	token = os.getenv("HF_TOKEN") or os.getenv("HUGGINGFACE_TOKEN")
	try:
	dar_pipe = DiarizationPipeline.from_pretrained(
	"pyannote/speaker-diarization-3.1",
	use_auth_token=token or True
	)
	except Exception as e:
	print(f"Failed to load pyannote/speaker-diarization-3.1: {e}\nFalling back to pyannote/[email protected].")
	dar_pipe = DiarizationPipeline.from_pretrained(
	"pyannote/[email protected]",
	use_auth_token=token or True
	)

	return dar_pipe

	# —————— Whisper Transcription ——————
	def _transcribe_fwhisper_stream_common(
	model_id,
	language,
	audio_path,
	whisper_multilingual_en,
	enable_punct,
	backend,
	device,
	banner_text,
	banner_color
	):
	"""
	Core generator for streaming transcription with accumulation using Faster-Whisper.
	Handles both CPU and CUDA backends; merges consecutive turns by the same speaker;
	strips injected trailing punctuation; and appends a Chinese period to new speaker turns if missing.

	Args:
	model_id: Whisper model identifier
	language: language code or "auto"
	audio_path: path to audio file
	whisper_multilingual_en: allow English in multilingual mode
	enable_punct: whether to append a Chinese period on new speaker turns when missing
	backend: "cpu" or "cuda"
	device: torch.device for model and diarizer
	banner_text: label for cprint (e.g. "CPU" or "CUDA")
	banner_color: color for cprint

	Yields:
	("", format_diarization_html(snippets))
	"""
	import re

	# Pattern to detect trailing punctuation
	end_punct_pattern = r'[。！？…~～\.\!?]+$'

	# Initialize whisper pipe
	pipe = get_fwhisper_model(model_id, backend)
	cprint(f'Whisper (faster-whisper) using {banner_text} [stream]', banner_color)

	# Load diarizer and audio
	diarizer = get_diarization_pipe()
	waveform, sample_rate = torchaudio.load(audio_path)
	if device.type == 'cuda':
	waveform = waveform.to(device)
	diarizer.to(device)

	# Run diarization
	with ProgressHook() as hook:
	diary = diarizer({"waveform": waveform, "sample_rate": sample_rate}, hook=hook)

	snippets = []
	for turn, _, speaker in diary.itertracks(yield_label=True):
	# Extract audio segment
	start_ms = int(turn.start * 1000)
	end_ms = int(turn.end * 1000)
	segment = AudioSegment.from_file(audio_path)[start_ms:end_ms]

	# Transcribe with faster-whisper
	with tempfile.NamedTemporaryFile(suffix=".wav", delete=False) as tmp:
	segment = effects.normalize(segment)
	segment.export(tmp.name, format="wav")
	segments, _ = pipe.transcribe(
	tmp.name,
	beam_size=3,
	best_of=3,
	language=None if language == "auto" else language,
	vad_filter=True,
	batch_size=16,
	multilingual=whisper_multilingual_en,
	)
	os.unlink(tmp.name)

	# Convert and clean text
	raw_text = "".join(s.text for s in segments).strip()
	text = converter.convert(raw_text)
	if text:
	tag = f"[{speaker}]"
	if enable_punct and not re.search(end_punct_pattern, text):
	text = f'{text}。'
	else:
	text = f'{text} '
	if snippets and snippets[-1].startswith(tag):
	# Same speaker: merge
	prev_text = snippets[-1].split('] ', 1)[1]
	snippets[-1] = f"{tag} {prev_text}{text}"
	else:
	# New speaker:
	snippets.append(f"{tag} {text}")

	# Yield accumulated HTML
	yield "", format_diarization_html(snippets)
	return

	def _transcribe_fwhisper_cpu_stream(
	model_id,
	language,
	audio_path,
	whisper_multilingual_en,
	enable_punct
	):
	"""
	CPU wrapper for Faster-Whisper streaming transcription.
	"""
	yield from _transcribe_fwhisper_stream_common(
	model_id,
	language,
	audio_path,
	whisper_multilingual_en,
	enable_punct,
	backend="cpu",
	device=torch.device('cpu'),
	banner_text="CPU",
	banner_color="red",
	)

	@spaces.GPU
	def _transcribe_fwhisper_gpu_stream(
	model_id,
	language,
	audio_path,
	whisper_multilingual_en,
	enable_punct
	):
	"""
	CUDA wrapper for Faster-Whisper streaming transcription.
	"""
	yield from _transcribe_fwhisper_stream_common(
	model_id,
	language,
	audio_path,
	whisper_multilingual_en,
	enable_punct,
	backend="cuda",
	device=torch.device('cuda'),
	banner_text="CUDA",
	banner_color="green",
	)

	def transcribe_fwhisper_stream(model_id, language, audio_path, device_sel, whisper_multilingual_en, enable_punct):
	"""Dispatch to CPU or GPU streaming generators, preserving two-value yields."""
	if device_sel == "GPU" and torch.cuda.is_available():
	yield from _transcribe_fwhisper_gpu_stream(model_id, language, audio_path, whisper_multilingual_en, enable_punct)
	else:
	yield from _transcribe_fwhisper_cpu_stream(model_id, language, audio_path, whisper_multilingual_en, enable_punct)

	# —————— SenseVoice Transcription ——————
	def _transcribe_sense_stream_common(
	model_id: str,
	language: str,
	audio_path: str,
	enable_punct: bool,
	backend: str,
	device: torch.device,
	banner_text: str,
	banner_color: str
	):
	"""
	Core generator for SenseVoiceSmall streaming transcription.
	Handles CPU and CUDA; merges consecutive turns by the same speaker;
	strips injected trailing punctuation; appends a Chinese period to new speaker turns if missing.

	Args:
	model_id: model identifier for SenseVoiceSmall
	language: language code
	audio_path: path to audio file
	enable_punct: whether to keep ITN punctuation and append periods
	backend: device spec for get_sense_model ("cpu" or "cuda:0")
	device: torch.device for waveform & diarizer
	banner_text: label for console banner
	banner_color: color for console banner
	Yields:
	("", format_diarization_html(snippets))
	"""
	import re

	# Pattern to detect trailing punctuation
	end_punct_pattern = r'[。！？…~～\.\!?]+$'

	# Load model
	model = get_sense_model(model_id, backend)
	cprint(f'SenseVoiceSmall using {banner_text} [stream]', banner_color)

	# Prepare diarizer and audio
	diarizer = get_diarization_pipe()
	diarizer.to(device)
	waveform, sample_rate = torchaudio.load(audio_path)
	if device.type == 'cuda':
	waveform = waveform.to(device)

	# Run diarization
	with ProgressHook() as hook:
	diary = diarizer({"waveform": waveform, "sample_rate": sample_rate}, hook=hook)

	snippets = []
	cache = {}
	for turn, _, speaker in diary.itertracks(yield_label=True):
	start_ms = int(turn.start * 1000)
	end_ms = int(turn.end * 1000)
	segment = AudioSegment.from_file(audio_path)[start_ms:end_ms]

	# Export and transcribe segment
	with tempfile.NamedTemporaryFile(suffix=".wav", delete=False) as tmp:
	segment.export(tmp.name, format="wav")
	try:
	segs = model.generate(
	input=tmp.name,
	cache=cache,
	language=language,
	use_itn=enable_punct,
	batch_size_s=300
	)
	except Exception as e:
	cprint(f'Error: {e}', 'red')
	segs = None
	os.unlink(tmp.name)

	# Post-process text
	if segs:
	txt = rich_transcription_postprocess(segs[0]['text'])
	# Remove all punctuation if disabled
	if not enable_punct:
	txt = re.sub(r"[^\w\s]", "", txt)

	if txt:
	txt = converter.convert(txt)
	tag = f"[{speaker}]"
	if enable_punct and not re.search(end_punct_pattern, txt):
	txt = f'{txt}。'
	else:
	txt = f'{txt} '
	if snippets and snippets[-1].startswith(tag):
	# Same speaker: merge with previous
	prev_text = snippets[-1].split('] ', 1)[1]
	snippets[-1] = f"{tag} {prev_text}{txt}"
	else:
	# New speaker
	snippets.append(f"{tag} {txt}")

	# Yield accumulated HTML
	yield "", format_diarization_html(snippets)
	return

	def _transcribe_sense_cpu_stream(
	model_id: str,
	language: str,
	audio_path: str,
	enable_punct: bool
	):
	"""
	CPU wrapper for SenseVoiceSmall streaming transcription.
	"""
	yield from _transcribe_sense_stream_common(
	model_id=model_id,
	language=language,
	audio_path=audio_path,
	enable_punct=enable_punct,
	backend="cpu",
	device=torch.device('cpu'),
	banner_text="CPU",
	banner_color="red"
	)

	@spaces.GPU(duration=120)
	def _transcribe_sense_gpu_stream(
	model_id: str,
	language: str,
	audio_path: str,
	enable_punct: bool
	):
	"""
	CUDA wrapper for SenseVoiceSmall streaming transcription.
	"""
	yield from _transcribe_sense_stream_common(
	model_id=model_id,
	language=language,
	audio_path=audio_path,
	enable_punct=enable_punct,
	backend="cuda:0",
	device=torch.device('cuda'),
	banner_text="CUDA",
	banner_color="green"
	)

	def transcribe_sense_steam(model_id: str,
	language: str,
	audio_path: str,
	enable_punct: bool,
	device_sel: str):
	if device_sel == "GPU" and torch.cuda.is_available():
	yield from _transcribe_sense_gpu_stream(model_id, language, audio_path, enable_punct)
	else:
	yield from _transcribe_sense_cpu_stream(model_id, language, audio_path, enable_punct)

	# —————— Gradio UI ——————
	DEMO_CSS = """
	.diar {
	padding: 0.5rem;
	color: #f1f1f1;
	font-family: monospace;
	font-size: 0.9rem;
	}
	"""
	Demo = gr.Blocks(css=DEMO_CSS)
	with Demo:
	gr.Markdown("## Faster-Whisper vs. SenseVoice")
	audio_input = gr.Audio(sources=["upload", "microphone"], type="filepath", label="Audio Input")
	examples = gr.Examples(
	examples=[["interview.mp3"], ["news.mp3"], ["meeting.mp3"]],
	inputs=[audio_input],
	label="Example Audio Files"
	)

	# ────────────────────────────────────────────────────────────────
	# 1) CONTROL PANELS (still side-by-side)
	with gr.Row():
	with gr.Column():
	gr.Markdown("### Faster-Whisper ASR")
	whisper_dd = gr.Dropdown(choices=WHISPER_MODELS, value=WHISPER_MODELS[0], label="Whisper Model")
	whisper_lang = gr.Dropdown(choices=WHISPER_LANGUAGES, value="auto", label="Whisper Language")
	device_radio = gr.Radio(choices=["GPU","CPU"], value="GPU", label="Device")
	whisper_punct_chk = gr.Checkbox(label="Enable Punctuation", value=True)
	whisper_multilingual_en = gr.Checkbox(label="Multilingual", value=False)
	btn_w = gr.Button("Transcribe with Faster-Whisper")

	with gr.Column():
	gr.Markdown("### FunASR SenseVoice ASR")
	sense_dd = gr.Dropdown(choices=SENSEVOICE_MODELS, value=SENSEVOICE_MODELS[0], label="SenseVoice Model")
	sense_lang = gr.Dropdown(choices=SENSEVOICE_LANGUAGES, value="auto", label="SenseVoice Language")
	device_radio_s = gr.Radio(choices=["GPU","CPU"], value="GPU", label="Device")
	sense_punct_chk = gr.Checkbox(label="Enable Punctuation", value=True)
	btn_s = gr.Button("Transcribe with SenseVoice")

	# ────────────────────────────────────────────────────────────────
	# 2) SHARED TRANSCRIPT ROW (aligned side-by-side)
	with gr.Row():
	with gr.Column():
	gr.Markdown("### Faster-Whisper Output")
	out_w = gr.Textbox(label="Raw Transcript", visible=False)
	out_w_d = gr.HTML(label="Diarized Transcript", elem_classes=["diar"])

	with gr.Column():
	gr.Markdown("### SenseVoice Output")
	out_s = gr.Textbox(label="Raw Transcript", visible=False)
	out_s_d = gr.HTML(label="Diarized Transcript", elem_classes=["diar"])

	# ────────────────────────────────────────────────────────────────
	# 3) WIRING UP TOGGLES & BUTTONS

	# wire the callbacks into those shared boxes
	btn_w.click(
	fn=transcribe_fwhisper_stream,
	inputs=[whisper_dd, whisper_lang, audio_input, device_radio, whisper_multilingual_en, whisper_punct_chk],
	outputs=[out_w, out_w_d]
	)
	btn_s.click(
	fn=transcribe_sense_steam,
	inputs=[sense_dd, sense_lang, audio_input, sense_punct_chk, device_radio_s],
	outputs=[out_s, out_s_d]
	)

	if __name__ == "__main__":
	Demo.launch()