Spaces:

faimlab
/

Persian_ASR_Model_Fast_Conformer

Running

App Files Files Community

Persian_ASR_Model_Fast_Conformer / app.py

saeedzou

Update app.py

cef39c8 verified 2 months ago

raw

history blame contribute delete

5.81 kB

	import gradio as gr
	import nemo.collections.asr as nemo_asr
	from pydub import AudioSegment
	import os
	import yt_dlp as youtube_dl
	from huggingface_hub import login
	from hazm import Normalizer
	import numpy as np
	import re
	import time

	# Fetch the token from an environment variable
	HF_TOKEN = os.getenv("HF_TOKEN")
	if not HF_TOKEN:
	raise ValueError("HF_TOKEN environment variable not set. Please provide a valid Hugging Face token.")

	# Authenticate with Hugging Face
	login(HF_TOKEN)

	# Load the private NeMo ASR model
	try:
	asr_model = nemo_asr.models.EncDecHybridRNNTCTCBPEModel.from_pretrained(
	model_name="faimlab/stt_fa_fastconformer_hybrid_large_dataset_v30"
	)
	except Exception as e:
	raise RuntimeError(f"Failed to load model: {str(e)}")

	normalizer = Normalizer()

	def load_audio(audio_path):
	audio = AudioSegment.from_file(audio_path)
	audio = audio.set_channels(1).set_frame_rate(16000)
	audio_samples = np.array(audio.get_array_of_samples(), dtype=np.float32)
	audio_samples /= np.max(np.abs(audio_samples))
	return audio_samples, audio.frame_rate

	def transcribe_chunk(audio_chunk, model):
	transcription = model.transcribe([audio_chunk], batch_size=1, verbose=False)
	return transcription[0].text

	def transcribe_audio(file_path, model, chunk_size=30*16000):
	waveform, _ = load_audio(file_path)
	transcriptions = []
	for start in range(0, len(waveform), chunk_size):
	end = min(len(waveform), start + chunk_size)
	transcription = transcribe_chunk(waveform[start:end], model)
	transcriptions.append(transcription)

	transcriptions = ' '.join(transcriptions)
	transcriptions = re.sub(' +', ' ', transcriptions)
	transcriptions = normalizer.normalize(transcriptions)

	return transcriptions

	# YouTube audio download function
	YT_LENGTH_LIMIT_S = 3600

	def download_yt_audio(yt_url, filename, cookie_file="cookies.txt"):
	info_loader = youtube_dl.YoutubeDL()

	try:
	info = info_loader.extract_info(yt_url, download=False)
	except youtube_dl.utils.DownloadError as err:
	raise gr.Error(str(err))

	file_length = info["duration_string"]
	file_h_m_s = file_length.split(":")
	file_h_m_s = [int(sub_length) for sub_length in file_h_m_s]

	if len(file_h_m_s) == 1:
	file_h_m_s.insert(0, 0)
	if len(file_h_m_s) == 2:
	file_h_m_s.insert(0, 0)
	file_length_s = file_h_m_s[0] * 3600 + file_h_m_s[1] * 60 + file_h_m_s[2]

	if file_length_s > YT_LENGTH_LIMIT_S:
	yt_length_limit_hms = time.strftime("%HH:%MM:%SS", time.gmtime(YT_LENGTH_LIMIT_S))
	file_length_hms = time.strftime("%HH:%MM:%SS", time.gmtime(file_length_s))
	raise gr.Error(f"Maximum YouTube length is {yt_length_limit_hms}, got {file_length_hms} YouTube video.")

	ydl_opts = {"outtmpl": filename, "format": "worstvideo[ext=mp4]+bestaudio[ext=m4a]/best[ext=mp4]/best", "cookies": cookie_file}

	with youtube_dl.YoutubeDL(ydl_opts) as ydl:
	try:
	ydl.download([yt_url])
	except youtube_dl.utils.ExtractorError as err:
	raise gr.Error(str(err))


	# Gradio Interface
	def transcribe(audio):
	if audio is None:
	return "Please upload an audio file."

	transcription = transcribe_audio(audio, asr_model)

	return transcription

	def transcribe_yt(yt_url):
	temp_filename = "/tmp/yt_audio.mp4" # Temporary filename for the downloaded video
	download_yt_audio(yt_url, temp_filename)
	transcription = transcribe_audio(temp_filename, asr_model)
	return transcription

	mf_transcribe = gr.Interface(
	fn=transcribe,
	inputs=gr.Microphone(type="filepath"),
	outputs=gr.Textbox(label="Transcription"),
	theme="huggingface",
	title="Persian ASR Transcription with NeMo Fast Conformer",
	description=(
	"Transcribe long-form microphone or audio inputs with the click of a button! Demo uses the NeMo's Fast Conformer Hybrid Large.\n\n"
	"Trained on ~800 hours of Persian speech dataset (Common Voice 17 (~300 hours), YouTube (~400 hours), NasleMana (~90 hours), In-house dataset (~70 hours)).\n\n"
	"For commercial applications, contact us via email: <[email protected]>.\n\n"
	"Credit FAIM Group, Sharif University of Technology.\n\n"
	),
	allow_flagging="never",
	)

	# File upload tab
	file_transcribe = gr.Interface(
	fn=transcribe,
	inputs=gr.Audio(type="filepath", label="Audio file"),
	outputs=gr.Textbox(label="Transcription"),
	theme="huggingface",
	title="Persian ASR Transcription with NeMo Fast Conformer",
	description=(
	"Transcribe long-form microphone or audio inputs with the click of a button! Demo uses the NeMo's Fast Conformer Hybrid Large.\n\n"
	"Trained on ~800 hours of Persian speech dataset (Common Voice 17 (~300 hours), YouTube (~400 hours), NasleMana (~90 hours), In-house dataset (~70 hours)).\n\n"
	"For commercial applications, contact us via email: <[email protected]>.\n\n"
	"Credit FAIM Group, Sharif University of Technology.\n\n"
	),
	allow_flagging="never",
	)

	# YouTube tab
	yt_transcribe = gr.Interface(
	fn=transcribe_yt,
	inputs=gr.Textbox(label="YouTube URL", placeholder="Enter the YouTube URL here"),
	outputs=gr.Textbox(label="Transcription"),
	theme="huggingface",
	title="Transcribe YouTube Video",
	description="Transcribe audio from a YouTube video by providing its URL. Currently YouTube is blocking the requests. So you will see the app showing error",
	allow_flagging="never",
	)

	# Gradio Interface
	demo = gr.Blocks()

	with demo:
	# Create the tabs with the list of interfaces
	gr.TabbedInterface([mf_transcribe, file_transcribe, yt_transcribe], ["Microphone", "Audio file", "YouTube"])

	demo.launch()