Spaces:

osman
/

Uyghur_TTS_Demo

Sleeping

App Files Files Community

Uyghur_TTS_Demo / app.py

osman

Update app.py

0fee520 verified about 1 month ago

raw

history blame contribute delete

5.36 kB

	"""
	Uyghur Text-to-Speech Application
	Main application file for the Gradio interface.
	"""

	import gradio as gr
	from transformers import VitsModel, AutoTokenizer
	import torch
	import soundfile as sf
	import os
	from huggingface_hub import login

	# Import Uyghur text processing utilities
	from utils import preprocess_uyghur_text

	# Login to Hugging Face if token is provided
	if os.environ.get("HF_TOKEN"):
	login(token=os.environ["HF_TOKEN"])


	# Dictionary of available TTS models
	MODEL_OPTIONS = {
	"Muhsin": "osman/uyghur_arabic_script_tts",
	}

	# Cache for loaded models and tokenizers
	model_cache = {}
	tokenizer_cache = {}


	def load_model_and_tokenizer(model_name):
	"""
	Load model and tokenizer with caching to avoid reloading.

	Args:
	model_name (str): Name of the model from MODEL_OPTIONS.

	Returns:
	tuple: (model, tokenizer)
	"""
	if model_name not in model_cache:
	model_cache[model_name] = VitsModel.from_pretrained(
	MODEL_OPTIONS[model_name])
	tokenizer_cache[model_name] = AutoTokenizer.from_pretrained(
	MODEL_OPTIONS[model_name])
	return model_cache[model_name], tokenizer_cache[model_name]


	def text_to_speech(text, model_name):
	"""
	Convert input text to speech using the selected TTS model.

	Args:
	text (str): Input text to convert to speech.
	model_name (str): Name of the TTS model to use.

	Returns:
	bytes: Audio data in WAV format.
	"""
	# Load the selected model and tokenizer
	model, tokenizer = load_model_and_tokenizer(model_name)

	# Preprocess the text
	processed_text = preprocess_uyghur_text(text)
	print(f"Processed text: {processed_text}")

	# Tokenize input text
	inputs = tokenizer(processed_text, return_tensors="pt")

	# Generate speech waveform
	with torch.no_grad():
	output = model(**inputs).waveform

	# Convert waveform to numpy array and ensure correct shape
	audio_data = output.squeeze().numpy()
	sample_rate = model.config.sampling_rate # Get sample rate from model config

	# Save audio to a temporary file
	temp_file = "output.wav"
	sf.write(temp_file, audio_data, sample_rate)

	# Read the audio file for Gradio output
	with open(temp_file, "rb") as f:
	audio_bytes = f.read()

	# Clean up temporary file
	os.remove(temp_file)

	return audio_bytes


	# Define examples for Gradio Examples component
	examples = [
	["ھاكىمىيەتكە بولغان قارىغۇلارچە ئىتائەت ھەقىقەتنىڭ ئەڭ زور دۈشمىنىدۇر. — ئالبېرت ئېينىشتىيىن", "Muhsin"],
	["خۇش خەۋەر: Mozilla شىركىتى CommonVoice ساندىنىنىڭ 21 - نەشرىنى ئېلان قىلدى.", "Muhsin"],
	["ئىزاھات: بۇ ئەپتە piyazon نىڭ قىسمەن كودلىرىنى ئىشلەتتىم.", "Muhsin"],
	["باشنىڭ يېرىمى ئاغرىسا، بىر داس ئىسسىق سۇغا ئىككى قولنى تەخمىنەن يېرىم سائەت ئەتراپىدا چىلاپ بەرسە، باش ئاغرىقى ئاستا-ئاستا يېنىكلەيدۇ.", "Muhsin"],
	["ئەسلىدىكى دوختۇر تور بېكىتى، ھازىرقى دوختۇرلار تور بېكىتى نامىدا كەڭ تورداشلارغا خىزمەت سۇنماقتا.",
	"Muhsin"],
	["ھەممە ئادەم ئەركىن بولۇپ تۇغۇلىدۇ، ھەمدە ئىززەت-ھۆرمەت ۋە ھوقۇقتا باب-باراۋەر بولىدۇ.",
	"Muhsin"],
	["ۋالىبول: ساغلاملىق، ھەمكارلىق ۋە ھاياتىي كۈچنىڭ مۇكەممەل بىرىكىشى", "Muhsin"],
	#["ئايلانمىسى: 65-67 سانتىمېتىر (cm).", "Muhsin"]،
	["«تۈركىي تىللار دىۋانى» ناملىق بۇ ئەدەبىي ۋە تارىخىي قامۇسنىڭ مۇئەللىپى ھۇسەيىن ئوغلى مەخمۇد كاشغەرى، كاشغەرنىڭ ئوپال دېگەن يېرىدە 1008-يىللىرى دۇنياغا كەلگەن.", "Muhsin"]
	]


	# Create Gradio interface with model selection, RTL text input, and examples
	demo = gr.Interface(
	fn=text_to_speech,
	inputs=[
	gr.Textbox(
	label="Enter text to convert to speech",
	elem_classes="rtl-text",
	elem_id="input-textbox",
	lines=6,
	max_lines=15
	),
	gr.Dropdown(
	choices=list(MODEL_OPTIONS.keys()),
	label="Select TTS Model",
	value="Muhsin"
	)
	],
	outputs=gr.Audio(label="Generated Speech", type="filepath"),
	title="Text-to-Speech with Uyghur Arabic Script TTS",
	description="Uyghur TTS Text To Speech",
	examples=examples,
	css="""
	@import url('https://fonts.googleapis.com/css2?family=Noto+Sans+Arabic&display=swap');
	.rtl-text textarea {
	direction: rtl;
	width: 100%;
	height: 200px;
	font-size: 17px;
	font-family: "Noto Sans Arabic" !important;
	}
	.table-wrap{
	font-family: "Noto Sans Arabic" !important;
	}
	.table-wrap table tbody tr td:first-child {
	direction: rtl;
	text-align: right;
	}
	"""
	)


	if __name__ == "__main__":
	demo.launch()