Spaces:

alakxender
/

tts-dhivehi-demo-mms

Running on Zero

466ec79 8 days ago

8.62 kB

	import gradio as gr
	import torch
	from transformers import VitsTokenizer, VitsModel, set_seed
	import tempfile
	import numpy as np
	from scipy.io.wavfile import write
	from dv_normalize.dv_sentence import DhivehiTextProcessor
	import spaces

	device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
	print(f"Using device: {device}")

	# Dhivehi 2 spoken
	dv_processor = DhivehiTextProcessor()


	# HuggingFace models with default seeds
	models = {
	"MMS TTS Base": {"model": "alakxender/mms-tts-div", "seed": 555},
	"Female F01 (CV)": {"model": "alakxender/mms-tts-div-finetuned-md-f01", "seed": 555},
	"Female F02 (CV, pitch/tempo changed)": {"model": "alakxender/mms-tts-div-finetuned-md-f02", "seed": 555},
	"Female F03 (CV, pitch/tempo changed)": {"model": "alakxender/mms-tts-div-finetuned-md-f03", "seed": 555},
	"Female F04 (CV, rvc-test)": {"model": "alakxender/mms-tts-speak-f01", "seed": 555},
	"Female F01 (z-test)": {"model": "alakxender/mms-tts-div-ft-spk01-f01", "seed": 555},
	#"Female Unknown 👩🏽 (🤷‍♀️)": {"model": "alakxender/mms-tts-div-finetuned-sm-fu01", "seed": 555},
	"Male M01 (CV)": {"model": "alakxender/mms-tts-div-finetuned-md-m01", "seed": 555},
	#"Male M02 (javaabu/shaafiu)": {"model": "alakxender/mms-tts-div-finetuned-sm-mu01", "seed": 555},
	"Male M02 (z-test)": {"model": "alakxender/mms-tts-div-ft-spk01-m01", "seed": 620},
	"Male M02 (z-test-sm)": {"model": "alakxender/mms-tts-div-finetuned-m-spk01-t1", "seed": 555}
	}

	@spaces.GPU
	def process_and_tts(text: str, model_name: str, seed_value: int = None):
	if (len(text) > 2000):
	raise gr.Error(f"huh! using free cpu here!, try a small chunk of data. Yours is {len(text)}. try to fit to 2000 chars.")
	if (model_name is None):
	raise gr.Error("huh! not sure what to do without a model. select a model.")

	# normalize the dv text from written to spoken
	print(f"Normalizing: {text}")
	normalized_text = dv_processor.spoken_dv(text)
	print(f"Normalized: {normalized_text}")

	# Use default seed if none provided
	if seed_value is None:
	seed_value = models[model_name]["seed"]

	print(f"Loading...{models[model_name]['model']}")
	# Load the MMS-TTS model
	tokenizer = VitsTokenizer.from_pretrained(models[model_name]["model"])
	model = VitsModel.from_pretrained(models[model_name]["model"])
	print("Model loaded.")

	# Preprocess the input text
	inputs = tokenizer(text=normalized_text, return_tensors="pt")
	print("Preprocess done.")

	# Make the speech synthesis deterministic with user-defined seed
	print(f"Setting seed to: {seed_value}")
	set_seed(seed_value)

	# Generate the audio waveform
	print("Generating audio...")
	with torch.no_grad():
	outputs = model(**inputs)
	waveform = outputs.waveform[0]
	sample_rate = model.config.sampling_rate

	with tempfile.NamedTemporaryFile(suffix=".wav", delete=False) as f:
	# Save the waveform to the temporary file
	write(f.name, sample_rate, waveform.numpy().T)
	# Get the file name
	waveform_file = f.name
	print("done.")
	return normalized_text, waveform_file

	def get_default_seed(model_name):
	return models[model_name]["seed"]

	css = """
	.textbox1 textarea {
	font-size: 18px !important;
	font-family: 'MV_Faseyha', 'Faruma', 'A_Faruma' !important;
	line-height: 1.8 !important;
	}
	"""

	with gr.Blocks(css=css) as demo:
	gr.Markdown("# <center> DV Text-To-Speech </center>")
	gr.Markdown("This interface converts Divehi text into natural-sounding speech using a fine-tuned Text-to-Speech model. Leveraging the capabilities of Massively Multilingual Speech (MMS) and VITS models. Text normalization is also incorporated to handle various input formats effectively.")

	with gr.Row():
	with gr.Column(scale=3):
	text = gr.TextArea(
	label="Input text",
	placeholder="ދިވެހި ބަހުން ކޮންމެވެސް އެއްޗެކޭ މިތާ ލިޔެބަލަ",
	rtl=True,
	elem_classes="textbox1"
	)
	normalized_text = gr.TextArea(
	label="Normalized text",
	rtl=True,
	elem_classes="textbox1",
	interactive=False
	)
	with gr.Column(scale=1):
	model_name = gr.Dropdown(
	choices=list(models.keys()),
	label="Select TTS Model",
	value=list(models.keys())[5] # Default to sixth model
	)
	seed_slider = gr.Slider(
	minimum=0,
	maximum=1000,
	value=555, # Default value
	step=1,
	label="Seed Value (affects voice variation)"
	)

	# Update seed slider when model changes
	model_name.change(
	fn=get_default_seed,
	inputs=[model_name],
	outputs=[seed_slider]
	)

	# Place audio output below settings in the right column
	output_audio = gr.Audio(label="Speech Output")

	# Button in original position (outside columns)
	btn = gr.Button("Text-To-Speech")

	# Add examples section
	with gr.Accordion("Examples", open=True):
	# Define example texts
	example_texts = [
	"""ނަމްބަރު އައިނު ހައްދަން އާސަންދަ އިން ކޮންމެ ދެ އަހަރަކުން އެއް ފަހަރު ދޭ 1،000ރ. ގެ ބަދަލުގައި އެ އަދަދު 2،000ރ. އަށް ބޮޑުކުރުމާ އެކު، އޭގެ ނާޖާއިޒު ފައިދާ ނެގުން ހުއްޓުވުމަށްޓަކައި ސަރުކާރުގެ މައި ހޮސްޕިޓަލް އައިޖީއެމްއެޗުގައި އައިނުގެ ވިޔަފާރި ފަށަން ނިންމާފައިވާ ކަން ރައީސް ޑރ. މުހައްމަދު މުއިއްޒު އިއުލާން ކުރައްވައިފި އެވެ.

	މިދިޔަ ބުދަ ދުވަހުގެ ރޭ ރައީސް ވިދާޅުވެފައި ވަނީ މާދަމާ އިން ފެށިގެން ލޮލުގެ ނަމްބަރު އައިނު ހައްދަން ކޮންމެ ދެ އަހަރަކުން އެއް އަހަރު އާސަންދަ އިން 2،000ރ. ލިބޭނެ ގޮތް ހަދަން އިދާރާތަކަށް އަންގަވާފައިވާނެ ކަމަށެވެ.""",
	"އައްޑޫގެ ގުޅިފައިވާ ރަށްތަކުގައި އެންމެ މަތިން ކަރަންޓު ބޭނުންވާ ގަޑިތަކުގައި 12 މެގަވޮޓްގެ ކަރަންޓު ބޭނުންވެ އެވެ. ކަރަންޓު ފޯރުކޮށްދިނުމަށް ހިތަދޫގައި ބަހައްޓާފައި ވަނީ 20 ޖެނަރޭޓަރު ސެޓެވެ. އޭގެ ކެޕޭސިޓީއަކީ 26.8 މެގަވޮޓެވެ. އެކަމަކު އޭގެ ތެރެއިން ފަސް ޖެނަރޭޓަރު ހަލާކުވުމާ ގުޅިގެން އޭރު އުފެއްދުނީ 15 މެގަވޮޓެވެ.",
	"މަރުޙަބާ! ކިހިނެއްތަ ހާލު؟ މިއަދު ވަރަށް ރީތި ދުވަހެއް.",
	"މިއަދު މާލޭގައި މޫސުން ވަރަށް ހޫނު. ވިއްސާރަވާނެ ކަމަށް ލަފާކުރެވޭ.",
	"ސްކޫލް ފެށޭނީ ޖޫން 15 ވަނަ ދުވަހު. ކްލާސްތައް ހުންނާނީ ހެނދުނު 7:30 އިން މެންދުރު 1:30 އަށް. 2025 ވަނަ އަހަރުގެ އަހަރީ ފީއަކީ 24،500 ރުފިޔާ."
	]

	# Create examples for different model combinations
	examples = []

	examples.append([example_texts[0], list(models.keys())[7], models[list(models.keys())[7]]["seed"]])
	examples.append([example_texts[1], list(models.keys())[5], models[list(models.keys())[5]]["seed"]])
	examples.append([example_texts[2], list(models.keys())[1], models[list(models.keys())[1]]["seed"]])
	examples.append([example_texts[3], list(models.keys())[6], models[list(models.keys())[6]]["seed"]])

	# Pass all examples to the Gradio Examples component
	gr.Examples(
	examples,
	[text, model_name, seed_slider],
	fn=process_and_tts,
	outputs=[normalized_text, output_audio],
	cache_examples=False
	)

	text.submit(fn=process_and_tts, inputs=[text, model_name, seed_slider], outputs=[normalized_text, output_audio])
	btn.click(fn=process_and_tts, inputs=[text, model_name, seed_slider], outputs=[normalized_text, output_audio])

	# Launch the Gradio app
	if __name__ == "__main__":
	demo.launch()