Spaces:
Running
on
Zero
Running
on
Zero
import gradio as gr | |
import torch | |
from transformers import VitsTokenizer, VitsModel, set_seed | |
import tempfile | |
import numpy as np | |
from scipy.io.wavfile import write | |
from dv_normalize.dv_sentence import DhivehiTextProcessor | |
import spaces | |
device = torch.device("cuda" if torch.cuda.is_available() else "cpu") | |
print(f"Using device: {device}") | |
# Dhivehi 2 spoken | |
dv_processor = DhivehiTextProcessor() | |
# HuggingFace models with default seeds | |
models = { | |
"MMS TTS Base": {"model": "alakxender/mms-tts-div", "seed": 555}, | |
"Female F01 (CV)": {"model": "alakxender/mms-tts-div-finetuned-md-f01", "seed": 555}, | |
"Female F02 (CV, pitch/tempo changed)": {"model": "alakxender/mms-tts-div-finetuned-md-f02", "seed": 555}, | |
"Female F03 (CV, pitch/tempo changed)": {"model": "alakxender/mms-tts-div-finetuned-md-f03", "seed": 555}, | |
"Female F04 (CV, rvc-test)": {"model": "alakxender/mms-tts-speak-f01", "seed": 555}, | |
"Female F01 (z-test)": {"model": "alakxender/mms-tts-div-ft-spk01-f01", "seed": 555}, | |
#"Female Unknown 👩🏽 (🤷♀️)": {"model": "alakxender/mms-tts-div-finetuned-sm-fu01", "seed": 555}, | |
"Male M01 (CV)": {"model": "alakxender/mms-tts-div-finetuned-md-m01", "seed": 555}, | |
#"Male M02 (javaabu/shaafiu)": {"model": "alakxender/mms-tts-div-finetuned-sm-mu01", "seed": 555}, | |
"Male M02 (z-test)": {"model": "alakxender/mms-tts-div-ft-spk01-m01", "seed": 620}, | |
"Male M02 (z-test-sm)": {"model": "alakxender/mms-tts-div-finetuned-m-spk01-t1", "seed": 555} | |
} | |
def process_and_tts(text: str, model_name: str, seed_value: int = None): | |
if (len(text) > 2000): | |
raise gr.Error(f"huh! using free cpu here!, try a small chunk of data. Yours is {len(text)}. try to fit to 2000 chars.") | |
if (model_name is None): | |
raise gr.Error("huh! not sure what to do without a model. select a model.") | |
# normalize the dv text from written to spoken | |
print(f"Normalizing: {text}") | |
normalized_text = dv_processor.spoken_dv(text) | |
print(f"Normalized: {normalized_text}") | |
# Use default seed if none provided | |
if seed_value is None: | |
seed_value = models[model_name]["seed"] | |
print(f"Loading...{models[model_name]['model']}") | |
# Load the MMS-TTS model | |
tokenizer = VitsTokenizer.from_pretrained(models[model_name]["model"]) | |
model = VitsModel.from_pretrained(models[model_name]["model"]) | |
print("Model loaded.") | |
# Preprocess the input text | |
inputs = tokenizer(text=normalized_text, return_tensors="pt") | |
print("Preprocess done.") | |
# Make the speech synthesis deterministic with user-defined seed | |
print(f"Setting seed to: {seed_value}") | |
set_seed(seed_value) | |
# Generate the audio waveform | |
print("Generating audio...") | |
with torch.no_grad(): | |
outputs = model(**inputs) | |
waveform = outputs.waveform[0] | |
sample_rate = model.config.sampling_rate | |
with tempfile.NamedTemporaryFile(suffix=".wav", delete=False) as f: | |
# Save the waveform to the temporary file | |
write(f.name, sample_rate, waveform.numpy().T) | |
# Get the file name | |
waveform_file = f.name | |
print("done.") | |
return normalized_text, waveform_file | |
def get_default_seed(model_name): | |
return models[model_name]["seed"] | |
css = """ | |
.textbox1 textarea { | |
font-size: 18px !important; | |
font-family: 'MV_Faseyha', 'Faruma', 'A_Faruma' !important; | |
line-height: 1.8 !important; | |
} | |
""" | |
with gr.Blocks(css=css) as demo: | |
gr.Markdown("# <center> DV Text-To-Speech </center>") | |
gr.Markdown("This interface converts Divehi text into natural-sounding speech using a fine-tuned Text-to-Speech model. Leveraging the capabilities of Massively Multilingual Speech (MMS) and VITS models. Text normalization is also incorporated to handle various input formats effectively.") | |
with gr.Row(): | |
with gr.Column(scale=3): | |
text = gr.TextArea( | |
label="Input text", | |
placeholder="ދިވެހި ބަހުން ކޮންމެވެސް އެއްޗެކޭ މިތާ ލިޔެބަލަ", | |
rtl=True, | |
elem_classes="textbox1" | |
) | |
normalized_text = gr.TextArea( | |
label="Normalized text", | |
rtl=True, | |
elem_classes="textbox1", | |
interactive=False | |
) | |
with gr.Column(scale=1): | |
model_name = gr.Dropdown( | |
choices=list(models.keys()), | |
label="Select TTS Model", | |
value=list(models.keys())[5] # Default to sixth model | |
) | |
seed_slider = gr.Slider( | |
minimum=0, | |
maximum=1000, | |
value=555, # Default value | |
step=1, | |
label="Seed Value (affects voice variation)" | |
) | |
# Update seed slider when model changes | |
model_name.change( | |
fn=get_default_seed, | |
inputs=[model_name], | |
outputs=[seed_slider] | |
) | |
# Place audio output below settings in the right column | |
output_audio = gr.Audio(label="Speech Output") | |
# Button in original position (outside columns) | |
btn = gr.Button("Text-To-Speech") | |
# Add examples section | |
with gr.Accordion("Examples", open=True): | |
# Define example texts | |
example_texts = [ | |
"""ނަމްބަރު އައިނު ހައްދަން އާސަންދަ އިން ކޮންމެ ދެ އަހަރަކުން އެއް ފަހަރު ދޭ 1،000ރ. ގެ ބަދަލުގައި އެ އަދަދު 2،000ރ. އަށް ބޮޑުކުރުމާ އެކު، އޭގެ ނާޖާއިޒު ފައިދާ ނެގުން ހުއްޓުވުމަށްޓަކައި ސަރުކާރުގެ މައި ހޮސްޕިޓަލް އައިޖީއެމްއެޗުގައި އައިނުގެ ވިޔަފާރި ފަށަން ނިންމާފައިވާ ކަން ރައީސް ޑރ. މުހައްމަދު މުއިއްޒު އިއުލާން ކުރައްވައިފި އެވެ. | |
މިދިޔަ ބުދަ ދުވަހުގެ ރޭ ރައީސް ވިދާޅުވެފައި ވަނީ މާދަމާ އިން ފެށިގެން ލޮލުގެ ނަމްބަރު އައިނު ހައްދަން ކޮންމެ ދެ އަހަރަކުން އެއް އަހަރު އާސަންދަ އިން 2،000ރ. ލިބޭނެ ގޮތް ހަދަން އިދާރާތަކަށް އަންގަވާފައިވާނެ ކަމަށެވެ.""", | |
"އައްޑޫގެ ގުޅިފައިވާ ރަށްތަކުގައި އެންމެ މަތިން ކަރަންޓު ބޭނުންވާ ގަޑިތަކުގައި 12 މެގަވޮޓްގެ ކަރަންޓު ބޭނުންވެ އެވެ. ކަރަންޓު ފޯރުކޮށްދިނުމަށް ހިތަދޫގައި ބަހައްޓާފައި ވަނީ 20 ޖެނަރޭޓަރު ސެޓެވެ. އޭގެ ކެޕޭސިޓީއަކީ 26.8 މެގަވޮޓެވެ. އެކަމަކު އޭގެ ތެރެއިން ފަސް ޖެނަރޭޓަރު ހަލާކުވުމާ ގުޅިގެން އޭރު އުފެއްދުނީ 15 މެގަވޮޓެވެ.", | |
"މަރުޙަބާ! ކިހިނެއްތަ ހާލު؟ މިއަދު ވަރަށް ރީތި ދުވަހެއް.", | |
"މިއަދު މާލޭގައި މޫސުން ވަރަށް ހޫނު. ވިއްސާރަވާނެ ކަމަށް ލަފާކުރެވޭ.", | |
"ސްކޫލް ފެށޭނީ ޖޫން 15 ވަނަ ދުވަހު. ކްލާސްތައް ހުންނާނީ ހެނދުނު 7:30 އިން މެންދުރު 1:30 އަށް. 2025 ވަނަ އަހަރުގެ އަހަރީ ފީއަކީ 24،500 ރުފިޔާ." | |
] | |
# Create examples for different model combinations | |
examples = [] | |
examples.append([example_texts[0], list(models.keys())[7], models[list(models.keys())[7]]["seed"]]) | |
examples.append([example_texts[1], list(models.keys())[5], models[list(models.keys())[5]]["seed"]]) | |
examples.append([example_texts[2], list(models.keys())[1], models[list(models.keys())[1]]["seed"]]) | |
examples.append([example_texts[3], list(models.keys())[6], models[list(models.keys())[6]]["seed"]]) | |
# Pass all examples to the Gradio Examples component | |
gr.Examples( | |
examples, | |
[text, model_name, seed_slider], | |
fn=process_and_tts, | |
outputs=[normalized_text, output_audio], | |
cache_examples=False | |
) | |
text.submit(fn=process_and_tts, inputs=[text, model_name, seed_slider], outputs=[normalized_text, output_audio]) | |
btn.click(fn=process_and_tts, inputs=[text, model_name, seed_slider], outputs=[normalized_text, output_audio]) | |
# Launch the Gradio app | |
if __name__ == "__main__": | |
demo.launch() | |