jsbeaudry's picture
Update app.py
898905b verified
from transformers import pipeline
from datasets import load_dataset
import soundfile as sf
import torch
from IPython.display import Audio
import gradio as gr
synthesiser = pipeline("text-to-speech", "jsbeaudry/haitian_creole_tts_11K")
embeddings_dataset = load_dataset("Matthijs/cmu-arctic-xvectors", split="validation")
speaker_embedding = torch.tensor(embeddings_dataset[7306]["xvector"]).unsqueeze(0)
# You can replace this embedding with your own as well.
def generate_speech(text):
speech = synthesiser(text, forward_params={"speaker_embeddings": speaker_embedding})
sf.write("generated_speech.wav", speech["audio"], samplerate=speech["sampling_rate"])
return "generated_speech.wav"
iface = gr.Interface(
fn=generate_speech,
inputs=gr.Textbox(lines=2, placeholder="Enter Haitian Creole text here..."),
outputs=gr.Audio(label="Generated Speech"),
title="Haitian Creole Text-to-Speech",
description="Generate speech from Haitian Creole text using a fine-tuned Text-to-Speech model."
)
iface.launch()
# import gradio as gr
# import torch
# import soundfile as sf
# from IPython.display import Audio
# from transformers import pipeline
# from datasets import load_dataset
# import re
# number_words = {
# 0: "zewo", 1: "en", 2: "de", 3: "twa", 4: "kat", 5: "senk", 6: "sis", 7: "sèt", 8: "uit", 9: "nèf",
# 10: "dis", 11: "onz", 12: "douz", 13: "trez", 14: "katorz", 15: "kenz", 16: "sèz", 17: "dis sèt",
# 18: "dis uit", 19: "dis nèf", 20: "vent", 30: "trant", 40: "karant", 50: "senkant", 60: "swasant",
# 70: "swasant diz", 80: "katreven", 90: "katreven diz", 100: "san", 1000: "mil"
# }
# def number_to_words(number):
# if number < 20:
# return number_words[number]
# elif number < 100:
# tens, unit = divmod(number, 10)
# return number_words[tens * 10] + (" " + number_words[unit] if unit else "")
# elif number < 1000:
# hundreds, remainder = divmod(number, 100)
# return (number_words[hundreds] + " san" if hundreds > 1 else "san") + (" " + number_to_words(remainder) if remainder else "")
# elif number < 1000000:
# thousands, remainder = divmod(number, 1000)
# return (number_to_words(thousands) + " mil" if thousands > 1 else "mil") + (" " + number_to_words(remainder) if remainder else "")
# elif number < 1000000000:
# millions, remainder = divmod(number, 1000000)
# return number_to_words(millions) + " milyon" + (" " + number_to_words(remainder) if remainder else "")
# elif number < 1000000000000:
# billions, remainder = divmod(number, 1000000000)
# return number_to_words(billions) + " milya" + (" " + number_to_words(remainder) if remainder else "")
# else:
# return str(number)
# def replace_numbers_with_words2(text):
# def replace(match):
# number = int(match.group())
# return number_to_words(number)
# return re.sub(r'\b\d+\b', replace, text)
# # Function to clean up text using the replacement pairs
# def normalize_text2(text):
# # Convert to lowercase
# text = text.lower()
# # Remove punctuation (except apostrophes)
# text = re.sub(r'[^\w\s\']', '', text)
# # Remove extra whitespace
# text = ' '.join(text.split())
# return text
# replacements = [
# ("b", "b"), ("d", "d"), ("f", "f"), ("g", "ɡ"), ("h", "h"),
# ("j", "ʒ"), ("k", "k"), ("l", "l"), ("m", "m"), ("n", "n"),
# ("p", "p"), ("r", "r"), ("s", "s"), ("t", "t"), ("v", "v"),
# ("w", "w"), ("y", "y"), ("z", "z"),
# ("a", "a"), ("e", "e"), ("è", "ɛ"), ("i", "i"), ("o", "o"),
# ("ò", "ɔ")
# ]
# def cleanup_text2(cleaned_text):
# for src, dst in replacements:
# cleaned_text = cleaned_text.replace(src, dst)
# return cleaned_text
# # Load the text-to-speech pipeline and speaker embedding
# synthesiser = pipeline("text-to-speech", "jsbeaudry/haitian_creole")
# embeddings_dataset = load_dataset("Matthijs/cmu-arctic-xvectors", split="validation")
# speaker_embedding = torch.tensor(embeddings_dataset[7106]["xvector"]).unsqueeze(0)
# def generate_audio(text):
# converted_text = replace_numbers_with_words2(text)
# cleaned_text = cleanup_text2(converted_text)
# final_text = normalize_text2(cleaned_text)
# print(final_text)
# speech = synthesiser(final_text, forward_params={"speaker_embeddings": speaker_embedding})
# sf.write("speech.wav", speech["audio"], samplerate=speech["sampling_rate"])
# return "speech.wav"
# # generate_audio("Kalkile koefisyan regresyon ak entèsepsyon yo lè l sèvi avèk metòd kare ki pi piti.")
# # Audio("speech.wav")
# iface = gr.Interface(
# fn=generate_audio,
# inputs=gr.Textbox(label="Enter text in Haitian Creole"),
# outputs=gr.Audio(label="Generated Audio"),
# title="Haitian Creole Text-to-Speech",
# description="Enter text to generate Haitian Creole speech."
# )
# iface.launch()