Spaces:
Running
Running
from transformers import pipeline | |
from datasets import load_dataset | |
import soundfile as sf | |
import torch | |
from IPython.display import Audio | |
import gradio as gr | |
synthesiser = pipeline("text-to-speech", "jsbeaudry/haitian_creole_tts_11K") | |
embeddings_dataset = load_dataset("Matthijs/cmu-arctic-xvectors", split="validation") | |
speaker_embedding = torch.tensor(embeddings_dataset[7306]["xvector"]).unsqueeze(0) | |
# You can replace this embedding with your own as well. | |
def generate_speech(text): | |
speech = synthesiser(text, forward_params={"speaker_embeddings": speaker_embedding}) | |
sf.write("generated_speech.wav", speech["audio"], samplerate=speech["sampling_rate"]) | |
return "generated_speech.wav" | |
iface = gr.Interface( | |
fn=generate_speech, | |
inputs=gr.Textbox(lines=2, placeholder="Enter Haitian Creole text here..."), | |
outputs=gr.Audio(label="Generated Speech"), | |
title="Haitian Creole Text-to-Speech", | |
description="Generate speech from Haitian Creole text using a fine-tuned Text-to-Speech model." | |
) | |
iface.launch() | |
# import gradio as gr | |
# import torch | |
# import soundfile as sf | |
# from IPython.display import Audio | |
# from transformers import pipeline | |
# from datasets import load_dataset | |
# import re | |
# number_words = { | |
# 0: "zewo", 1: "en", 2: "de", 3: "twa", 4: "kat", 5: "senk", 6: "sis", 7: "sèt", 8: "uit", 9: "nèf", | |
# 10: "dis", 11: "onz", 12: "douz", 13: "trez", 14: "katorz", 15: "kenz", 16: "sèz", 17: "dis sèt", | |
# 18: "dis uit", 19: "dis nèf", 20: "vent", 30: "trant", 40: "karant", 50: "senkant", 60: "swasant", | |
# 70: "swasant diz", 80: "katreven", 90: "katreven diz", 100: "san", 1000: "mil" | |
# } | |
# def number_to_words(number): | |
# if number < 20: | |
# return number_words[number] | |
# elif number < 100: | |
# tens, unit = divmod(number, 10) | |
# return number_words[tens * 10] + (" " + number_words[unit] if unit else "") | |
# elif number < 1000: | |
# hundreds, remainder = divmod(number, 100) | |
# return (number_words[hundreds] + " san" if hundreds > 1 else "san") + (" " + number_to_words(remainder) if remainder else "") | |
# elif number < 1000000: | |
# thousands, remainder = divmod(number, 1000) | |
# return (number_to_words(thousands) + " mil" if thousands > 1 else "mil") + (" " + number_to_words(remainder) if remainder else "") | |
# elif number < 1000000000: | |
# millions, remainder = divmod(number, 1000000) | |
# return number_to_words(millions) + " milyon" + (" " + number_to_words(remainder) if remainder else "") | |
# elif number < 1000000000000: | |
# billions, remainder = divmod(number, 1000000000) | |
# return number_to_words(billions) + " milya" + (" " + number_to_words(remainder) if remainder else "") | |
# else: | |
# return str(number) | |
# def replace_numbers_with_words2(text): | |
# def replace(match): | |
# number = int(match.group()) | |
# return number_to_words(number) | |
# return re.sub(r'\b\d+\b', replace, text) | |
# # Function to clean up text using the replacement pairs | |
# def normalize_text2(text): | |
# # Convert to lowercase | |
# text = text.lower() | |
# # Remove punctuation (except apostrophes) | |
# text = re.sub(r'[^\w\s\']', '', text) | |
# # Remove extra whitespace | |
# text = ' '.join(text.split()) | |
# return text | |
# replacements = [ | |
# ("b", "b"), ("d", "d"), ("f", "f"), ("g", "ɡ"), ("h", "h"), | |
# ("j", "ʒ"), ("k", "k"), ("l", "l"), ("m", "m"), ("n", "n"), | |
# ("p", "p"), ("r", "r"), ("s", "s"), ("t", "t"), ("v", "v"), | |
# ("w", "w"), ("y", "y"), ("z", "z"), | |
# ("a", "a"), ("e", "e"), ("è", "ɛ"), ("i", "i"), ("o", "o"), | |
# ("ò", "ɔ") | |
# ] | |
# def cleanup_text2(cleaned_text): | |
# for src, dst in replacements: | |
# cleaned_text = cleaned_text.replace(src, dst) | |
# return cleaned_text | |
# # Load the text-to-speech pipeline and speaker embedding | |
# synthesiser = pipeline("text-to-speech", "jsbeaudry/haitian_creole") | |
# embeddings_dataset = load_dataset("Matthijs/cmu-arctic-xvectors", split="validation") | |
# speaker_embedding = torch.tensor(embeddings_dataset[7106]["xvector"]).unsqueeze(0) | |
# def generate_audio(text): | |
# converted_text = replace_numbers_with_words2(text) | |
# cleaned_text = cleanup_text2(converted_text) | |
# final_text = normalize_text2(cleaned_text) | |
# print(final_text) | |
# speech = synthesiser(final_text, forward_params={"speaker_embeddings": speaker_embedding}) | |
# sf.write("speech.wav", speech["audio"], samplerate=speech["sampling_rate"]) | |
# return "speech.wav" | |
# # generate_audio("Kalkile koefisyan regresyon ak entèsepsyon yo lè l sèvi avèk metòd kare ki pi piti.") | |
# # Audio("speech.wav") | |
# iface = gr.Interface( | |
# fn=generate_audio, | |
# inputs=gr.Textbox(label="Enter text in Haitian Creole"), | |
# outputs=gr.Audio(label="Generated Audio"), | |
# title="Haitian Creole Text-to-Speech", | |
# description="Enter text to generate Haitian Creole speech." | |
# ) | |
# iface.launch() | |