labadaba / app.py
Somalitts's picture
Update app.py
a1a999a verified
raw
history blame
4.47 kB
import gradio as gr
import torch
import torchaudio
import re
import os
from transformers import SpeechT5Processor, SpeechT5ForTextToSpeech, SpeechT5HifiGan
from speechbrain.pretrained import EncoderClassifier
device = "cuda" if torch.cuda.is_available() else "cpu"
# Load processor & vocoder
processor = SpeechT5Processor.from_pretrained("microsoft/speecht5_tts")
vocoder = SpeechT5HifiGan.from_pretrained("microsoft/speecht5_hifigan").to(device)
# Load both TTS models
model_male = SpeechT5ForTextToSpeech.from_pretrained("HusseinBashir/xus23").to(device)
model_female = SpeechT5ForTextToSpeech.from_pretrained("Somalitts/8aad").to(device)
# Load speaker encoder model
speaker_model = EncoderClassifier.from_hparams(
source="speechbrain/spkrec-xvect-voxceleb",
run_opts={"device": device},
savedir="./spk_model"
)
# Auto-generate embedding
def get_embedding(wav_path, pt_path):
if os.path.exists(pt_path):
return torch.load(pt_path).to(device)
else:
audio, sr = torchaudio.load(wav_path)
audio = torchaudio.functional.resample(audio, sr, 16000).mean(dim=0).unsqueeze(0).to(device)
with torch.no_grad():
emb = speaker_model.encode_batch(audio)
emb = torch.nn.functional.normalize(emb, dim=2).squeeze()
torch.save(emb.cpu(), pt_path)
return emb
# Ensure embeddings are created or loaded
embedding_male = get_embedding("Hussein.wav", "male_embedding.pt")
embedding_female = get_embedding("caasho.wav", "female_embedding.pt")
# Somali numbers to words
number_words = {
0: "eber", 1: "koow", 2: "labo", 3: "seddex", 4: "afar", 5: "shan",
6: "lix", 7: "todobo", 8: "sideed", 9: "sagaal", 10: "toban",
11: "toban iyo koow", 12: "toban iyo labo", 13: "toban iyo seddex",
14: "toban iyo afar", 15: "toban iyo shan", 16: "toban iyo lix",
17: "toban iyo todobo", 18: "toban iyo sideed", 19: "toban iyo sagaal",
20: "labaatan", 30: "sodon", 40: "afartan", 50: "konton",
60: "lixdan", 70: "todobaatan", 80: "sideetan", 90: "sagaashan",
100: "boqol", 1000: "kun",
}
def number_to_words(number):
if number < 20:
return number_words[number]
elif number < 100:
tens, unit = divmod(number, 10)
return number_words[tens * 10] + (" " + number_words[unit] if unit else "")
elif number < 1000:
hundreds, remainder = divmod(number, 100)
return (number_words[hundreds] + " boqol" if hundreds > 1 else "BOQOL") + (" " + number_to_words(remainder) if remainder else "")
elif number < 1000000:
thousands, remainder = divmod(number, 1000)
return (number_to_words(thousands) + " kun" if thousands > 1 else "KUN") + (" " + number_to_words(remainder) if remainder else "")
elif number < 1000000000:
millions, remainder = divmod(number, 1000000)
return number_to_words(millions) + " malyan" + (" " + number_to_words(remainder) if remainder else "")
elif number < 1000000000000:
billions, remainder = divmod(number, 1000000000)
return number_to_words(billions) + " milyaar" + (" " + number_to_words(remainder) if remainder else "")
else:
return str(number)
def replace_numbers_with_words(text):
return re.sub(r'\b\d+\b', lambda match: number_to_words(int(match.group())), text)
def normalize_text(text):
text = text.lower()
text = replace_numbers_with_words(text)
text = re.sub(r'[^\w\s]', '', text)
return text
# Main TTS function
def text_to_speech(text, voice):
text = normalize_text(text)
inputs = processor(text=text, return_tensors="pt").to(device)
if voice == "Male":
model = model_male
embedding = embedding_male
else:
model = model_female
embedding = embedding_female
with torch.no_grad():
speech = model.generate_speech(inputs["input_ids"], embedding.unsqueeze(0), vocoder=vocoder)
return (16000, speech.cpu().numpy())
# Gradio Interface
iface = gr.Interface(
fn=text_to_speech,
inputs=[
gr.Textbox(label="Geli qoraalka Af-Soomaaliga", placeholder="Tusaale: Baro aqoonta casriga ah..."),
gr.Radio(["Male", "Female"], label="Dooro Codka", value="Female")
],
outputs=gr.Audio(label="Codka la abuuray", type="numpy"),
title="Somali TTS (Lab & Dhedig)",
description="Dooro codka aad rabto, geli qoraal af-soomaali ah, codka ayaa la abuuri doonaa adigoo isticmaalaya Somali TTS (SpeechT5)."
)
iface.launch()