In [None]:
from inference import StyleTTS2

import librosa
import IPython.display as ipd
import torch.cuda

device = 'cuda' if torch.cuda.is_available() else 'cpu'

### Load G2P

If you did not use eSpeak for your language, please add your own G2P.

In [None]:
import sys
import phonemizer
if sys.platform.startswith("win"):
 try:
 from phonemizer.backend.espeak.wrapper import EspeakWrapper
 import espeakng_loader
 EspeakWrapper.set_library(espeakng_loader.get_library_path())
 except Exception as e:
 print(e)

def get_phoneme(text, lang):
 try:
 my_phonemizer = phonemizer.backend.EspeakBackend(language=lang, preserve_punctuation=True, with_stress=True, language_switch='remove-flags')
 return my_phonemizer.phonemize([text])[0]
 except Exception as e:
 print(e)

### Load models

In [None]:
config_path = "Models/config.yaml"
models_path = "Models/inference/model.pth"

### Synthesize speech

Little Note: Reference audio has a huge impact on the result. It is best to select audio around 10s long and consistent in both tone and speed.

In [None]:
speaker = {
 "path": "./Audio/1_heart.wav", #Ref audio path
 "speed": 1.0, #Speaking speed
}

max_samples = 24000*20 #max 20 seconds ref audio
print(speaker['path'])
wave, sr = librosa.load(speaker['path'], sr=24000)
audio, index = librosa.effects.trim(wave, top_db=30)
if sr != 24000: audio = librosa.resample(audio, sr, 24000)
if len(audio) > max_samples: audio = audio[:max_samples]
display(ipd.Audio(audio, rate=24000, normalize=True))

In [None]:
text = '''
Nearly 300 scholars currently working in the United States have applied for positions at Aix-Marseille University in France, which has announced a program to provide a haven for academics affected by the Trump administration's policies.
Aix-Marseille launched the "Safe Place for Science" initiative earlier this year, offering three-year funded placements for approximately 20 researchers. The program aims to support scholars facing budget cuts and policy changes that have disrupted U.S. academic institutions.
'''

In [None]:
model = StyleTTS2(config_path, models_path).eval().to(device)
avg_style = True #BOOL Split the ref audio and calculate the avg styles.
stabilize = False #BOOL Stabilize speaking speed.
denoise = 0.3 #FLOAT Adjust the strength of the denoiser. Value range is [0, 1]
n_merge = 16 #INT Avoid short sentences by merging when a sentence has fewer than n words

In [None]:
with torch.no_grad():
 phonemes = get_phoneme(text=text, lang="en-us")

 styles = model.get_styles(speaker, denoise, avg_style)
 r = model.generate(phonemes, styles, stabilize, n_merge)

print('Synthesized:')
display(ipd.Audio(r, rate=24000, normalize=True))