Dionyssos's picture
Audionar long form
a1338da
import numpy as np
import soundfile
import msinference # Prefer live_demo.py instead as this demo.py has no split to sentences to prevent OOM
from audiocraft.builders import AudioGen # fixed bug for repeated calls
def tts_entry(text='A quick brown fox jumps over the lazy dog. Sweet dreams are made of this, I traveled the world and the seven seas.',
voice='en_US/m-ailabs_low#mary_ann', # Listen to voices https://huggingface.co/dkounadis/artificial-styletts2/discussions/1
soundscape = 'birds fomig'): # purposeful spells for AudioGen (behaves as controllable top-p)
if ('en_US/' in voice) or ('en_UK/' in voice):
style_vector = msinference.compute_style('assets/wavs/style_vector/' + voice.replace(
'/', '_').replace('#', '_').replace(
'cmu-arctic', 'cmu_arctic').replace(
'_low', '') + '.wav')
x = msinference.inference(text, style_vector)
elif '_' in voice:
style_vector = msinference.compute_style('assets/wavs/mimic3_foreign_4x/' + voice.replace(
'/', '_').replace('#', '_').replace(
'cmu-arctic', 'cmu_arctic').replace(
'_low', '') + '.wav')
x = msinference.inference(text, style_vector)
else:
x = msinference.foreign(text=text, lang=voice)
x /= 1.02 * np.abs(x).max() + 1e-7 # volume amplify to [-1,1]
if soundscape is not None:
sound_gen = AudioGen().to('cuda:0').eval()
background = sound_gen.generate(soundscape, duration=len(x)/16000 + .74, # sound duration in seconds
).detach().cpu().numpy()
x = .6 * x + .4 * background[:len(x)]
return x
soundfile.write(f'demo.wav', tts_entry(), 16000)