Spaces:
Runtime error
Runtime error
| import argparse | |
| from ctypes import alignment | |
| import os | |
| os.environ["CUDA_VISIBLE_DEVICES"] = "-1" | |
| import sys | |
| sys.path.append('rtvc/') | |
| from pathlib import Path | |
| import time | |
| import spacy | |
| import matplotlib.pyplot as plt | |
| import librosa | |
| import numpy as np | |
| import soundfile as sf | |
| import torch | |
| import noisereduce as nr | |
| import io | |
| from scipy.io.wavfile import write | |
| import base64 | |
| import streamlit as st | |
| from rtvc.encoder import inference as encoder | |
| from rtvc.encoder.params_data import * | |
| from rtvc.encoder.params_model import model_embedding_size as speaker_embedding_size | |
| from rtvc.synthesizer.inference import Synthesizer_infer | |
| from rtvc.utils.argutils import print_args | |
| from rtvc.utils.default_models import ensure_default_models | |
| from rtvc.vocoder import inference as vocoder | |
| from rtvc.vocoder.display import save_attention_multiple, save_spectrogram, save_stop_tokens | |
| from rtvc.synthesizer.utils.cleaners import english_cleaners_predict | |
| from rtvc.speed_changer.fixSpeed import * | |
| def tts(text, embed_name, nlp, autoplay=True): | |
| run_id = "default" | |
| models_dir = Path("rtvc/saved_models") | |
| embed_path = f"embeds/{embed_name}.npy" | |
| if torch.cuda.is_available(): | |
| device_id = torch.cuda.current_device() | |
| gpu_properties = torch.cuda.get_device_properties(device_id) | |
| ensure_default_models(run_id, models_dir) | |
| synthesizer = Synthesizer_infer(list(models_dir.glob(f"{run_id}/synthesizer.pt"))[0]) | |
| # vocoder.load_model(list(models_dir.glob(f"{run_id}/vocoder.pt"))[0]) | |
| ## Generating the spectrogram | |
| # The synthesizer works in batch, so you need to put your data in a list or numpy array | |
| def split_text(text): | |
| text = english_cleaners_predict(text) | |
| texts = [i.text.strip() for i in nlp(text).sents] # split paragraph to sentences | |
| return texts | |
| texts = split_text(text) | |
| print(f"the list of inputs texts:\n{texts}") | |
| embed = np.load(embed_path) | |
| specs = [] | |
| alignments = [] | |
| stop_tokens = [] | |
| for text in texts: | |
| spec, align, stop_token = synthesizer.synthesize_spectrograms([text], [embed], require_visualization=True) | |
| specs.append(spec[0]) | |
| alignments.append(align[0]) | |
| stop_tokens.append(stop_token[0]) | |
| breaks = [spec.shape[1] for spec in specs] | |
| spec = np.concatenate(specs, axis=1) | |
| ## Save synthesizer visualization results | |
| if not os.path.exists("syn_results"): | |
| os.mkdir("syn_results") | |
| save_attention_multiple(alignments, "syn_results/attention") | |
| save_stop_tokens(stop_tokens, "syn_results/stop_tokens") | |
| save_spectrogram(spec, "syn_results/mel") | |
| print("Created the mel spectrogram") | |
| ## Generating the waveform | |
| print("Synthesizing the waveform:") | |
| # Synthesizing the waveform is fairly straightforward. Remember that the longer the | |
| # spectrogram, the more time-efficient the vocoder. | |
| wav = synthesizer.griffin_lim(spec) | |
| wav = vocoder.waveform_denoising(wav) | |
| # Add breaks | |
| b_ends = np.cumsum(np.array(breaks) * Synthesizer_infer.hparams.hop_size) | |
| b_starts = np.concatenate(([0], b_ends[:-1])) | |
| wavs = [wav[start:end] for start, end, in zip(b_starts, b_ends)] | |
| breaks = [np.zeros(int(0.15 * Synthesizer_infer.sample_rate))] * len(breaks) | |
| wav = np.concatenate([i for w, b in zip(wavs, breaks) for i in (w, b)]) | |
| # Trim excess silences to compensate for gaps in spectrograms (issue #53) | |
| # generated_wav = encoder.preprocess_wav(generated_wav) | |
| wav = wav / np.abs(wav).max() * 1 | |
| if autoplay: | |
| # Play the audio (non-blocking) | |
| import sounddevice as sd | |
| try: | |
| sd.stop() | |
| sd.play(wav, synthesizer.sample_rate) | |
| time_span = len(wav)//synthesizer.sample_rate + 1 | |
| time.sleep(time_span) | |
| except sd.PortAudioError as e: | |
| print("\nCaught exception: %s" % repr(e)) | |
| print("Continuing without audio playback. Suppress this message with the \"--no_sound\" flag.\n") | |
| except: | |
| raise | |
| bytes_wav = bytes() | |
| byte_io = io.BytesIO(bytes_wav) | |
| write(byte_io, synthesizer.sample_rate, wav.astype(np.float32)) | |
| result_bytes = byte_io.read() | |
| return base64.b64encode(result_bytes).decode() | |
| if __name__ == "__main__": | |
| text = "Adkins was raised by a young single mother in various working-class neighbourhoods of London." | |
| embed_name = "Adele" | |
| nlp = spacy.load('en_core_web_sm') | |
| b64 = tts(text, embed_name, nlp, autoplay=False) | |
| md = f""" | |
| <audio controls autoplay> | |
| <source src="data:audio/wav;base64,{b64}" type="audio/wav"> | |
| Your browser does not support the audio element. | |
| </audio> | |
| """ | |
| st.markdown(md, unsafe_allow_html=True) |