{ "cells": [ { "cell_type": "code", "execution_count": null, "id": "5a3ddcc8", "metadata": {}, "outputs": [], "source": [ "from inference import StyleTTS2\n", "\n", "import librosa\n", "import IPython.display as ipd\n", "import torch.cuda\n", "\n", "device = 'cuda' if torch.cuda.is_available() else 'cpu'" ] }, { "cell_type": "markdown", "id": "092cfb69", "metadata": {}, "source": [ "### Load G2P" ] }, { "cell_type": "markdown", "id": "a152ec13", "metadata": {}, "source": [ "If you did not use eSpeak for your language, please add your own G2P." ] }, { "cell_type": "code", "execution_count": null, "id": "ca224f37", "metadata": {}, "outputs": [], "source": [ "import sys\n", "import phonemizer\n", "if sys.platform.startswith(\"win\"):\n", " try:\n", " from phonemizer.backend.espeak.wrapper import EspeakWrapper\n", " import espeakng_loader\n", " EspeakWrapper.set_library(espeakng_loader.get_library_path())\n", " except Exception as e:\n", " print(e)\n", "\n", "def get_phoneme(text, lang):\n", " try:\n", " my_phonemizer = phonemizer.backend.EspeakBackend(language=lang, preserve_punctuation=True, with_stress=True, language_switch='remove-flags')\n", " return my_phonemizer.phonemize([text])[0]\n", " except Exception as e:\n", " print(e)" ] }, { "cell_type": "markdown", "id": "7b9cecbe", "metadata": {}, "source": [ "### Load models" ] }, { "cell_type": "code", "execution_count": null, "id": "e7b9c01d", "metadata": {}, "outputs": [], "source": [ "config_path = \"Models/config.yaml\"\n", "models_path = \"Models/inference/model.pth\"" ] }, { "cell_type": "markdown", "id": "b803110e", "metadata": {}, "source": [ "### Synthesize speech\n", "\n", "Little Note: Reference audio has a huge impact on the result. It is best to select audio around 10s long and consistent in both tone and speed." ] }, { "cell_type": "code", "execution_count": null, "id": "78396f70", "metadata": {}, "outputs": [], "source": [ "speaker = {\n", " \"path\": \"./Audio/1_heart.wav\", #Ref audio path\n", " \"speed\": 1.0, #Speaking speed\n", "}\n", "\n", "max_samples = 24000*20 #max 20 seconds ref audio\n", "print(speaker['path'])\n", "wave, sr = librosa.load(speaker['path'], sr=24000)\n", "audio, index = librosa.effects.trim(wave, top_db=30)\n", "if sr != 24000: audio = librosa.resample(audio, sr, 24000)\n", "if len(audio) > max_samples: audio = audio[:max_samples]\n", "display(ipd.Audio(audio, rate=24000, normalize=True))" ] }, { "cell_type": "code", "execution_count": null, "id": "395959f1", "metadata": {}, "outputs": [], "source": [ "text = '''\n", "Nearly 300 scholars currently working in the United States have applied for positions at Aix-Marseille University in France, which has announced a program to provide a haven for academics affected by the Trump administration's policies.\n", "Aix-Marseille launched the \"Safe Place for Science\" initiative earlier this year, offering three-year funded placements for approximately 20 researchers. The program aims to support scholars facing budget cuts and policy changes that have disrupted U.S. academic institutions.\n", "'''" ] }, { "cell_type": "code", "execution_count": null, "id": "16194211", "metadata": {}, "outputs": [], "source": [ "model = StyleTTS2(config_path, models_path).eval().to(device)\n", "avg_style = True #BOOL Split the ref audio and calculate the avg styles.\n", "stabilize = False #BOOL Stabilize speaking speed.\n", "denoise = 0.3 #FLOAT Adjust the strength of the denoiser. Value range is [0, 1]\n", "n_merge = 16 #INT Avoid short sentences by merging when a sentence has fewer than n words" ] }, { "cell_type": "code", "execution_count": null, "id": "980c6fbb", "metadata": {}, "outputs": [], "source": [ "with torch.no_grad():\n", " phonemes = get_phoneme(text=text, lang=\"en-us\")\n", "\n", " styles = model.get_styles(speaker, denoise, avg_style)\n", " r = model.generate(phonemes, styles, stabilize, n_merge)\n", "\n", "print('Synthesized:')\n", "display(ipd.Audio(r, rate=24000, normalize=True))" ] } ], "metadata": { "kernelspec": { "display_name": "base", "language": "python", "name": "python3" }, "language_info": { "codemirror_mode": { "name": "ipython", "version": 3 }, "file_extension": ".py", "mimetype": "text/x-python", "name": "python", "nbconvert_exporter": "python", "pygments_lexer": "ipython3", "version": "3.11.7" } }, "nbformat": 4, "nbformat_minor": 5 }