diff --git "a/run.ipynb" "b/run.ipynb" --- "a/run.ipynb" +++ "b/run.ipynb" @@ -2,7 +2,7 @@ "cells": [ { "cell_type": "code", - "execution_count": null, + "execution_count": 1, "id": "5a3ddcc8", "metadata": {}, "outputs": [ @@ -10,11 +10,11 @@ "name": "stderr", "output_type": "stream", "text": [ + "c:\\Users\\catto\\anaconda3\\envs\\test_env\\Lib\\site-packages\\tqdm\\auto.py:21: TqdmWarning: IProgress not found. Please update jupyter and ipywidgets. See https://ipywidgets.readthedocs.io/en/stable/user_install.html\n", + " from .autonotebook import tqdm as notebook_tqdm\n", "[nltk_data] Downloading package punkt_tab to\n", "[nltk_data] C:\\Users\\catto\\AppData\\Roaming\\nltk_data...\n", - "[nltk_data] Package punkt_tab is already up-to-date!\n", - "c:\\Users\\catto\\anaconda3\\envs\\test_env\\Lib\\site-packages\\tqdm\\auto.py:21: TqdmWarning: IProgress not found. Please update jupyter and ipywidgets. See https://ipywidgets.readthedocs.io/en/stable/user_install.html\n", - " from .autonotebook import tqdm as notebook_tqdm\n" + "[nltk_data] Package punkt_tab is already up-to-date!\n" ] } ], @@ -38,7 +38,7 @@ }, { "cell_type": "code", - "execution_count": null, + "execution_count": 2, "id": "e7b9c01d", "metadata": {}, "outputs": [], @@ -58,18 +58,12 @@ "\n", "- You don't need to add language tokens everywhere, espeak can detect and handle them automatically most of the time.\n", "\n", - "- Because of my preprocess method, the model speaks at a slower rate. Increase the speed value to greater than 1 to restore it to normal.\n", - "\n", - "- Reference audio has a huge impact on the result. It is best to select audio around 10s long and consistent in both tone and speed.\n", - "\n", - "- I would not set randomness, smooth_dur, or t_denoise too high, as they can distort the audio.\n", - "\n", - "- For high‑quality reference audio, I recommend setting denoise to False." + "- Reference audio has a huge impact on the result. It is best to select audio around 10s long and consistent in both tone and speed." ] }, { "cell_type": "code", - "execution_count": null, + "execution_count": 3, "id": "78396f70", "metadata": {}, "outputs": [ @@ -147,7 +141,7 @@ }, { "cell_type": "code", - "execution_count": null, + "execution_count": 4, "id": "395959f1", "metadata": {}, "outputs": [], @@ -165,7 +159,7 @@ }, { "cell_type": "code", - "execution_count": null, + "execution_count": 5, "id": "16194211", "metadata": {}, "outputs": [ @@ -194,7 +188,7 @@ ], "source": [ "model = StyleTTS2(config_path, models_path).eval().to(device)\n", - "default_speaker = \"[id_1]\" #INT Default speaker used when no speaker_id is provided in the input\n", + "default_speaker = \"[id_1]\" #STR Default speaker used when no speaker_id is provided in the input\n", "avg_style = True #BOOL Split the ref audio and calculate the avg styles.\n", "stabilize = True #BOOL Stabilize speaking speed.\n", "denoise = 0.6 #FLOAT Adjust the strength of the denoiser. Value range is [0, 1]\n", @@ -203,7 +197,7 @@ }, { "cell_type": "code", - "execution_count": null, + "execution_count": 6, "id": "d98bdb71", "metadata": {}, "outputs": [ @@ -222,7 +216,7 @@ "text/html": [ "\n", " \n", " " @@ -237,7 +231,8 @@ ], "source": [ "with torch.no_grad():\n", - " r = model.generate(text, speakers, avg_style, stabilize, denoise, n_merge, default_speaker)\n", + " styles = model.get_styles(speakers, denoise, avg_style)\n", + " r = model.generate(text, styles, stabilize, n_merge, default_speaker)\n", "\n", "print('Synthesized:')\n", "display(ipd.Audio(r, rate=24000, normalize=True))"