diff --git "a/run.ipynb" "b/run.ipynb" new file mode 100644--- /dev/null +++ "b/run.ipynb" @@ -0,0 +1,268 @@ +{ + "cells": [ + { + "cell_type": "code", + "execution_count": null, + "id": "5a3ddcc8", + "metadata": {}, + "outputs": [ + { + "name": "stderr", + "output_type": "stream", + "text": [ + "[nltk_data] Downloading package punkt_tab to\n", + "[nltk_data] C:\\Users\\catto\\AppData\\Roaming\\nltk_data...\n", + "[nltk_data] Package punkt_tab is already up-to-date!\n", + "c:\\Users\\catto\\anaconda3\\envs\\test_env\\Lib\\site-packages\\tqdm\\auto.py:21: TqdmWarning: IProgress not found. Please update jupyter and ipywidgets. See https://ipywidgets.readthedocs.io/en/stable/user_install.html\n", + " from .autonotebook import tqdm as notebook_tqdm\n" + ] + } + ], + "source": [ + "from inference import StyleTTS2\n", + "\n", + "import librosa\n", + "import IPython.display as ipd\n", + "import torch.cuda\n", + "\n", + "device = 'cuda' if torch.cuda.is_available() else 'cpu'" + ] + }, + { + "cell_type": "markdown", + "id": "7b9cecbe", + "metadata": {}, + "source": [ + "### Load models" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "e7b9c01d", + "metadata": {}, + "outputs": [], + "source": [ + "config_path = \"Models/config.yml\"\n", + "models_path = \"Models/model.pth\"" + ] + }, + { + "cell_type": "markdown", + "id": "b803110e", + "metadata": {}, + "source": [ + "### Synthesize speech\n", + "\n", + "Little Note:\n", + "\n", + "- You don't need to add language tokens everywhere, espeak can detect and handle them automatically most of the time.\n", + "\n", + "- Because of my preprocess method, the model speaks at a slower rate. Increase the speed value to greater than 1 to restore it to normal.\n", + "\n", + "- Reference audio has a huge impact on the result. It is best to select audio around 10s long and consistent in both tone and speed.\n", + "\n", + "- I would not set randomness, smooth_dur, or t_denoise too high, as they can distort the audio.\n", + "\n", + "- For high‑quality reference audio, I recommend setting denoise to False." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "78396f70", + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "./reference_audio/vn_2.wav\n" + ] + }, + { + "data": { + "text/html": [ + "\n", + " \n", + " " + ], + "text/plain": [ + "" + ] + }, + "metadata": {}, + "output_type": "display_data" + }, + { + "name": "stdout", + "output_type": "stream", + "text": [ + "./reference_audio/vn_4.wav\n" + ] + }, + { + "data": { + "text/html": [ + "\n", + " \n", + " " + ], + "text/plain": [ + "" + ] + }, + "metadata": {}, + "output_type": "display_data" + } + ], + "source": [ + "speakers = {\n", + " \"id_1\": {\n", + " \"path\": \"./reference_audio/vn_2.wav\", #Ref audio path\n", + " \"lang\": \"vi\", #Default language\n", + " \"speed\": 1.2, #Speaking speed\n", + " },\n", + " \"id_2\": {\n", + " \"path\": \"./reference_audio/vn_4.wav\",\n", + " \"lang\": \"vi\",\n", + " \"speed\": 1.2,\n", + " },\n", + "}\n", + "for id in speakers:\n", + " max_samples = 24000*30 #max 30 seconds ref audio\n", + " print(speakers[id]['path'])\n", + " wave, sr = librosa.load(speakers[id]['path'], sr=24000)\n", + " audio, index = librosa.effects.trim(wave, top_db=30)\n", + " if sr != 24000: audio = librosa.resample(audio, sr, 24000)\n", + " if len(audio) > max_samples: audio = audio[:max_samples]\n", + " display(ipd.Audio(audio, rate=24000, normalize=True))" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "395959f1", + "metadata": {}, + "outputs": [], + "source": [ + "text = '''\n", + "[id_1][en-us]{What's up hommie}, dạo này đang học tí [en-us]{English}. Thấy bảo [en-us]{Building a strong vocabulary} khá là quan trọng. [en-us]{Bro} thấy sao?\n", + "\n", + "[id_2][en-us]{That's right}, tôi thấy [en-us]{bro} nên bắt đầu với việc đọc sách và báo tiếng Anh để quen với cách sử dụng từ, cũng như tập trung vào [en-us]{listening exercises} để cải thiện khả năng nghe.\n", + "\n", + "[id_1]Nghe nói rằng [en-us]{speaking practice} là bước quan trọng để giao tiếp tự tin. [en-us]{Bro} có muốn luyện tập với tôi không?\n", + "\n", + "[id_2][en-us]{For sure my hommie} à, cứ cho mình cái hẹn nhé.\n", + "'''" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "16194211", + "metadata": {}, + "outputs": [ + { + "name": "stderr", + "output_type": "stream", + "text": [ + "c:\\Users\\catto\\anaconda3\\envs\\test_env\\Lib\\site-packages\\torch\\nn\\utils\\weight_norm.py:143: FutureWarning: `torch.nn.utils.weight_norm` is deprecated in favor of `torch.nn.utils.parametrizations.weight_norm`.\n", + " WeightNorm.apply(module, name, dim)\n", + "c:\\Users\\catto\\anaconda3\\envs\\test_env\\Lib\\site-packages\\torch\\nn\\modules\\rnn.py:123: UserWarning: dropout option adds dropout after all but last recurrent layer, so non-zero dropout expects num_layers greater than 1, but got dropout=0.2 and num_layers=1\n", + " warnings.warn(\n" + ] + }, + { + "name": "stdout", + "output_type": "stream", + "text": [ + "decoder : 54289492\n", + "predictor : 16194612\n", + "text_encoder : 5612032\n", + "style_encoder : 13845440\n", + "\n", + "Total : 89941576\n" + ] + } + ], + "source": [ + "model = StyleTTS2(config_path, models_path).eval().to(device)\n", + "default_speaker = \"[id_1]\" #INT Default speaker used when no speaker_id is provided in the input\n", + "avg_style = True #BOOL Split the ref audio and calculate the avg styles.\n", + "stabilize = True #BOOL Stabilize speaking speed.\n", + "denoise = 0.6 #FLOAT Adjust the strength of the denoiser. Value range is [0, 1]\n", + "n_merge = 16 #INT Avoid short sentences by merging when a sentence has fewer than n words" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "id": "d98bdb71", + "metadata": {}, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "Computing the style for: ./reference_audio/vn_2.wav\n", + "Computing the style for: ./reference_audio/vn_4.wav\n", + "Generating Audio...\n", + "Synthesized:\n" + ] + }, + { + "data": { + "text/html": [ + "\n", + " \n", + " " + ], + "text/plain": [ + "" + ] + }, + "metadata": {}, + "output_type": "display_data" + } + ], + "source": [ + "with torch.no_grad():\n", + " r = model.generate(text, speakers, avg_style, stabilize, denoise, n_merge, default_speaker)\n", + "\n", + "print('Synthesized:')\n", + "display(ipd.Audio(r, rate=24000, normalize=True))" + ] + } + ], + "metadata": { + "kernelspec": { + "display_name": "test_env", + "language": "python", + "name": "python3" + }, + "language_info": { + "codemirror_mode": { + "name": "ipython", + "version": 3 + }, + "file_extension": ".py", + "mimetype": "text/x-python", + "name": "python", + "nbconvert_exporter": "python", + "pygments_lexer": "ipython3", + "version": "3.11.11" + } + }, + "nbformat": 4, + "nbformat_minor": 5 +}