diff --git "a/run.ipynb" "b/run.ipynb"
new file mode 100644--- /dev/null
+++ "b/run.ipynb"
@@ -0,0 +1,268 @@
+{
+ "cells": [
+ {
+ "cell_type": "code",
+ "execution_count": null,
+ "id": "5a3ddcc8",
+ "metadata": {},
+ "outputs": [
+ {
+ "name": "stderr",
+ "output_type": "stream",
+ "text": [
+ "[nltk_data] Downloading package punkt_tab to\n",
+ "[nltk_data] C:\\Users\\catto\\AppData\\Roaming\\nltk_data...\n",
+ "[nltk_data] Package punkt_tab is already up-to-date!\n",
+ "c:\\Users\\catto\\anaconda3\\envs\\test_env\\Lib\\site-packages\\tqdm\\auto.py:21: TqdmWarning: IProgress not found. Please update jupyter and ipywidgets. See https://ipywidgets.readthedocs.io/en/stable/user_install.html\n",
+ " from .autonotebook import tqdm as notebook_tqdm\n"
+ ]
+ }
+ ],
+ "source": [
+ "from inference import StyleTTS2\n",
+ "\n",
+ "import librosa\n",
+ "import IPython.display as ipd\n",
+ "import torch.cuda\n",
+ "\n",
+ "device = 'cuda' if torch.cuda.is_available() else 'cpu'"
+ ]
+ },
+ {
+ "cell_type": "markdown",
+ "id": "7b9cecbe",
+ "metadata": {},
+ "source": [
+ "### Load models"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": null,
+ "id": "e7b9c01d",
+ "metadata": {},
+ "outputs": [],
+ "source": [
+ "config_path = \"Models/config.yml\"\n",
+ "models_path = \"Models/model.pth\""
+ ]
+ },
+ {
+ "cell_type": "markdown",
+ "id": "b803110e",
+ "metadata": {},
+ "source": [
+ "### Synthesize speech\n",
+ "\n",
+ "Little Note:\n",
+ "\n",
+ "- You don't need to add language tokens everywhere, espeak can detect and handle them automatically most of the time.\n",
+ "\n",
+ "- Because of my preprocess method, the model speaks at a slower rate. Increase the speed value to greater than 1 to restore it to normal.\n",
+ "\n",
+ "- Reference audio has a huge impact on the result. It is best to select audio around 10s long and consistent in both tone and speed.\n",
+ "\n",
+ "- I would not set randomness, smooth_dur, or t_denoise too high, as they can distort the audio.\n",
+ "\n",
+ "- For high‑quality reference audio, I recommend setting denoise to False."
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": null,
+ "id": "78396f70",
+ "metadata": {},
+ "outputs": [
+ {
+ "name": "stdout",
+ "output_type": "stream",
+ "text": [
+ "./reference_audio/vn_2.wav\n"
+ ]
+ },
+ {
+ "data": {
+ "text/html": [
+ "\n",
+ " \n",
+ " "
+ ],
+ "text/plain": [
+ ""
+ ]
+ },
+ "metadata": {},
+ "output_type": "display_data"
+ },
+ {
+ "name": "stdout",
+ "output_type": "stream",
+ "text": [
+ "./reference_audio/vn_4.wav\n"
+ ]
+ },
+ {
+ "data": {
+ "text/html": [
+ "\n",
+ " \n",
+ " "
+ ],
+ "text/plain": [
+ ""
+ ]
+ },
+ "metadata": {},
+ "output_type": "display_data"
+ }
+ ],
+ "source": [
+ "speakers = {\n",
+ " \"id_1\": {\n",
+ " \"path\": \"./reference_audio/vn_2.wav\", #Ref audio path\n",
+ " \"lang\": \"vi\", #Default language\n",
+ " \"speed\": 1.2, #Speaking speed\n",
+ " },\n",
+ " \"id_2\": {\n",
+ " \"path\": \"./reference_audio/vn_4.wav\",\n",
+ " \"lang\": \"vi\",\n",
+ " \"speed\": 1.2,\n",
+ " },\n",
+ "}\n",
+ "for id in speakers:\n",
+ " max_samples = 24000*30 #max 30 seconds ref audio\n",
+ " print(speakers[id]['path'])\n",
+ " wave, sr = librosa.load(speakers[id]['path'], sr=24000)\n",
+ " audio, index = librosa.effects.trim(wave, top_db=30)\n",
+ " if sr != 24000: audio = librosa.resample(audio, sr, 24000)\n",
+ " if len(audio) > max_samples: audio = audio[:max_samples]\n",
+ " display(ipd.Audio(audio, rate=24000, normalize=True))"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": null,
+ "id": "395959f1",
+ "metadata": {},
+ "outputs": [],
+ "source": [
+ "text = '''\n",
+ "[id_1][en-us]{What's up hommie}, dạo này đang học tí [en-us]{English}. Thấy bảo [en-us]{Building a strong vocabulary} khá là quan trọng. [en-us]{Bro} thấy sao?\n",
+ "\n",
+ "[id_2][en-us]{That's right}, tôi thấy [en-us]{bro} nên bắt đầu với việc đọc sách và báo tiếng Anh để quen với cách sử dụng từ, cũng như tập trung vào [en-us]{listening exercises} để cải thiện khả năng nghe.\n",
+ "\n",
+ "[id_1]Nghe nói rằng [en-us]{speaking practice} là bước quan trọng để giao tiếp tự tin. [en-us]{Bro} có muốn luyện tập với tôi không?\n",
+ "\n",
+ "[id_2][en-us]{For sure my hommie} à, cứ cho mình cái hẹn nhé.\n",
+ "'''"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": null,
+ "id": "16194211",
+ "metadata": {},
+ "outputs": [
+ {
+ "name": "stderr",
+ "output_type": "stream",
+ "text": [
+ "c:\\Users\\catto\\anaconda3\\envs\\test_env\\Lib\\site-packages\\torch\\nn\\utils\\weight_norm.py:143: FutureWarning: `torch.nn.utils.weight_norm` is deprecated in favor of `torch.nn.utils.parametrizations.weight_norm`.\n",
+ " WeightNorm.apply(module, name, dim)\n",
+ "c:\\Users\\catto\\anaconda3\\envs\\test_env\\Lib\\site-packages\\torch\\nn\\modules\\rnn.py:123: UserWarning: dropout option adds dropout after all but last recurrent layer, so non-zero dropout expects num_layers greater than 1, but got dropout=0.2 and num_layers=1\n",
+ " warnings.warn(\n"
+ ]
+ },
+ {
+ "name": "stdout",
+ "output_type": "stream",
+ "text": [
+ "decoder : 54289492\n",
+ "predictor : 16194612\n",
+ "text_encoder : 5612032\n",
+ "style_encoder : 13845440\n",
+ "\n",
+ "Total : 89941576\n"
+ ]
+ }
+ ],
+ "source": [
+ "model = StyleTTS2(config_path, models_path).eval().to(device)\n",
+ "default_speaker = \"[id_1]\" #INT Default speaker used when no speaker_id is provided in the input\n",
+ "avg_style = True #BOOL Split the ref audio and calculate the avg styles.\n",
+ "stabilize = True #BOOL Stabilize speaking speed.\n",
+ "denoise = 0.6 #FLOAT Adjust the strength of the denoiser. Value range is [0, 1]\n",
+ "n_merge = 16 #INT Avoid short sentences by merging when a sentence has fewer than n words"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": null,
+ "id": "d98bdb71",
+ "metadata": {},
+ "outputs": [
+ {
+ "name": "stdout",
+ "output_type": "stream",
+ "text": [
+ "Computing the style for: ./reference_audio/vn_2.wav\n",
+ "Computing the style for: ./reference_audio/vn_4.wav\n",
+ "Generating Audio...\n",
+ "Synthesized:\n"
+ ]
+ },
+ {
+ "data": {
+ "text/html": [
+ "\n",
+ " \n",
+ " "
+ ],
+ "text/plain": [
+ ""
+ ]
+ },
+ "metadata": {},
+ "output_type": "display_data"
+ }
+ ],
+ "source": [
+ "with torch.no_grad():\n",
+ " r = model.generate(text, speakers, avg_style, stabilize, denoise, n_merge, default_speaker)\n",
+ "\n",
+ "print('Synthesized:')\n",
+ "display(ipd.Audio(r, rate=24000, normalize=True))"
+ ]
+ }
+ ],
+ "metadata": {
+ "kernelspec": {
+ "display_name": "test_env",
+ "language": "python",
+ "name": "python3"
+ },
+ "language_info": {
+ "codemirror_mode": {
+ "name": "ipython",
+ "version": 3
+ },
+ "file_extension": ".py",
+ "mimetype": "text/x-python",
+ "name": "python",
+ "nbconvert_exporter": "python",
+ "pygments_lexer": "ipython3",
+ "version": "3.11.11"
+ }
+ },
+ "nbformat": 4,
+ "nbformat_minor": 5
+}