Commit
·
0d15013
1
Parent(s):
2db5c30
update extension, fix speed has no effect
Browse files- Models/{config.yml → config.yaml} +70 -70
- app.py +1 -1
- inference.py +3 -1
- run.ipynb +7 -7
Models/{config.yml → config.yaml}
RENAMED
|
@@ -1,71 +1,71 @@
|
|
| 1 |
-
log_dir: "Models/Finetune_Extend"
|
| 2 |
-
save_freq: 1
|
| 3 |
-
log_interval: 5
|
| 4 |
-
device: "cuda"
|
| 5 |
-
epochs: 50
|
| 6 |
-
batch_size: 3
|
| 7 |
-
max_len: 210 # maximum number of frames
|
| 8 |
-
pretrained_model: "Models/Finetune_Extend/current_model.pth"
|
| 9 |
-
load_only_params: false # set to true if do not want to load epoch numbers and optimizer parameters
|
| 10 |
-
|
| 11 |
-
data_params:
|
| 12 |
-
train_data: "../../Data_Speech/viVoice/train.txt"
|
| 13 |
-
val_data: "../../Data_Speech/combine/combine_val.txt"
|
| 14 |
-
root_path: "../../Data_Speech/"
|
| 15 |
-
min_length: 50 # sample until texts with this size are obtained for OOD texts
|
| 16 |
-
|
| 17 |
-
preprocess_params:
|
| 18 |
-
sr: 24000
|
| 19 |
-
spect_params:
|
| 20 |
-
n_fft: 2048
|
| 21 |
-
win_length: 1200
|
| 22 |
-
hop_length: 300
|
| 23 |
-
|
| 24 |
-
model_params:
|
| 25 |
-
dim_in: 64
|
| 26 |
-
hidden_dim: 512
|
| 27 |
-
max_conv_dim: 512
|
| 28 |
-
n_layer: 3
|
| 29 |
-
n_mels: 80
|
| 30 |
-
|
| 31 |
-
n_token: 189 # number of phoneme tokens
|
| 32 |
-
max_dur: 50 # maximum duration of a single phoneme
|
| 33 |
-
style_dim: 128 # style vector size
|
| 34 |
-
|
| 35 |
-
dropout: 0.2
|
| 36 |
-
|
| 37 |
-
ASR_params:
|
| 38 |
-
input_dim: 80
|
| 39 |
-
hidden_dim: 256
|
| 40 |
-
n_token: 189 # number of phoneme tokens
|
| 41 |
-
n_layers: 6
|
| 42 |
-
token_embedding_dim: 512
|
| 43 |
-
|
| 44 |
-
JDC_params:
|
| 45 |
-
num_class: 1
|
| 46 |
-
seq_len: 192
|
| 47 |
-
|
| 48 |
-
# config for decoder
|
| 49 |
-
decoder:
|
| 50 |
-
type: 'hifigan' # either hifigan or istftnet
|
| 51 |
-
resblock_kernel_sizes: [3,7,11]
|
| 52 |
-
upsample_rates : [10,5,3,2]
|
| 53 |
-
upsample_initial_channel: 512
|
| 54 |
-
resblock_dilation_sizes: [[1,3,5], [1,3,5], [1,3,5]]
|
| 55 |
-
upsample_kernel_sizes: [20,10,6,4]
|
| 56 |
-
|
| 57 |
-
loss_params:
|
| 58 |
-
lambda_mel: 5. # mel reconstruction loss
|
| 59 |
-
lambda_gen: 1. # generator loss
|
| 60 |
-
|
| 61 |
-
lambda_mono: 1. # monotonic alignment loss (TMA)
|
| 62 |
-
lambda_s2s: 1. # sequence-to-sequence loss (TMA)
|
| 63 |
-
|
| 64 |
-
lambda_F0: 1. # F0 reconstruction loss
|
| 65 |
-
lambda_norm: 1. # norm reconstruction loss
|
| 66 |
-
lambda_dur: 1. # duration loss
|
| 67 |
-
lambda_ce: 20. # duration predictor probability output CE loss
|
| 68 |
-
|
| 69 |
-
optimizer_params:
|
| 70 |
-
lr: 0.0001 # general learning rate
|
| 71 |
ft_lr: 0.00001 # learning rate for acoustic modules
|
|
|
|
| 1 |
+
log_dir: "Models/Finetune_Extend"
|
| 2 |
+
save_freq: 1
|
| 3 |
+
log_interval: 5
|
| 4 |
+
device: "cuda"
|
| 5 |
+
epochs: 50
|
| 6 |
+
batch_size: 3
|
| 7 |
+
max_len: 210 # maximum number of frames
|
| 8 |
+
pretrained_model: "Models/Finetune_Extend/current_model.pth"
|
| 9 |
+
load_only_params: false # set to true if do not want to load epoch numbers and optimizer parameters
|
| 10 |
+
|
| 11 |
+
data_params:
|
| 12 |
+
train_data: "../../Data_Speech/viVoice/train.txt"
|
| 13 |
+
val_data: "../../Data_Speech/combine/combine_val.txt"
|
| 14 |
+
root_path: "../../Data_Speech/"
|
| 15 |
+
min_length: 50 # sample until texts with this size are obtained for OOD texts
|
| 16 |
+
|
| 17 |
+
preprocess_params:
|
| 18 |
+
sr: 24000
|
| 19 |
+
spect_params:
|
| 20 |
+
n_fft: 2048
|
| 21 |
+
win_length: 1200
|
| 22 |
+
hop_length: 300
|
| 23 |
+
|
| 24 |
+
model_params:
|
| 25 |
+
dim_in: 64
|
| 26 |
+
hidden_dim: 512
|
| 27 |
+
max_conv_dim: 512
|
| 28 |
+
n_layer: 3
|
| 29 |
+
n_mels: 80
|
| 30 |
+
|
| 31 |
+
n_token: 189 # number of phoneme tokens
|
| 32 |
+
max_dur: 50 # maximum duration of a single phoneme
|
| 33 |
+
style_dim: 128 # style vector size
|
| 34 |
+
|
| 35 |
+
dropout: 0.2
|
| 36 |
+
|
| 37 |
+
ASR_params:
|
| 38 |
+
input_dim: 80
|
| 39 |
+
hidden_dim: 256
|
| 40 |
+
n_token: 189 # number of phoneme tokens
|
| 41 |
+
n_layers: 6
|
| 42 |
+
token_embedding_dim: 512
|
| 43 |
+
|
| 44 |
+
JDC_params:
|
| 45 |
+
num_class: 1
|
| 46 |
+
seq_len: 192
|
| 47 |
+
|
| 48 |
+
# config for decoder
|
| 49 |
+
decoder:
|
| 50 |
+
type: 'hifigan' # either hifigan or istftnet
|
| 51 |
+
resblock_kernel_sizes: [3,7,11]
|
| 52 |
+
upsample_rates : [10,5,3,2]
|
| 53 |
+
upsample_initial_channel: 512
|
| 54 |
+
resblock_dilation_sizes: [[1,3,5], [1,3,5], [1,3,5]]
|
| 55 |
+
upsample_kernel_sizes: [20,10,6,4]
|
| 56 |
+
|
| 57 |
+
loss_params:
|
| 58 |
+
lambda_mel: 5. # mel reconstruction loss
|
| 59 |
+
lambda_gen: 1. # generator loss
|
| 60 |
+
|
| 61 |
+
lambda_mono: 1. # monotonic alignment loss (TMA)
|
| 62 |
+
lambda_s2s: 1. # sequence-to-sequence loss (TMA)
|
| 63 |
+
|
| 64 |
+
lambda_F0: 1. # F0 reconstruction loss
|
| 65 |
+
lambda_norm: 1. # norm reconstruction loss
|
| 66 |
+
lambda_dur: 1. # duration loss
|
| 67 |
+
lambda_ce: 20. # duration predictor probability output CE loss
|
| 68 |
+
|
| 69 |
+
optimizer_params:
|
| 70 |
+
lr: 0.0001 # general learning rate
|
| 71 |
ft_lr: 0.00001 # learning rate for acoustic modules
|
app.py
CHANGED
|
@@ -7,7 +7,7 @@ import traceback
|
|
| 7 |
from inference import StyleTTS2
|
| 8 |
repo_dir = './'
|
| 9 |
device = 'cuda' if torch.cuda.is_available() else 'cpu'
|
| 10 |
-
config_path = os.path.join(repo_dir, "Models", "config.
|
| 11 |
models_path = os.path.join(repo_dir, "Models", "model.pth")
|
| 12 |
model = StyleTTS2(config_path, models_path).eval().to(device)
|
| 13 |
voice_path = os.path.join(repo_dir, "reference_audio")
|
|
|
|
| 7 |
from inference import StyleTTS2
|
| 8 |
repo_dir = './'
|
| 9 |
device = 'cuda' if torch.cuda.is_available() else 'cpu'
|
| 10 |
+
config_path = os.path.join(repo_dir, "Models", "config.yaml")
|
| 11 |
models_path = os.path.join(repo_dir, "Models", "model.pth")
|
| 12 |
model = StyleTTS2(config_path, models_path).eval().to(device)
|
| 13 |
voice_path = os.path.join(repo_dir, "reference_audio")
|
inference.py
CHANGED
|
@@ -261,7 +261,7 @@ class StyleTTS2(torch.nn.Module):
|
|
| 261 |
# cal alignment
|
| 262 |
d = self.predictor.text_encoder(t_en, s, input_lengths, text_mask)
|
| 263 |
x, _ = self.predictor.lstm(d)
|
| 264 |
-
duration = self.predictor.duration_proj(x)
|
| 265 |
duration = torch.sigmoid(duration).sum(axis=-1)
|
| 266 |
|
| 267 |
if prev_d_mean != 0:#Stabilize speaking speed between splits
|
|
@@ -270,6 +270,8 @@ class StyleTTS2(torch.nn.Module):
|
|
| 270 |
dur_stats = torch.empty(duration.shape).normal_(mean=duration.mean(), std=duration.std()).to(device)
|
| 271 |
duration = duration*(1-t) + dur_stats*t
|
| 272 |
duration[:,1:-2] = self.__replace_outliers_zscore(duration[:,1:-2]) #Normalize outlier
|
|
|
|
|
|
|
| 273 |
|
| 274 |
pred_dur = torch.round(duration.squeeze()).clamp(min=1)
|
| 275 |
pred_aln_trg = torch.zeros(input_lengths, int(pred_dur.sum().data))
|
|
|
|
| 261 |
# cal alignment
|
| 262 |
d = self.predictor.text_encoder(t_en, s, input_lengths, text_mask)
|
| 263 |
x, _ = self.predictor.lstm(d)
|
| 264 |
+
duration = self.predictor.duration_proj(x)
|
| 265 |
duration = torch.sigmoid(duration).sum(axis=-1)
|
| 266 |
|
| 267 |
if prev_d_mean != 0:#Stabilize speaking speed between splits
|
|
|
|
| 270 |
dur_stats = torch.empty(duration.shape).normal_(mean=duration.mean(), std=duration.std()).to(device)
|
| 271 |
duration = duration*(1-t) + dur_stats*t
|
| 272 |
duration[:,1:-2] = self.__replace_outliers_zscore(duration[:,1:-2]) #Normalize outlier
|
| 273 |
+
|
| 274 |
+
duration /= speed
|
| 275 |
|
| 276 |
pred_dur = torch.round(duration.squeeze()).clamp(min=1)
|
| 277 |
pred_aln_trg = torch.zeros(input_lengths, int(pred_dur.sum().data))
|
run.ipynb
CHANGED
|
@@ -38,12 +38,12 @@
|
|
| 38 |
},
|
| 39 |
{
|
| 40 |
"cell_type": "code",
|
| 41 |
-
"execution_count":
|
| 42 |
"id": "e7b9c01d",
|
| 43 |
"metadata": {},
|
| 44 |
"outputs": [],
|
| 45 |
"source": [
|
| 46 |
-
"config_path = \"Models/config.
|
| 47 |
"models_path = \"Models/model.pth\""
|
| 48 |
]
|
| 49 |
},
|
|
@@ -63,7 +63,7 @@
|
|
| 63 |
},
|
| 64 |
{
|
| 65 |
"cell_type": "code",
|
| 66 |
-
"execution_count":
|
| 67 |
"id": "78396f70",
|
| 68 |
"metadata": {},
|
| 69 |
"outputs": [
|
|
@@ -121,12 +121,12 @@
|
|
| 121 |
" \"id_1\": {\n",
|
| 122 |
" \"path\": \"./reference_audio/vn_3.wav\", #Ref audio path\n",
|
| 123 |
" \"lang\": \"vi\", #Default language\n",
|
| 124 |
-
" \"speed\": 1.
|
| 125 |
" },\n",
|
| 126 |
" \"id_2\": {\n",
|
| 127 |
" \"path\": \"./reference_audio/vn_4.wav\",\n",
|
| 128 |
" \"lang\": \"vi\",\n",
|
| 129 |
-
" \"speed\": 1.
|
| 130 |
" },\n",
|
| 131 |
"}\n",
|
| 132 |
"for id in speakers:\n",
|
|
@@ -159,7 +159,7 @@
|
|
| 159 |
},
|
| 160 |
{
|
| 161 |
"cell_type": "code",
|
| 162 |
-
"execution_count":
|
| 163 |
"id": "16194211",
|
| 164 |
"metadata": {},
|
| 165 |
"outputs": [
|
|
@@ -192,7 +192,7 @@
|
|
| 192 |
"avg_style = True #BOOL Split the ref audio and calculate the avg styles.\n",
|
| 193 |
"stabilize = True #BOOL Stabilize speaking speed.\n",
|
| 194 |
"denoise = 0.6 #FLOAT Adjust the strength of the denoiser. Value range is [0, 1]\n",
|
| 195 |
-
"n_merge =
|
| 196 |
]
|
| 197 |
},
|
| 198 |
{
|
|
|
|
| 38 |
},
|
| 39 |
{
|
| 40 |
"cell_type": "code",
|
| 41 |
+
"execution_count": null,
|
| 42 |
"id": "e7b9c01d",
|
| 43 |
"metadata": {},
|
| 44 |
"outputs": [],
|
| 45 |
"source": [
|
| 46 |
+
"config_path = \"Models/config.yaml\"\n",
|
| 47 |
"models_path = \"Models/model.pth\""
|
| 48 |
]
|
| 49 |
},
|
|
|
|
| 63 |
},
|
| 64 |
{
|
| 65 |
"cell_type": "code",
|
| 66 |
+
"execution_count": null,
|
| 67 |
"id": "78396f70",
|
| 68 |
"metadata": {},
|
| 69 |
"outputs": [
|
|
|
|
| 121 |
" \"id_1\": {\n",
|
| 122 |
" \"path\": \"./reference_audio/vn_3.wav\", #Ref audio path\n",
|
| 123 |
" \"lang\": \"vi\", #Default language\n",
|
| 124 |
+
" \"speed\": 1.0, #Speaking speed\n",
|
| 125 |
" },\n",
|
| 126 |
" \"id_2\": {\n",
|
| 127 |
" \"path\": \"./reference_audio/vn_4.wav\",\n",
|
| 128 |
" \"lang\": \"vi\",\n",
|
| 129 |
+
" \"speed\": 1.0,\n",
|
| 130 |
" },\n",
|
| 131 |
"}\n",
|
| 132 |
"for id in speakers:\n",
|
|
|
|
| 159 |
},
|
| 160 |
{
|
| 161 |
"cell_type": "code",
|
| 162 |
+
"execution_count": null,
|
| 163 |
"id": "16194211",
|
| 164 |
"metadata": {},
|
| 165 |
"outputs": [
|
|
|
|
| 192 |
"avg_style = True #BOOL Split the ref audio and calculate the avg styles.\n",
|
| 193 |
"stabilize = True #BOOL Stabilize speaking speed.\n",
|
| 194 |
"denoise = 0.6 #FLOAT Adjust the strength of the denoiser. Value range is [0, 1]\n",
|
| 195 |
+
"n_merge = 18 #INT Avoid short sentences by merging when a sentence has fewer than n words"
|
| 196 |
]
|
| 197 |
},
|
| 198 |
{
|