|
log_dir: ./Models/Finetune |
|
save_freq: 1 |
|
log_interval: 10 |
|
device: cuda |
|
epochs: 50 |
|
batch_size: 2 |
|
max_len: 310 |
|
pretrained_model: ./Models/Finetune/base_model.pth |
|
load_only_params: false |
|
debug: true |
|
|
|
data_params: |
|
train_data: "../../Data_Speech/viVoice/train.txt" |
|
val_data: "../../Data_Speech/combine/combine_val.txt" |
|
root_path: "../../Data_Speech/" |
|
|
|
symbol: |
|
pad: "$" |
|
punctuation: ';:,.!?¡¿—…"«»“” ' |
|
letters: "ABCDEFGHIJKLMNOPQRSTUVWXYZabcdefghijklmnopqrstuvwxyz" |
|
letters_ipa: "ɑɐɒæɓʙβɔɕçɗɖðʤəɘɚɛɜɝɞɟʄɡɠɢʛɦɧħɥʜɨɪʝɭɬɫɮʟɱɯɰŋɳɲɴøɵɸθœɶʘɹɺɾɻʀʁɽʂʃʈʧʉʊʋⱱʌɣɤʍχʎʏʑʐʒʔʡʕʢǀǁǂǃˈˌːˑʼʴʰʱʲʷˠˤ˞↓↑→↗↘'̩'ᵻ" |
|
extend: "∫̆ăη͡123456" |
|
|
|
preprocess_params: |
|
sr: 24000 |
|
spect_params: |
|
n_fft: 2048 |
|
win_length: 1200 |
|
hop_length: 300 |
|
|
|
training_strats: |
|
|
|
freeze_modules: [''] |
|
ignore_modules: [''] |
|
|
|
model_params: |
|
dim_in: 64 |
|
hidden_dim: 512 |
|
max_conv_dim: 512 |
|
n_layer: 3 |
|
n_mels: 80 |
|
max_dur: 50 |
|
style_dim: 128 |
|
|
|
dropout: 0.2 |
|
|
|
ASR_params: |
|
input_dim: 80 |
|
hidden_dim: 256 |
|
n_layers: 6 |
|
token_embedding_dim: 512 |
|
|
|
JDC_params: |
|
num_class: 1 |
|
seq_len: 192 |
|
|
|
|
|
decoder: |
|
type: 'hifigan' |
|
resblock_kernel_sizes: [3,7,11] |
|
upsample_rates : [10,5,3,2] |
|
upsample_initial_channel: 512 |
|
resblock_dilation_sizes: [[1,3,5], [1,3,5], [1,3,5]] |
|
upsample_kernel_sizes: [20,10,6,4] |
|
|
|
loss_params: |
|
lambda_mel: 5. |
|
lambda_gen: 1. |
|
|
|
lambda_mono: 1. |
|
lambda_s2s: 1. |
|
|
|
lambda_F0: 1. |
|
lambda_norm: 1. |
|
lambda_dur: 1. |
|
lambda_ce: 20. |
|
|
|
optimizer_params: |
|
lr: 0.0001 |
|
ft_lr: 0.00001 |