log_dir: "Models/Finetune_Extend" save_freq: 1 log_interval: 5 device: "cuda" epochs: 50 batch_size: 3 max_len: 210 # maximum number of frames pretrained_model: "Models/Finetune_Extend/current_model.pth" load_only_params: false # set to true if do not want to load epoch numbers and optimizer parameters data_params: train_data: "../../Data_Speech/viVoice/train.txt" val_data: "../../Data_Speech/combine/combine_val.txt" root_path: "../../Data_Speech/" min_length: 50 # sample until texts with this size are obtained for OOD texts preprocess_params: sr: 24000 spect_params: n_fft: 2048 win_length: 1200 hop_length: 300 model_params: dim_in: 64 hidden_dim: 512 max_conv_dim: 512 n_layer: 3 n_mels: 80 n_token: 189 # number of phoneme tokens max_dur: 50 # maximum duration of a single phoneme style_dim: 128 # style vector size dropout: 0.2 ASR_params: input_dim: 80 hidden_dim: 256 n_token: 189 # number of phoneme tokens n_layers: 6 token_embedding_dim: 512 JDC_params: num_class: 1 seq_len: 192 # config for decoder decoder: type: 'hifigan' # either hifigan or istftnet resblock_kernel_sizes: [3,7,11] upsample_rates : [10,5,3,2] upsample_initial_channel: 512 resblock_dilation_sizes: [[1,3,5], [1,3,5], [1,3,5]] upsample_kernel_sizes: [20,10,6,4] loss_params: lambda_mel: 5. # mel reconstruction loss lambda_gen: 1. # generator loss lambda_mono: 1. # monotonic alignment loss (TMA) lambda_s2s: 1. # sequence-to-sequence loss (TMA) lambda_F0: 1. # F0 reconstruction loss lambda_norm: 1. # norm reconstruction loss lambda_dur: 1. # duration loss lambda_ce: 20. # duration predictor probability output CE loss optimizer_params: lr: 0.0001 # general learning rate ft_lr: 0.00001 # learning rate for acoustic modules