log_dir: ./Models/Finetune save_freq: 1 log_interval: 10 device: cuda epochs: 50 batch_size: 2 max_len: 310 # maximum number of frames pretrained_model: ./Models/Finetune/base_model.pth load_only_params: false # set to true if do not want to load epoch numbers and optimizer parameters debug: true data_params: train_data: "../../Data_Speech/viVoice/train.txt" val_data: "../../Data_Speech/combine/combine_val.txt" root_path: "../../Data_Speech/" symbol: #Total 189 symbols pad: "$" punctuation: ';:,.!?¡¿—…"«»“” ' letters: "ABCDEFGHIJKLMNOPQRSTUVWXYZabcdefghijklmnopqrstuvwxyz" letters_ipa: "ɑɐɒæɓʙβɔɕçɗɖðʤəɘɚɛɜɝɞɟʄɡɠɢʛɦɧħɥʜɨɪʝɭɬɫɮʟɱɯɰŋɳɲɴøɵɸθœɶʘɹɺɾɻʀʁɽʂʃʈʧʉʊʋⱱʌɣɤʍχʎʏʑʐʒʔʡʕʢǀǁǂǃˈˌːˑʼʴʰʱʲʷˠˤ˞↓↑→↗↘'̩'ᵻ" extend: "∫̆ăη͡123456" #ADD MORE SYMBOLS HERE preprocess_params: sr: 24000 spect_params: n_fft: 2048 win_length: 1200 hop_length: 300 training_strats: #All modules: 'decoder', 'predictor', 'text_encoder', 'style_encoder', 'text_aligner', 'pitch_extractor', 'mpd', 'msd' freeze_modules: [''] # Not updated when training. ignore_modules: [''] # Not loading => fresh start. IMPORTANT: 'text_aligner' and 'pitch_extractor' are util pretraineds DO NOT ignore them. model_params: dim_in: 64 hidden_dim: 512 max_conv_dim: 512 n_layer: 3 n_mels: 80 max_dur: 50 # maximum duration of a single phoneme style_dim: 128 # style vector size dropout: 0.2 ASR_params: input_dim: 80 hidden_dim: 256 n_layers: 6 token_embedding_dim: 512 JDC_params: num_class: 1 seq_len: 192 # config for decoder decoder: type: 'hifigan' # either hifigan or istftnet resblock_kernel_sizes: [3,7,11] upsample_rates : [10,5,3,2] upsample_initial_channel: 512 resblock_dilation_sizes: [[1,3,5], [1,3,5], [1,3,5]] upsample_kernel_sizes: [20,10,6,4] loss_params: lambda_mel: 5. # mel reconstruction loss lambda_gen: 1. # generator loss lambda_mono: 1. # monotonic alignment loss (TMA) lambda_s2s: 1. # sequence-to-sequence loss (TMA) lambda_F0: 1. # F0 reconstruction loss lambda_norm: 1. # norm reconstruction loss lambda_dur: 1. # duration loss lambda_ce: 20. # duration predictor probability output CE loss optimizer_params: lr: 0.0001 # general learning rate ft_lr: 0.00001 # learning rate for acoustic modules