File size: 3,566 Bytes
335b152 |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 |
acous_params:
- - 480
- 1200
- 80
- - 240
- 1200
- 160
amp: false
audio_num_mel_bins: 160
audio_sample_rate: 24000
c_spk_enc: 512
char_dict_size: 15000
conv_use_pos: false
dec0_dilations:
- 1
- 2
- 4
- 1
- 2
- 4
- 1
dec0_kernel_size: 3
dec_dilations:
- 1
- 2
- 1
- 2
- 1
dec_ffn_kernel_size: 9
dec_inp_add_noise: false
dec_kernel_size: 5
dec_layers: 4
dec_post_net_kernel: 3
decoder_rnn_dim: 0
decoder_type: conv
dropout: 0.0
ds_add_pitch_embed: false
dur_alpha: 1.0
dur_context_enc: true
dur_log: true
dur_predictor_kernel: 3
dur_predictor_layers: 2
dur_use_char: true
dur_use_spk: true
enc_dec_norm: ln
enc_dilations:
- 1
- 1
- 1
- 1
enc_ffn_kernel_size: 5
enc_kernel_size: 5
enc_layers: 8
enc_post_net_kernel: 3
enc_pre_ln: true
enc_prenet: true
encoder_K: 8
encoder_type: rel_fft
f0_max: 600
f0_min: 60
ffn_act: gelu
ffn_hidden_size: 1024
fft_size: 1200
fg_spk_enc_hidden: 256
fmax: 12000
fmin: 0
frames_multiple: 8
hidden_size: 512
hop_size: 240
ignore_begin_end_sil: false
keep_c0_init: true
kl_min: 0
kl_start_steps: 1
lat_for_dur: false
latent_dim: 16
latent_size: 256
layers_in_block: 2
ling_label_dict_size:
- 20
- 4
- 5
- 2
- 3
- 3
- 3
- 6
- 15
ling_labels:
- tone
load_ckpt: ''
loud_norm: false
mel_vmax: 0.5
mel_vmin: -6
min_frames: 50
mixed_precision: bf16
no_text_enc: false
nsf_type: none
num_heads: 2
out_wav_norm: true
pad_frames: false
precision: fp16
predict_pitch: false
resblock: '1'
resblock_dilation_sizes:
- - 1
- 3
- 5
- - 1
- 3
- 5
- - 1
- 3
- 5
resblock_kernel_sizes:
- 3
- 7
- 11
train_spk_embed_only: false
upsample_initial_channel: 512
upsample_kernel_sizes:
- 12
- 11
- 8
- 4
upsample_rates:
- 6
- 5
- 4
- 2
use_bert_input: false
use_cfg: true
use_char: true
use_cur_global: false
use_cur_global_dec: true
use_dur_embed: true
use_dur_mask_embed: true
use_ema: false
use_expand_ph: true
use_finegrained_spk: false
use_global_lat: false
use_gt_dur: false
use_gt_f0: false
use_mix_spk_embed: false
use_new_vae: false
use_ph_level_f0: false
use_ph_pos_embed: true
use_pitch_embed: false
use_pitch_embed_dec: false
use_pitch_pred: true
use_pos_embed: true
use_qk_norm: true
use_random_spk_embed: false
use_seq_cfg: true
use_spk_embed: false
use_spk_enc: true
use_spk_id: false
use_uv: true
use_vae: true
use_vpcfm: true
use_vqvae: true
use_word_encoder: true
use_word_input: false
vae_dur_grad: 0.1
vae_enc_hidden_size: 384
vae_stride: 4
vae_word_conder_layers: 0
vq_stride: 8
win_size: 1200
word_dict_size: 10000
melgan_config:
all_noise: false
backbone_resampling: librosa_kaiser_best
batch_size: 8
cond_disc: false
dim_pitch_condition: 1
downsamp_factor: 4
epochs: 1000
frame_shift: 240
lambda_feat: 0.0
lambda_log_pitch: 0.4
lambda_voiced: 1.0
load_D: 1
log_interval: 100
loss_pitch: 1.0
loss_speaker: 1.0
loss_stft: 0.0
lr: 0.0005
mode_pitch_condition: singgan_torch
multi_resolution: 0
n_layers_D: 4
n_mel_channels: 160
n_residual_layers: 4
n_test_samples: 5
ndf: 16
noise_index: 1.0
nr: 0
num_D: 3
num_band: 1
num_workers: 0
offset: 0
pretrain_steps: 0
res_layers: 1
run_hdfs: 0
sampling_rate: 24000
save_interval: 5000
seq_len: 100
single_stft: 0
sub_dis: 1
tf: 1
tf_end_ratio: 0.0
tf_end_step: 0
tf_start_ratio: 0.0
tf_start_step: 0
up_sample:
- 5
- 4
- 4
- 3
use_F_dis: 0
use_aug_pitch: 0
use_interpolate: 0
use_lsgan: 1
use_mel_loss: 1
use_melnorm: 0
use_msg_gan: 0
use_pitch_condition: false
use_pitch_prediction: 1
use_sbd: 0
use_speaker_prediction: 0
use_tanh: true
use_time_loss: 1 |