Text-to-Speech
Safetensors
English
Chinese
File size: 3,566 Bytes
335b152
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
acous_params:
- - 480
  - 1200
  - 80
- - 240
  - 1200
  - 160
amp: false
audio_num_mel_bins: 160
audio_sample_rate: 24000
c_spk_enc: 512
char_dict_size: 15000
conv_use_pos: false
dec0_dilations:
- 1
- 2
- 4
- 1
- 2
- 4
- 1
dec0_kernel_size: 3
dec_dilations:
- 1
- 2
- 1
- 2
- 1
dec_ffn_kernel_size: 9
dec_inp_add_noise: false
dec_kernel_size: 5
dec_layers: 4
dec_post_net_kernel: 3
decoder_rnn_dim: 0
decoder_type: conv
dropout: 0.0
ds_add_pitch_embed: false
dur_alpha: 1.0
dur_context_enc: true
dur_log: true
dur_predictor_kernel: 3
dur_predictor_layers: 2
dur_use_char: true
dur_use_spk: true
enc_dec_norm: ln
enc_dilations:
- 1
- 1
- 1
- 1
enc_ffn_kernel_size: 5
enc_kernel_size: 5
enc_layers: 8
enc_post_net_kernel: 3
enc_pre_ln: true
enc_prenet: true
encoder_K: 8
encoder_type: rel_fft
f0_max: 600
f0_min: 60
ffn_act: gelu
ffn_hidden_size: 1024
fft_size: 1200
fg_spk_enc_hidden: 256
fmax: 12000
fmin: 0
frames_multiple: 8
hidden_size: 512
hop_size: 240
ignore_begin_end_sil: false
keep_c0_init: true
kl_min: 0
kl_start_steps: 1
lat_for_dur: false
latent_dim: 16
latent_size: 256
layers_in_block: 2
ling_label_dict_size:
- 20
- 4
- 5
- 2
- 3
- 3
- 3
- 6
- 15
ling_labels:
- tone
load_ckpt: ''
loud_norm: false
mel_vmax: 0.5
mel_vmin: -6
min_frames: 50
mixed_precision: bf16
no_text_enc: false
nsf_type: none
num_heads: 2
out_wav_norm: true
pad_frames: false

precision: fp16
predict_pitch: false
resblock: '1'
resblock_dilation_sizes:
- - 1
  - 3
  - 5
- - 1
  - 3
  - 5
- - 1
  - 3
  - 5
resblock_kernel_sizes:
- 3
- 7
- 11
train_spk_embed_only: false
upsample_initial_channel: 512
upsample_kernel_sizes:
- 12
- 11
- 8
- 4
upsample_rates:
- 6
- 5
- 4
- 2
use_bert_input: false
use_cfg: true
use_char: true
use_cur_global: false
use_cur_global_dec: true
use_dur_embed: true
use_dur_mask_embed: true
use_ema: false
use_expand_ph: true
use_finegrained_spk: false
use_global_lat: false
use_gt_dur: false
use_gt_f0: false
use_mix_spk_embed: false
use_new_vae: false
use_ph_level_f0: false
use_ph_pos_embed: true
use_pitch_embed: false
use_pitch_embed_dec: false
use_pitch_pred: true
use_pos_embed: true
use_qk_norm: true
use_random_spk_embed: false
use_seq_cfg: true
use_spk_embed: false
use_spk_enc: true
use_spk_id: false
use_uv: true
use_vae: true
use_vpcfm: true
use_vqvae: true
use_word_encoder: true
use_word_input: false
vae_dur_grad: 0.1
vae_enc_hidden_size: 384
vae_stride: 4
vae_word_conder_layers: 0
vq_stride: 8
win_size: 1200
word_dict_size: 10000
melgan_config:
  all_noise: false
  backbone_resampling: librosa_kaiser_best
  batch_size: 8
  cond_disc: false
  dim_pitch_condition: 1
  downsamp_factor: 4
  epochs: 1000
  frame_shift: 240
  lambda_feat: 0.0
  lambda_log_pitch: 0.4
  lambda_voiced: 1.0
  load_D: 1
  log_interval: 100
  loss_pitch: 1.0
  loss_speaker: 1.0
  loss_stft: 0.0
  lr: 0.0005
  mode_pitch_condition: singgan_torch
  multi_resolution: 0
  n_layers_D: 4
  n_mel_channels: 160
  n_residual_layers: 4
  n_test_samples: 5
  ndf: 16
  noise_index: 1.0
  nr: 0
  num_D: 3
  num_band: 1
  num_workers: 0
  offset: 0
  pretrain_steps: 0
  res_layers: 1
  run_hdfs: 0
  sampling_rate: 24000
  save_interval: 5000
  seq_len: 100
  single_stft: 0
  sub_dis: 1
  tf: 1
  tf_end_ratio: 0.0
  tf_end_step: 0
  tf_start_ratio: 0.0
  tf_start_step: 0
  up_sample:
  - 5
  - 4
  - 4
  - 3
  use_F_dis: 0
  use_aug_pitch: 0
  use_interpolate: 0
  use_lsgan: 1
  use_mel_loss: 1
  use_melnorm: 0
  use_msg_gan: 0
  use_pitch_condition: false
  use_pitch_prediction: 1
  use_sbd: 0
  use_speaker_prediction: 0
  use_tanh: true
  use_time_loss: 1