DBDXSS commited on
Commit
4691746
·
1 Parent(s): 1c5a443
Files changed (3) hide show
  1. cosyvoice.yaml +140 -0
  2. cosyvoice2.yaml +239 -0
  3. cosyvoice2_end2end.yaml +236 -0
cosyvoice.yaml ADDED
@@ -0,0 +1,140 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # set random seed, so that you may reproduce your result.
2
+ __set_seed1: !apply:random.seed [1986]
3
+ __set_seed2: !apply:numpy.random.seed [1986]
4
+ __set_seed3: !apply:torch.manual_seed [1986]
5
+ __set_seed4: !apply:torch.cuda.manual_seed_all [1986]
6
+
7
+ # fixed params
8
+ sample_rate: 24000
9
+ llm_input_size: 896
10
+ llm_output_size: 896
11
+ spk_embed_dim: 192
12
+ qwen_pretrain_path: ''
13
+
14
+ # model params
15
+ # for all class/function included in this repo, we use !<name> or !<new> for intialization, so that user may find all corresponding class/function according to one single yaml.
16
+ # for system/third_party class/function, we do not require this.
17
+ llm: !new:cosyvoice.llm.llm.Qwen2LM
18
+ llm_input_size: !ref <llm_input_size>
19
+ llm_output_size: !ref <llm_output_size>
20
+ speech_token_size: 6561
21
+ length_normalized_loss: True
22
+ lsm_weight: 0
23
+ llm: !new:cosyvoice.llm.llm.Qwen2Encoder
24
+ pretrain_path: !ref <qwen_pretrain_path>
25
+ sampling: !name:cosyvoice.utils.common.ras_sampling
26
+ top_p: 0.8
27
+ top_k: 25
28
+ win_size: 10
29
+ tau_r: 0.1
30
+
31
+ flow: !new:cosyvoice.flow.flow.CausalMaskedDiffWithXvec
32
+ input_size: 512
33
+ output_size: 80
34
+ spk_embed_dim: !ref <spk_embed_dim>
35
+ output_type: 'mel'
36
+ vocab_size: 6561
37
+ input_frame_rate: 25
38
+ only_mask_loss: True
39
+ token_mel_ratio: 2
40
+ pre_lookahead_len: 3
41
+ encoder: !new:cosyvoice.transformer.upsample_encoder.UpsampleConformerEncoder
42
+ output_size: 512
43
+ attention_heads: 8
44
+ linear_units: 2048
45
+ num_blocks: 6
46
+ dropout_rate: 0.1
47
+ positional_dropout_rate: 0.1
48
+ attention_dropout_rate: 0.1
49
+ normalize_before: True
50
+ input_layer: 'linear'
51
+ pos_enc_layer_type: 'rel_pos_espnet'
52
+ selfattention_layer_type: 'rel_selfattn'
53
+ input_size: 512
54
+ use_cnn_module: False
55
+ macaron_style: False
56
+ decoder: !new:cosyvoice.flow.flow_matching.CausalConditionalCFM
57
+ in_channels: 240
58
+ n_spks: 1
59
+ spk_emb_dim: 80
60
+ cfm_params: !new:omegaconf.DictConfig
61
+ content:
62
+ sigma_min: 1e-06
63
+ solver: 'euler'
64
+ t_scheduler: 'cosine'
65
+ training_cfg_rate: 0.2
66
+ inference_cfg_rate: 0.7
67
+ reg_loss_type: 'l1'
68
+ estimator: !new:cosyvoice.flow.decoder.ConditionalDecoder
69
+ in_channels: 320
70
+ out_channels: 80
71
+ causal: True
72
+ channels: [256]
73
+ dropout: 0.0
74
+ attention_head_dim: 64
75
+ n_blocks: 4
76
+ num_mid_blocks: 12
77
+ num_heads: 8
78
+ act_fn: 'gelu'
79
+
80
+ hift: !new:cosyvoice.hifigan.generator.HiFTGenerator
81
+ in_channels: 80
82
+ base_channels: 512
83
+ nb_harmonics: 8
84
+ sampling_rate: !ref <sample_rate>
85
+ nsf_alpha: 0.1
86
+ nsf_sigma: 0.003
87
+ nsf_voiced_threshold: 10
88
+ upsample_rates: [8, 5, 3]
89
+ upsample_kernel_sizes: [16, 11, 7]
90
+ istft_params:
91
+ n_fft: 16
92
+ hop_len: 4
93
+ resblock_kernel_sizes: [3, 7, 11]
94
+ resblock_dilation_sizes: [[1, 3, 5], [1, 3, 5], [1, 3, 5]]
95
+ source_resblock_kernel_sizes: [7, 7, 11]
96
+ source_resblock_dilation_sizes: [[1, 3, 5], [1, 3, 5], [1, 3, 5]]
97
+ lrelu_slope: 0.1
98
+ audio_limit: 0.99
99
+ f0_predictor: !new:cosyvoice.hifigan.f0_predictor.ConvRNNF0Predictor
100
+ num_class: 1
101
+ in_channels: 80
102
+ cond_channels: 512
103
+
104
+ # processor functions
105
+ parquet_opener: !name:cosyvoice.dataset.processor.parquet_opener
106
+ get_tokenizer: !name:cosyvoice.tokenizer.tokenizer.get_qwen_tokenizer
107
+ token_path: !ref <qwen_pretrain_path>
108
+ skip_special_tokens: True
109
+ allowed_special: 'all'
110
+ tokenize: !name:cosyvoice.dataset.processor.tokenize
111
+ get_tokenizer: !ref <get_tokenizer>
112
+ allowed_special: !ref <allowed_special>
113
+ filter: !name:cosyvoice.dataset.processor.filter
114
+ max_length: 40960
115
+ min_length: 0
116
+ token_max_length: 200
117
+ token_min_length: 1
118
+ resample: !name:cosyvoice.dataset.processor.resample
119
+ resample_rate: !ref <sample_rate>
120
+ feat_extractor: !name:matcha.utils.audio.mel_spectrogram
121
+ n_fft: 1920
122
+ num_mels: 80
123
+ sampling_rate: !ref <sample_rate>
124
+ hop_size: 480
125
+ win_size: 1920
126
+ fmin: 0
127
+ fmax: 8000
128
+ center: False
129
+ compute_fbank: !name:cosyvoice.dataset.processor.compute_fbank
130
+ feat_extractor: !ref <feat_extractor>
131
+ parse_embedding: !name:cosyvoice.dataset.processor.parse_embedding
132
+ normalize: True
133
+ shuffle: !name:cosyvoice.dataset.processor.shuffle
134
+ shuffle_size: 1000
135
+ sort: !name:cosyvoice.dataset.processor.sort
136
+ sort_size: 500 # sort_size should be less than shuffle_size
137
+ batch: !name:cosyvoice.dataset.processor.batch
138
+ batch_type: 'dynamic'
139
+ max_frames_in_batch: 2000
140
+ padding: !name:cosyvoice.dataset.processor.padding
cosyvoice2.yaml ADDED
@@ -0,0 +1,239 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # set random seed, so that you may reproduce your result.
2
+ __set_seed1: !apply:random.seed [1986]
3
+ __set_seed2: !apply:numpy.random.seed [1986]
4
+ __set_seed3: !apply:torch.manual_seed [1986]
5
+ __set_seed4: !apply:torch.cuda.manual_seed_all [1986]
6
+
7
+ # fixed params
8
+ sample_rate: 24000
9
+ llm_input_size: 896
10
+ llm_output_size: 896
11
+ spk_embed_dim: 192
12
+ qwen_pretrain_path: ''
13
+ token_frame_rate: 25
14
+ token_mel_ratio: 2
15
+ cpm_pretrain_path: ''
16
+ # cpm_pretrain_path: '/mnt/afs/zhoufangru/agent/end2end/pretrained_models/MiniCPM-o-2_6'
17
+
18
+ # stream related params
19
+ chunk_size: 25 # streaming inference chunk size, in token
20
+ num_decoding_left_chunks: 1 # streaming inference flow decoder left chunk size, <0 means use all left chunks
21
+
22
+ # model params
23
+ # for all class/function included in this repo, we use !<name> or !<new> for intialization, so that user may find all corresponding class/function according to one single yaml.
24
+ # for system/third_party class/function, we do not require this.
25
+ llm: !new:cosyvoice.llm.llm.Qwen2LM
26
+ llm_input_size: !ref <llm_input_size>
27
+ llm_output_size: !ref <llm_output_size>
28
+ speech_token_size: 6561
29
+ length_normalized_loss: True
30
+ lsm_weight: 0
31
+ mix_ratio: [5, 15]
32
+ chat_path: !ref <cpm_pretrain_path>
33
+ llm: !new:cosyvoice.llm.llm.Qwen2Encoder
34
+ pretrain_path: !ref <qwen_pretrain_path>
35
+ sampling: !name:cosyvoice.utils.common.ras_sampling
36
+ top_p: 0.8
37
+ top_k: 25
38
+ win_size: 10
39
+ tau_r: 0.1
40
+
41
+ flow: !new:cosyvoice.flow.flow.CausalMaskedDiffWithXvec
42
+ input_size: 512
43
+ output_size: 80
44
+ spk_embed_dim: !ref <spk_embed_dim>
45
+ output_type: 'mel'
46
+ vocab_size: 6561
47
+ input_frame_rate: !ref <token_frame_rate>
48
+ only_mask_loss: True
49
+ token_mel_ratio: !ref <token_mel_ratio>
50
+ pre_lookahead_len: 3
51
+ encoder: !new:cosyvoice.transformer.upsample_encoder.UpsampleConformerEncoder
52
+ output_size: 512
53
+ attention_heads: 8
54
+ linear_units: 2048
55
+ num_blocks: 6
56
+ dropout_rate: 0.1
57
+ positional_dropout_rate: 0.1
58
+ attention_dropout_rate: 0.1
59
+ normalize_before: True
60
+ input_layer: 'linear'
61
+ pos_enc_layer_type: 'rel_pos_espnet'
62
+ selfattention_layer_type: 'rel_selfattn'
63
+ input_size: 512
64
+ use_cnn_module: False
65
+ macaron_style: False
66
+ static_chunk_size: !ref <chunk_size>
67
+ decoder: !new:cosyvoice.flow.flow_matching.CausalConditionalCFM
68
+ in_channels: 240
69
+ n_spks: 1
70
+ spk_emb_dim: 80
71
+ cfm_params: !new:omegaconf.DictConfig
72
+ content:
73
+ sigma_min: 1e-06
74
+ solver: 'euler'
75
+ t_scheduler: 'cosine'
76
+ training_cfg_rate: 0.2
77
+ inference_cfg_rate: 0.7
78
+ reg_loss_type: 'l1'
79
+ estimator: !new:cosyvoice.flow.decoder.CausalConditionalDecoder
80
+ in_channels: 320
81
+ out_channels: 80
82
+ channels: [256]
83
+ dropout: 0.0
84
+ attention_head_dim: 64
85
+ n_blocks: 4
86
+ num_mid_blocks: 12
87
+ num_heads: 8
88
+ act_fn: 'gelu'
89
+ static_chunk_size: !ref <chunk_size> * <token_mel_ratio>
90
+ num_decoding_left_chunks: !ref <num_decoding_left_chunks>
91
+
92
+ hift: !new:cosyvoice.hifigan.generator.HiFTGenerator
93
+ in_channels: 80
94
+ base_channels: 512
95
+ nb_harmonics: 8
96
+ sampling_rate: !ref <sample_rate>
97
+ nsf_alpha: 0.1
98
+ nsf_sigma: 0.003
99
+ nsf_voiced_threshold: 10
100
+ upsample_rates: [8, 5, 3]
101
+ upsample_kernel_sizes: [16, 11, 7]
102
+ istft_params:
103
+ n_fft: 16
104
+ hop_len: 4
105
+ resblock_kernel_sizes: [3, 7, 11]
106
+ resblock_dilation_sizes: [[1, 3, 5], [1, 3, 5], [1, 3, 5]]
107
+ source_resblock_kernel_sizes: [7, 7, 11]
108
+ source_resblock_dilation_sizes: [[1, 3, 5], [1, 3, 5], [1, 3, 5]]
109
+ lrelu_slope: 0.1
110
+ audio_limit: 0.99
111
+ f0_predictor: !new:cosyvoice.hifigan.f0_predictor.ConvRNNF0Predictor
112
+ num_class: 1
113
+ in_channels: 80
114
+ cond_channels: 512
115
+
116
+ # gan related module
117
+ mel_spec_transform1: !name:matcha.utils.audio.mel_spectrogram
118
+ n_fft: 1920
119
+ num_mels: 80
120
+ sampling_rate: !ref <sample_rate>
121
+ hop_size: 480
122
+ win_size: 1920
123
+ fmin: 0
124
+ fmax: null
125
+ center: False
126
+ hifigan: !new:cosyvoice.hifigan.hifigan.HiFiGan
127
+ generator: !ref <hift>
128
+ discriminator: !new:cosyvoice.hifigan.discriminator.MultipleDiscriminator
129
+ mpd: !new:matcha.hifigan.models.MultiPeriodDiscriminator
130
+ mrd: !new:cosyvoice.hifigan.discriminator.MultiResSpecDiscriminator
131
+ mel_spec_transform: [
132
+ !ref <mel_spec_transform1>
133
+ ]
134
+
135
+ # processor functions
136
+ parquet_opener: !name:cosyvoice.dataset.processor.parquet_opener
137
+ get_tokenizer: !name:cosyvoice.tokenizer.tokenizer.get_qwen_tokenizer
138
+ token_path: !ref <qwen_pretrain_path>
139
+ skip_special_tokens: True
140
+ allowed_special: 'all'
141
+ tokenize: !name:cosyvoice.dataset.processor.tokenize
142
+ get_tokenizer: !ref <get_tokenizer>
143
+ allowed_special: !ref <allowed_special>
144
+ tokenize_llm: !name:cosyvoice.dataset.processor.tokenize_llm
145
+ tokenizer_path: !ref <cpm_pretrain_path>
146
+ filter: !name:cosyvoice.dataset.processor.filter
147
+ max_length: 40960
148
+ min_length: 100
149
+ token_max_length: 200
150
+ token_min_length: 1
151
+ resample: !name:cosyvoice.dataset.processor.resample
152
+ resample_rate: !ref <sample_rate>
153
+ truncate: !name:cosyvoice.dataset.processor.truncate
154
+ truncate_length: 24480 # must be a multiplier of hop_size
155
+ feat_extractor: !name:matcha.utils.audio.mel_spectrogram
156
+ n_fft: 1920
157
+ num_mels: 80
158
+ sampling_rate: !ref <sample_rate>
159
+ hop_size: 480
160
+ win_size: 1920
161
+ fmin: 0
162
+ fmax: 8000
163
+ center: False
164
+ compute_fbank: !name:cosyvoice.dataset.processor.compute_fbank
165
+ feat_extractor: !ref <feat_extractor>
166
+ compute_f0: !name:cosyvoice.dataset.processor.compute_f0
167
+ sample_rate: !ref <sample_rate>
168
+ hop_size: 480
169
+ parse_embedding: !name:cosyvoice.dataset.processor.parse_embedding
170
+ normalize: True
171
+ shuffle: !name:cosyvoice.dataset.processor.shuffle
172
+ shuffle_size: 1000
173
+ sort: !name:cosyvoice.dataset.processor.sort
174
+ sort_size: 500 # sort_size should be less than shuffle_size
175
+ batch: !name:cosyvoice.dataset.processor.batch
176
+ batch_type: 'dynamic'
177
+ max_frames_in_batch: 2000
178
+ padding: !name:cosyvoice.dataset.processor.padding
179
+ use_spk_embedding: False # change to True during sft
180
+
181
+
182
+ # dataset processor pipeline
183
+ data_pipeline: [
184
+ !ref <parquet_opener>,
185
+ # !ref <tokenize>,
186
+ !ref <tokenize_llm>,
187
+ !ref <filter>,
188
+ !ref <resample>,
189
+ !ref <compute_fbank>,
190
+ !ref <parse_embedding>,
191
+ !ref <shuffle>,
192
+ !ref <sort>,
193
+ !ref <batch>,
194
+ !ref <padding>,
195
+ ]
196
+ data_pipeline_gan: [
197
+ !ref <parquet_opener>,
198
+ !ref <tokenize>,
199
+ !ref <filter>,
200
+ !ref <resample>,
201
+ !ref <truncate>,
202
+ !ref <compute_fbank>,
203
+ !ref <compute_f0>,
204
+ !ref <parse_embedding>,
205
+ !ref <shuffle>,
206
+ !ref <sort>,
207
+ !ref <batch>,
208
+ !ref <padding>,
209
+ ]
210
+
211
+ # llm flow train conf
212
+ train_conf:
213
+ optim: adam
214
+ optim_conf:
215
+ lr: 1e-4 # change to 1e-5 during sft
216
+ scheduler: constantlr # change to constantlr during sft
217
+ scheduler_conf:
218
+ warmup_steps: 2500
219
+ max_epoch: 200
220
+ grad_clip: 5
221
+ accum_grad: 2
222
+ log_interval: 100
223
+ save_per_step: -1
224
+
225
+ # gan train conf
226
+ train_conf_gan:
227
+ optim: adam
228
+ optim_conf:
229
+ lr: 0.0002 # use small lr for gan training
230
+ scheduler: constantlr
231
+ optim_d: adam
232
+ optim_conf_d:
233
+ lr: 0.0002 # use small lr for gan training
234
+ scheduler_d: constantlr
235
+ max_epoch: 200
236
+ grad_clip: 5
237
+ accum_grad: 1 # in gan training, accum_grad must be 1
238
+ log_interval: 100
239
+ save_per_step: -1
cosyvoice2_end2end.yaml ADDED
@@ -0,0 +1,236 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # set random seed, so that you may reproduce your result.
2
+ __set_seed1: !apply:random.seed [1986]
3
+ __set_seed2: !apply:numpy.random.seed [1986]
4
+ __set_seed3: !apply:torch.manual_seed [1986]
5
+ __set_seed4: !apply:torch.cuda.manual_seed_all [1986]
6
+
7
+ # fixed params
8
+ sample_rate: 24000
9
+ llm_input_size: 896
10
+ llm_output_size: 896
11
+ spk_embed_dim: 192
12
+ qwen_pretrain_path: ''
13
+ token_frame_rate: 25
14
+ token_mel_ratio: 2
15
+ chat_pretrain_path: ''
16
+
17
+ # stream related params
18
+ chunk_size: 25 # streaming inference chunk size, in token
19
+ num_decoding_left_chunks: 1 # streaming inference flow decoder left chunk size, <0 means use all left chunks
20
+
21
+ # model params
22
+ # for all class/function included in this repo, we use !<name> or !<new> for intialization, so that user may find all corresponding class/function according to one single yaml.
23
+ # for system/third_party class/function, we do not require this.
24
+ llm: !new:cosyvoice.llm.llm.Qwen2LM
25
+ llm_input_size: !ref <llm_input_size>
26
+ llm_output_size: !ref <llm_output_size>
27
+ speech_token_size: 6561
28
+ length_normalized_loss: True
29
+ lsm_weight: 0
30
+ mix_ratio: [5, 15]
31
+ chat: !new:cosyvoice.llm.llm.Qwen2Chat
32
+ pretrain_path: !ref <chat_pretrain_path>
33
+ llm: !new:cosyvoice.llm.llm.Qwen2Encoder
34
+ pretrain_path: !ref <qwen_pretrain_path>
35
+ sampling: !name:cosyvoice.utils.common.ras_sampling
36
+ top_p: 0.8
37
+ top_k: 25
38
+ win_size: 10
39
+ tau_r: 0.1
40
+
41
+ flow: !new:cosyvoice.flow.flow.CausalMaskedDiffWithXvec
42
+ input_size: 512
43
+ output_size: 80
44
+ spk_embed_dim: !ref <spk_embed_dim>
45
+ output_type: 'mel'
46
+ vocab_size: 6561
47
+ input_frame_rate: !ref <token_frame_rate>
48
+ only_mask_loss: True
49
+ token_mel_ratio: !ref <token_mel_ratio>
50
+ pre_lookahead_len: 3
51
+ encoder: !new:cosyvoice.transformer.upsample_encoder.UpsampleConformerEncoder
52
+ output_size: 512
53
+ attention_heads: 8
54
+ linear_units: 2048
55
+ num_blocks: 6
56
+ dropout_rate: 0.1
57
+ positional_dropout_rate: 0.1
58
+ attention_dropout_rate: 0.1
59
+ normalize_before: True
60
+ input_layer: 'linear'
61
+ pos_enc_layer_type: 'rel_pos_espnet'
62
+ selfattention_layer_type: 'rel_selfattn'
63
+ input_size: 512
64
+ use_cnn_module: False
65
+ macaron_style: False
66
+ static_chunk_size: !ref <chunk_size>
67
+ decoder: !new:cosyvoice.flow.flow_matching.CausalConditionalCFM
68
+ in_channels: 240
69
+ n_spks: 1
70
+ spk_emb_dim: 80
71
+ cfm_params: !new:omegaconf.DictConfig
72
+ content:
73
+ sigma_min: 1e-06
74
+ solver: 'euler'
75
+ t_scheduler: 'cosine'
76
+ training_cfg_rate: 0.2
77
+ inference_cfg_rate: 0.7
78
+ reg_loss_type: 'l1'
79
+ estimator: !new:cosyvoice.flow.decoder.CausalConditionalDecoder
80
+ in_channels: 320
81
+ out_channels: 80
82
+ channels: [256]
83
+ dropout: 0.0
84
+ attention_head_dim: 64
85
+ n_blocks: 4
86
+ num_mid_blocks: 12
87
+ num_heads: 8
88
+ act_fn: 'gelu'
89
+ static_chunk_size: !ref <chunk_size> * <token_mel_ratio>
90
+ num_decoding_left_chunks: !ref <num_decoding_left_chunks>
91
+
92
+ hift: !new:cosyvoice.hifigan.generator.HiFTGenerator
93
+ in_channels: 80
94
+ base_channels: 512
95
+ nb_harmonics: 8
96
+ sampling_rate: !ref <sample_rate>
97
+ nsf_alpha: 0.1
98
+ nsf_sigma: 0.003
99
+ nsf_voiced_threshold: 10
100
+ upsample_rates: [8, 5, 3]
101
+ upsample_kernel_sizes: [16, 11, 7]
102
+ istft_params:
103
+ n_fft: 16
104
+ hop_len: 4
105
+ resblock_kernel_sizes: [3, 7, 11]
106
+ resblock_dilation_sizes: [[1, 3, 5], [1, 3, 5], [1, 3, 5]]
107
+ source_resblock_kernel_sizes: [7, 7, 11]
108
+ source_resblock_dilation_sizes: [[1, 3, 5], [1, 3, 5], [1, 3, 5]]
109
+ lrelu_slope: 0.1
110
+ audio_limit: 0.99
111
+ f0_predictor: !new:cosyvoice.hifigan.f0_predictor.ConvRNNF0Predictor
112
+ num_class: 1
113
+ in_channels: 80
114
+ cond_channels: 512
115
+
116
+ # gan related module
117
+ mel_spec_transform1: !name:matcha.utils.audio.mel_spectrogram
118
+ n_fft: 1920
119
+ num_mels: 80
120
+ sampling_rate: !ref <sample_rate>
121
+ hop_size: 480
122
+ win_size: 1920
123
+ fmin: 0
124
+ fmax: null
125
+ center: False
126
+ hifigan: !new:cosyvoice.hifigan.hifigan.HiFiGan
127
+ generator: !ref <hift>
128
+ discriminator: !new:cosyvoice.hifigan.discriminator.MultipleDiscriminator
129
+ mpd: !new:matcha.hifigan.models.MultiPeriodDiscriminator
130
+ mrd: !new:cosyvoice.hifigan.discriminator.MultiResSpecDiscriminator
131
+ mel_spec_transform: [
132
+ !ref <mel_spec_transform1>
133
+ ]
134
+
135
+ # processor functions
136
+ parquet_opener: !name:cosyvoice.dataset.processor.parquet_opener
137
+ get_tokenizer: !name:cosyvoice.tokenizer.tokenizer.get_qwen_tokenizer
138
+ token_path: !ref <qwen_pretrain_path>
139
+ skip_special_tokens: True
140
+ allowed_special: 'all'
141
+ tokenize: !name:cosyvoice.dataset.processor.tokenize
142
+ get_tokenizer: !ref <get_tokenizer>
143
+ allowed_special: !ref <allowed_special>
144
+ tokenize_llm: !name:cosyvoice.dataset.processor.tokenize_llm
145
+ tokenizer_path: !ref <chat_pretrain_path>
146
+ filter: !name:cosyvoice.dataset.processor.filter
147
+ token_max_length: 500
148
+ token_min_length: 1
149
+ resample: !name:cosyvoice.dataset.processor.resample
150
+ resample_rate: !ref <sample_rate>
151
+ truncate: !name:cosyvoice.dataset.processor.truncate
152
+ truncate_length: 24480 # must be a multiplier of hop_size
153
+ feat_extractor: !name:matcha.utils.audio.mel_spectrogram
154
+ n_fft: 1920
155
+ num_mels: 80
156
+ sampling_rate: !ref <sample_rate>
157
+ hop_size: 480
158
+ win_size: 1920
159
+ fmin: 0
160
+ fmax: 8000
161
+ center: False
162
+ compute_fbank: !name:cosyvoice.dataset.processor.compute_fbank
163
+ feat_extractor: !ref <feat_extractor>
164
+ compute_f0: !name:cosyvoice.dataset.processor.compute_f0
165
+ sample_rate: !ref <sample_rate>
166
+ hop_size: 480
167
+ parse_embedding: !name:cosyvoice.dataset.processor.parse_embedding
168
+ normalize: True
169
+ shuffle: !name:cosyvoice.dataset.processor.shuffle
170
+ shuffle_size: 1000
171
+ sort: !name:cosyvoice.dataset.processor.sort
172
+ sort_size: 500 # sort_size should be less than shuffle_size
173
+ # batch: !name:cosyvoice.dataset.processor.batch
174
+ # batch_type: 'dynamic'
175
+ # max_frames_in_batch: 2000
176
+ batch: !name:cosyvoice.dataset.processor.batch
177
+ batch_type: 'static'
178
+ batch_size: 1
179
+ padding: !name:cosyvoice.dataset.processor.padding
180
+ use_spk_embedding: False # change to True during sft
181
+
182
+
183
+ # dataset processor pipeline
184
+ data_pipeline: [
185
+ !ref <parquet_opener>,
186
+ !ref <tokenize_llm>,
187
+ !ref <filter>,
188
+ !ref <shuffle>,
189
+ !ref <sort>,
190
+ !ref <batch>,
191
+ !ref <padding>,
192
+ ]
193
+ data_pipeline_gan: [
194
+ !ref <parquet_opener>,
195
+ !ref <tokenize>,
196
+ !ref <filter>,
197
+ !ref <resample>,
198
+ !ref <truncate>,
199
+ !ref <compute_fbank>,
200
+ !ref <compute_f0>,
201
+ !ref <parse_embedding>,
202
+ !ref <shuffle>,
203
+ !ref <sort>,
204
+ !ref <batch>,
205
+ !ref <padding>,
206
+ ]
207
+
208
+ # llm flow train conf
209
+ train_conf:
210
+ optim: adam
211
+ optim_conf:
212
+ lr: 1e-5 # change to 1e-5 during sft
213
+ scheduler: constantlr # change to constantlr during sft
214
+ scheduler_conf:
215
+ warmup_steps: 2500
216
+ max_epoch: 5
217
+ grad_clip: 5
218
+ accum_grad: 2
219
+ log_interval: 100
220
+ save_per_step: -1
221
+
222
+ # gan train conf
223
+ train_conf_gan:
224
+ optim: adam
225
+ optim_conf:
226
+ lr: 0.0002 # use small lr for gan training
227
+ scheduler: constantlr
228
+ optim_d: adam
229
+ optim_conf_d:
230
+ lr: 0.0002 # use small lr for gan training
231
+ scheduler_d: constantlr
232
+ max_epoch: 200
233
+ grad_clip: 5
234
+ accum_grad: 1 # in gan training, accum_grad must be 1
235
+ log_interval: 100
236
+ save_per_step: -1