Upload hyperparams.yaml with huggingface_hub
Browse files- hyperparams.yaml +186 -0
hyperparams.yaml
ADDED
@@ -0,0 +1,186 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
# Generated 2025-03-24 from:
|
2 |
+
# /workspace/speechbrain/recipes/LJSpeech/TTS/vocoder/hifigan/hparams/finetune_all.yaml
|
3 |
+
# yamllint disable
|
4 |
+
###################################
|
5 |
+
# Experiment Parameters and setup #
|
6 |
+
###################################
|
7 |
+
seed: 1234
|
8 |
+
__set_seed: !apply:speechbrain.utils.seed_everything [1234]
|
9 |
+
output_folder: ./results/hifi_gan_finetune_all/1234
|
10 |
+
save_folder: ./results/hifi_gan_finetune_all/1234/save
|
11 |
+
train_log: ./results/hifi_gan_finetune_all/1234/train_log.txt
|
12 |
+
progress_sample_path: ./results/hifi_gan_finetune_all/1234/samples
|
13 |
+
epochs: 500 # Reduced epochs for finetuning
|
14 |
+
keep_checkpoint_interval: 50
|
15 |
+
use_tensorboard: true
|
16 |
+
|
17 |
+
#################################
|
18 |
+
# Data files and pre-processing #
|
19 |
+
#################################
|
20 |
+
data_folder: all_wav_files
|
21 |
+
# e.g, /path/to/your/wav/files
|
22 |
+
train_json: ./results/hifi_gan_finetune_all/1234/save/train.json
|
23 |
+
valid_json: ./results/hifi_gan_finetune_all/1234/save/valid.json
|
24 |
+
test_json: ./results/hifi_gan_finetune_all/1234/save/test.json
|
25 |
+
|
26 |
+
splits: [train, valid]
|
27 |
+
split_ratio: [90, 10]
|
28 |
+
################################
|
29 |
+
# Audio Parameters #
|
30 |
+
################################
|
31 |
+
skip_prep: false
|
32 |
+
|
33 |
+
segment_size: 8192
|
34 |
+
sample_rate: 22050
|
35 |
+
hop_length: 256
|
36 |
+
win_length: 1024
|
37 |
+
n_mel_channels: 80
|
38 |
+
n_fft: 1024
|
39 |
+
mel_fmin: 0.0
|
40 |
+
mel_fmax: 8000
|
41 |
+
mel_normalized: false
|
42 |
+
power: 1
|
43 |
+
norm: slaney
|
44 |
+
mel_scale: slaney
|
45 |
+
dynamic_range_compression: true
|
46 |
+
|
47 |
+
|
48 |
+
################################
|
49 |
+
# Optimization Hyperparameters #
|
50 |
+
################################
|
51 |
+
learning_rate: 0.00005 # Lower learning rate for finetuning
|
52 |
+
weight_decay: 0.9999
|
53 |
+
adam_b1: 0.8
|
54 |
+
adam_b2: 0.99
|
55 |
+
batch_size: 32
|
56 |
+
num_workers: 8
|
57 |
+
|
58 |
+
train_dataloader_opts:
|
59 |
+
batch_size: 32
|
60 |
+
drop_last: false
|
61 |
+
num_workers: 8
|
62 |
+
|
63 |
+
valid_dataloader_opts:
|
64 |
+
batch_size: 1
|
65 |
+
num_workers: 8
|
66 |
+
|
67 |
+
test_dataloader_opts:
|
68 |
+
batch_size: 1
|
69 |
+
num_workers: 8
|
70 |
+
|
71 |
+
################################
|
72 |
+
# Model Parameters and model #
|
73 |
+
################################
|
74 |
+
|
75 |
+
# generator params
|
76 |
+
in_channels: 80
|
77 |
+
out_channels: 1
|
78 |
+
|
79 |
+
resblock_type: '1'
|
80 |
+
resblock_dilation_sizes: &id001 [[1, 3, 5], [1, 3, 5], [1, 3, 5]]
|
81 |
+
resblock_kernel_sizes: &id002 [3, 7, 11]
|
82 |
+
upsample_kernel_sizes: &id003 [16, 16, 4, 4]
|
83 |
+
upsample_initial_channel: 512
|
84 |
+
upsample_factors: &id004 [8, 8, 2, 2]
|
85 |
+
|
86 |
+
inference_padding: 5
|
87 |
+
cond_channels: 0
|
88 |
+
conv_post_bias: true
|
89 |
+
|
90 |
+
mel_spectogram: !name:speechbrain.lobes.models.HifiGAN.mel_spectogram
|
91 |
+
sample_rate: 22050
|
92 |
+
hop_length: 256
|
93 |
+
win_length: 1024
|
94 |
+
n_fft: 1024
|
95 |
+
n_mels: 80
|
96 |
+
f_min: 0.0
|
97 |
+
f_max: 8000
|
98 |
+
power: 1
|
99 |
+
normalized: false
|
100 |
+
norm: slaney
|
101 |
+
mel_scale: slaney
|
102 |
+
compression: true
|
103 |
+
|
104 |
+
generator: &id005 !new:speechbrain.lobes.models.HifiGAN.HifiganGenerator
|
105 |
+
in_channels: 80
|
106 |
+
out_channels: 1
|
107 |
+
resblock_type: '1'
|
108 |
+
resblock_dilation_sizes: *id001
|
109 |
+
resblock_kernel_sizes: *id002
|
110 |
+
upsample_kernel_sizes: *id003
|
111 |
+
upsample_initial_channel: 512
|
112 |
+
upsample_factors: *id004
|
113 |
+
inference_padding: 5
|
114 |
+
cond_channels: 0
|
115 |
+
conv_post_bias: true
|
116 |
+
|
117 |
+
discriminator: &id006 !new:speechbrain.lobes.models.HifiGAN.HifiganDiscriminator
|
118 |
+
|
119 |
+
#generator loss
|
120 |
+
|
121 |
+
modules:
|
122 |
+
generator: *id005
|
123 |
+
discriminator: *id006
|
124 |
+
stft_loss:
|
125 |
+
mseg_loss: &id007 !new:speechbrain.lobes.models.HifiGAN.MSEGLoss
|
126 |
+
feat_match_loss: &id008 !new:speechbrain.lobes.models.HifiGAN.MelganFeatureLoss
|
127 |
+
l1_spec_loss: &id009 !new:speechbrain.lobes.models.HifiGAN.L1SpecLoss
|
128 |
+
sample_rate: 22050
|
129 |
+
hop_length: 256
|
130 |
+
win_length: 1024
|
131 |
+
n_mel_channels: 80
|
132 |
+
n_fft: 1024
|
133 |
+
n_stft: 513
|
134 |
+
mel_fmin: 0.0
|
135 |
+
mel_fmax:
|
136 |
+
mel_normalized: false
|
137 |
+
power: 1
|
138 |
+
dynamic_range_compression: true
|
139 |
+
|
140 |
+
generator_loss: !new:speechbrain.lobes.models.HifiGAN.GeneratorLoss
|
141 |
+
stft_loss:
|
142 |
+
stft_loss_weight: 0
|
143 |
+
mseg_loss: *id007
|
144 |
+
mseg_loss_weight: 1
|
145 |
+
feat_match_loss: *id008
|
146 |
+
feat_match_loss_weight: 10
|
147 |
+
l1_spec_loss: *id009
|
148 |
+
l1_spec_loss_weight: 45
|
149 |
+
|
150 |
+
#discriminator loss
|
151 |
+
msed_loss: &id010 !new:speechbrain.lobes.models.HifiGAN.MSEDLoss
|
152 |
+
|
153 |
+
#optimizer
|
154 |
+
|
155 |
+
discriminator_loss: !new:speechbrain.lobes.models.HifiGAN.DiscriminatorLoss
|
156 |
+
msed_loss: *id010
|
157 |
+
opt_class_generator: !name:torch.optim.AdamW
|
158 |
+
lr: 0.00005
|
159 |
+
betas: [0.8, 0.99]
|
160 |
+
|
161 |
+
opt_class_discriminator: !name:torch.optim.AdamW
|
162 |
+
lr: 0.00005
|
163 |
+
betas: [0.8, 0.99]
|
164 |
+
|
165 |
+
sch_class_generator: !name:torch.optim.lr_scheduler.ExponentialLR
|
166 |
+
gamma: 0.9999
|
167 |
+
last_epoch: -1
|
168 |
+
|
169 |
+
sch_class_discriminator: !name:torch.optim.lr_scheduler.ExponentialLR
|
170 |
+
gamma: 0.9999
|
171 |
+
last_epoch: -1
|
172 |
+
|
173 |
+
#epoch object
|
174 |
+
epoch_counter: &id011 !new:speechbrain.utils.epoch_loop.EpochCounter
|
175 |
+
limit: 500
|
176 |
+
|
177 |
+
train_logger: !new:speechbrain.utils.train_logger.FileTrainLogger
|
178 |
+
save_file: ./results/hifi_gan_finetune_all/1234/train_log.txt
|
179 |
+
|
180 |
+
#checkpointer
|
181 |
+
checkpointer: !new:speechbrain.utils.checkpoints.Checkpointer
|
182 |
+
checkpoints_dir: ./results/hifi_gan_finetune_all/1234/save
|
183 |
+
recoverables:
|
184 |
+
generator: *id005
|
185 |
+
discriminator: *id006
|
186 |
+
counter: *id011
|