muhtasham
/

hifigan-ar-v2

+# Generated 2025-03-24 from:
+# /workspace/speechbrain/recipes/LJSpeech/TTS/vocoder/hifigan/hparams/finetune_all.yaml
+# yamllint disable
+###################################
+# Experiment Parameters and setup #
+###################################
+seed: 1234
+__set_seed: !apply:speechbrain.utils.seed_everything [1234]
+output_folder: ./results/hifi_gan_finetune_all/1234
+save_folder: ./results/hifi_gan_finetune_all/1234/save
+train_log: ./results/hifi_gan_finetune_all/1234/train_log.txt
+progress_sample_path: ./results/hifi_gan_finetune_all/1234/samples
+epochs: 500  # Reduced epochs for finetuning
+keep_checkpoint_interval: 50
+use_tensorboard: true
+#################################
+# Data files and pre-processing #
+#################################
+data_folder: all_wav_files
+                          # e.g, /path/to/your/wav/files
+train_json: ./results/hifi_gan_finetune_all/1234/save/train.json
+valid_json: ./results/hifi_gan_finetune_all/1234/save/valid.json
+test_json: ./results/hifi_gan_finetune_all/1234/save/test.json
+splits: [train, valid]
+split_ratio: [90, 10]
+################################
+# Audio Parameters             #
+################################
+skip_prep: false
+segment_size: 8192
+sample_rate: 22050
+hop_length: 256
+win_length: 1024
+n_mel_channels: 80
+n_fft: 1024
+mel_fmin: 0.0
+mel_fmax: 8000
+mel_normalized: false
+power: 1
+norm: slaney
+mel_scale: slaney
+dynamic_range_compression: true
+################################
+# Optimization Hyperparameters #
+################################
+learning_rate: 0.00005  # Lower learning rate for finetuning
+weight_decay: 0.9999
+adam_b1: 0.8
+adam_b2: 0.99
+batch_size: 32
+num_workers: 8
+train_dataloader_opts:
+  batch_size: 32
+  drop_last: false
+  num_workers: 8
+valid_dataloader_opts:
+  batch_size: 1
+  num_workers: 8
+test_dataloader_opts:
+  batch_size: 1
+  num_workers: 8
+################################
+# Model Parameters and model   #
+################################
+# generator params
+in_channels: 80
+out_channels: 1
+resblock_type: '1'
+resblock_dilation_sizes: &id001 [[1, 3, 5], [1, 3, 5], [1, 3, 5]]
+resblock_kernel_sizes: &id002 [3, 7, 11]
+upsample_kernel_sizes: &id003 [16, 16, 4, 4]
+upsample_initial_channel: 512
+upsample_factors: &id004 [8, 8, 2, 2]
+inference_padding: 5
+cond_channels: 0
+conv_post_bias: true
+mel_spectogram: !name:speechbrain.lobes.models.HifiGAN.mel_spectogram
+  sample_rate: 22050
+  hop_length: 256
+  win_length: 1024
+  n_fft: 1024
+  n_mels: 80
+  f_min: 0.0
+  f_max: 8000
+  power: 1
+  normalized: false
+  norm: slaney
+  mel_scale: slaney
+  compression: true
+generator: &id005 !new:speechbrain.lobes.models.HifiGAN.HifiganGenerator
+  in_channels: 80
+  out_channels: 1
+  resblock_type: '1'
+  resblock_dilation_sizes: *id001
+  resblock_kernel_sizes: *id002
+  upsample_kernel_sizes: *id003
+  upsample_initial_channel: 512
+  upsample_factors: *id004
+  inference_padding: 5
+  cond_channels: 0
+  conv_post_bias: true
+discriminator: &id006 !new:speechbrain.lobes.models.HifiGAN.HifiganDiscriminator
+#generator loss
+modules:
+  generator: *id005
+  discriminator: *id006
+stft_loss:
+mseg_loss: &id007 !new:speechbrain.lobes.models.HifiGAN.MSEGLoss
+feat_match_loss: &id008 !new:speechbrain.lobes.models.HifiGAN.MelganFeatureLoss
+l1_spec_loss: &id009 !new:speechbrain.lobes.models.HifiGAN.L1SpecLoss
+  sample_rate: 22050
+  hop_length: 256
+  win_length: 1024
+  n_mel_channels: 80
+  n_fft: 1024
+  n_stft: 513
+  mel_fmin: 0.0
+  mel_fmax:
+  mel_normalized: false
+  power: 1
+  dynamic_range_compression: true
+generator_loss: !new:speechbrain.lobes.models.HifiGAN.GeneratorLoss
+  stft_loss:
+  stft_loss_weight: 0
+  mseg_loss: *id007
+  mseg_loss_weight: 1
+  feat_match_loss: *id008
+  feat_match_loss_weight: 10
+  l1_spec_loss: *id009
+  l1_spec_loss_weight: 45
+#discriminator loss
+msed_loss: &id010 !new:speechbrain.lobes.models.HifiGAN.MSEDLoss
+#optimizer
+discriminator_loss: !new:speechbrain.lobes.models.HifiGAN.DiscriminatorLoss
+  msed_loss: *id010
+opt_class_generator: !name:torch.optim.AdamW
+  lr: 0.00005
+  betas: [0.8, 0.99]
+opt_class_discriminator: !name:torch.optim.AdamW
+  lr: 0.00005
+  betas: [0.8, 0.99]
+sch_class_generator: !name:torch.optim.lr_scheduler.ExponentialLR
+  gamma: 0.9999
+  last_epoch: -1
+sch_class_discriminator: !name:torch.optim.lr_scheduler.ExponentialLR
+  gamma: 0.9999
+  last_epoch: -1
+#epoch object
+epoch_counter: &id011 !new:speechbrain.utils.epoch_loop.EpochCounter
+  limit: 500
+train_logger: !new:speechbrain.utils.train_logger.FileTrainLogger
+  save_file: ./results/hifi_gan_finetune_all/1234/train_log.txt
+#checkpointer
+checkpointer: !new:speechbrain.utils.checkpoints.Checkpointer
+  checkpoints_dir: ./results/hifi_gan_finetune_all/1234/save
+  recoverables:
+    generator: *id005
+    discriminator: *id006
+    counter: *id011