speechbrainteam commited on
Commit
f2143d6
·
1 Parent(s): 9b2fb80

Update README.md

Browse files
Files changed (1) hide show
  1. README.md +51 -0
README.md CHANGED
@@ -35,6 +35,7 @@ Please notice that we encourage you to read our tutorials and learn more about
35
 
36
  ### Using the Vocoder
37
 
 
38
  ```python
39
  import torch
40
  from speechbrain.pretrained import HIFIGAN
@@ -45,6 +46,56 @@ mel_specs = torch.rand(2, 80,298)
45
  waveforms = hifi_gan.decode_batch(mel_specs)
46
  ```
47
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
48
  ### Using the Vocoder with the TTS
49
  ```python
50
  import torchaudio
 
35
 
36
  ### Using the Vocoder
37
 
38
+ - *Basic Usage:**
39
  ```python
40
  import torch
41
  from speechbrain.pretrained import HIFIGAN
 
46
  waveforms = hifi_gan.decode_batch(mel_specs)
47
  ```
48
 
49
+ - *Spectrogram to Waveform Conversion:**
50
+ -
51
+ ```python
52
+ import torchaudio
53
+ from speechbrain.pretrained import HIFIGAN
54
+ from speechbrain.lobes.models.FastSpeech2 import mel_spectogram
55
+
56
+ # Load a pretrained HIFIGAN Vocoder
57
+ hifi_gan = HIFIGAN.from_hparams(source="speechbrain/tts-hifigan-libritts-22050Hz", savedir="tmpdir_voc22050")
58
+
59
+ # Load an audio file (an example file can be found in this repository)
60
+ # Ensure that the audio signal is sampled at 22050 Hz; refer to the provided link for a 16000 Hz Vocoder.
61
+ #signal, rate = torchaudio.load('speechbrain/tts-hifigan-libritts-22050H/example_22kHz.wav')
62
+ signal, rate = torchaudio.load('/home/mirco/Downloads/example_22kHz.wav')
63
+
64
+ # Ensure the audio is sigle channel
65
+ signal = signal[0].squeeze()
66
+
67
+ torchaudio.save('waveform.wav', signal.unsqueeze(0), 22050)
68
+
69
+ # Compute the mel spectrogram.
70
+ # IMPORTANT: Use these specific parameters to match the Vocoder's training settings for optimal results.
71
+ spectrogram, _ = mel_spectogram(
72
+ audio=signal.squeeze(),
73
+ sample_rate=22050,
74
+ hop_length=256,
75
+ win_length=1024,
76
+ n_mels=80,
77
+ n_fft=1024,
78
+ f_min=0.0,
79
+ f_max=8000.0,
80
+ power=1,
81
+ normalized=False,
82
+ min_max_energy_norm=True,
83
+ norm="slaney",
84
+ mel_scale="slaney",
85
+ compression=True
86
+ )
87
+
88
+ # Convert the spectrogram to waveform
89
+ waveforms = hifi_gan.decode_batch(spectrogram)
90
+
91
+ # Save the reconstructed audio as a waveform
92
+ torchaudio.save('waveform_reconstructed.wav', waveforms.squeeze(1), 22050)
93
+
94
+ # If everything is set up correctly, the original and reconstructed audio should be nearly indistinguishable.
95
+
96
+ ```
97
+
98
+
99
  ### Using the Vocoder with the TTS
100
  ```python
101
  import torchaudio