Rasmus Lellep commited on
Commit
5a03f53
·
1 Parent(s): 09cf1ac

initial commit

Browse files
This view is limited to 50 files because it contains too many changes.   See raw diff
Files changed (50) hide show
  1. .DS_Store +0 -0
  2. TTS/.DS_Store +0 -0
  3. TTS/.models.json +959 -0
  4. TTS/__init__.py +33 -0
  5. TTS/__pycache__/__init__.cpython-310.pyc +0 -0
  6. TTS/__pycache__/__init__.cpython-311.pyc +0 -0
  7. TTS/__pycache__/__init__.cpython-39.pyc +0 -0
  8. TTS/__pycache__/model.cpython-310.pyc +0 -0
  9. TTS/__pycache__/model.cpython-311.pyc +0 -0
  10. TTS/__pycache__/model.cpython-39.pyc +0 -0
  11. TTS/api.py +499 -0
  12. TTS/bin/__init__.py +0 -0
  13. TTS/bin/collect_env_info.py +49 -0
  14. TTS/bin/compute_attention_masks.py +170 -0
  15. TTS/bin/compute_embeddings.py +202 -0
  16. TTS/bin/compute_statistics.py +106 -0
  17. TTS/bin/eval_encoder.py +93 -0
  18. TTS/bin/extract_tts_spectrograms.py +305 -0
  19. TTS/bin/find_unique_chars.py +41 -0
  20. TTS/bin/find_unique_phonemes.py +88 -0
  21. TTS/bin/remove_silence_using_vad.py +129 -0
  22. TTS/bin/resample.py +90 -0
  23. TTS/bin/synthesize.py +438 -0
  24. TTS/bin/train_encoder.py +340 -0
  25. TTS/bin/train_tts.py +76 -0
  26. TTS/bin/train_vocoder.py +84 -0
  27. TTS/bin/tune_wavegrad.py +108 -0
  28. TTS/config/__init__.py +139 -0
  29. TTS/config/__pycache__/__init__.cpython-310.pyc +0 -0
  30. TTS/config/__pycache__/__init__.cpython-311.pyc +0 -0
  31. TTS/config/__pycache__/__init__.cpython-39.pyc +0 -0
  32. TTS/config/__pycache__/shared_configs.cpython-310.pyc +0 -0
  33. TTS/config/__pycache__/shared_configs.cpython-311.pyc +0 -0
  34. TTS/config/__pycache__/shared_configs.cpython-39.pyc +0 -0
  35. TTS/config/shared_configs.py +268 -0
  36. TTS/demos/xtts_ft_demo/requirements.txt +2 -0
  37. TTS/demos/xtts_ft_demo/utils/formatter.py +161 -0
  38. TTS/demos/xtts_ft_demo/utils/gpt_train.py +172 -0
  39. TTS/demos/xtts_ft_demo/xtts_demo.py +433 -0
  40. TTS/encoder/.DS_Store +0 -0
  41. TTS/encoder/README.md +18 -0
  42. TTS/encoder/__init__.py +0 -0
  43. TTS/encoder/__pycache__/__init__.cpython-310.pyc +0 -0
  44. TTS/encoder/__pycache__/__init__.cpython-311.pyc +0 -0
  45. TTS/encoder/__pycache__/__init__.cpython-39.pyc +0 -0
  46. TTS/encoder/__pycache__/losses.cpython-310.pyc +0 -0
  47. TTS/encoder/__pycache__/losses.cpython-311.pyc +0 -0
  48. TTS/encoder/__pycache__/losses.cpython-39.pyc +0 -0
  49. TTS/encoder/configs/base_encoder_config.py +61 -0
  50. TTS/encoder/configs/emotion_encoder_config.py +12 -0
.DS_Store ADDED
Binary file (6.15 kB). View file
 
TTS/.DS_Store ADDED
Binary file (8.2 kB). View file
 
TTS/.models.json ADDED
@@ -0,0 +1,959 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "tts_models": {
3
+ "multilingual": {
4
+ "multi-dataset": {
5
+ "xtts_v2": {
6
+ "description": "XTTS-v2.0.3 by Coqui with 17 languages.",
7
+ "hf_url": [
8
+ "https://huggingface.co/coqui/XTTS-v2/resolve/main/model.pth",
9
+ "https://huggingface.co/coqui/XTTS-v2/resolve/main/config.json",
10
+ "https://huggingface.co/coqui/XTTS-v2/resolve/main/vocab.json",
11
+ "https://huggingface.co/coqui/XTTS-v2/resolve/main/hash.md5",
12
+ "https://huggingface.co/coqui/XTTS-v2/resolve/main/speakers_xtts.pth"
13
+ ],
14
+ "model_hash": "10f92b55c512af7a8d39d650547a15a7",
15
+ "default_vocoder": null,
16
+ "commit": "480a6cdf7",
17
+ "license": "CPML",
18
+ "contact": "[email protected]",
19
+ "tos_required": true
20
+ },
21
+ "xtts_v1.1": {
22
+ "description": "XTTS-v1.1 by Coqui with 14 languages, cross-language voice cloning and reference leak fixed.",
23
+ "hf_url": [
24
+ "https://huggingface.co/coqui/XTTS-v1/resolve/v1.1.2/model.pth",
25
+ "https://huggingface.co/coqui/XTTS-v1/resolve/v1.1.2/config.json",
26
+ "https://huggingface.co/coqui/XTTS-v1/resolve/v1.1.2/vocab.json",
27
+ "https://huggingface.co/coqui/XTTS-v1/resolve/v1.1.2/hash.md5"
28
+ ],
29
+ "model_hash": "7c62beaf58d39b729de287330dc254e7b515677416839b649a50e7cf74c3df59",
30
+ "default_vocoder": null,
31
+ "commit": "82910a63",
32
+ "license": "CPML",
33
+ "contact": "[email protected]",
34
+ "tos_required": true
35
+ },
36
+ "your_tts": {
37
+ "description": "Your TTS model accompanying the paper https://arxiv.org/abs/2112.02418",
38
+ "github_rls_url": "https://github.com/coqui-ai/TTS/releases/download/v0.10.1_models/tts_models--multilingual--multi-dataset--your_tts.zip",
39
+ "default_vocoder": null,
40
+ "commit": "e9a1953e",
41
+ "license": "CC BY-NC-ND 4.0",
42
+ "contact": "[email protected]"
43
+ },
44
+ "bark": {
45
+ "description": "🐶 Bark TTS model released by suno-ai. You can find the original implementation in https://github.com/suno-ai/bark.",
46
+ "hf_url": [
47
+ "https://huggingface.co/erogol/bark/resolve/main/coarse_2.pt",
48
+ "https://huggingface.co/erogol/bark/resolve/main/fine_2.pt",
49
+ "https://huggingface.co/erogol/bark/resolve/main/text_2.pt",
50
+ "https://huggingface.co/erogol/bark/resolve/main/config.json",
51
+ "https://huggingface.co/erogol/bark/resolve/main/tokenizer.pth"
52
+ ],
53
+ "default_vocoder": null,
54
+ "commit": "e9a1953e",
55
+ "license": "MIT",
56
+ "contact": "https://www.suno.ai/"
57
+ }
58
+ }
59
+ },
60
+ "bg": {
61
+ "cv": {
62
+ "vits": {
63
+ "github_rls_url": "https://github.com/coqui-ai/TTS/releases/download/v0.8.0_models/tts_models--bg--cv--vits.zip",
64
+ "default_vocoder": null,
65
+ "commit": null,
66
+ "author": "@NeonGeckoCom",
67
+ "license": "bsd-3-clause"
68
+ }
69
+ }
70
+ },
71
+ "cs": {
72
+ "cv": {
73
+ "vits": {
74
+ "github_rls_url": "https://github.com/coqui-ai/TTS/releases/download/v0.8.0_models/tts_models--cs--cv--vits.zip",
75
+ "default_vocoder": null,
76
+ "commit": null,
77
+ "author": "@NeonGeckoCom",
78
+ "license": "bsd-3-clause"
79
+ }
80
+ }
81
+ },
82
+ "da": {
83
+ "cv": {
84
+ "vits": {
85
+ "github_rls_url": "https://github.com/coqui-ai/TTS/releases/download/v0.8.0_models/tts_models--da--cv--vits.zip",
86
+ "default_vocoder": null,
87
+ "commit": null,
88
+ "author": "@NeonGeckoCom",
89
+ "license": "bsd-3-clause"
90
+ }
91
+ }
92
+ },
93
+ "et": {
94
+ "cv": {
95
+ "vits": {
96
+ "github_rls_url": "https://github.com/coqui-ai/TTS/releases/download/v0.8.0_models/tts_models--et--cv--vits.zip",
97
+ "default_vocoder": null,
98
+ "commit": null,
99
+ "author": "@NeonGeckoCom",
100
+ "license": "bsd-3-clause"
101
+ }
102
+ }
103
+ },
104
+ "ga": {
105
+ "cv": {
106
+ "vits": {
107
+ "github_rls_url": "https://github.com/coqui-ai/TTS/releases/download/v0.8.0_models/tts_models--ga--cv--vits.zip",
108
+ "default_vocoder": null,
109
+ "commit": null,
110
+ "author": "@NeonGeckoCom",
111
+ "license": "bsd-3-clause"
112
+ }
113
+ }
114
+ },
115
+ "en": {
116
+ "ek1": {
117
+ "tacotron2": {
118
+ "description": "EK1 en-rp tacotron2 by NMStoker",
119
+ "github_rls_url": "https://github.com/coqui-ai/TTS/releases/download/v0.6.1_models/tts_models--en--ek1--tacotron2.zip",
120
+ "default_vocoder": "vocoder_models/en/ek1/wavegrad",
121
+ "commit": "c802255",
122
+ "license": "apache 2.0"
123
+ }
124
+ },
125
+ "ljspeech": {
126
+ "tacotron2-DDC": {
127
+ "description": "Tacotron2 with Double Decoder Consistency.",
128
+ "github_rls_url": "https://github.com/coqui-ai/TTS/releases/download/v0.6.1_models/tts_models--en--ljspeech--tacotron2-DDC.zip",
129
+ "default_vocoder": "vocoder_models/en/ljspeech/hifigan_v2",
130
+ "commit": "bae2ad0f",
131
+ "author": "Eren Gölge @erogol",
132
+ "license": "apache 2.0",
133
+ "contact": "[email protected]"
134
+ },
135
+ "tacotron2-DDC_ph": {
136
+ "description": "Tacotron2 with Double Decoder Consistency with phonemes.",
137
+ "github_rls_url": "https://github.com/coqui-ai/TTS/releases/download/v0.6.1_models/tts_models--en--ljspeech--tacotron2-DDC_ph.zip",
138
+ "default_vocoder": "vocoder_models/en/ljspeech/univnet",
139
+ "commit": "3900448",
140
+ "author": "Eren Gölge @erogol",
141
+ "license": "apache 2.0",
142
+ "contact": "[email protected]"
143
+ },
144
+ "glow-tts": {
145
+ "description": "",
146
+ "github_rls_url": "https://github.com/coqui-ai/TTS/releases/download/v0.6.1_models/tts_models--en--ljspeech--glow-tts.zip",
147
+ "stats_file": null,
148
+ "default_vocoder": "vocoder_models/en/ljspeech/multiband-melgan",
149
+ "commit": "",
150
+ "author": "Eren Gölge @erogol",
151
+ "license": "MPL",
152
+ "contact": "[email protected]"
153
+ },
154
+ "speedy-speech": {
155
+ "description": "Speedy Speech model trained on LJSpeech dataset using the Alignment Network for learning the durations.",
156
+ "github_rls_url": "https://github.com/coqui-ai/TTS/releases/download/v0.6.1_models/tts_models--en--ljspeech--speedy-speech.zip",
157
+ "stats_file": null,
158
+ "default_vocoder": "vocoder_models/en/ljspeech/hifigan_v2",
159
+ "commit": "4581e3d",
160
+ "author": "Eren Gölge @erogol",
161
+ "license": "apache 2.0",
162
+ "contact": "[email protected]"
163
+ },
164
+ "tacotron2-DCA": {
165
+ "description": "",
166
+ "github_rls_url": "https://github.com/coqui-ai/TTS/releases/download/v0.6.1_models/tts_models--en--ljspeech--tacotron2-DCA.zip",
167
+ "default_vocoder": "vocoder_models/en/ljspeech/multiband-melgan",
168
+ "commit": "",
169
+ "author": "Eren Gölge @erogol",
170
+ "license": "MPL",
171
+ "contact": "[email protected]"
172
+ },
173
+ "vits": {
174
+ "description": "VITS is an End2End TTS model trained on LJSpeech dataset with phonemes.",
175
+ "github_rls_url": "https://github.com/coqui-ai/TTS/releases/download/v0.6.1_models/tts_models--en--ljspeech--vits.zip",
176
+ "default_vocoder": null,
177
+ "commit": "3900448",
178
+ "author": "Eren Gölge @erogol",
179
+ "license": "apache 2.0",
180
+ "contact": "[email protected]"
181
+ },
182
+ "vits--neon": {
183
+ "github_rls_url": "https://github.com/coqui-ai/TTS/releases/download/v0.8.0_models/tts_models--en--ljspeech--vits.zip",
184
+ "default_vocoder": null,
185
+ "author": "@NeonGeckoCom",
186
+ "license": "bsd-3-clause",
187
+ "contact": null,
188
+ "commit": null
189
+ },
190
+ "fast_pitch": {
191
+ "description": "FastPitch model trained on LJSpeech using the Aligner Network",
192
+ "github_rls_url": "https://github.com/coqui-ai/TTS/releases/download/v0.6.1_models/tts_models--en--ljspeech--fast_pitch.zip",
193
+ "default_vocoder": "vocoder_models/en/ljspeech/hifigan_v2",
194
+ "commit": "b27b3ba",
195
+ "author": "Eren Gölge @erogol",
196
+ "license": "apache 2.0",
197
+ "contact": "[email protected]"
198
+ },
199
+ "overflow": {
200
+ "description": "Overflow model trained on LJSpeech",
201
+ "github_rls_url": "https://github.com/coqui-ai/TTS/releases/download/v0.10.0_models/tts_models--en--ljspeech--overflow.zip",
202
+ "default_vocoder": "vocoder_models/en/ljspeech/hifigan_v2",
203
+ "commit": "3b1a28f",
204
+ "author": "Eren Gölge @erogol",
205
+ "license": "apache 2.0",
206
+ "contact": "[email protected]"
207
+ },
208
+ "neural_hmm": {
209
+ "description": "Neural HMM model trained on LJSpeech",
210
+ "github_rls_url": "https://github.com/coqui-ai/TTS/releases/download/v0.11.0_models/tts_models--en--ljspeech--neural_hmm.zip",
211
+ "default_vocoder": "vocoder_models/en/ljspeech/hifigan_v2",
212
+ "commit": "3b1a28f",
213
+ "author": "Shivam Metha @shivammehta25",
214
+ "license": "apache 2.0",
215
+ "contact": "d83ee8fe45e3c0d776d4a865aca21d7c2ac324c4"
216
+ }
217
+ },
218
+ "vctk": {
219
+ "vits": {
220
+ "description": "VITS End2End TTS model trained on VCTK dataset with 109 different speakers with EN accent.",
221
+ "github_rls_url": "https://github.com/coqui-ai/TTS/releases/download/v0.6.1_models/tts_models--en--vctk--vits.zip",
222
+ "default_vocoder": null,
223
+ "commit": "3900448",
224
+ "author": "Eren @erogol",
225
+ "license": "apache 2.0",
226
+ "contact": "[email protected]"
227
+ },
228
+ "fast_pitch": {
229
+ "description": "FastPitch model trained on VCTK dataseset.",
230
+ "github_rls_url": "https://github.com/coqui-ai/TTS/releases/download/v0.6.1_models/tts_models--en--vctk--fast_pitch.zip",
231
+ "default_vocoder": null,
232
+ "commit": "bdab788d",
233
+ "author": "Eren @erogol",
234
+ "license": "CC BY-NC-ND 4.0",
235
+ "contact": "[email protected]"
236
+ }
237
+ },
238
+ "sam": {
239
+ "tacotron-DDC": {
240
+ "description": "Tacotron2 with Double Decoder Consistency trained with Aceenture's Sam dataset.",
241
+ "github_rls_url": "https://github.com/coqui-ai/TTS/releases/download/v0.6.1_models/tts_models--en--sam--tacotron-DDC.zip",
242
+ "default_vocoder": "vocoder_models/en/sam/hifigan_v2",
243
+ "commit": "bae2ad0f",
244
+ "author": "Eren Gölge @erogol",
245
+ "license": "apache 2.0",
246
+ "contact": "[email protected]"
247
+ }
248
+ },
249
+ "blizzard2013": {
250
+ "capacitron-t2-c50": {
251
+ "description": "Capacitron additions to Tacotron 2 with Capacity at 50 as in https://arxiv.org/pdf/1906.03402.pdf",
252
+ "github_rls_url": "https://github.com/coqui-ai/TTS/releases/download/v0.7.0_models/tts_models--en--blizzard2013--capacitron-t2-c50.zip",
253
+ "commit": "d6284e7",
254
+ "default_vocoder": "vocoder_models/en/blizzard2013/hifigan_v2",
255
+ "author": "Adam Froghyar @a-froghyar",
256
+ "license": "apache 2.0",
257
+ "contact": "[email protected]"
258
+ },
259
+ "capacitron-t2-c150_v2": {
260
+ "description": "Capacitron additions to Tacotron 2 with Capacity at 150 as in https://arxiv.org/pdf/1906.03402.pdf",
261
+ "github_rls_url": "https://github.com/coqui-ai/TTS/releases/download/v0.7.1_models/tts_models--en--blizzard2013--capacitron-t2-c150_v2.zip",
262
+ "commit": "a67039d",
263
+ "default_vocoder": "vocoder_models/en/blizzard2013/hifigan_v2",
264
+ "author": "Adam Froghyar @a-froghyar",
265
+ "license": "apache 2.0",
266
+ "contact": "[email protected]"
267
+ }
268
+ },
269
+ "multi-dataset": {
270
+ "tortoise-v2": {
271
+ "description": "Tortoise tts model https://github.com/neonbjb/tortoise-tts",
272
+ "github_rls_url": [
273
+ "https://github.com/coqui-ai/TTS/releases/download/v0.14.1_models/autoregressive.pth",
274
+ "https://github.com/coqui-ai/TTS/releases/download/v0.14.1_models/clvp2.pth",
275
+ "https://github.com/coqui-ai/TTS/releases/download/v0.14.1_models/cvvp.pth",
276
+ "https://github.com/coqui-ai/TTS/releases/download/v0.14.1_models/diffusion_decoder.pth",
277
+ "https://github.com/coqui-ai/TTS/releases/download/v0.14.1_models/rlg_auto.pth",
278
+ "https://github.com/coqui-ai/TTS/releases/download/v0.14.1_models/rlg_diffuser.pth",
279
+ "https://github.com/coqui-ai/TTS/releases/download/v0.14.1_models/vocoder.pth",
280
+ "https://github.com/coqui-ai/TTS/releases/download/v0.14.1_models/mel_norms.pth",
281
+ "https://github.com/coqui-ai/TTS/releases/download/v0.14.1_models/config.json"
282
+ ],
283
+ "commit": "c1875f6",
284
+ "default_vocoder": null,
285
+ "author": "@neonbjb - James Betker, @manmay-nakhashi Manmay Nakhashi",
286
+ "license": "apache 2.0"
287
+ }
288
+ },
289
+ "jenny": {
290
+ "jenny": {
291
+ "description": "VITS model trained with Jenny(Dioco) dataset. Named as Jenny as demanded by the license. Original URL for the model https://www.kaggle.com/datasets/noml4u/tts-models--en--jenny-dioco--vits",
292
+ "github_rls_url": "https://github.com/coqui-ai/TTS/releases/download/v0.14.0_models/tts_models--en--jenny--jenny.zip",
293
+ "default_vocoder": null,
294
+ "commit": "ba40a1c",
295
+ "license": "custom - see https://github.com/dioco-group/jenny-tts-dataset#important",
296
+ "author": "@noml4u"
297
+ }
298
+ }
299
+ },
300
+ "es": {
301
+ "mai": {
302
+ "tacotron2-DDC": {
303
+ "github_rls_url": "https://github.com/coqui-ai/TTS/releases/download/v0.6.1_models/tts_models--es--mai--tacotron2-DDC.zip",
304
+ "default_vocoder": "vocoder_models/universal/libri-tts/fullband-melgan",
305
+ "commit": "",
306
+ "author": "Eren Gölge @erogol",
307
+ "license": "MPL",
308
+ "contact": "[email protected]"
309
+ }
310
+ },
311
+ "css10": {
312
+ "vits": {
313
+ "github_rls_url": "https://github.com/coqui-ai/TTS/releases/download/v0.8.0_models/tts_models--es--css10--vits.zip",
314
+ "default_vocoder": null,
315
+ "commit": null,
316
+ "author": "@NeonGeckoCom",
317
+ "license": "bsd-3-clause"
318
+ }
319
+ }
320
+ },
321
+ "fr": {
322
+ "mai": {
323
+ "tacotron2-DDC": {
324
+ "github_rls_url": "https://github.com/coqui-ai/TTS/releases/download/v0.6.1_models/tts_models--fr--mai--tacotron2-DDC.zip",
325
+ "default_vocoder": "vocoder_models/universal/libri-tts/fullband-melgan",
326
+ "commit": null,
327
+ "author": "Eren Gölge @erogol",
328
+ "license": "MPL",
329
+ "contact": "[email protected]"
330
+ }
331
+ },
332
+ "css10": {
333
+ "vits": {
334
+ "github_rls_url": "https://github.com/coqui-ai/TTS/releases/download/v0.8.0_models/tts_models--fr--css10--vits.zip",
335
+ "default_vocoder": null,
336
+ "commit": null,
337
+ "author": "@NeonGeckoCom",
338
+ "license": "bsd-3-clause"
339
+ }
340
+ }
341
+ },
342
+ "uk": {
343
+ "mai": {
344
+ "glow-tts": {
345
+ "github_rls_url": "https://github.com/coqui-ai/TTS/releases/download/v0.6.1_models/tts_models--uk--mai--glow-tts.zip",
346
+ "author": "@robinhad",
347
+ "commit": "bdab788d",
348
+ "license": "MIT",
349
+ "contact": "",
350
+ "default_vocoder": "vocoder_models/uk/mai/multiband-melgan"
351
+ },
352
+ "vits": {
353
+ "github_rls_url": "https://github.com/coqui-ai/TTS/releases/download/v0.8.0_models/tts_models--uk--mai--vits.zip",
354
+ "default_vocoder": null,
355
+ "commit": null,
356
+ "author": "@NeonGeckoCom",
357
+ "license": "bsd-3-clause"
358
+ }
359
+ }
360
+ },
361
+ "zh-CN": {
362
+ "baker": {
363
+ "tacotron2-DDC-GST": {
364
+ "github_rls_url": "https://github.com/coqui-ai/TTS/releases/download/v0.6.1_models/tts_models--zh-CN--baker--tacotron2-DDC-GST.zip",
365
+ "commit": "unknown",
366
+ "author": "@kirianguiller",
367
+ "license": "apache 2.0",
368
+ "default_vocoder": null
369
+ }
370
+ }
371
+ },
372
+ "nl": {
373
+ "mai": {
374
+ "tacotron2-DDC": {
375
+ "github_rls_url": "https://github.com/coqui-ai/TTS/releases/download/v0.6.1_models/tts_models--nl--mai--tacotron2-DDC.zip",
376
+ "author": "@r-dh",
377
+ "license": "apache 2.0",
378
+ "default_vocoder": "vocoder_models/nl/mai/parallel-wavegan",
379
+ "stats_file": null,
380
+ "commit": "540d811"
381
+ }
382
+ },
383
+ "css10": {
384
+ "vits": {
385
+ "github_rls_url": "https://github.com/coqui-ai/TTS/releases/download/v0.8.0_models/tts_models--nl--css10--vits.zip",
386
+ "default_vocoder": null,
387
+ "commit": null,
388
+ "author": "@NeonGeckoCom",
389
+ "license": "bsd-3-clause"
390
+ }
391
+ }
392
+ },
393
+ "de": {
394
+ "thorsten": {
395
+ "tacotron2-DCA": {
396
+ "github_rls_url": "https://github.com/coqui-ai/TTS/releases/download/v0.6.1_models/tts_models--de--thorsten--tacotron2-DCA.zip",
397
+ "default_vocoder": "vocoder_models/de/thorsten/fullband-melgan",
398
+ "author": "@thorstenMueller",
399
+ "license": "apache 2.0",
400
+ "commit": "unknown"
401
+ },
402
+ "vits": {
403
+ "github_rls_url": "https://github.com/coqui-ai/TTS/releases/download/v0.7.0_models/tts_models--de--thorsten--vits.zip",
404
+ "default_vocoder": null,
405
+ "author": "@thorstenMueller",
406
+ "license": "apache 2.0",
407
+ "commit": "unknown"
408
+ },
409
+ "tacotron2-DDC": {
410
+ "github_rls_url": "https://github.com/coqui-ai/TTS/releases/download/v0.8.0_models/tts_models--de--thorsten--tacotron2-DDC.zip",
411
+ "default_vocoder": "vocoder_models/de/thorsten/hifigan_v1",
412
+ "description": "Thorsten-Dec2021-22k-DDC",
413
+ "author": "@thorstenMueller",
414
+ "license": "apache 2.0",
415
+ "commit": "unknown"
416
+ }
417
+ },
418
+ "css10": {
419
+ "vits-neon": {
420
+ "github_rls_url": "https://github.com/coqui-ai/TTS/releases/download/v0.8.0_models/tts_models--de--css10--vits.zip",
421
+ "default_vocoder": null,
422
+ "author": "@NeonGeckoCom",
423
+ "license": "bsd-3-clause",
424
+ "commit": null
425
+ }
426
+ }
427
+ },
428
+ "ja": {
429
+ "kokoro": {
430
+ "tacotron2-DDC": {
431
+ "github_rls_url": "https://github.com/coqui-ai/TTS/releases/download/v0.6.1_models/tts_models--ja--kokoro--tacotron2-DDC.zip",
432
+ "default_vocoder": "vocoder_models/ja/kokoro/hifigan_v1",
433
+ "description": "Tacotron2 with Double Decoder Consistency trained with Kokoro Speech Dataset.",
434
+ "author": "@kaiidams",
435
+ "license": "apache 2.0",
436
+ "commit": "401fbd89"
437
+ }
438
+ }
439
+ },
440
+ "tr": {
441
+ "common-voice": {
442
+ "glow-tts": {
443
+ "github_rls_url": "https://github.com/coqui-ai/TTS/releases/download/v0.6.1_models/tts_models--tr--common-voice--glow-tts.zip",
444
+ "default_vocoder": "vocoder_models/tr/common-voice/hifigan",
445
+ "license": "MIT",
446
+ "description": "Turkish GlowTTS model using an unknown speaker from the Common-Voice dataset.",
447
+ "author": "Fatih Akademi",
448
+ "commit": null
449
+ }
450
+ }
451
+ },
452
+ "it": {
453
+ "mai_female": {
454
+ "glow-tts": {
455
+ "github_rls_url": "https://github.com/coqui-ai/TTS/releases/download/v0.6.1_models/tts_models--it--mai_female--glow-tts.zip",
456
+ "default_vocoder": null,
457
+ "description": "GlowTTS model as explained on https://github.com/coqui-ai/TTS/issues/1148.",
458
+ "author": "@nicolalandro",
459
+ "license": "apache 2.0",
460
+ "commit": null
461
+ },
462
+ "vits": {
463
+ "github_rls_url": "https://github.com/coqui-ai/TTS/releases/download/v0.6.1_models/tts_models--it--mai_female--vits.zip",
464
+ "default_vocoder": null,
465
+ "description": "GlowTTS model as explained on https://github.com/coqui-ai/TTS/issues/1148.",
466
+ "author": "@nicolalandro",
467
+ "license": "apache 2.0",
468
+ "commit": null
469
+ }
470
+ },
471
+ "mai_male": {
472
+ "glow-tts": {
473
+ "github_rls_url": "https://github.com/coqui-ai/TTS/releases/download/v0.6.1_models/tts_models--it--mai_male--glow-tts.zip",
474
+ "default_vocoder": null,
475
+ "description": "GlowTTS model as explained on https://github.com/coqui-ai/TTS/issues/1148.",
476
+ "author": "@nicolalandro",
477
+ "license": "apache 2.0",
478
+ "commit": null
479
+ },
480
+ "vits": {
481
+ "github_rls_url": "https://github.com/coqui-ai/TTS/releases/download/v0.6.1_models/tts_models--it--mai_male--vits.zip",
482
+ "default_vocoder": null,
483
+ "description": "GlowTTS model as explained on https://github.com/coqui-ai/TTS/issues/1148.",
484
+ "author": "@nicolalandro",
485
+ "license": "apache 2.0",
486
+ "commit": null
487
+ }
488
+ }
489
+ },
490
+ "ewe": {
491
+ "openbible": {
492
+ "vits": {
493
+ "github_rls_url": "https://github.com/coqui-ai/TTS/releases/download/v0.6.2_models/tts_models--ewe--openbible--vits.zip",
494
+ "default_vocoder": null,
495
+ "license": "CC-BY-SA 4.0",
496
+ "description": "Original work (audio and text) by Biblica available for free at www.biblica.com and open.bible.",
497
+ "author": "@coqui_ai",
498
+ "commit": "1b22f03"
499
+ }
500
+ }
501
+ },
502
+ "hau": {
503
+ "openbible": {
504
+ "vits": {
505
+ "github_rls_url": "https://github.com/coqui-ai/TTS/releases/download/v0.6.2_models/tts_models--hau--openbible--vits.zip",
506
+ "default_vocoder": null,
507
+ "license": "CC-BY-SA 4.0",
508
+ "description": "Original work (audio and text) by Biblica available for free at www.biblica.com and open.bible.",
509
+ "author": "@coqui_ai",
510
+ "commit": "1b22f03"
511
+ }
512
+ }
513
+ },
514
+ "lin": {
515
+ "openbible": {
516
+ "vits": {
517
+ "github_rls_url": "https://github.com/coqui-ai/TTS/releases/download/v0.6.2_models/tts_models--lin--openbible--vits.zip",
518
+ "default_vocoder": null,
519
+ "license": "CC-BY-SA 4.0",
520
+ "description": "Original work (audio and text) by Biblica available for free at www.biblica.com and open.bible.",
521
+ "author": "@coqui_ai",
522
+ "commit": "1b22f03"
523
+ }
524
+ }
525
+ },
526
+ "tw_akuapem": {
527
+ "openbible": {
528
+ "vits": {
529
+ "github_rls_url": "https://github.com/coqui-ai/TTS/releases/download/v0.6.2_models/tts_models--tw_akuapem--openbible--vits.zip",
530
+ "default_vocoder": null,
531
+ "license": "CC-BY-SA 4.0",
532
+ "description": "Original work (audio and text) by Biblica available for free at www.biblica.com and open.bible.",
533
+ "author": "@coqui_ai",
534
+ "commit": "1b22f03"
535
+ }
536
+ }
537
+ },
538
+ "tw_asante": {
539
+ "openbible": {
540
+ "vits": {
541
+ "github_rls_url": "https://github.com/coqui-ai/TTS/releases/download/v0.6.2_models/tts_models--tw_asante--openbible--vits.zip",
542
+ "default_vocoder": null,
543
+ "license": "CC-BY-SA 4.0",
544
+ "description": "Original work (audio and text) by Biblica available for free at www.biblica.com and open.bible.",
545
+ "author": "@coqui_ai",
546
+ "commit": "1b22f03"
547
+ }
548
+ }
549
+ },
550
+ "yor": {
551
+ "openbible": {
552
+ "vits": {
553
+ "github_rls_url": "https://github.com/coqui-ai/TTS/releases/download/v0.6.2_models/tts_models--yor--openbible--vits.zip",
554
+ "default_vocoder": null,
555
+ "license": "CC-BY-SA 4.0",
556
+ "description": "Original work (audio and text) by Biblica available for free at www.biblica.com and open.bible.",
557
+ "author": "@coqui_ai",
558
+ "commit": "1b22f03"
559
+ }
560
+ }
561
+ },
562
+ "hu": {
563
+ "css10": {
564
+ "vits": {
565
+ "github_rls_url": "https://github.com/coqui-ai/TTS/releases/download/v0.8.0_models/tts_models--hu--css10--vits.zip",
566
+ "default_vocoder": null,
567
+ "commit": null,
568
+ "author": "@NeonGeckoCom",
569
+ "license": "bsd-3-clause"
570
+ }
571
+ }
572
+ },
573
+ "el": {
574
+ "cv": {
575
+ "vits": {
576
+ "github_rls_url": "https://github.com/coqui-ai/TTS/releases/download/v0.8.0_models/tts_models--el--cv--vits.zip",
577
+ "default_vocoder": null,
578
+ "commit": null,
579
+ "author": "@NeonGeckoCom",
580
+ "license": "bsd-3-clause"
581
+ }
582
+ }
583
+ },
584
+ "fi": {
585
+ "css10": {
586
+ "vits": {
587
+ "github_rls_url": "https://github.com/coqui-ai/TTS/releases/download/v0.8.0_models/tts_models--fi--css10--vits.zip",
588
+ "default_vocoder": null,
589
+ "commit": null,
590
+ "author": "@NeonGeckoCom",
591
+ "license": "bsd-3-clause"
592
+ }
593
+ }
594
+ },
595
+ "hr": {
596
+ "cv": {
597
+ "vits": {
598
+ "github_rls_url": "https://github.com/coqui-ai/TTS/releases/download/v0.8.0_models/tts_models--hr--cv--vits.zip",
599
+ "default_vocoder": null,
600
+ "commit": null,
601
+ "author": "@NeonGeckoCom",
602
+ "license": "bsd-3-clause"
603
+ }
604
+ }
605
+ },
606
+ "lt": {
607
+ "cv": {
608
+ "vits": {
609
+ "github_rls_url": "https://github.com/coqui-ai/TTS/releases/download/v0.8.0_models/tts_models--lt--cv--vits.zip",
610
+ "default_vocoder": null,
611
+ "commit": null,
612
+ "author": "@NeonGeckoCom",
613
+ "license": "bsd-3-clause"
614
+ }
615
+ }
616
+ },
617
+ "lv": {
618
+ "cv": {
619
+ "vits": {
620
+ "github_rls_url": "https://github.com/coqui-ai/TTS/releases/download/v0.8.0_models/tts_models--lv--cv--vits.zip",
621
+ "default_vocoder": null,
622
+ "commit": null,
623
+ "author": "@NeonGeckoCom",
624
+ "license": "bsd-3-clause"
625
+ }
626
+ }
627
+ },
628
+ "mt": {
629
+ "cv": {
630
+ "vits": {
631
+ "github_rls_url": "https://github.com/coqui-ai/TTS/releases/download/v0.8.0_models/tts_models--mt--cv--vits.zip",
632
+ "default_vocoder": null,
633
+ "commit": null,
634
+ "author": "@NeonGeckoCom",
635
+ "license": "bsd-3-clause"
636
+ }
637
+ }
638
+ },
639
+ "pl": {
640
+ "mai_female": {
641
+ "vits": {
642
+ "github_rls_url": "https://github.com/coqui-ai/TTS/releases/download/v0.8.0_models/tts_models--pl--mai_female--vits.zip",
643
+ "default_vocoder": null,
644
+ "commit": null,
645
+ "author": "@NeonGeckoCom",
646
+ "license": "bsd-3-clause"
647
+ }
648
+ }
649
+ },
650
+ "pt": {
651
+ "cv": {
652
+ "vits": {
653
+ "github_rls_url": "https://github.com/coqui-ai/TTS/releases/download/v0.8.0_models/tts_models--pt--cv--vits.zip",
654
+ "default_vocoder": null,
655
+ "commit": null,
656
+ "author": "@NeonGeckoCom",
657
+ "license": "bsd-3-clause"
658
+ }
659
+ }
660
+ },
661
+ "ro": {
662
+ "cv": {
663
+ "vits": {
664
+ "github_rls_url": "https://github.com/coqui-ai/TTS/releases/download/v0.8.0_models/tts_models--ro--cv--vits.zip",
665
+ "default_vocoder": null,
666
+ "commit": null,
667
+ "author": "@NeonGeckoCom",
668
+ "license": "bsd-3-clause"
669
+ }
670
+ }
671
+ },
672
+ "sk": {
673
+ "cv": {
674
+ "vits": {
675
+ "github_rls_url": "https://github.com/coqui-ai/TTS/releases/download/v0.8.0_models/tts_models--sk--cv--vits.zip",
676
+ "default_vocoder": null,
677
+ "commit": null,
678
+ "author": "@NeonGeckoCom",
679
+ "license": "bsd-3-clause"
680
+ }
681
+ }
682
+ },
683
+ "sl": {
684
+ "cv": {
685
+ "vits": {
686
+ "github_rls_url": "https://github.com/coqui-ai/TTS/releases/download/v0.8.0_models/tts_models--sl--cv--vits.zip",
687
+ "default_vocoder": null,
688
+ "commit": null,
689
+ "author": "@NeonGeckoCom",
690
+ "license": "bsd-3-clause"
691
+ }
692
+ }
693
+ },
694
+ "sv": {
695
+ "cv": {
696
+ "vits": {
697
+ "github_rls_url": "https://github.com/coqui-ai/TTS/releases/download/v0.8.0_models/tts_models--sv--cv--vits.zip",
698
+ "default_vocoder": null,
699
+ "commit": null,
700
+ "author": "@NeonGeckoCom",
701
+ "license": "bsd-3-clause"
702
+ }
703
+ }
704
+ },
705
+ "ca": {
706
+ "custom": {
707
+ "vits": {
708
+ "github_rls_url": "https://github.com/coqui-ai/TTS/releases/download/v0.10.1_models/tts_models--ca--custom--vits.zip",
709
+ "default_vocoder": null,
710
+ "commit": null,
711
+ "description": " It is trained from zero with 101460 utterances consisting of 257 speakers, approx 138 hours of speech. We used three datasets;\nFestcat and Google Catalan TTS (both TTS datasets) and also a part of Common Voice 8. It is trained with TTS v0.8.0.\nhttps://github.com/coqui-ai/TTS/discussions/930#discussioncomment-4466345",
712
+ "author": "@gullabi",
713
+ "license": "CC-BY-4.0"
714
+ }
715
+ }
716
+ },
717
+ "fa": {
718
+ "custom": {
719
+ "glow-tts": {
720
+ "github_rls_url": "https://github.com/coqui-ai/TTS/releases/download/v0.10.1_models/tts_models--fa--custom--glow-tts.zip",
721
+ "default_vocoder": null,
722
+ "commit": null,
723
+ "description": "persian-tts-female-glow_tts model for text to speech purposes. Single-speaker female voice Trained on persian-tts-dataset-famale. \nThis model has no compatible vocoder thus the output quality is not very good. \nDataset: https://www.kaggle.com/datasets/magnoliasis/persian-tts-dataset-famale.",
724
+ "author": "@karim23657",
725
+ "license": "CC-BY-4.0"
726
+ }
727
+ }
728
+ },
729
+ "bn": {
730
+ "custom": {
731
+ "vits-male": {
732
+ "github_rls_url": "https://github.com/coqui-ai/TTS/releases/download/v0.13.3_models/tts_models--bn--custom--vits_male.zip",
733
+ "default_vocoder": null,
734
+ "commit": null,
735
+ "description": "Single speaker Bangla male model. For more information -> https://github.com/mobassir94/comprehensive-bangla-tts",
736
+ "author": "@mobassir94",
737
+ "license": "Apache 2.0"
738
+ },
739
+ "vits-female": {
740
+ "github_rls_url": "https://github.com/coqui-ai/TTS/releases/download/v0.13.3_models/tts_models--bn--custom--vits_female.zip",
741
+ "default_vocoder": null,
742
+ "commit": null,
743
+ "description": "Single speaker Bangla female model. For more information -> https://github.com/mobassir94/comprehensive-bangla-tts",
744
+ "author": "@mobassir94",
745
+ "license": "Apache 2.0"
746
+ }
747
+ }
748
+ },
749
+ "be": {
750
+ "common-voice": {
751
+ "glow-tts":{
752
+ "description": "Belarusian GlowTTS model created by @alex73 (Github).",
753
+ "github_rls_url":"https://github.com/coqui-ai/TTS/releases/download/v0.16.6/tts_models--be--common-voice--glow-tts.zip",
754
+ "default_vocoder": "vocoder_models/be/common-voice/hifigan",
755
+ "commit": "c0aabb85",
756
+ "license": "CC-BY-SA 4.0",
757
+ "contact": "[email protected]"
758
+ }
759
+ }
760
+ }
761
+ },
762
+ "vocoder_models": {
763
+ "universal": {
764
+ "libri-tts": {
765
+ "wavegrad": {
766
+ "github_rls_url": "https://github.com/coqui-ai/TTS/releases/download/v0.6.1_models/vocoder_models--universal--libri-tts--wavegrad.zip",
767
+ "commit": "ea976b0",
768
+ "author": "Eren Gölge @erogol",
769
+ "license": "MPL",
770
+ "contact": "[email protected]"
771
+ },
772
+ "fullband-melgan": {
773
+ "github_rls_url": "https://github.com/coqui-ai/TTS/releases/download/v0.6.1_models/vocoder_models--universal--libri-tts--fullband-melgan.zip",
774
+ "commit": "4132240",
775
+ "author": "Eren Gölge @erogol",
776
+ "license": "MPL",
777
+ "contact": "[email protected]"
778
+ }
779
+ }
780
+ },
781
+ "en": {
782
+ "ek1": {
783
+ "wavegrad": {
784
+ "description": "EK1 en-rp wavegrad by NMStoker",
785
+ "github_rls_url": "https://github.com/coqui-ai/TTS/releases/download/v0.6.1_models/vocoder_models--en--ek1--wavegrad.zip",
786
+ "commit": "c802255",
787
+ "license": "apache 2.0"
788
+ }
789
+ },
790
+ "ljspeech": {
791
+ "multiband-melgan": {
792
+ "github_rls_url": "https://github.com/coqui-ai/TTS/releases/download/v0.6.1_models/vocoder_models--en--ljspeech--multiband-melgan.zip",
793
+ "commit": "ea976b0",
794
+ "author": "Eren Gölge @erogol",
795
+ "license": "MPL",
796
+ "contact": "[email protected]"
797
+ },
798
+ "hifigan_v2": {
799
+ "description": "HiFiGAN_v2 LJSpeech vocoder from https://arxiv.org/abs/2010.05646.",
800
+ "github_rls_url": "https://github.com/coqui-ai/TTS/releases/download/v0.6.1_models/vocoder_models--en--ljspeech--hifigan_v2.zip",
801
+ "commit": "bae2ad0f",
802
+ "author": "@erogol",
803
+ "license": "apache 2.0",
804
+ "contact": "[email protected]"
805
+ },
806
+ "univnet": {
807
+ "description": "UnivNet model finetuned on TacotronDDC_ph spectrograms for better compatibility.",
808
+ "github_rls_url": "https://github.com/coqui-ai/TTS/releases/download/v0.6.1_models/vocoder_models--en--ljspeech--univnet_v2.zip",
809
+ "commit": "4581e3d",
810
+ "author": "Eren @erogol",
811
+ "license": "apache 2.0",
812
+ "contact": "[email protected]"
813
+ }
814
+ },
815
+ "blizzard2013": {
816
+ "hifigan_v2": {
817
+ "description": "HiFiGAN_v2 LJSpeech vocoder from https://arxiv.org/abs/2010.05646.",
818
+ "github_rls_url": "https://github.com/coqui-ai/TTS/releases/download/v0.7.0_models/vocoder_models--en--blizzard2013--hifigan_v2.zip",
819
+ "commit": "d6284e7",
820
+ "author": "Adam Froghyar @a-froghyar",
821
+ "license": "apache 2.0",
822
+ "contact": "[email protected]"
823
+ }
824
+ },
825
+ "vctk": {
826
+ "hifigan_v2": {
827
+ "description": "Finetuned and intended to be used with tts_models/en/vctk/sc-glow-tts",
828
+ "github_rls_url": "https://github.com/coqui-ai/TTS/releases/download/v0.6.1_models/vocoder_models--en--vctk--hifigan_v2.zip",
829
+ "commit": "2f07160",
830
+ "author": "Edresson Casanova",
831
+ "license": "apache 2.0",
832
+ "contact": ""
833
+ }
834
+ },
835
+ "sam": {
836
+ "hifigan_v2": {
837
+ "description": "Finetuned and intended to be used with tts_models/en/sam/tacotron_DDC",
838
+ "github_rls_url": "https://github.com/coqui-ai/TTS/releases/download/v0.6.1_models/vocoder_models--en--sam--hifigan_v2.zip",
839
+ "commit": "2f07160",
840
+ "author": "Eren Gölge @erogol",
841
+ "license": "apache 2.0",
842
+ "contact": "[email protected]"
843
+ }
844
+ }
845
+ },
846
+ "nl": {
847
+ "mai": {
848
+ "parallel-wavegan": {
849
+ "github_rls_url": "https://github.com/coqui-ai/TTS/releases/download/v0.6.1_models/vocoder_models--nl--mai--parallel-wavegan.zip",
850
+ "author": "@r-dh",
851
+ "license": "apache 2.0",
852
+ "commit": "unknown"
853
+ }
854
+ }
855
+ },
856
+ "de": {
857
+ "thorsten": {
858
+ "wavegrad": {
859
+ "github_rls_url": "https://github.com/coqui-ai/TTS/releases/download/v0.6.1_models/vocoder_models--de--thorsten--wavegrad.zip",
860
+ "author": "@thorstenMueller",
861
+ "license": "apache 2.0",
862
+ "commit": "unknown"
863
+ },
864
+ "fullband-melgan": {
865
+ "github_rls_url": "https://github.com/coqui-ai/TTS/releases/download/v0.6.1_models/vocoder_models--de--thorsten--fullband-melgan.zip",
866
+ "author": "@thorstenMueller",
867
+ "license": "apache 2.0",
868
+ "commit": "unknown"
869
+ },
870
+ "hifigan_v1": {
871
+ "github_rls_url": "https://github.com/coqui-ai/TTS/releases/download/v0.8.0_models/vocoder_models--de--thorsten--hifigan_v1.zip",
872
+ "description": "HifiGAN vocoder model for Thorsten Neutral Dec2021 22k Samplerate Tacotron2 DDC model",
873
+ "author": "@thorstenMueller",
874
+ "license": "apache 2.0",
875
+ "commit": "unknown"
876
+ }
877
+ }
878
+ },
879
+ "ja": {
880
+ "kokoro": {
881
+ "hifigan_v1": {
882
+ "github_rls_url": "https://github.com/coqui-ai/TTS/releases/download/v0.6.1_models/vocoder_models--ja--kokoro--hifigan_v1.zip",
883
+ "description": "HifiGAN model trained for kokoro dataset by @kaiidams",
884
+ "author": "@kaiidams",
885
+ "license": "apache 2.0",
886
+ "commit": "3900448"
887
+ }
888
+ }
889
+ },
890
+ "uk": {
891
+ "mai": {
892
+ "multiband-melgan": {
893
+ "github_rls_url": "https://github.com/coqui-ai/TTS/releases/download/v0.6.1_models/vocoder_models--uk--mai--multiband-melgan.zip",
894
+ "author": "@robinhad",
895
+ "commit": "bdab788d",
896
+ "license": "MIT",
897
+ "contact": ""
898
+ }
899
+ }
900
+ },
901
+ "tr": {
902
+ "common-voice": {
903
+ "hifigan": {
904
+ "github_rls_url": "https://github.com/coqui-ai/TTS/releases/download/v0.6.1_models/vocoder_models--tr--common-voice--hifigan.zip",
905
+ "description": "HifiGAN model using an unknown speaker from the Common-Voice dataset.",
906
+ "author": "Fatih Akademi",
907
+ "license": "MIT",
908
+ "commit": null
909
+ }
910
+ }
911
+ },
912
+ "be": {
913
+ "common-voice": {
914
+ "hifigan": {
915
+ "github_rls_url": "https://github.com/coqui-ai/TTS/releases/download/v0.16.6/vocoder_models--be--common-voice--hifigan.zip",
916
+ "description": "Belarusian HiFiGAN model created by @alex73 (Github).",
917
+ "author": "@alex73",
918
+ "license": "CC-BY-SA 4.0",
919
+ "commit": "c0aabb85"
920
+ }
921
+ }
922
+ }
923
+ },
924
+ "voice_conversion_models": {
925
+ "multilingual": {
926
+ "vctk": {
927
+ "freevc24": {
928
+ "github_rls_url": "https://github.com/coqui-ai/TTS/releases/download/v0.13.0_models/voice_conversion_models--multilingual--vctk--freevc24.zip",
929
+ "description": "FreeVC model trained on VCTK dataset from https://github.com/OlaWod/FreeVC",
930
+ "author": "Jing-Yi Li @OlaWod",
931
+ "license": "MIT",
932
+ "commit": null
933
+ }
934
+ },
935
+ "multi-dataset": {
936
+ "openvoice_v1": {
937
+ "hf_url": [
938
+ "https://huggingface.co/myshell-ai/OpenVoice/resolve/main/checkpoints/converter/config.json",
939
+ "https://huggingface.co/myshell-ai/OpenVoice/resolve/main/checkpoints/converter/checkpoint.pth"
940
+ ],
941
+ "description": "OpenVoice VC model from https://huggingface.co/myshell-ai/OpenVoiceV2",
942
+ "author": "MyShell.ai",
943
+ "license": "MIT",
944
+ "commit": null
945
+ },
946
+ "openvoice_v2": {
947
+ "hf_url": [
948
+ "https://huggingface.co/myshell-ai/OpenVoiceV2/resolve/main/converter/config.json",
949
+ "https://huggingface.co/myshell-ai/OpenVoiceV2/resolve/main/converter/checkpoint.pth"
950
+ ],
951
+ "description": "OpenVoice VC model from https://huggingface.co/myshell-ai/OpenVoiceV2",
952
+ "author": "MyShell.ai",
953
+ "license": "MIT",
954
+ "commit": null
955
+ }
956
+ }
957
+ }
958
+ }
959
+ }
TTS/__init__.py ADDED
@@ -0,0 +1,33 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ #import importlib.metadata
2
+
3
+ from TTS.utils.generic_utils import is_pytorch_at_least_2_4
4
+
5
+ #__version__ = importlib.metadata.version("coqui-tts")
6
+
7
+
8
+ if is_pytorch_at_least_2_4():
9
+ import _codecs
10
+ from collections import defaultdict
11
+
12
+ import numpy as np
13
+ import torch
14
+
15
+ from TTS.config.shared_configs import BaseDatasetConfig
16
+ from TTS.tts.configs.xtts_config import XttsConfig
17
+ from TTS.tts.models.xtts import XttsArgs, XttsAudioConfig
18
+ from TTS.utils.radam import RAdam
19
+
20
+ torch.serialization.add_safe_globals([dict, defaultdict, RAdam])
21
+
22
+ # Bark
23
+ torch.serialization.add_safe_globals(
24
+ [
25
+ np.core.multiarray.scalar,
26
+ np.dtype,
27
+ np.dtypes.Float64DType,
28
+ _codecs.encode, # TODO: safe by default from Pytorch 2.5
29
+ ]
30
+ )
31
+
32
+ # XTTS
33
+ torch.serialization.add_safe_globals([BaseDatasetConfig, XttsConfig, XttsAudioConfig, XttsArgs])
TTS/__pycache__/__init__.cpython-310.pyc ADDED
Binary file (784 Bytes). View file
 
TTS/__pycache__/__init__.cpython-311.pyc ADDED
Binary file (1.28 kB). View file
 
TTS/__pycache__/__init__.cpython-39.pyc ADDED
Binary file (882 Bytes). View file
 
TTS/__pycache__/model.cpython-310.pyc ADDED
Binary file (2.75 kB). View file
 
TTS/__pycache__/model.cpython-311.pyc ADDED
Binary file (3.31 kB). View file
 
TTS/__pycache__/model.cpython-39.pyc ADDED
Binary file (2.75 kB). View file
 
TTS/api.py ADDED
@@ -0,0 +1,499 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ """Coqui TTS Python API."""
2
+
3
+ import logging
4
+ import tempfile
5
+ import warnings
6
+ from pathlib import Path
7
+ from typing import Optional
8
+
9
+ from torch import nn
10
+
11
+ from TTS.config import load_config
12
+ from TTS.utils.manage import ModelManager
13
+ from TTS.utils.synthesizer import Synthesizer
14
+
15
+ logger = logging.getLogger(__name__)
16
+
17
+
18
+ class TTS(nn.Module):
19
+ """TODO: Add voice conversion and Capacitron support."""
20
+
21
+ def __init__(
22
+ self,
23
+ model_name: str = "",
24
+ *,
25
+ model_path: Optional[str] = None,
26
+ config_path: Optional[str] = None,
27
+ vocoder_name: Optional[str] = None,
28
+ vocoder_path: Optional[str] = None,
29
+ vocoder_config_path: Optional[str] = None,
30
+ encoder_path: Optional[str] = None,
31
+ encoder_config_path: Optional[str] = None,
32
+ speakers_file_path: Optional[str] = None,
33
+ language_ids_file_path: Optional[str] = None,
34
+ progress_bar: bool = True,
35
+ gpu: bool = False,
36
+ ) -> None:
37
+ """🐸TTS python interface that allows to load and use the released models.
38
+
39
+ Example with a multi-speaker model:
40
+ >>> from TTS.api import TTS
41
+ >>> tts = TTS(TTS.list_models()[0])
42
+ >>> wav = tts.tts("This is a test! This is also a test!!", speaker=tts.speakers[0], language=tts.languages[0])
43
+ >>> tts.tts_to_file(text="Hello world!", speaker=tts.speakers[0], language=tts.languages[0], file_path="output.wav")
44
+
45
+ Example with a single-speaker model:
46
+ >>> tts = TTS(model_name="tts_models/de/thorsten/tacotron2-DDC", progress_bar=False)
47
+ >>> tts.tts_to_file(text="Ich bin eine Testnachricht.", file_path="output.wav")
48
+
49
+ Example loading a model from a path:
50
+ >>> tts = TTS(model_path="/path/to/checkpoint_100000.pth", config_path="/path/to/config.json", progress_bar=False)
51
+ >>> tts.tts_to_file(text="Ich bin eine Testnachricht.", file_path="output.wav")
52
+
53
+ Example voice cloning with YourTTS in English, French and Portuguese:
54
+ >>> tts = TTS(model_name="tts_models/multilingual/multi-dataset/your_tts", progress_bar=False).to("cuda")
55
+ >>> tts.tts_to_file("This is voice cloning.", speaker_wav="my/cloning/audio.wav", language="en", file_path="thisisit.wav")
56
+ >>> tts.tts_to_file("C'est le clonage de la voix.", speaker_wav="my/cloning/audio.wav", language="fr", file_path="thisisit.wav")
57
+ >>> tts.tts_to_file("Isso é clonagem de voz.", speaker_wav="my/cloning/audio.wav", language="pt", file_path="thisisit.wav")
58
+
59
+ Example Fairseq TTS models (uses ISO language codes in https://dl.fbaipublicfiles.com/mms/tts/all-tts-languages.html):
60
+ >>> tts = TTS(model_name="tts_models/eng/fairseq/vits", progress_bar=False).to("cuda")
61
+ >>> tts.tts_to_file("This is a test.", file_path="output.wav")
62
+
63
+ Args:
64
+ model_name (str, optional): Model name to load. You can list models by ```tts.models```. Defaults to None.
65
+ model_path (str, optional): Path to the model checkpoint. Defaults to None.
66
+ config_path (str, optional): Path to the model config. Defaults to None.
67
+ vocoder_name (str, optional): Pre-trained vocoder to use. Defaults to None, i.e. using the default vocoder.
68
+ vocoder_path (str, optional): Path to the vocoder checkpoint. Defaults to None.
69
+ vocoder_config_path (str, optional): Path to the vocoder config. Defaults to None.
70
+ encoder_path: Path to speaker encoder checkpoint. Default to None.
71
+ encoder_config_path: Path to speaker encoder config file. Defaults to None.
72
+ speakers_file_path: JSON file for multi-speaker model. Defaults to None.
73
+ language_ids_file_path: JSON file for multilingual model. Defaults to None
74
+ progress_bar (bool, optional): Whether to print a progress bar while downloading a model. Defaults to True.
75
+ gpu (bool, optional): Enable/disable GPU. Defaults to False. DEPRECATED, use TTS(...).to("cuda")
76
+ """
77
+ super().__init__()
78
+ self.manager = ModelManager(models_file=self.get_models_file_path(), progress_bar=progress_bar)
79
+ self.config = load_config(config_path) if config_path else None
80
+ self.synthesizer = None
81
+ self.voice_converter = None
82
+ self.model_name = ""
83
+
84
+ self.vocoder_path = vocoder_path
85
+ self.vocoder_config_path = vocoder_config_path
86
+ self.encoder_path = encoder_path
87
+ self.encoder_config_path = encoder_config_path
88
+ self.speakers_file_path = speakers_file_path
89
+ self.language_ids_file_path = language_ids_file_path
90
+
91
+ if gpu:
92
+ warnings.warn("`gpu` will be deprecated. Please use `tts.to(device)` instead.")
93
+
94
+ if model_name is not None and len(model_name) > 0:
95
+ if "tts_models" in model_name:
96
+ self.load_tts_model_by_name(model_name, vocoder_name, gpu=gpu)
97
+ elif "voice_conversion_models" in model_name:
98
+ self.load_vc_model_by_name(model_name, gpu=gpu)
99
+ # To allow just TTS("xtts")
100
+ else:
101
+ self.load_model_by_name(model_name, vocoder_name, gpu=gpu)
102
+
103
+ if model_path:
104
+ self.load_tts_model_by_path(model_path, config_path, gpu=gpu)
105
+
106
+ @property
107
+ def models(self) -> list[str]:
108
+ return self.manager.list_tts_models()
109
+
110
+ @property
111
+ def is_multi_speaker(self) -> bool:
112
+ if (
113
+ self.synthesizer is not None
114
+ and hasattr(self.synthesizer.tts_model, "speaker_manager")
115
+ and self.synthesizer.tts_model.speaker_manager
116
+ ):
117
+ return self.synthesizer.tts_model.speaker_manager.num_speakers > 1
118
+ return False
119
+
120
+ @property
121
+ def is_multi_lingual(self) -> bool:
122
+ # Not sure what sets this to None, but applied a fix to prevent crashing.
123
+ if (
124
+ isinstance(self.model_name, str)
125
+ and "xtts" in self.model_name
126
+ or self.config
127
+ and ("xtts" in self.config.model or "languages" in self.config and len(self.config.languages) > 1)
128
+ ):
129
+ return True
130
+ if (
131
+ self.synthesizer is not None
132
+ and hasattr(self.synthesizer.tts_model, "language_manager")
133
+ and self.synthesizer.tts_model.language_manager
134
+ ):
135
+ return self.synthesizer.tts_model.language_manager.num_languages > 1
136
+ return False
137
+
138
+ @property
139
+ def speakers(self) -> list[str]:
140
+ if not self.is_multi_speaker:
141
+ return None
142
+ return self.synthesizer.tts_model.speaker_manager.speaker_names
143
+
144
+ @property
145
+ def languages(self) -> list[str]:
146
+ if not self.is_multi_lingual:
147
+ return None
148
+ return self.synthesizer.tts_model.language_manager.language_names
149
+
150
+ @staticmethod
151
+ def get_models_file_path() -> Path:
152
+ return Path(__file__).parent / ".models.json"
153
+
154
+ @staticmethod
155
+ def list_models() -> list[str]:
156
+ return ModelManager(models_file=TTS.get_models_file_path(), progress_bar=False).list_models()
157
+
158
+ def download_model_by_name(
159
+ self, model_name: str, vocoder_name: Optional[str] = None
160
+ ) -> tuple[Optional[Path], Optional[Path], Optional[Path]]:
161
+ model_path, config_path, model_item = self.manager.download_model(model_name)
162
+ if "fairseq" in model_name or (model_item is not None and isinstance(model_item["model_url"], list)):
163
+ # return model directory if there are multiple files
164
+ # we assume that the model knows how to load itself
165
+ return None, None, model_path
166
+ if model_item.get("default_vocoder") is None:
167
+ return model_path, config_path, None
168
+ if vocoder_name is None:
169
+ vocoder_name = model_item["default_vocoder"]
170
+ vocoder_path, vocoder_config_path, _ = self.manager.download_model(vocoder_name)
171
+ # A local vocoder model will take precedence if specified via vocoder_path
172
+ if self.vocoder_path is None or self.vocoder_config_path is None:
173
+ self.vocoder_path = vocoder_path
174
+ self.vocoder_config_path = vocoder_config_path
175
+ return model_path, config_path, None
176
+
177
+ def load_model_by_name(self, model_name: str, vocoder_name: Optional[str] = None, *, gpu: bool = False) -> None:
178
+ """Load one of the 🐸TTS models by name.
179
+
180
+ Args:
181
+ model_name (str): Model name to load. You can list models by ```tts.models```.
182
+ gpu (bool, optional): Enable/disable GPU. Some models might be too slow on CPU. Defaults to False.
183
+ """
184
+ self.load_tts_model_by_name(model_name, vocoder_name, gpu=gpu)
185
+
186
+ def load_vc_model_by_name(self, model_name: str, *, gpu: bool = False) -> None:
187
+ """Load one of the voice conversion models by name.
188
+
189
+ Args:
190
+ model_name (str): Model name to load. You can list models by ```tts.models```.
191
+ gpu (bool, optional): Enable/disable GPU. Some models might be too slow on CPU. Defaults to False.
192
+ """
193
+ self.model_name = model_name
194
+ model_path, config_path, model_dir = self.download_model_by_name(model_name)
195
+ self.voice_converter = Synthesizer(
196
+ vc_checkpoint=model_path, vc_config=config_path, model_dir=model_dir, use_cuda=gpu
197
+ )
198
+
199
+ def load_tts_model_by_name(self, model_name: str, vocoder_name: Optional[str] = None, *, gpu: bool = False) -> None:
200
+ """Load one of 🐸TTS models by name.
201
+
202
+ Args:
203
+ model_name (str): Model name to load. You can list models by ```tts.models```.
204
+ gpu (bool, optional): Enable/disable GPU. Some models might be too slow on CPU. Defaults to False.
205
+
206
+ TODO: Add tests
207
+ """
208
+ self.synthesizer = None
209
+ self.model_name = model_name
210
+
211
+ model_path, config_path, model_dir = self.download_model_by_name(model_name, vocoder_name)
212
+
213
+ # init synthesizer
214
+ # None values are fetch from the model
215
+ self.synthesizer = Synthesizer(
216
+ tts_checkpoint=model_path,
217
+ tts_config_path=config_path,
218
+ tts_speakers_file=None,
219
+ tts_languages_file=None,
220
+ vocoder_checkpoint=self.vocoder_path,
221
+ vocoder_config=self.vocoder_config_path,
222
+ encoder_checkpoint=self.encoder_path,
223
+ encoder_config=self.encoder_config_path,
224
+ model_dir=model_dir,
225
+ use_cuda=gpu,
226
+ )
227
+
228
+ def load_tts_model_by_path(self, model_path: str, config_path: str, *, gpu: bool = False) -> None:
229
+ """Load a model from a path.
230
+
231
+ Args:
232
+ model_path (str): Path to the model checkpoint.
233
+ config_path (str): Path to the model config.
234
+ vocoder_path (str, optional): Path to the vocoder checkpoint. Defaults to None.
235
+ vocoder_config (str, optional): Path to the vocoder config. Defaults to None.
236
+ gpu (bool, optional): Enable/disable GPU. Some models might be too slow on CPU. Defaults to False.
237
+ """
238
+
239
+ self.synthesizer = Synthesizer(
240
+ tts_checkpoint=model_path,
241
+ tts_config_path=config_path,
242
+ tts_speakers_file=self.speakers_file_path,
243
+ tts_languages_file=self.language_ids_file_path,
244
+ vocoder_checkpoint=self.vocoder_path,
245
+ vocoder_config=self.vocoder_config_path,
246
+ encoder_checkpoint=self.encoder_path,
247
+ encoder_config=self.encoder_config_path,
248
+ use_cuda=gpu,
249
+ )
250
+
251
+ def _check_arguments(
252
+ self,
253
+ speaker: Optional[str] = None,
254
+ language: Optional[str] = None,
255
+ speaker_wav: Optional[str] = None,
256
+ emotion: Optional[str] = None,
257
+ speed: Optional[float] = None,
258
+ **kwargs,
259
+ ) -> None:
260
+ """Check if the arguments are valid for the model."""
261
+ # check for the coqui tts models
262
+ if self.is_multi_speaker and (speaker is None and speaker_wav is None):
263
+ raise ValueError("Model is multi-speaker but no `speaker` is provided.")
264
+ if self.is_multi_lingual and language is None:
265
+ raise ValueError("Model is multi-lingual but no `language` is provided.")
266
+ if not self.is_multi_speaker and speaker is not None and "voice_dir" not in kwargs:
267
+ raise ValueError("Model is not multi-speaker but `speaker` is provided.")
268
+ if not self.is_multi_lingual and language is not None:
269
+ raise ValueError("Model is not multi-lingual but `language` is provided.")
270
+ if emotion is not None and speed is not None:
271
+ raise ValueError("Emotion and speed can only be used with Coqui Studio models. Which is discontinued.")
272
+
273
+ def tts(
274
+ self,
275
+ text: str,
276
+ speaker: str = None,
277
+ language: str = None,
278
+ speaker_wav: str = None,
279
+ emotion: str = None,
280
+ speed: float = None,
281
+ split_sentences: bool = True,
282
+ **kwargs,
283
+ ):
284
+ """Convert text to speech.
285
+
286
+ Args:
287
+ text (str):
288
+ Input text to synthesize.
289
+ speaker (str, optional):
290
+ Speaker name for multi-speaker. You can check whether loaded model is multi-speaker by
291
+ `tts.is_multi_speaker` and list speakers by `tts.speakers`. Defaults to None.
292
+ language (str): Language of the text. If None, the default language of the speaker is used. Language is only
293
+ supported by `XTTS` model.
294
+ speaker_wav (str, optional):
295
+ Path to a reference wav file to use for voice cloning with supporting models like YourTTS.
296
+ Defaults to None.
297
+ emotion (str, optional):
298
+ Emotion to use for 🐸Coqui Studio models. If None, Studio models use "Neutral". Defaults to None.
299
+ speed (float, optional):
300
+ Speed factor to use for 🐸Coqui Studio models, between 0 and 2.0. If None, Studio models use 1.0.
301
+ Defaults to None.
302
+ split_sentences (bool, optional):
303
+ Split text into sentences, synthesize them separately and concatenate the file audio.
304
+ Setting it False uses more VRAM and possibly hit model specific text length or VRAM limits. Only
305
+ applicable to the 🐸TTS models. Defaults to True.
306
+ kwargs (dict, optional):
307
+ Additional arguments for the model.
308
+ """
309
+ self._check_arguments(
310
+ speaker=speaker, language=language, speaker_wav=speaker_wav, emotion=emotion, speed=speed, **kwargs
311
+ )
312
+ wav = self.synthesizer.tts(
313
+ text=text,
314
+ speaker_name=speaker,
315
+ language_name=language,
316
+ speaker_wav=speaker_wav,
317
+ split_sentences=split_sentences,
318
+ **kwargs,
319
+ )
320
+ return wav
321
+
322
+ def tts_to_file(
323
+ self,
324
+ text: str,
325
+ speaker: str = None,
326
+ language: str = None,
327
+ speaker_wav: str = None,
328
+ emotion: str = None,
329
+ speed: float = 1.0,
330
+ pipe_out=None,
331
+ file_path: str = "output.wav",
332
+ split_sentences: bool = True,
333
+ **kwargs,
334
+ ) -> str:
335
+ """Convert text to speech.
336
+
337
+ Args:
338
+ text (str):
339
+ Input text to synthesize.
340
+ speaker (str, optional):
341
+ Speaker name for multi-speaker. You can check whether loaded model is multi-speaker by
342
+ `tts.is_multi_speaker` and list speakers by `tts.speakers`. Defaults to None.
343
+ language (str, optional):
344
+ Language code for multi-lingual models. You can check whether loaded model is multi-lingual
345
+ `tts.is_multi_lingual` and list available languages by `tts.languages`. Defaults to None.
346
+ speaker_wav (str, optional):
347
+ Path to a reference wav file to use for voice cloning with supporting models like YourTTS.
348
+ Defaults to None.
349
+ emotion (str, optional):
350
+ Emotion to use for 🐸Coqui Studio models. Defaults to "Neutral".
351
+ speed (float, optional):
352
+ Speed factor to use for 🐸Coqui Studio models, between 0.0 and 2.0. Defaults to None.
353
+ pipe_out (BytesIO, optional):
354
+ Flag to stdout the generated TTS wav file for shell pipe.
355
+ file_path (str, optional):
356
+ Output file path. Defaults to "output.wav".
357
+ split_sentences (bool, optional):
358
+ Split text into sentences, synthesize them separately and concatenate the file audio.
359
+ Setting it False uses more VRAM and possibly hit model specific text length or VRAM limits. Only
360
+ applicable to the 🐸TTS models. Defaults to True.
361
+ kwargs (dict, optional):
362
+ Additional arguments for the model.
363
+ """
364
+ self._check_arguments(speaker=speaker, language=language, speaker_wav=speaker_wav, **kwargs)
365
+
366
+ wav = self.tts(
367
+ text=text,
368
+ speaker=speaker,
369
+ language=language,
370
+ speaker_wav=speaker_wav,
371
+ split_sentences=split_sentences,
372
+ **kwargs,
373
+ )
374
+ self.synthesizer.save_wav(wav=wav, path=file_path, pipe_out=pipe_out)
375
+ return file_path
376
+
377
+ def voice_conversion(
378
+ self,
379
+ source_wav: str,
380
+ target_wav: str,
381
+ ):
382
+ """Voice conversion with FreeVC. Convert source wav to target speaker.
383
+
384
+ Args:``
385
+ source_wav (str):
386
+ Path to the source wav file.
387
+ target_wav (str):`
388
+ Path to the target wav file.
389
+ """
390
+ if self.voice_converter is None:
391
+ msg = "The selected model does not support voice conversion."
392
+ raise RuntimeError(msg)
393
+ return self.voice_converter.voice_conversion(source_wav=source_wav, target_wav=target_wav)
394
+
395
+ def voice_conversion_to_file(
396
+ self,
397
+ source_wav: str,
398
+ target_wav: str,
399
+ file_path: str = "output.wav",
400
+ pipe_out=None,
401
+ ) -> str:
402
+ """Voice conversion with FreeVC. Convert source wav to target speaker.
403
+
404
+ Args:
405
+ source_wav (str):
406
+ Path to the source wav file.
407
+ target_wav (str):
408
+ Path to the target wav file.
409
+ file_path (str, optional):
410
+ Output file path. Defaults to "output.wav".
411
+ pipe_out (BytesIO, optional):
412
+ Flag to stdout the generated TTS wav file for shell pipe.
413
+ """
414
+ wav = self.voice_conversion(source_wav=source_wav, target_wav=target_wav)
415
+ self.voice_converter.save_wav(wav=wav, path=file_path, pipe_out=pipe_out)
416
+ return file_path
417
+
418
+ def tts_with_vc(
419
+ self,
420
+ text: str,
421
+ language: str = None,
422
+ speaker_wav: str = None,
423
+ speaker: str = None,
424
+ split_sentences: bool = True,
425
+ ):
426
+ """Convert text to speech with voice conversion.
427
+
428
+ It combines tts with voice conversion to fake voice cloning.
429
+
430
+ - Convert text to speech with tts.
431
+ - Convert the output wav to target speaker with voice conversion.
432
+
433
+ Args:
434
+ text (str):
435
+ Input text to synthesize.
436
+ language (str, optional):
437
+ Language code for multi-lingual models. You can check whether loaded model is multi-lingual
438
+ `tts.is_multi_lingual` and list available languages by `tts.languages`. Defaults to None.
439
+ speaker_wav (str, optional):
440
+ Path to a reference wav file to use for voice cloning with supporting models like YourTTS.
441
+ Defaults to None.
442
+ speaker (str, optional):
443
+ Speaker name for multi-speaker. You can check whether loaded model is multi-speaker by
444
+ `tts.is_multi_speaker` and list speakers by `tts.speakers`. Defaults to None.
445
+ split_sentences (bool, optional):
446
+ Split text into sentences, synthesize them separately and concatenate the file audio.
447
+ Setting it False uses more VRAM and possibly hit model specific text length or VRAM limits. Only
448
+ applicable to the 🐸TTS models. Defaults to True.
449
+ """
450
+ with tempfile.NamedTemporaryFile(suffix=".wav", delete=False) as fp:
451
+ # Lazy code... save it to a temp file to resample it while reading it for VC
452
+ self.tts_to_file(
453
+ text=text, speaker=speaker, language=language, file_path=fp.name, split_sentences=split_sentences
454
+ )
455
+ if self.voice_converter is None:
456
+ self.load_vc_model_by_name("voice_conversion_models/multilingual/vctk/freevc24")
457
+ wav = self.voice_converter.voice_conversion(source_wav=fp.name, target_wav=speaker_wav)
458
+ return wav
459
+
460
+ def tts_with_vc_to_file(
461
+ self,
462
+ text: str,
463
+ language: str = None,
464
+ speaker_wav: str = None,
465
+ file_path: str = "output.wav",
466
+ speaker: str = None,
467
+ split_sentences: bool = True,
468
+ pipe_out=None,
469
+ ) -> str:
470
+ """Convert text to speech with voice conversion and save to file.
471
+
472
+ Check `tts_with_vc` for more details.
473
+
474
+ Args:
475
+ text (str):
476
+ Input text to synthesize.
477
+ language (str, optional):
478
+ Language code for multi-lingual models. You can check whether loaded model is multi-lingual
479
+ `tts.is_multi_lingual` and list available languages by `tts.languages`. Defaults to None.
480
+ speaker_wav (str, optional):
481
+ Path to a reference wav file to use for voice cloning with supporting models like YourTTS.
482
+ Defaults to None.
483
+ file_path (str, optional):
484
+ Output file path. Defaults to "output.wav".
485
+ speaker (str, optional):
486
+ Speaker name for multi-speaker. You can check whether loaded model is multi-speaker by
487
+ `tts.is_multi_speaker` and list speakers by `tts.speakers`. Defaults to None.
488
+ split_sentences (bool, optional):
489
+ Split text into sentences, synthesize them separately and concatenate the file audio.
490
+ Setting it False uses more VRAM and possibly hit model specific text length or VRAM limits. Only
491
+ applicable to the 🐸TTS models. Defaults to True.
492
+ pipe_out (BytesIO, optional):
493
+ Flag to stdout the generated TTS wav file for shell pipe.
494
+ """
495
+ wav = self.tts_with_vc(
496
+ text=text, language=language, speaker_wav=speaker_wav, speaker=speaker, split_sentences=split_sentences
497
+ )
498
+ self.voice_converter.save_wav(wav=wav, path=file_path, pipe_out=pipe_out)
499
+ return file_path
TTS/bin/__init__.py ADDED
File without changes
TTS/bin/collect_env_info.py ADDED
@@ -0,0 +1,49 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ """Get detailed info about the working environment."""
2
+
3
+ import json
4
+ import os
5
+ import platform
6
+ import sys
7
+
8
+ import numpy
9
+ import torch
10
+
11
+ import TTS
12
+
13
+ sys.path += [os.path.abspath(".."), os.path.abspath(".")]
14
+
15
+
16
+ def system_info():
17
+ return {
18
+ "OS": platform.system(),
19
+ "architecture": platform.architecture(),
20
+ "version": platform.version(),
21
+ "processor": platform.processor(),
22
+ "python": platform.python_version(),
23
+ }
24
+
25
+
26
+ def cuda_info():
27
+ return {
28
+ "GPU": [torch.cuda.get_device_name(i) for i in range(torch.cuda.device_count())],
29
+ "available": torch.cuda.is_available(),
30
+ "version": torch.version.cuda,
31
+ }
32
+
33
+
34
+ def package_info():
35
+ return {
36
+ "numpy": numpy.__version__,
37
+ "PyTorch_version": torch.__version__,
38
+ "PyTorch_debug": torch.version.debug,
39
+ "TTS": TTS.__version__,
40
+ }
41
+
42
+
43
+ def main():
44
+ details = {"System": system_info(), "CUDA": cuda_info(), "Packages": package_info()}
45
+ print(json.dumps(details, indent=4, sort_keys=True))
46
+
47
+
48
+ if __name__ == "__main__":
49
+ main()
TTS/bin/compute_attention_masks.py ADDED
@@ -0,0 +1,170 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import argparse
2
+ import importlib
3
+ import logging
4
+ import os
5
+ import sys
6
+ from argparse import RawTextHelpFormatter
7
+
8
+ import numpy as np
9
+ import torch
10
+ from torch.utils.data import DataLoader
11
+ from tqdm import tqdm
12
+ from trainer.io import load_checkpoint
13
+
14
+ from TTS.config import load_config
15
+ from TTS.tts.datasets.TTSDataset import TTSDataset
16
+ from TTS.tts.models import setup_model
17
+ from TTS.tts.utils.text.characters import make_symbols, phonemes, symbols
18
+ from TTS.utils.audio import AudioProcessor
19
+ from TTS.utils.generic_utils import ConsoleFormatter, setup_logger
20
+
21
+ if __name__ == "__main__":
22
+ setup_logger("TTS", level=logging.INFO, stream=sys.stdout, formatter=ConsoleFormatter())
23
+
24
+ # pylint: disable=bad-option-value
25
+ parser = argparse.ArgumentParser(
26
+ description="""Extract attention masks from trained Tacotron/Tacotron2 models.
27
+ These masks can be used for different purposes including training a TTS model with a Duration Predictor.\n\n"""
28
+ """Each attention mask is written to the same path as the input wav file with ".npy" file extension.
29
+ (e.g. path/bla.wav (wav file) --> path/bla.npy (attention mask))\n"""
30
+ """
31
+ Example run:
32
+ CUDA_VISIBLE_DEVICE="0" python TTS/bin/compute_attention_masks.py
33
+ --model_path /data/rw/home/Models/ljspeech-dcattn-December-14-2020_11+10AM-9d0e8c7/checkpoint_200000.pth
34
+ --config_path /data/rw/home/Models/ljspeech-dcattn-December-14-2020_11+10AM-9d0e8c7/config.json
35
+ --dataset_metafile metadata.csv
36
+ --data_path /root/LJSpeech-1.1/
37
+ --batch_size 32
38
+ --dataset ljspeech
39
+ --use_cuda
40
+ """,
41
+ formatter_class=RawTextHelpFormatter,
42
+ )
43
+ parser.add_argument("--model_path", type=str, required=True, help="Path to Tacotron/Tacotron2 model file ")
44
+ parser.add_argument(
45
+ "--config_path",
46
+ type=str,
47
+ required=True,
48
+ help="Path to Tacotron/Tacotron2 config file.",
49
+ )
50
+ parser.add_argument(
51
+ "--dataset",
52
+ type=str,
53
+ default="",
54
+ required=True,
55
+ help="Target dataset processor name from TTS.tts.dataset.preprocess.",
56
+ )
57
+
58
+ parser.add_argument(
59
+ "--dataset_metafile",
60
+ type=str,
61
+ default="",
62
+ required=True,
63
+ help="Dataset metafile inclusing file paths with transcripts.",
64
+ )
65
+ parser.add_argument("--data_path", type=str, default="", help="Defines the data path. It overwrites config.json.")
66
+ parser.add_argument("--use_cuda", action=argparse.BooleanOptionalAction, default=False, help="enable/disable cuda.")
67
+
68
+ parser.add_argument(
69
+ "--batch_size", default=16, type=int, help="Batch size for the model. Use batch_size=1 if you have no CUDA."
70
+ )
71
+ args = parser.parse_args()
72
+
73
+ C = load_config(args.config_path)
74
+ ap = AudioProcessor(**C.audio)
75
+
76
+ # if the vocabulary was passed, replace the default
77
+ if "characters" in C.keys():
78
+ symbols, phonemes = make_symbols(**C.characters) # noqa: F811
79
+
80
+ # load the model
81
+ num_chars = len(phonemes) if C.use_phonemes else len(symbols)
82
+ # TODO: handle multi-speaker
83
+ model = setup_model(C)
84
+ model, _ = load_checkpoint(model, args.model_path, use_cuda=args.use_cuda, eval=True)
85
+
86
+ # data loader
87
+ preprocessor = importlib.import_module("TTS.tts.datasets.formatters")
88
+ preprocessor = getattr(preprocessor, args.dataset)
89
+ meta_data = preprocessor(args.data_path, args.dataset_metafile)
90
+ dataset = TTSDataset(
91
+ model.decoder.r,
92
+ C.text_cleaner,
93
+ compute_linear_spec=False,
94
+ ap=ap,
95
+ meta_data=meta_data,
96
+ characters=C.characters if "characters" in C.keys() else None,
97
+ add_blank=C["add_blank"] if "add_blank" in C.keys() else False,
98
+ use_phonemes=C.use_phonemes,
99
+ phoneme_cache_path=C.phoneme_cache_path,
100
+ phoneme_language=C.phoneme_language,
101
+ enable_eos_bos=C.enable_eos_bos_chars,
102
+ )
103
+
104
+ dataset.sort_and_filter_items(C.get("sort_by_audio_len", default=False))
105
+ loader = DataLoader(
106
+ dataset,
107
+ batch_size=args.batch_size,
108
+ num_workers=4,
109
+ collate_fn=dataset.collate_fn,
110
+ shuffle=False,
111
+ drop_last=False,
112
+ )
113
+
114
+ # compute attentions
115
+ file_paths = []
116
+ with torch.inference_mode():
117
+ for data in tqdm(loader):
118
+ # setup input data
119
+ text_input = data[0]
120
+ text_lengths = data[1]
121
+ linear_input = data[3]
122
+ mel_input = data[4]
123
+ mel_lengths = data[5]
124
+ stop_targets = data[6]
125
+ item_idxs = data[7]
126
+
127
+ # dispatch data to GPU
128
+ if args.use_cuda:
129
+ text_input = text_input.cuda()
130
+ text_lengths = text_lengths.cuda()
131
+ mel_input = mel_input.cuda()
132
+ mel_lengths = mel_lengths.cuda()
133
+
134
+ model_outputs = model.forward(text_input, text_lengths, mel_input)
135
+
136
+ alignments = model_outputs["alignments"].detach()
137
+ for idx, alignment in enumerate(alignments):
138
+ item_idx = item_idxs[idx]
139
+ # interpolate if r > 1
140
+ alignment = (
141
+ torch.nn.functional.interpolate(
142
+ alignment.transpose(0, 1).unsqueeze(0),
143
+ size=None,
144
+ scale_factor=model.decoder.r,
145
+ mode="nearest",
146
+ align_corners=None,
147
+ recompute_scale_factor=None,
148
+ )
149
+ .squeeze(0)
150
+ .transpose(0, 1)
151
+ )
152
+ # remove paddings
153
+ alignment = alignment[: mel_lengths[idx], : text_lengths[idx]].cpu().numpy()
154
+ # set file paths
155
+ wav_file_name = os.path.basename(item_idx)
156
+ align_file_name = os.path.splitext(wav_file_name)[0] + "_attn.npy"
157
+ file_path = item_idx.replace(wav_file_name, align_file_name)
158
+ # save output
159
+ wav_file_abs_path = os.path.abspath(item_idx)
160
+ file_abs_path = os.path.abspath(file_path)
161
+ file_paths.append([wav_file_abs_path, file_abs_path])
162
+ np.save(file_path, alignment)
163
+
164
+ # ourput metafile
165
+ metafile = os.path.join(args.data_path, "metadata_attn_mask.txt")
166
+
167
+ with open(metafile, "w", encoding="utf-8") as f:
168
+ for p in file_paths:
169
+ f.write(f"{p[0]}|{p[1]}\n")
170
+ print(f" >> Metafile created: {metafile}")
TTS/bin/compute_embeddings.py ADDED
@@ -0,0 +1,202 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import argparse
2
+ import logging
3
+ import os
4
+ import sys
5
+ from argparse import RawTextHelpFormatter
6
+
7
+ import torch
8
+ from tqdm import tqdm
9
+
10
+ from TTS.config import load_config
11
+ from TTS.config.shared_configs import BaseDatasetConfig
12
+ from TTS.tts.datasets import load_tts_samples
13
+ from TTS.tts.utils.managers import save_file
14
+ from TTS.tts.utils.speakers import SpeakerManager
15
+ from TTS.utils.generic_utils import ConsoleFormatter, setup_logger
16
+
17
+
18
+ def compute_embeddings(
19
+ model_path,
20
+ config_path,
21
+ output_path,
22
+ old_speakers_file=None,
23
+ old_append=False,
24
+ config_dataset_path=None,
25
+ formatter_name=None,
26
+ dataset_name=None,
27
+ dataset_path=None,
28
+ meta_file_train=None,
29
+ meta_file_val=None,
30
+ disable_cuda=False,
31
+ no_eval=False,
32
+ ):
33
+ use_cuda = torch.cuda.is_available() and not disable_cuda
34
+
35
+ if config_dataset_path is not None:
36
+ c_dataset = load_config(config_dataset_path)
37
+ meta_data_train, meta_data_eval = load_tts_samples(c_dataset.datasets, eval_split=not no_eval)
38
+ else:
39
+ c_dataset = BaseDatasetConfig()
40
+ c_dataset.formatter = formatter_name
41
+ c_dataset.dataset_name = dataset_name
42
+ c_dataset.path = dataset_path
43
+ if meta_file_train is not None:
44
+ c_dataset.meta_file_train = meta_file_train
45
+ if meta_file_val is not None:
46
+ c_dataset.meta_file_val = meta_file_val
47
+ meta_data_train, meta_data_eval = load_tts_samples(c_dataset, eval_split=not no_eval)
48
+
49
+ if meta_data_eval is None:
50
+ samples = meta_data_train
51
+ else:
52
+ samples = meta_data_train + meta_data_eval
53
+
54
+ encoder_manager = SpeakerManager(
55
+ encoder_model_path=model_path,
56
+ encoder_config_path=config_path,
57
+ d_vectors_file_path=old_speakers_file,
58
+ use_cuda=use_cuda,
59
+ )
60
+
61
+ class_name_key = encoder_manager.encoder_config.class_name_key
62
+
63
+ # compute speaker embeddings
64
+ if old_speakers_file is not None and old_append:
65
+ speaker_mapping = encoder_manager.embeddings
66
+ else:
67
+ speaker_mapping = {}
68
+
69
+ for fields in tqdm(samples):
70
+ class_name = fields[class_name_key]
71
+ audio_file = fields["audio_file"]
72
+ embedding_key = fields["audio_unique_name"]
73
+
74
+ # Only update the speaker name when the embedding is already in the old file.
75
+ if embedding_key in speaker_mapping:
76
+ speaker_mapping[embedding_key]["name"] = class_name
77
+ continue
78
+
79
+ if old_speakers_file is not None and embedding_key in encoder_manager.clip_ids:
80
+ # get the embedding from the old file
81
+ embedd = encoder_manager.get_embedding_by_clip(embedding_key)
82
+ else:
83
+ # extract the embedding
84
+ embedd = encoder_manager.compute_embedding_from_clip(audio_file)
85
+
86
+ # create speaker_mapping if target dataset is defined
87
+ speaker_mapping[embedding_key] = {}
88
+ speaker_mapping[embedding_key]["name"] = class_name
89
+ speaker_mapping[embedding_key]["embedding"] = embedd
90
+
91
+ if speaker_mapping:
92
+ # save speaker_mapping if target dataset is defined
93
+ if os.path.isdir(output_path):
94
+ mapping_file_path = os.path.join(output_path, "speakers.pth")
95
+ else:
96
+ mapping_file_path = output_path
97
+
98
+ if os.path.dirname(mapping_file_path) != "":
99
+ os.makedirs(os.path.dirname(mapping_file_path), exist_ok=True)
100
+
101
+ save_file(speaker_mapping, mapping_file_path)
102
+ print("Speaker embeddings saved at:", mapping_file_path)
103
+
104
+
105
+ if __name__ == "__main__":
106
+ setup_logger("TTS", level=logging.INFO, stream=sys.stdout, formatter=ConsoleFormatter())
107
+
108
+ parser = argparse.ArgumentParser(
109
+ description="""Compute embedding vectors for each audio file in a dataset and store them keyed by `{dataset_name}#{file_path}` in a .pth file\n\n"""
110
+ """
111
+ Example runs:
112
+ python TTS/bin/compute_embeddings.py --model_path speaker_encoder_model.pth --config_path speaker_encoder_config.json --config_dataset_path dataset_config.json
113
+
114
+ python TTS/bin/compute_embeddings.py --model_path speaker_encoder_model.pth --config_path speaker_encoder_config.json --formatter_name coqui --dataset_path /path/to/vctk/dataset --dataset_name my_vctk --meta_file_train /path/to/vctk/metafile_train.csv --meta_file_val /path/to/vctk/metafile_eval.csv
115
+ """,
116
+ formatter_class=RawTextHelpFormatter,
117
+ )
118
+ parser.add_argument(
119
+ "--model_path",
120
+ type=str,
121
+ help="Path to model checkpoint file. It defaults to the released speaker encoder.",
122
+ default="https://github.com/coqui-ai/TTS/releases/download/speaker_encoder_model/model_se.pth.tar",
123
+ )
124
+ parser.add_argument(
125
+ "--config_path",
126
+ type=str,
127
+ help="Path to model config file. It defaults to the released speaker encoder config.",
128
+ default="https://github.com/coqui-ai/TTS/releases/download/speaker_encoder_model/config_se.json",
129
+ )
130
+ parser.add_argument(
131
+ "--config_dataset_path",
132
+ type=str,
133
+ help="Path to dataset config file. You either need to provide this or `formatter_name`, `dataset_name` and `dataset_path` arguments.",
134
+ default=None,
135
+ )
136
+ parser.add_argument(
137
+ "--output_path",
138
+ type=str,
139
+ help="Path for output `pth` or `json` file.",
140
+ default="speakers.pth",
141
+ )
142
+ parser.add_argument(
143
+ "--old_file",
144
+ type=str,
145
+ help="The old existing embedding file, from which the embeddings will be directly loaded for already computed audio clips.",
146
+ default=None,
147
+ )
148
+ parser.add_argument(
149
+ "--old_append",
150
+ help="Append new audio clip embeddings to the old embedding file, generate a new non-duplicated merged embedding file. Default False",
151
+ default=False,
152
+ action="store_true",
153
+ )
154
+ parser.add_argument("--disable_cuda", action="store_true", help="Flag to disable cuda.", default=False)
155
+ parser.add_argument("--no_eval", help="Do not compute eval?. Default False", default=False, action="store_true")
156
+ parser.add_argument(
157
+ "--formatter_name",
158
+ type=str,
159
+ help="Name of the formatter to use. You either need to provide this or `config_dataset_path`",
160
+ default=None,
161
+ )
162
+ parser.add_argument(
163
+ "--dataset_name",
164
+ type=str,
165
+ help="Name of the dataset to use. You either need to provide this or `config_dataset_path`",
166
+ default=None,
167
+ )
168
+ parser.add_argument(
169
+ "--dataset_path",
170
+ type=str,
171
+ help="Path to the dataset. You either need to provide this or `config_dataset_path`",
172
+ default=None,
173
+ )
174
+ parser.add_argument(
175
+ "--meta_file_train",
176
+ type=str,
177
+ help="Path to the train meta file. If not set, dataset formatter uses the default metafile if it is defined in the formatter. You either need to provide this or `config_dataset_path`",
178
+ default=None,
179
+ )
180
+ parser.add_argument(
181
+ "--meta_file_val",
182
+ type=str,
183
+ help="Path to the evaluation meta file. If not set, dataset formatter uses the default metafile if it is defined in the formatter. You either need to provide this or `config_dataset_path`",
184
+ default=None,
185
+ )
186
+ args = parser.parse_args()
187
+
188
+ compute_embeddings(
189
+ args.model_path,
190
+ args.config_path,
191
+ args.output_path,
192
+ old_speakers_file=args.old_file,
193
+ old_append=args.old_append,
194
+ config_dataset_path=args.config_dataset_path,
195
+ formatter_name=args.formatter_name,
196
+ dataset_name=args.dataset_name,
197
+ dataset_path=args.dataset_path,
198
+ meta_file_train=args.meta_file_train,
199
+ meta_file_val=args.meta_file_val,
200
+ disable_cuda=args.disable_cuda,
201
+ no_eval=args.no_eval,
202
+ )
TTS/bin/compute_statistics.py ADDED
@@ -0,0 +1,106 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ #!/usr/bin/env python3
2
+ # -*- coding: utf-8 -*-
3
+
4
+ import argparse
5
+ import glob
6
+ import logging
7
+ import os
8
+ import sys
9
+ from typing import Optional
10
+
11
+ import numpy as np
12
+ from tqdm import tqdm
13
+
14
+ # from TTS.utils.io import load_config
15
+ from TTS.config import load_config
16
+ from TTS.tts.datasets import load_tts_samples
17
+ from TTS.utils.audio import AudioProcessor
18
+ from TTS.utils.generic_utils import ConsoleFormatter, setup_logger
19
+
20
+
21
+ def parse_args(arg_list: Optional[list[str]]) -> tuple[argparse.Namespace, list[str]]:
22
+ parser = argparse.ArgumentParser(description="Compute mean and variance of spectrogtram features.")
23
+ parser.add_argument("config_path", type=str, help="TTS config file path to define audio processin parameters.")
24
+ parser.add_argument("out_path", type=str, help="save path (directory and filename).")
25
+ parser.add_argument(
26
+ "--data_path",
27
+ type=str,
28
+ required=False,
29
+ help="folder including the target set of wavs overriding dataset config.",
30
+ )
31
+ return parser.parse_known_args(arg_list)
32
+
33
+
34
+ def main(arg_list: Optional[list[str]] = None):
35
+ """Run preprocessing process."""
36
+ setup_logger("TTS", level=logging.INFO, stream=sys.stderr, formatter=ConsoleFormatter())
37
+ args, overrides = parse_args(arg_list)
38
+
39
+ CONFIG = load_config(args.config_path)
40
+ CONFIG.parse_known_args(overrides, relaxed_parser=True)
41
+
42
+ # load config
43
+ CONFIG.audio.signal_norm = False # do not apply earlier normalization
44
+ CONFIG.audio.stats_path = None # discard pre-defined stats
45
+
46
+ # load audio processor
47
+ ap = AudioProcessor(**CONFIG.audio.to_dict())
48
+
49
+ # load the meta data of target dataset
50
+ if args.data_path:
51
+ dataset_items = glob.glob(os.path.join(args.data_path, "**", "*.wav"), recursive=True)
52
+ else:
53
+ dataset_items = load_tts_samples(CONFIG.datasets)[0] # take only train data
54
+ print(f" > There are {len(dataset_items)} files.")
55
+
56
+ mel_sum = 0
57
+ mel_square_sum = 0
58
+ linear_sum = 0
59
+ linear_square_sum = 0
60
+ N = 0
61
+ for item in tqdm(dataset_items):
62
+ # compute features
63
+ wav = ap.load_wav(item if isinstance(item, str) else item["audio_file"])
64
+ linear = ap.spectrogram(wav)
65
+ mel = ap.melspectrogram(wav)
66
+
67
+ # compute stats
68
+ N += mel.shape[1]
69
+ mel_sum += mel.sum(1)
70
+ linear_sum += linear.sum(1)
71
+ mel_square_sum += (mel**2).sum(axis=1)
72
+ linear_square_sum += (linear**2).sum(axis=1)
73
+
74
+ mel_mean = mel_sum / N
75
+ mel_scale = np.sqrt(mel_square_sum / N - mel_mean**2)
76
+ linear_mean = linear_sum / N
77
+ linear_scale = np.sqrt(linear_square_sum / N - linear_mean**2)
78
+
79
+ output_file_path = args.out_path
80
+ stats = {}
81
+ stats["mel_mean"] = mel_mean
82
+ stats["mel_std"] = mel_scale
83
+ stats["linear_mean"] = linear_mean
84
+ stats["linear_std"] = linear_scale
85
+
86
+ print(f" > Avg mel spec mean: {mel_mean.mean()}")
87
+ print(f" > Avg mel spec scale: {mel_scale.mean()}")
88
+ print(f" > Avg linear spec mean: {linear_mean.mean()}")
89
+ print(f" > Avg linear spec scale: {linear_scale.mean()}")
90
+
91
+ # set default config values for mean-var scaling
92
+ CONFIG.audio.stats_path = output_file_path
93
+ CONFIG.audio.signal_norm = True
94
+ # remove redundant values
95
+ del CONFIG.audio.max_norm
96
+ del CONFIG.audio.min_level_db
97
+ del CONFIG.audio.symmetric_norm
98
+ del CONFIG.audio.clip_norm
99
+ stats["audio_config"] = CONFIG.audio.to_dict()
100
+ np.save(output_file_path, stats, allow_pickle=True)
101
+ print(f" > stats saved to {output_file_path}")
102
+ sys.exit(0)
103
+
104
+
105
+ if __name__ == "__main__":
106
+ main()
TTS/bin/eval_encoder.py ADDED
@@ -0,0 +1,93 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import argparse
2
+ import logging
3
+ import sys
4
+ from argparse import RawTextHelpFormatter
5
+
6
+ import torch
7
+ from tqdm import tqdm
8
+
9
+ from TTS.config import load_config
10
+ from TTS.tts.datasets import load_tts_samples
11
+ from TTS.tts.utils.speakers import SpeakerManager
12
+ from TTS.utils.generic_utils import ConsoleFormatter, setup_logger
13
+
14
+
15
+ def compute_encoder_accuracy(dataset_items, encoder_manager):
16
+ class_name_key = encoder_manager.encoder_config.class_name_key
17
+ map_classid_to_classname = getattr(encoder_manager.encoder_config, "map_classid_to_classname", None)
18
+
19
+ class_acc_dict = {}
20
+
21
+ # compute embeddings for all wav_files
22
+ for item in tqdm(dataset_items):
23
+ class_name = item[class_name_key]
24
+ wav_file = item["audio_file"]
25
+
26
+ # extract the embedding
27
+ embedd = encoder_manager.compute_embedding_from_clip(wav_file)
28
+ if encoder_manager.encoder_criterion is not None and map_classid_to_classname is not None:
29
+ embedding = torch.FloatTensor(embedd).unsqueeze(0)
30
+ if encoder_manager.use_cuda:
31
+ embedding = embedding.cuda()
32
+
33
+ class_id = encoder_manager.encoder_criterion.softmax.inference(embedding).item()
34
+ predicted_label = map_classid_to_classname[str(class_id)]
35
+ else:
36
+ predicted_label = None
37
+
38
+ if class_name is not None and predicted_label is not None:
39
+ is_equal = int(class_name == predicted_label)
40
+ if class_name not in class_acc_dict:
41
+ class_acc_dict[class_name] = [is_equal]
42
+ else:
43
+ class_acc_dict[class_name].append(is_equal)
44
+ else:
45
+ raise RuntimeError("Error: class_name or/and predicted_label are None")
46
+
47
+ acc_avg = 0
48
+ for key, values in class_acc_dict.items():
49
+ acc = sum(values) / len(values)
50
+ print("Class", key, "Accuracy:", acc)
51
+ acc_avg += acc
52
+
53
+ print("Average Accuracy:", acc_avg / len(class_acc_dict))
54
+
55
+
56
+ if __name__ == "__main__":
57
+ setup_logger("TTS", level=logging.INFO, stream=sys.stdout, formatter=ConsoleFormatter())
58
+
59
+ parser = argparse.ArgumentParser(
60
+ description="""Compute the accuracy of the encoder.\n\n"""
61
+ """
62
+ Example runs:
63
+ python TTS/bin/eval_encoder.py emotion_encoder_model.pth emotion_encoder_config.json dataset_config.json
64
+ """,
65
+ formatter_class=RawTextHelpFormatter,
66
+ )
67
+ parser.add_argument("model_path", type=str, help="Path to model checkpoint file.")
68
+ parser.add_argument(
69
+ "config_path",
70
+ type=str,
71
+ help="Path to model config file.",
72
+ )
73
+
74
+ parser.add_argument(
75
+ "config_dataset_path",
76
+ type=str,
77
+ help="Path to dataset config file.",
78
+ )
79
+ parser.add_argument("--use_cuda", action=argparse.BooleanOptionalAction, help="flag to set cuda.", default=True)
80
+ parser.add_argument("--eval", action=argparse.BooleanOptionalAction, help="compute eval.", default=True)
81
+
82
+ args = parser.parse_args()
83
+
84
+ c_dataset = load_config(args.config_dataset_path)
85
+
86
+ meta_data_train, meta_data_eval = load_tts_samples(c_dataset.datasets, eval_split=args.eval)
87
+ items = meta_data_train + meta_data_eval
88
+
89
+ enc_manager = SpeakerManager(
90
+ encoder_model_path=args.model_path, encoder_config_path=args.config_path, use_cuda=args.use_cuda
91
+ )
92
+
93
+ compute_encoder_accuracy(items, enc_manager)
TTS/bin/extract_tts_spectrograms.py ADDED
@@ -0,0 +1,305 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ #!/usr/bin/env python3
2
+ """Extract Mel spectrograms with teacher forcing."""
3
+
4
+ import argparse
5
+ import logging
6
+ import sys
7
+ from pathlib import Path
8
+ from typing import Optional
9
+
10
+ import numpy as np
11
+ import torch
12
+ from torch.utils.data import DataLoader
13
+ from tqdm import tqdm
14
+ from trainer.generic_utils import count_parameters
15
+
16
+ from TTS.config import load_config
17
+ from TTS.tts.configs.shared_configs import BaseTTSConfig
18
+ from TTS.tts.datasets import TTSDataset, load_tts_samples
19
+ from TTS.tts.models import setup_model
20
+ from TTS.tts.models.base_tts import BaseTTS
21
+ from TTS.tts.utils.speakers import SpeakerManager
22
+ from TTS.tts.utils.text.tokenizer import TTSTokenizer
23
+ from TTS.utils.audio import AudioProcessor
24
+ from TTS.utils.audio.numpy_transforms import quantize
25
+ from TTS.utils.generic_utils import ConsoleFormatter, setup_logger
26
+
27
+ use_cuda = torch.cuda.is_available()
28
+
29
+
30
+ def parse_args(arg_list: Optional[list[str]]) -> argparse.Namespace:
31
+ parser = argparse.ArgumentParser()
32
+ parser.add_argument("--config_path", type=str, help="Path to config file for training.", required=True)
33
+ parser.add_argument("--checkpoint_path", type=str, help="Model file to be restored.", required=True)
34
+ parser.add_argument("--output_path", type=str, help="Path to save mel specs", required=True)
35
+ parser.add_argument("--debug", default=False, action="store_true", help="Save audio files for debug")
36
+ parser.add_argument("--save_audio", default=False, action="store_true", help="Save audio files")
37
+ parser.add_argument("--quantize_bits", type=int, default=0, help="Save quantized audio files if non-zero")
38
+ parser.add_argument("--eval", action=argparse.BooleanOptionalAction, help="compute eval.", default=True)
39
+ return parser.parse_args(arg_list)
40
+
41
+
42
+ def setup_loader(config: BaseTTSConfig, ap: AudioProcessor, r, speaker_manager: SpeakerManager, samples) -> DataLoader:
43
+ tokenizer, _ = TTSTokenizer.init_from_config(config)
44
+ dataset = TTSDataset(
45
+ outputs_per_step=r,
46
+ compute_linear_spec=False,
47
+ samples=samples,
48
+ tokenizer=tokenizer,
49
+ ap=ap,
50
+ batch_group_size=0,
51
+ min_text_len=config.min_text_len,
52
+ max_text_len=config.max_text_len,
53
+ min_audio_len=config.min_audio_len,
54
+ max_audio_len=config.max_audio_len,
55
+ phoneme_cache_path=config.phoneme_cache_path,
56
+ precompute_num_workers=0,
57
+ use_noise_augment=False,
58
+ speaker_id_mapping=speaker_manager.name_to_id if config.use_speaker_embedding else None,
59
+ d_vector_mapping=speaker_manager.embeddings if config.use_d_vector_file else None,
60
+ )
61
+
62
+ if config.use_phonemes and config.compute_input_seq_cache:
63
+ # precompute phonemes to have a better estimate of sequence lengths.
64
+ dataset.compute_input_seq(config.num_loader_workers)
65
+ dataset.preprocess_samples()
66
+
67
+ return DataLoader(
68
+ dataset,
69
+ batch_size=config.batch_size,
70
+ shuffle=False,
71
+ collate_fn=dataset.collate_fn,
72
+ drop_last=False,
73
+ sampler=None,
74
+ num_workers=config.num_loader_workers,
75
+ pin_memory=False,
76
+ )
77
+
78
+
79
+ def set_filename(wav_path: str, out_path: Path) -> tuple[Path, Path, Path, Path]:
80
+ wav_name = Path(wav_path).stem
81
+ (out_path / "quant").mkdir(exist_ok=True, parents=True)
82
+ (out_path / "mel").mkdir(exist_ok=True, parents=True)
83
+ (out_path / "wav_gl").mkdir(exist_ok=True, parents=True)
84
+ (out_path / "wav").mkdir(exist_ok=True, parents=True)
85
+ wavq_path = out_path / "quant" / wav_name
86
+ mel_path = out_path / "mel" / wav_name
87
+ wav_gl_path = out_path / "wav_gl" / f"{wav_name}.wav"
88
+ out_wav_path = out_path / "wav" / f"{wav_name}.wav"
89
+ return wavq_path, mel_path, wav_gl_path, out_wav_path
90
+
91
+
92
+ def format_data(data):
93
+ # setup input data
94
+ text_input = data["token_id"]
95
+ text_lengths = data["token_id_lengths"]
96
+ mel_input = data["mel"]
97
+ mel_lengths = data["mel_lengths"]
98
+ item_idx = data["item_idxs"]
99
+ d_vectors = data["d_vectors"]
100
+ speaker_ids = data["speaker_ids"]
101
+ attn_mask = data["attns"]
102
+ avg_text_length = torch.mean(text_lengths.float())
103
+ avg_spec_length = torch.mean(mel_lengths.float())
104
+
105
+ # dispatch data to GPU
106
+ if use_cuda:
107
+ text_input = text_input.cuda(non_blocking=True)
108
+ text_lengths = text_lengths.cuda(non_blocking=True)
109
+ mel_input = mel_input.cuda(non_blocking=True)
110
+ mel_lengths = mel_lengths.cuda(non_blocking=True)
111
+ if speaker_ids is not None:
112
+ speaker_ids = speaker_ids.cuda(non_blocking=True)
113
+ if d_vectors is not None:
114
+ d_vectors = d_vectors.cuda(non_blocking=True)
115
+ if attn_mask is not None:
116
+ attn_mask = attn_mask.cuda(non_blocking=True)
117
+ return (
118
+ text_input,
119
+ text_lengths,
120
+ mel_input,
121
+ mel_lengths,
122
+ speaker_ids,
123
+ d_vectors,
124
+ avg_text_length,
125
+ avg_spec_length,
126
+ attn_mask,
127
+ item_idx,
128
+ )
129
+
130
+
131
+ @torch.inference_mode()
132
+ def inference(
133
+ model_name: str,
134
+ model: BaseTTS,
135
+ ap: AudioProcessor,
136
+ text_input,
137
+ text_lengths,
138
+ mel_input,
139
+ mel_lengths,
140
+ speaker_ids=None,
141
+ d_vectors=None,
142
+ ) -> np.ndarray:
143
+ if model_name == "glow_tts":
144
+ speaker_c = None
145
+ if speaker_ids is not None:
146
+ speaker_c = speaker_ids
147
+ elif d_vectors is not None:
148
+ speaker_c = d_vectors
149
+ outputs = model.inference_with_MAS(
150
+ text_input,
151
+ text_lengths,
152
+ mel_input,
153
+ mel_lengths,
154
+ aux_input={"d_vectors": speaker_c, "speaker_ids": speaker_ids},
155
+ )
156
+ model_output = outputs["model_outputs"]
157
+ return model_output.detach().cpu().numpy()
158
+
159
+ if "tacotron" in model_name:
160
+ aux_input = {"speaker_ids": speaker_ids, "d_vectors": d_vectors}
161
+ outputs = model(text_input, text_lengths, mel_input, mel_lengths, aux_input)
162
+ postnet_outputs = outputs["model_outputs"]
163
+ # normalize tacotron output
164
+ if model_name == "tacotron":
165
+ mel_specs = []
166
+ postnet_outputs = postnet_outputs.data.cpu().numpy()
167
+ for b in range(postnet_outputs.shape[0]):
168
+ postnet_output = postnet_outputs[b]
169
+ mel_specs.append(torch.FloatTensor(ap.out_linear_to_mel(postnet_output.T).T))
170
+ return torch.stack(mel_specs).cpu().numpy()
171
+ if model_name == "tacotron2":
172
+ return postnet_outputs.detach().cpu().numpy()
173
+ msg = f"Model not supported: {model_name}"
174
+ raise ValueError(msg)
175
+
176
+
177
+ def extract_spectrograms(
178
+ model_name: str,
179
+ data_loader: DataLoader,
180
+ model: BaseTTS,
181
+ ap: AudioProcessor,
182
+ output_path: Path,
183
+ quantize_bits: int = 0,
184
+ save_audio: bool = False,
185
+ debug: bool = False,
186
+ metadata_name: str = "metadata.txt",
187
+ ) -> None:
188
+ model.eval()
189
+ export_metadata = []
190
+ for _, data in tqdm(enumerate(data_loader), total=len(data_loader)):
191
+ # format data
192
+ (
193
+ text_input,
194
+ text_lengths,
195
+ mel_input,
196
+ mel_lengths,
197
+ speaker_ids,
198
+ d_vectors,
199
+ _,
200
+ _,
201
+ _,
202
+ item_idx,
203
+ ) = format_data(data)
204
+
205
+ model_output = inference(
206
+ model_name,
207
+ model,
208
+ ap,
209
+ text_input,
210
+ text_lengths,
211
+ mel_input,
212
+ mel_lengths,
213
+ speaker_ids,
214
+ d_vectors,
215
+ )
216
+
217
+ for idx in range(text_input.shape[0]):
218
+ wav_file_path = item_idx[idx]
219
+ wav = ap.load_wav(wav_file_path)
220
+ wavq_path, mel_path, wav_gl_path, wav_path = set_filename(wav_file_path, output_path)
221
+
222
+ # quantize and save wav
223
+ if quantize_bits > 0:
224
+ wavq = quantize(wav, quantize_bits)
225
+ np.save(wavq_path, wavq)
226
+
227
+ # save TTS mel
228
+ mel = model_output[idx]
229
+ mel_length = mel_lengths[idx]
230
+ mel = mel[:mel_length, :].T
231
+ np.save(mel_path, mel)
232
+
233
+ export_metadata.append([wav_file_path, mel_path])
234
+ if save_audio:
235
+ ap.save_wav(wav, wav_path)
236
+
237
+ if debug:
238
+ print("Audio for debug saved at:", wav_gl_path)
239
+ wav = ap.inv_melspectrogram(mel)
240
+ ap.save_wav(wav, wav_gl_path)
241
+
242
+ with (output_path / metadata_name).open("w") as f:
243
+ for data in export_metadata:
244
+ f.write(f"{data[0] / data[1]}.npy\n")
245
+
246
+
247
+ def main(arg_list: Optional[list[str]] = None) -> None:
248
+ setup_logger("TTS", level=logging.INFO, stream=sys.stdout, formatter=ConsoleFormatter())
249
+ args = parse_args(arg_list)
250
+ config = load_config(args.config_path)
251
+ config.audio.trim_silence = False
252
+
253
+ # Audio processor
254
+ ap = AudioProcessor(**config.audio)
255
+
256
+ # load data instances
257
+ meta_data_train, meta_data_eval = load_tts_samples(
258
+ config.datasets,
259
+ eval_split=args.eval,
260
+ eval_split_max_size=config.eval_split_max_size,
261
+ eval_split_size=config.eval_split_size,
262
+ )
263
+
264
+ # use eval and training partitions
265
+ meta_data = meta_data_train + meta_data_eval
266
+
267
+ # init speaker manager
268
+ if config.use_speaker_embedding:
269
+ speaker_manager = SpeakerManager(data_items=meta_data)
270
+ elif config.use_d_vector_file:
271
+ speaker_manager = SpeakerManager(d_vectors_file_path=config.d_vector_file)
272
+ else:
273
+ speaker_manager = None
274
+
275
+ # setup model
276
+ model = setup_model(config)
277
+
278
+ # restore model
279
+ model.load_checkpoint(config, args.checkpoint_path, eval=True)
280
+
281
+ if use_cuda:
282
+ model.cuda()
283
+
284
+ num_params = count_parameters(model)
285
+ print(f"\n > Model has {num_params} parameters", flush=True)
286
+ # set r
287
+ r = 1 if config.model.lower() == "glow_tts" else model.decoder.r
288
+ own_loader = setup_loader(config, ap, r, speaker_manager, meta_data)
289
+
290
+ extract_spectrograms(
291
+ config.model.lower(),
292
+ own_loader,
293
+ model,
294
+ ap,
295
+ Path(args.output_path),
296
+ quantize_bits=args.quantize_bits,
297
+ save_audio=args.save_audio,
298
+ debug=args.debug,
299
+ metadata_name="metadata.txt",
300
+ )
301
+ sys.exit(0)
302
+
303
+
304
+ if __name__ == "__main__":
305
+ main()
TTS/bin/find_unique_chars.py ADDED
@@ -0,0 +1,41 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ """Find all the unique characters in a dataset"""
2
+
3
+ import argparse
4
+ import logging
5
+ import sys
6
+ from argparse import RawTextHelpFormatter
7
+
8
+ from TTS.config import load_config
9
+ from TTS.tts.datasets import find_unique_chars, load_tts_samples
10
+ from TTS.utils.generic_utils import ConsoleFormatter, setup_logger
11
+
12
+
13
+ def main():
14
+ setup_logger("TTS", level=logging.INFO, stream=sys.stdout, formatter=ConsoleFormatter())
15
+
16
+ # pylint: disable=bad-option-value
17
+ parser = argparse.ArgumentParser(
18
+ description="""Find all the unique characters or phonemes in a dataset.\n\n"""
19
+ """
20
+ Example runs:
21
+
22
+ python TTS/bin/find_unique_chars.py --config_path config.json
23
+ """,
24
+ formatter_class=RawTextHelpFormatter,
25
+ )
26
+ parser.add_argument("--config_path", type=str, help="Path to dataset config file.", required=True)
27
+ args = parser.parse_args()
28
+
29
+ c = load_config(args.config_path)
30
+
31
+ # load all datasets
32
+ train_items, eval_items = load_tts_samples(
33
+ c.datasets, eval_split=True, eval_split_max_size=c.eval_split_max_size, eval_split_size=c.eval_split_size
34
+ )
35
+
36
+ items = train_items + eval_items
37
+ find_unique_chars(items)
38
+
39
+
40
+ if __name__ == "__main__":
41
+ main()
TTS/bin/find_unique_phonemes.py ADDED
@@ -0,0 +1,88 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ """Find all the unique characters in a dataset."""
2
+
3
+ import argparse
4
+ import logging
5
+ import multiprocessing
6
+ import sys
7
+ from argparse import RawTextHelpFormatter
8
+ from typing import Optional
9
+
10
+ from tqdm.contrib.concurrent import process_map
11
+
12
+ from TTS.config import load_config
13
+ from TTS.tts.datasets import load_tts_samples
14
+ from TTS.tts.utils.text.phonemizers import Gruut
15
+ from TTS.utils.generic_utils import ConsoleFormatter, setup_logger
16
+
17
+
18
+ def compute_phonemes(item: dict) -> set[str]:
19
+ text = item["text"]
20
+ ph = phonemizer.phonemize(text).replace("|", "")
21
+ return set(ph)
22
+
23
+
24
+ def parse_args(arg_list: Optional[list[str]]) -> argparse.Namespace:
25
+ parser = argparse.ArgumentParser(
26
+ description="""Find all the unique characters or phonemes in a dataset.\n\n"""
27
+ """
28
+ Example runs:
29
+
30
+ python TTS/bin/find_unique_phonemes.py --config_path config.json
31
+ """,
32
+ formatter_class=RawTextHelpFormatter,
33
+ )
34
+ parser.add_argument("--config_path", type=str, help="Path to dataset config file.", required=True)
35
+ return parser.parse_args(arg_list)
36
+
37
+
38
+ def main(arg_list: Optional[list[str]] = None) -> None:
39
+ setup_logger("TTS", level=logging.INFO, stream=sys.stdout, formatter=ConsoleFormatter())
40
+ global phonemizer
41
+ args = parse_args(arg_list)
42
+ config = load_config(args.config_path)
43
+
44
+ # load all datasets
45
+ train_items, eval_items = load_tts_samples(
46
+ config.datasets,
47
+ eval_split=True,
48
+ eval_split_max_size=config.eval_split_max_size,
49
+ eval_split_size=config.eval_split_size,
50
+ )
51
+ items = train_items + eval_items
52
+ print("Num items:", len(items))
53
+
54
+ language_list = [item["language"] for item in items]
55
+ is_lang_def = all(language_list)
56
+
57
+ if not config.phoneme_language or not is_lang_def:
58
+ msg = "Phoneme language must be defined in config."
59
+ raise ValueError(msg)
60
+
61
+ if language_list.count(language_list[0]) != len(language_list):
62
+ msg = (
63
+ "Currently, just one phoneme language per config file is supported !! "
64
+ "Please split the dataset config into different configs and run it individually for each language !!"
65
+ )
66
+ raise ValueError(msg)
67
+
68
+ phonemizer = Gruut(language=language_list[0], keep_puncs=True)
69
+
70
+ phonemes = process_map(compute_phonemes, items, max_workers=multiprocessing.cpu_count(), chunksize=15)
71
+ phones = []
72
+ for ph in phonemes:
73
+ phones.extend(ph)
74
+
75
+ phones = set(phones)
76
+ lower_phones = filter(lambda c: c.islower(), phones)
77
+ phones_force_lower = [c.lower() for c in phones]
78
+ phones_force_lower = set(phones_force_lower)
79
+
80
+ print(f" > Number of unique phonemes: {len(phones)}")
81
+ print(f" > Unique phonemes: {''.join(sorted(phones))}")
82
+ print(f" > Unique lower phonemes: {''.join(sorted(lower_phones))}")
83
+ print(f" > Unique all forced to lower phonemes: {''.join(sorted(phones_force_lower))}")
84
+ sys.exit(0)
85
+
86
+
87
+ if __name__ == "__main__":
88
+ main()
TTS/bin/remove_silence_using_vad.py ADDED
@@ -0,0 +1,129 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import argparse
2
+ import glob
3
+ import logging
4
+ import multiprocessing
5
+ import os
6
+ import pathlib
7
+ import sys
8
+
9
+ import torch
10
+ from tqdm import tqdm
11
+
12
+ from TTS.utils.generic_utils import ConsoleFormatter, setup_logger
13
+ from TTS.utils.vad import get_vad_model_and_utils, remove_silence
14
+
15
+ torch.set_num_threads(1)
16
+
17
+
18
+ def adjust_path_and_remove_silence(audio_path):
19
+ output_path = audio_path.replace(os.path.join(args.input_dir, ""), os.path.join(args.output_dir, ""))
20
+ # ignore if the file exists
21
+ if os.path.exists(output_path) and not args.force:
22
+ return output_path, False
23
+
24
+ # create all directory structure
25
+ pathlib.Path(output_path).parent.mkdir(parents=True, exist_ok=True)
26
+ # remove the silence and save the audio
27
+ output_path, is_speech = remove_silence(
28
+ model_and_utils,
29
+ audio_path,
30
+ output_path,
31
+ trim_just_beginning_and_end=args.trim_just_beginning_and_end,
32
+ use_cuda=args.use_cuda,
33
+ )
34
+ return output_path, is_speech
35
+
36
+
37
+ def preprocess_audios():
38
+ files = sorted(glob.glob(os.path.join(args.input_dir, args.glob), recursive=True))
39
+ print("> Number of files: ", len(files))
40
+ if not args.force:
41
+ print("> Ignoring files that already exist in the output idrectory.")
42
+
43
+ if args.trim_just_beginning_and_end:
44
+ print("> Trimming just the beginning and the end with nonspeech parts.")
45
+ else:
46
+ print("> Trimming all nonspeech parts.")
47
+
48
+ filtered_files = []
49
+ if files:
50
+ # create threads
51
+ # num_threads = multiprocessing.cpu_count()
52
+ # process_map(adjust_path_and_remove_silence, files, max_workers=num_threads, chunksize=15)
53
+
54
+ if args.num_processes > 1:
55
+ with multiprocessing.Pool(processes=args.num_processes) as pool:
56
+ results = list(
57
+ tqdm(
58
+ pool.imap_unordered(adjust_path_and_remove_silence, files),
59
+ total=len(files),
60
+ desc="Processing audio files",
61
+ )
62
+ )
63
+ for output_path, is_speech in results:
64
+ if not is_speech:
65
+ filtered_files.append(output_path)
66
+ else:
67
+ for f in tqdm(files):
68
+ output_path, is_speech = adjust_path_and_remove_silence(f)
69
+ if not is_speech:
70
+ filtered_files.append(output_path)
71
+
72
+ # write files that do not have speech
73
+ with open(os.path.join(args.output_dir, "filtered_files.txt"), "w", encoding="utf-8") as f:
74
+ for file in filtered_files:
75
+ f.write(str(file) + "\n")
76
+ else:
77
+ print("> No files Found !")
78
+
79
+
80
+ if __name__ == "__main__":
81
+ setup_logger("TTS", level=logging.INFO, stream=sys.stdout, formatter=ConsoleFormatter())
82
+
83
+ parser = argparse.ArgumentParser(
84
+ description="python TTS/bin/remove_silence_using_vad.py -i=VCTK-Corpus/ -o=VCTK-Corpus-removed-silence/ -g=wav48_silence_trimmed/*/*_mic1.flac --trim_just_beginning_and_end"
85
+ )
86
+ parser.add_argument("-i", "--input_dir", type=str, help="Dataset root dir", required=True)
87
+ parser.add_argument("-o", "--output_dir", type=str, help="Output Dataset dir", default="")
88
+ parser.add_argument("-f", "--force", default=False, action="store_true", help="Force the replace of exists files")
89
+ parser.add_argument(
90
+ "-g",
91
+ "--glob",
92
+ type=str,
93
+ default="**/*.wav",
94
+ help="path in glob format for acess wavs from input_dir. ex: wav48/*/*.wav",
95
+ )
96
+ parser.add_argument(
97
+ "-t",
98
+ "--trim_just_beginning_and_end",
99
+ action=argparse.BooleanOptionalAction,
100
+ default=True,
101
+ help="If True this script will trim just the beginning and end nonspeech parts. If False all nonspeech parts will be trimmed.",
102
+ )
103
+ parser.add_argument(
104
+ "-c",
105
+ "--use_cuda",
106
+ action=argparse.BooleanOptionalAction,
107
+ default=False,
108
+ help="If True use cuda",
109
+ )
110
+ parser.add_argument(
111
+ "--use_onnx",
112
+ action=argparse.BooleanOptionalAction,
113
+ default=False,
114
+ help="If True use onnx",
115
+ )
116
+ parser.add_argument(
117
+ "--num_processes",
118
+ type=int,
119
+ default=1,
120
+ help="Number of processes to use",
121
+ )
122
+ args = parser.parse_args()
123
+
124
+ if args.output_dir == "":
125
+ args.output_dir = args.input_dir
126
+
127
+ # load the model and utils
128
+ model_and_utils = get_vad_model_and_utils(use_cuda=args.use_cuda, use_onnx=args.use_onnx)
129
+ preprocess_audios()
TTS/bin/resample.py ADDED
@@ -0,0 +1,90 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import argparse
2
+ import glob
3
+ import os
4
+ from argparse import RawTextHelpFormatter
5
+ from multiprocessing import Pool
6
+ from shutil import copytree
7
+
8
+ import librosa
9
+ import soundfile as sf
10
+ from tqdm import tqdm
11
+
12
+
13
+ def resample_file(func_args):
14
+ filename, output_sr = func_args
15
+ y, sr = librosa.load(filename, sr=output_sr)
16
+ sf.write(filename, y, sr)
17
+
18
+
19
+ def resample_files(input_dir, output_sr, output_dir=None, file_ext="wav", n_jobs=10):
20
+ if output_dir:
21
+ print("Recursively copying the input folder...")
22
+ copytree(input_dir, output_dir)
23
+ input_dir = output_dir
24
+
25
+ print("Resampling the audio files...")
26
+ audio_files = glob.glob(os.path.join(input_dir, f"**/*.{file_ext}"), recursive=True)
27
+ print(f"Found {len(audio_files)} files...")
28
+ audio_files = list(zip(audio_files, len(audio_files) * [output_sr]))
29
+ with Pool(processes=n_jobs) as p:
30
+ with tqdm(total=len(audio_files)) as pbar:
31
+ for _, _ in enumerate(p.imap_unordered(resample_file, audio_files)):
32
+ pbar.update()
33
+
34
+ print("Done !")
35
+
36
+
37
+ if __name__ == "__main__":
38
+ parser = argparse.ArgumentParser(
39
+ description="""Resample a folder recusively with librosa
40
+ Can be used in place or create a copy of the folder as an output.\n\n
41
+ Example run:
42
+ python TTS/bin/resample.py
43
+ --input_dir /root/LJSpeech-1.1/
44
+ --output_sr 22050
45
+ --output_dir /root/resampled_LJSpeech-1.1/
46
+ --file_ext wav
47
+ --n_jobs 24
48
+ """,
49
+ formatter_class=RawTextHelpFormatter,
50
+ )
51
+
52
+ parser.add_argument(
53
+ "--input_dir",
54
+ type=str,
55
+ default=None,
56
+ required=True,
57
+ help="Path of the folder containing the audio files to resample",
58
+ )
59
+
60
+ parser.add_argument(
61
+ "--output_sr",
62
+ type=int,
63
+ default=22050,
64
+ required=False,
65
+ help="Samlple rate to which the audio files should be resampled",
66
+ )
67
+
68
+ parser.add_argument(
69
+ "--output_dir",
70
+ type=str,
71
+ default=None,
72
+ required=False,
73
+ help="Path of the destination folder. If not defined, the operation is done in place",
74
+ )
75
+
76
+ parser.add_argument(
77
+ "--file_ext",
78
+ type=str,
79
+ default="wav",
80
+ required=False,
81
+ help="Extension of the audio files to resample",
82
+ )
83
+
84
+ parser.add_argument(
85
+ "--n_jobs", type=int, default=None, help="Number of threads to use, by default it uses all cores"
86
+ )
87
+
88
+ args = parser.parse_args()
89
+
90
+ resample_files(args.input_dir, args.output_sr, args.output_dir, args.file_ext, args.n_jobs)
TTS/bin/synthesize.py ADDED
@@ -0,0 +1,438 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ #!/usr/bin/env python3
2
+
3
+ """Command line interface."""
4
+
5
+ import argparse
6
+ import contextlib
7
+ import logging
8
+ import sys
9
+ from argparse import RawTextHelpFormatter
10
+ from typing import Optional
11
+
12
+ # pylint: disable=redefined-outer-name, unused-argument
13
+ from TTS.utils.generic_utils import ConsoleFormatter, setup_logger
14
+
15
+ logger = logging.getLogger(__name__)
16
+
17
+ description = """
18
+ Synthesize speech on the command line.
19
+
20
+ You can either use your trained model or choose a model from the provided list.
21
+
22
+ - List provided models:
23
+
24
+ ```sh
25
+ tts --list_models
26
+ ```
27
+
28
+ - Get model information. Use the names obtained from `--list_models`.
29
+ ```sh
30
+ tts --model_info_by_name "<model_type>/<language>/<dataset>/<model_name>"
31
+ ```
32
+ For example:
33
+ ```sh
34
+ tts --model_info_by_name tts_models/tr/common-voice/glow-tts
35
+ tts --model_info_by_name vocoder_models/en/ljspeech/hifigan_v2
36
+ ```
37
+
38
+ #### Single speaker models
39
+
40
+ - Run TTS with the default model (`tts_models/en/ljspeech/tacotron2-DDC`):
41
+
42
+ ```sh
43
+ tts --text "Text for TTS" --out_path output/path/speech.wav
44
+ ```
45
+
46
+ - Run TTS and pipe out the generated TTS wav file data:
47
+
48
+ ```sh
49
+ tts --text "Text for TTS" --pipe_out --out_path output/path/speech.wav | aplay
50
+ ```
51
+
52
+ - Run a TTS model with its default vocoder model:
53
+
54
+ ```sh
55
+ tts --text "Text for TTS" \\
56
+ --model_name "<model_type>/<language>/<dataset>/<model_name>" \\
57
+ --out_path output/path/speech.wav
58
+ ```
59
+
60
+ For example:
61
+
62
+ ```sh
63
+ tts --text "Text for TTS" \\
64
+ --model_name "tts_models/en/ljspeech/glow-tts" \\
65
+ --out_path output/path/speech.wav
66
+ ```
67
+
68
+ - Run with specific TTS and vocoder models from the list. Note that not every vocoder is compatible with every TTS model.
69
+
70
+ ```sh
71
+ tts --text "Text for TTS" \\
72
+ --model_name "<model_type>/<language>/<dataset>/<model_name>" \\
73
+ --vocoder_name "<model_type>/<language>/<dataset>/<model_name>" \\
74
+ --out_path output/path/speech.wav
75
+ ```
76
+
77
+ For example:
78
+
79
+ ```sh
80
+ tts --text "Text for TTS" \\
81
+ --model_name "tts_models/en/ljspeech/glow-tts" \\
82
+ --vocoder_name "vocoder_models/en/ljspeech/univnet" \\
83
+ --out_path output/path/speech.wav
84
+ ```
85
+
86
+ - Run your own TTS model (using Griffin-Lim Vocoder):
87
+
88
+ ```sh
89
+ tts --text "Text for TTS" \\
90
+ --model_path path/to/model.pth \\
91
+ --config_path path/to/config.json \\
92
+ --out_path output/path/speech.wav
93
+ ```
94
+
95
+ - Run your own TTS and Vocoder models:
96
+
97
+ ```sh
98
+ tts --text "Text for TTS" \\
99
+ --model_path path/to/model.pth \\
100
+ --config_path path/to/config.json \\
101
+ --out_path output/path/speech.wav \\
102
+ --vocoder_path path/to/vocoder.pth \\
103
+ --vocoder_config_path path/to/vocoder_config.json
104
+ ```
105
+
106
+ #### Multi-speaker models
107
+
108
+ - List the available speakers and choose a `<speaker_id>` among them:
109
+
110
+ ```sh
111
+ tts --model_name "<language>/<dataset>/<model_name>" --list_speaker_idxs
112
+ ```
113
+
114
+ - Run the multi-speaker TTS model with the target speaker ID:
115
+
116
+ ```sh
117
+ tts --text "Text for TTS." --out_path output/path/speech.wav \\
118
+ --model_name "<language>/<dataset>/<model_name>" --speaker_idx <speaker_id>
119
+ ```
120
+
121
+ - Run your own multi-speaker TTS model:
122
+
123
+ ```sh
124
+ tts --text "Text for TTS" --out_path output/path/speech.wav \\
125
+ --model_path path/to/model.pth --config_path path/to/config.json \\
126
+ --speakers_file_path path/to/speaker.json --speaker_idx <speaker_id>
127
+ ```
128
+
129
+ #### Voice conversion models
130
+
131
+ ```sh
132
+ tts --out_path output/path/speech.wav --model_name "<language>/<dataset>/<model_name>" \\
133
+ --source_wav <path/to/speaker/wav> --target_wav <path/to/reference/wav>
134
+ ```
135
+ """
136
+
137
+
138
+ def parse_args(arg_list: Optional[list[str]]) -> argparse.Namespace:
139
+ """Parse arguments."""
140
+ parser = argparse.ArgumentParser(
141
+ description=description.replace(" ```\n", ""),
142
+ formatter_class=RawTextHelpFormatter,
143
+ )
144
+
145
+ parser.add_argument(
146
+ "--list_models",
147
+ action="store_true",
148
+ help="list available pre-trained TTS and vocoder models.",
149
+ )
150
+
151
+ parser.add_argument(
152
+ "--model_info_by_idx",
153
+ type=str,
154
+ default=None,
155
+ help="model info using query format: <model_type>/<model_query_idx>",
156
+ )
157
+
158
+ parser.add_argument(
159
+ "--model_info_by_name",
160
+ type=str,
161
+ default=None,
162
+ help="model info using query format: <model_type>/<language>/<dataset>/<model_name>",
163
+ )
164
+
165
+ parser.add_argument("--text", type=str, default=None, help="Text to generate speech.")
166
+
167
+ # Args for running pre-trained TTS models.
168
+ parser.add_argument(
169
+ "--model_name",
170
+ type=str,
171
+ default="tts_models/en/ljspeech/tacotron2-DDC",
172
+ help="Name of one of the pre-trained TTS models in format <language>/<dataset>/<model_name>",
173
+ )
174
+ parser.add_argument(
175
+ "--vocoder_name",
176
+ type=str,
177
+ default=None,
178
+ help="Name of one of the pre-trained vocoder models in format <language>/<dataset>/<model_name>",
179
+ )
180
+
181
+ # Args for running custom models
182
+ parser.add_argument("--config_path", default=None, type=str, help="Path to model config file.")
183
+ parser.add_argument(
184
+ "--model_path",
185
+ type=str,
186
+ default=None,
187
+ help="Path to model file.",
188
+ )
189
+ parser.add_argument(
190
+ "--out_path",
191
+ type=str,
192
+ default="tts_output.wav",
193
+ help="Output wav file path.",
194
+ )
195
+ parser.add_argument("--use_cuda", action="store_true", help="Run model on CUDA.")
196
+ parser.add_argument("--device", type=str, help="Device to run model on.", default="cpu")
197
+ parser.add_argument(
198
+ "--vocoder_path",
199
+ type=str,
200
+ help="Path to vocoder model file. If it is not defined, model uses GL as vocoder. Please make sure that you installed vocoder library before (WaveRNN).",
201
+ default=None,
202
+ )
203
+ parser.add_argument("--vocoder_config_path", type=str, help="Path to vocoder model config file.", default=None)
204
+ parser.add_argument(
205
+ "--encoder_path",
206
+ type=str,
207
+ help="Path to speaker encoder model file.",
208
+ default=None,
209
+ )
210
+ parser.add_argument("--encoder_config_path", type=str, help="Path to speaker encoder config file.", default=None)
211
+ parser.add_argument(
212
+ "--pipe_out",
213
+ help="stdout the generated TTS wav file for shell pipe.",
214
+ action="store_true",
215
+ )
216
+
217
+ # args for multi-speaker synthesis
218
+ parser.add_argument("--speakers_file_path", type=str, help="JSON file for multi-speaker model.", default=None)
219
+ parser.add_argument("--language_ids_file_path", type=str, help="JSON file for multi-lingual model.", default=None)
220
+ parser.add_argument(
221
+ "--speaker_idx",
222
+ type=str,
223
+ help="Target speaker ID for a multi-speaker TTS model.",
224
+ default=None,
225
+ )
226
+ parser.add_argument(
227
+ "--language_idx",
228
+ type=str,
229
+ help="Target language ID for a multi-lingual TTS model.",
230
+ default=None,
231
+ )
232
+ parser.add_argument(
233
+ "--speaker_wav",
234
+ nargs="+",
235
+ help="wav file(s) to condition a multi-speaker TTS model with a Speaker Encoder. You can give multiple file paths. The d_vectors is computed as their average.",
236
+ default=None,
237
+ )
238
+ parser.add_argument("--gst_style", help="Wav path file for GST style reference.", default=None)
239
+ parser.add_argument(
240
+ "--capacitron_style_wav", type=str, help="Wav path file for Capacitron prosody reference.", default=None
241
+ )
242
+ parser.add_argument("--capacitron_style_text", type=str, help="Transcription of the reference.", default=None)
243
+ parser.add_argument(
244
+ "--list_speaker_idxs",
245
+ help="List available speaker ids for the defined multi-speaker model.",
246
+ action="store_true",
247
+ )
248
+ parser.add_argument(
249
+ "--list_language_idxs",
250
+ help="List available language ids for the defined multi-lingual model.",
251
+ action="store_true",
252
+ )
253
+ # aux args
254
+ parser.add_argument(
255
+ "--reference_wav",
256
+ type=str,
257
+ help="Reference wav file to convert in the voice of the speaker_idx or speaker_wav",
258
+ default=None,
259
+ )
260
+ parser.add_argument(
261
+ "--reference_speaker_idx",
262
+ type=str,
263
+ help="speaker ID of the reference_wav speaker (If not provided the embedding will be computed using the Speaker Encoder).",
264
+ default=None,
265
+ )
266
+ parser.add_argument(
267
+ "--progress_bar",
268
+ action=argparse.BooleanOptionalAction,
269
+ help="Show a progress bar for the model download.",
270
+ default=True,
271
+ )
272
+
273
+ # voice conversion args
274
+ parser.add_argument(
275
+ "--source_wav",
276
+ type=str,
277
+ default=None,
278
+ help="Original audio file to convert in the voice of the target_wav",
279
+ )
280
+ parser.add_argument(
281
+ "--target_wav",
282
+ type=str,
283
+ default=None,
284
+ help="Target audio file to convert in the voice of the source_wav",
285
+ )
286
+
287
+ parser.add_argument(
288
+ "--voice_dir",
289
+ type=str,
290
+ default=None,
291
+ help="Voice dir for tortoise model",
292
+ )
293
+
294
+ args = parser.parse_args(arg_list)
295
+
296
+ # print the description if either text or list_models is not set
297
+ check_args = [
298
+ args.text,
299
+ args.list_models,
300
+ args.list_speaker_idxs,
301
+ args.list_language_idxs,
302
+ args.reference_wav,
303
+ args.model_info_by_idx,
304
+ args.model_info_by_name,
305
+ args.source_wav,
306
+ args.target_wav,
307
+ ]
308
+ if not any(check_args):
309
+ parser.parse_args(["-h"])
310
+ return args
311
+
312
+
313
+ def main(arg_list: Optional[list[str]] = None) -> None:
314
+ """Entry point for `tts` command line interface."""
315
+ args = parse_args(arg_list)
316
+ stream = sys.stderr if args.pipe_out else sys.stdout
317
+ setup_logger("TTS", level=logging.INFO, stream=stream, formatter=ConsoleFormatter())
318
+
319
+ pipe_out = sys.stdout if args.pipe_out else None
320
+
321
+ with contextlib.redirect_stdout(None if args.pipe_out else sys.stdout):
322
+ # Late-import to make things load faster
323
+ from TTS.api import TTS
324
+ from TTS.utils.manage import ModelManager
325
+
326
+ # load model manager
327
+ manager = ModelManager(models_file=TTS.get_models_file_path(), progress_bar=args.progress_bar)
328
+
329
+ tts_path = None
330
+ tts_config_path = None
331
+ speakers_file_path = None
332
+ language_ids_file_path = None
333
+ vocoder_path = None
334
+ vocoder_config_path = None
335
+ encoder_path = None
336
+ encoder_config_path = None
337
+ vc_path = None
338
+ vc_config_path = None
339
+ model_dir = None
340
+
341
+ # 1) List pre-trained TTS models
342
+ if args.list_models:
343
+ manager.list_models()
344
+ sys.exit(0)
345
+
346
+ # 2) Info about pre-trained TTS models (without loading a model)
347
+ if args.model_info_by_idx:
348
+ model_query = args.model_info_by_idx
349
+ manager.model_info_by_idx(model_query)
350
+ sys.exit(0)
351
+
352
+ if args.model_info_by_name:
353
+ model_query_full_name = args.model_info_by_name
354
+ manager.model_info_by_full_name(model_query_full_name)
355
+ sys.exit(0)
356
+
357
+ # 3) Load a model for further info or TTS/VC
358
+ device = args.device
359
+ if args.use_cuda:
360
+ device = "cuda"
361
+ # A local model will take precedence if specified via modeL_path
362
+ model_name = args.model_name if args.model_path is None else None
363
+ api = TTS(
364
+ model_name=model_name,
365
+ model_path=args.model_path,
366
+ config_path=args.config_path,
367
+ vocoder_name=args.vocoder_name,
368
+ vocoder_path=args.vocoder_path,
369
+ vocoder_config_path=args.vocoder_config_path,
370
+ encoder_path=args.encoder_path,
371
+ encoder_config_path=args.encoder_config_path,
372
+ speakers_file_path=args.speakers_file_path,
373
+ language_ids_file_path=args.language_ids_file_path,
374
+ progress_bar=args.progress_bar,
375
+ ).to(device)
376
+
377
+ # query speaker ids of a multi-speaker model.
378
+ if args.list_speaker_idxs:
379
+ if not api.is_multi_speaker:
380
+ logger.info("Model only has a single speaker.")
381
+ sys.exit(0)
382
+ logger.info(
383
+ "Available speaker ids: (Set --speaker_idx flag to one of these values to use the multi-speaker model."
384
+ )
385
+ logger.info(api.speakers)
386
+ sys.exit(0)
387
+
388
+ # query langauge ids of a multi-lingual model.
389
+ if args.list_language_idxs:
390
+ if not api.is_multi_lingual:
391
+ logger.info("Monolingual model.")
392
+ sys.exit(0)
393
+ logger.info(
394
+ "Available language ids: (Set --language_idx flag to one of these values to use the multi-lingual model."
395
+ )
396
+ logger.info(api.languages)
397
+ sys.exit(0)
398
+
399
+ # check the arguments against a multi-speaker model.
400
+ if api.is_multi_speaker and (not args.speaker_idx and not args.speaker_wav):
401
+ logger.error(
402
+ "Looks like you use a multi-speaker model. Define `--speaker_idx` to "
403
+ "select the target speaker. You can list the available speakers for this model by `--list_speaker_idxs`."
404
+ )
405
+ sys.exit(1)
406
+
407
+ # RUN THE SYNTHESIS
408
+ if args.text:
409
+ logger.info("Text: %s", args.text)
410
+
411
+ if args.text is not None:
412
+ api.tts_to_file(
413
+ text=args.text,
414
+ speaker=args.speaker_idx,
415
+ language=args.language_idx,
416
+ speaker_wav=args.speaker_wav,
417
+ pipe_out=pipe_out,
418
+ file_path=args.out_path,
419
+ reference_wav=args.reference_wav,
420
+ style_wav=args.capacitron_style_wav,
421
+ style_text=args.capacitron_style_text,
422
+ reference_speaker_name=args.reference_speaker_idx,
423
+ voice_dir=args.voice_dir,
424
+ )
425
+ logger.info("Saved TTS output to %s", args.out_path)
426
+ elif args.source_wav is not None and args.target_wav is not None:
427
+ api.voice_conversion_to_file(
428
+ source_wav=args.source_wav,
429
+ target_wav=args.target_wav,
430
+ file_path=args.out_path,
431
+ pipe_out=pipe_out,
432
+ )
433
+ logger.info("Saved VC output to %s", args.out_path)
434
+ sys.exit(0)
435
+
436
+
437
+ if __name__ == "__main__":
438
+ main()
TTS/bin/train_encoder.py ADDED
@@ -0,0 +1,340 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ #!/usr/bin/env python3
2
+ # -*- coding: utf-8 -*-
3
+
4
+ import logging
5
+ import os
6
+ import sys
7
+ import time
8
+ import traceback
9
+ import warnings
10
+
11
+ import torch
12
+ from torch.utils.data import DataLoader
13
+ from trainer.generic_utils import count_parameters, remove_experiment_folder
14
+ from trainer.io import copy_model_files, save_best_model, save_checkpoint
15
+ from trainer.torch import NoamLR
16
+ from trainer.trainer_utils import get_optimizer
17
+
18
+ from TTS.encoder.dataset import EncoderDataset
19
+ from TTS.encoder.utils.generic_utils import setup_encoder_model
20
+ from TTS.encoder.utils.training import init_training
21
+ from TTS.encoder.utils.visual import plot_embeddings
22
+ from TTS.tts.datasets import load_tts_samples
23
+ from TTS.utils.audio import AudioProcessor
24
+ from TTS.utils.generic_utils import ConsoleFormatter, setup_logger
25
+ from TTS.utils.samplers import PerfectBatchSampler
26
+ from TTS.utils.training import check_update
27
+
28
+ torch.backends.cudnn.enabled = True
29
+ torch.backends.cudnn.benchmark = True
30
+ torch.manual_seed(54321)
31
+ use_cuda = torch.cuda.is_available()
32
+ num_gpus = torch.cuda.device_count()
33
+ print(" > Using CUDA: ", use_cuda)
34
+ print(" > Number of GPUs: ", num_gpus)
35
+
36
+
37
+ def setup_loader(ap: AudioProcessor, is_val: bool = False):
38
+ num_utter_per_class = c.num_utter_per_class if not is_val else c.eval_num_utter_per_class
39
+ num_classes_in_batch = c.num_classes_in_batch if not is_val else c.eval_num_classes_in_batch
40
+
41
+ dataset = EncoderDataset(
42
+ c,
43
+ ap,
44
+ meta_data_eval if is_val else meta_data_train,
45
+ voice_len=c.voice_len,
46
+ num_utter_per_class=num_utter_per_class,
47
+ num_classes_in_batch=num_classes_in_batch,
48
+ augmentation_config=c.audio_augmentation if not is_val else None,
49
+ use_torch_spec=c.model_params.get("use_torch_spec", False),
50
+ )
51
+ # get classes list
52
+ classes = dataset.get_class_list()
53
+
54
+ sampler = PerfectBatchSampler(
55
+ dataset.items,
56
+ classes,
57
+ batch_size=num_classes_in_batch * num_utter_per_class, # total batch size
58
+ num_classes_in_batch=num_classes_in_batch,
59
+ num_gpus=1,
60
+ shuffle=not is_val,
61
+ drop_last=True,
62
+ )
63
+
64
+ if len(classes) < num_classes_in_batch:
65
+ if is_val:
66
+ raise RuntimeError(
67
+ f"config.eval_num_classes_in_batch ({num_classes_in_batch}) need to be <= {len(classes)} (Number total of Classes in the Eval dataset) !"
68
+ )
69
+ raise RuntimeError(
70
+ f"config.num_classes_in_batch ({num_classes_in_batch}) need to be <= {len(classes)} (Number total of Classes in the Train dataset) !"
71
+ )
72
+
73
+ # set the classes to avoid get wrong class_id when the number of training and eval classes are not equal
74
+ if is_val:
75
+ dataset.set_classes(train_classes)
76
+
77
+ loader = DataLoader(
78
+ dataset,
79
+ num_workers=c.num_loader_workers,
80
+ batch_sampler=sampler,
81
+ collate_fn=dataset.collate_fn,
82
+ )
83
+
84
+ return loader, classes, dataset.get_map_classid_to_classname()
85
+
86
+
87
+ def evaluation(model, criterion, data_loader, global_step):
88
+ eval_loss = 0
89
+ for _, data in enumerate(data_loader):
90
+ with torch.inference_mode():
91
+ # setup input data
92
+ inputs, labels = data
93
+
94
+ # agroup samples of each class in the batch. perfect sampler produces [3,2,1,3,2,1] we need [3,3,2,2,1,1]
95
+ labels = torch.transpose(
96
+ labels.view(c.eval_num_utter_per_class, c.eval_num_classes_in_batch), 0, 1
97
+ ).reshape(labels.shape)
98
+ inputs = torch.transpose(
99
+ inputs.view(c.eval_num_utter_per_class, c.eval_num_classes_in_batch, -1), 0, 1
100
+ ).reshape(inputs.shape)
101
+
102
+ # dispatch data to GPU
103
+ if use_cuda:
104
+ inputs = inputs.cuda(non_blocking=True)
105
+ labels = labels.cuda(non_blocking=True)
106
+
107
+ # forward pass model
108
+ outputs = model(inputs)
109
+
110
+ # loss computation
111
+ loss = criterion(
112
+ outputs.view(c.eval_num_classes_in_batch, outputs.shape[0] // c.eval_num_classes_in_batch, -1), labels
113
+ )
114
+
115
+ eval_loss += loss.item()
116
+
117
+ eval_avg_loss = eval_loss / len(data_loader)
118
+ # save stats
119
+ dashboard_logger.eval_stats(global_step, {"loss": eval_avg_loss})
120
+ try:
121
+ # plot the last batch in the evaluation
122
+ figures = {
123
+ "UMAP Plot": plot_embeddings(outputs.detach().cpu().numpy(), c.num_classes_in_batch),
124
+ }
125
+ dashboard_logger.eval_figures(global_step, figures)
126
+ except ImportError:
127
+ warnings.warn("Install the `umap-learn` package to see embedding plots.")
128
+ return eval_avg_loss
129
+
130
+
131
+ def train(model, optimizer, scheduler, criterion, data_loader, eval_data_loader, global_step):
132
+ model.train()
133
+ best_loss = {"train_loss": None, "eval_loss": float("inf")}
134
+ avg_loader_time = 0
135
+ end_time = time.time()
136
+ for epoch in range(c.epochs):
137
+ tot_loss = 0
138
+ epoch_time = 0
139
+ for _, data in enumerate(data_loader):
140
+ start_time = time.time()
141
+
142
+ # setup input data
143
+ inputs, labels = data
144
+ # agroup samples of each class in the batch. perfect sampler produces [3,2,1,3,2,1] we need [3,3,2,2,1,1]
145
+ labels = torch.transpose(labels.view(c.num_utter_per_class, c.num_classes_in_batch), 0, 1).reshape(
146
+ labels.shape
147
+ )
148
+ inputs = torch.transpose(inputs.view(c.num_utter_per_class, c.num_classes_in_batch, -1), 0, 1).reshape(
149
+ inputs.shape
150
+ )
151
+ # ToDo: move it to a unit test
152
+ # labels_converted = torch.transpose(labels.view(c.num_utter_per_class, c.num_classes_in_batch), 0, 1).reshape(labels.shape)
153
+ # inputs_converted = torch.transpose(inputs.view(c.num_utter_per_class, c.num_classes_in_batch, -1), 0, 1).reshape(inputs.shape)
154
+ # idx = 0
155
+ # for j in range(0, c.num_classes_in_batch, 1):
156
+ # for i in range(j, len(labels), c.num_classes_in_batch):
157
+ # if not torch.all(labels[i].eq(labels_converted[idx])) or not torch.all(inputs[i].eq(inputs_converted[idx])):
158
+ # print("Invalid")
159
+ # print(labels)
160
+ # exit()
161
+ # idx += 1
162
+ # labels = labels_converted
163
+ # inputs = inputs_converted
164
+
165
+ loader_time = time.time() - end_time
166
+ global_step += 1
167
+
168
+ optimizer.zero_grad()
169
+
170
+ # dispatch data to GPU
171
+ if use_cuda:
172
+ inputs = inputs.cuda(non_blocking=True)
173
+ labels = labels.cuda(non_blocking=True)
174
+
175
+ # forward pass model
176
+ outputs = model(inputs)
177
+
178
+ # loss computation
179
+ loss = criterion(
180
+ outputs.view(c.num_classes_in_batch, outputs.shape[0] // c.num_classes_in_batch, -1), labels
181
+ )
182
+ loss.backward()
183
+ grad_norm, _ = check_update(model, c.grad_clip)
184
+ optimizer.step()
185
+
186
+ # setup lr
187
+ if c.lr_decay:
188
+ scheduler.step()
189
+
190
+ step_time = time.time() - start_time
191
+ epoch_time += step_time
192
+
193
+ # acumulate the total epoch loss
194
+ tot_loss += loss.item()
195
+
196
+ # Averaged Loader Time
197
+ num_loader_workers = c.num_loader_workers if c.num_loader_workers > 0 else 1
198
+ avg_loader_time = (
199
+ 1 / num_loader_workers * loader_time + (num_loader_workers - 1) / num_loader_workers * avg_loader_time
200
+ if avg_loader_time != 0
201
+ else loader_time
202
+ )
203
+ current_lr = optimizer.param_groups[0]["lr"]
204
+
205
+ if global_step % c.steps_plot_stats == 0:
206
+ # Plot Training Epoch Stats
207
+ train_stats = {
208
+ "loss": loss.item(),
209
+ "lr": current_lr,
210
+ "grad_norm": grad_norm,
211
+ "step_time": step_time,
212
+ "avg_loader_time": avg_loader_time,
213
+ }
214
+ dashboard_logger.train_epoch_stats(global_step, train_stats)
215
+ figures = {
216
+ "UMAP Plot": plot_embeddings(outputs.detach().cpu().numpy(), c.num_classes_in_batch),
217
+ }
218
+ dashboard_logger.train_figures(global_step, figures)
219
+
220
+ if global_step % c.print_step == 0:
221
+ print(
222
+ " | > Step:{} Loss:{:.5f} GradNorm:{:.5f} "
223
+ "StepTime:{:.2f} LoaderTime:{:.2f} AvGLoaderTime:{:.2f} LR:{:.6f}".format(
224
+ global_step, loss.item(), grad_norm, step_time, loader_time, avg_loader_time, current_lr
225
+ ),
226
+ flush=True,
227
+ )
228
+
229
+ if global_step % c.save_step == 0:
230
+ # save model
231
+ save_checkpoint(
232
+ c, model, optimizer, None, global_step, epoch, OUT_PATH, criterion=criterion.state_dict()
233
+ )
234
+
235
+ end_time = time.time()
236
+
237
+ print("")
238
+ print(
239
+ ">>> Epoch:{} AvgLoss: {:.5f} GradNorm:{:.5f} "
240
+ "EpochTime:{:.2f} AvGLoaderTime:{:.2f} ".format(
241
+ epoch, tot_loss / len(data_loader), grad_norm, epoch_time, avg_loader_time
242
+ ),
243
+ flush=True,
244
+ )
245
+ # evaluation
246
+ if c.run_eval:
247
+ model.eval()
248
+ eval_loss = evaluation(model, criterion, eval_data_loader, global_step)
249
+ print("\n\n")
250
+ print("--> EVAL PERFORMANCE")
251
+ print(
252
+ " | > Epoch:{} AvgLoss: {:.5f} ".format(epoch, eval_loss),
253
+ flush=True,
254
+ )
255
+ # save the best checkpoint
256
+ best_loss = save_best_model(
257
+ {"train_loss": None, "eval_loss": eval_loss},
258
+ best_loss,
259
+ c,
260
+ model,
261
+ optimizer,
262
+ None,
263
+ global_step,
264
+ epoch,
265
+ OUT_PATH,
266
+ criterion=criterion.state_dict(),
267
+ )
268
+ model.train()
269
+
270
+ return best_loss, global_step
271
+
272
+
273
+ def main(args): # pylint: disable=redefined-outer-name
274
+ # pylint: disable=global-variable-undefined
275
+ global meta_data_train
276
+ global meta_data_eval
277
+ global train_classes
278
+
279
+ ap = AudioProcessor(**c.audio)
280
+ model = setup_encoder_model(c)
281
+
282
+ optimizer = get_optimizer(c.optimizer, c.optimizer_params, c.lr, model)
283
+
284
+ # pylint: disable=redefined-outer-name
285
+ meta_data_train, meta_data_eval = load_tts_samples(c.datasets, eval_split=True)
286
+
287
+ train_data_loader, train_classes, map_classid_to_classname = setup_loader(ap, is_val=False)
288
+ if c.run_eval:
289
+ eval_data_loader, _, _ = setup_loader(ap, is_val=True)
290
+ else:
291
+ eval_data_loader = None
292
+
293
+ num_classes = len(train_classes)
294
+ criterion = model.get_criterion(c, num_classes)
295
+
296
+ if c.loss == "softmaxproto" and c.model != "speaker_encoder":
297
+ c.map_classid_to_classname = map_classid_to_classname
298
+ copy_model_files(c, OUT_PATH, new_fields={})
299
+
300
+ if args.restore_path:
301
+ criterion, args.restore_step = model.load_checkpoint(
302
+ c, args.restore_path, eval=False, use_cuda=use_cuda, criterion=criterion
303
+ )
304
+ print(" > Model restored from step %d" % args.restore_step, flush=True)
305
+ else:
306
+ args.restore_step = 0
307
+
308
+ if c.lr_decay:
309
+ scheduler = NoamLR(optimizer, warmup_steps=c.warmup_steps, last_epoch=args.restore_step - 1)
310
+ else:
311
+ scheduler = None
312
+
313
+ num_params = count_parameters(model)
314
+ print("\n > Model has {} parameters".format(num_params), flush=True)
315
+
316
+ if use_cuda:
317
+ model = model.cuda()
318
+ criterion.cuda()
319
+
320
+ global_step = args.restore_step
321
+ _, global_step = train(model, optimizer, scheduler, criterion, train_data_loader, eval_data_loader, global_step)
322
+
323
+
324
+ if __name__ == "__main__":
325
+ setup_logger("TTS", level=logging.INFO, stream=sys.stdout, formatter=ConsoleFormatter())
326
+
327
+ args, c, OUT_PATH, AUDIO_PATH, c_logger, dashboard_logger = init_training()
328
+
329
+ try:
330
+ main(args)
331
+ except KeyboardInterrupt:
332
+ remove_experiment_folder(OUT_PATH)
333
+ try:
334
+ sys.exit(0)
335
+ except SystemExit:
336
+ os._exit(0) # pylint: disable=protected-access
337
+ except Exception: # pylint: disable=broad-except
338
+ remove_experiment_folder(OUT_PATH)
339
+ traceback.print_exc()
340
+ sys.exit(1)
TTS/bin/train_tts.py ADDED
@@ -0,0 +1,76 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import logging
2
+ import os
3
+ import sys
4
+ from dataclasses import dataclass, field
5
+
6
+ from trainer import Trainer, TrainerArgs
7
+
8
+ from TTS.config import load_config, register_config
9
+ from TTS.tts.datasets import load_tts_samples
10
+ from TTS.tts.models import setup_model
11
+ from TTS.utils.generic_utils import ConsoleFormatter, setup_logger
12
+
13
+
14
+ @dataclass
15
+ class TrainTTSArgs(TrainerArgs):
16
+ config_path: str = field(default=None, metadata={"help": "Path to the config file."})
17
+
18
+
19
+ def main():
20
+ """Run `tts` model training directly by a `config.json` file."""
21
+ setup_logger("TTS", level=logging.INFO, stream=sys.stdout, formatter=ConsoleFormatter())
22
+
23
+ # init trainer args
24
+ train_args = TrainTTSArgs()
25
+ parser = train_args.init_argparse(arg_prefix="")
26
+
27
+ # override trainer args from comman-line args
28
+ args, config_overrides = parser.parse_known_args()
29
+ train_args.parse_args(args)
30
+
31
+ # load config.json and register
32
+ if args.config_path or args.continue_path:
33
+ if args.config_path:
34
+ # init from a file
35
+ config = load_config(args.config_path)
36
+ if len(config_overrides) > 0:
37
+ config.parse_known_args(config_overrides, relaxed_parser=True)
38
+ elif args.continue_path:
39
+ # continue from a prev experiment
40
+ config = load_config(os.path.join(args.continue_path, "config.json"))
41
+ if len(config_overrides) > 0:
42
+ config.parse_known_args(config_overrides, relaxed_parser=True)
43
+ else:
44
+ # init from console args
45
+ from TTS.config.shared_configs import BaseTrainingConfig # pylint: disable=import-outside-toplevel
46
+
47
+ config_base = BaseTrainingConfig()
48
+ config_base.parse_known_args(config_overrides)
49
+ config = register_config(config_base.model)()
50
+
51
+ # load training samples
52
+ train_samples, eval_samples = load_tts_samples(
53
+ config.datasets,
54
+ eval_split=True,
55
+ eval_split_max_size=config.eval_split_max_size,
56
+ eval_split_size=config.eval_split_size,
57
+ )
58
+
59
+ # init the model from config
60
+ model = setup_model(config, train_samples + eval_samples)
61
+
62
+ # init the trainer and 🚀
63
+ trainer = Trainer(
64
+ train_args,
65
+ model.config,
66
+ config.output_path,
67
+ model=model,
68
+ train_samples=train_samples,
69
+ eval_samples=eval_samples,
70
+ parse_command_line_args=False,
71
+ )
72
+ trainer.fit()
73
+
74
+
75
+ if __name__ == "__main__":
76
+ main()
TTS/bin/train_vocoder.py ADDED
@@ -0,0 +1,84 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import logging
2
+ import os
3
+ import sys
4
+ from dataclasses import dataclass, field
5
+ from typing import Optional
6
+
7
+ from trainer import Trainer, TrainerArgs
8
+
9
+ from TTS.config import load_config, register_config
10
+ from TTS.utils.audio import AudioProcessor
11
+ from TTS.utils.generic_utils import ConsoleFormatter, setup_logger
12
+ from TTS.vocoder.datasets.preprocess import load_wav_data, load_wav_feat_data
13
+ from TTS.vocoder.models import setup_model
14
+
15
+
16
+ @dataclass
17
+ class TrainVocoderArgs(TrainerArgs):
18
+ config_path: str = field(default=None, metadata={"help": "Path to the config file."})
19
+
20
+
21
+ def main(arg_list: Optional[list[str]] = None):
22
+ """Run `tts` model training directly by a `config.json` file."""
23
+ setup_logger("TTS", level=logging.INFO, stream=sys.stdout, formatter=ConsoleFormatter())
24
+
25
+ # init trainer args
26
+ train_args = TrainVocoderArgs()
27
+ parser = train_args.init_argparse(arg_prefix="")
28
+
29
+ # override trainer args from comman-line args
30
+ args, config_overrides = parser.parse_known_args(arg_list)
31
+ train_args.parse_args(args)
32
+
33
+ # load config.json and register
34
+ if args.config_path or args.continue_path:
35
+ if args.config_path:
36
+ # init from a file
37
+ config = load_config(args.config_path)
38
+ if len(config_overrides) > 0:
39
+ config.parse_known_args(config_overrides, relaxed_parser=True)
40
+ elif args.continue_path:
41
+ # continue from a prev experiment
42
+ config = load_config(os.path.join(args.continue_path, "config.json"))
43
+ if len(config_overrides) > 0:
44
+ config.parse_known_args(config_overrides, relaxed_parser=True)
45
+ else:
46
+ # init from console args
47
+ from TTS.config.shared_configs import BaseTrainingConfig # pylint: disable=import-outside-toplevel
48
+
49
+ config_base = BaseTrainingConfig()
50
+ config_base.parse_known_args(config_overrides)
51
+ config = register_config(config_base.model)()
52
+
53
+ # load training samples
54
+ if "feature_path" in config and config.feature_path:
55
+ # load pre-computed features
56
+ print(f" > Loading features from: {config.feature_path}")
57
+ eval_samples, train_samples = load_wav_feat_data(config.data_path, config.feature_path, config.eval_split_size)
58
+ else:
59
+ # load data raw wav files
60
+ eval_samples, train_samples = load_wav_data(config.data_path, config.eval_split_size)
61
+
62
+ # setup audio processor
63
+ ap = AudioProcessor(**config.audio)
64
+
65
+ # init the model from config
66
+ model = setup_model(config)
67
+
68
+ # init the trainer and 🚀
69
+ trainer = Trainer(
70
+ train_args,
71
+ config,
72
+ config.output_path,
73
+ model=model,
74
+ train_samples=train_samples,
75
+ eval_samples=eval_samples,
76
+ training_assets={"audio_processor": ap},
77
+ parse_command_line_args=False,
78
+ )
79
+ trainer.fit()
80
+ sys.exit(0)
81
+
82
+
83
+ if __name__ == "__main__":
84
+ main()
TTS/bin/tune_wavegrad.py ADDED
@@ -0,0 +1,108 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ """Search a good noise schedule for WaveGrad for a given number of inference iterations"""
2
+
3
+ import argparse
4
+ import logging
5
+ import sys
6
+ from itertools import product as cartesian_product
7
+
8
+ import numpy as np
9
+ import torch
10
+ from torch.utils.data import DataLoader
11
+ from tqdm import tqdm
12
+
13
+ from TTS.config import load_config
14
+ from TTS.utils.audio import AudioProcessor
15
+ from TTS.utils.generic_utils import ConsoleFormatter, setup_logger
16
+ from TTS.vocoder.datasets.preprocess import load_wav_data
17
+ from TTS.vocoder.datasets.wavegrad_dataset import WaveGradDataset
18
+ from TTS.vocoder.models import setup_model
19
+
20
+ if __name__ == "__main__":
21
+ setup_logger("TTS", level=logging.INFO, stream=sys.stdout, formatter=ConsoleFormatter())
22
+
23
+ parser = argparse.ArgumentParser()
24
+ parser.add_argument("--model_path", type=str, help="Path to model checkpoint.")
25
+ parser.add_argument("--config_path", type=str, help="Path to model config file.")
26
+ parser.add_argument("--data_path", type=str, help="Path to data directory.")
27
+ parser.add_argument("--output_path", type=str, help="path for output file including file name and extension.")
28
+ parser.add_argument(
29
+ "--num_iter",
30
+ type=int,
31
+ help="Number of model inference iterations that you like to optimize noise schedule for.",
32
+ )
33
+ parser.add_argument("--use_cuda", action="store_true", help="enable CUDA.")
34
+ parser.add_argument("--num_samples", type=int, default=1, help="Number of datasamples used for inference.")
35
+ parser.add_argument(
36
+ "--search_depth",
37
+ type=int,
38
+ default=3,
39
+ help="Search granularity. Increasing this increases the run-time exponentially.",
40
+ )
41
+
42
+ # load config
43
+ args = parser.parse_args()
44
+ config = load_config(args.config_path)
45
+
46
+ # setup audio processor
47
+ ap = AudioProcessor(**config.audio)
48
+
49
+ # load dataset
50
+ _, train_data = load_wav_data(args.data_path, 0)
51
+ train_data = train_data[: args.num_samples]
52
+ dataset = WaveGradDataset(
53
+ ap=ap,
54
+ items=train_data,
55
+ seq_len=-1,
56
+ hop_len=ap.hop_length,
57
+ pad_short=config.pad_short,
58
+ conv_pad=config.conv_pad,
59
+ is_training=True,
60
+ return_segments=False,
61
+ use_noise_augment=False,
62
+ use_cache=False,
63
+ )
64
+ loader = DataLoader(
65
+ dataset,
66
+ batch_size=1,
67
+ shuffle=False,
68
+ collate_fn=dataset.collate_full_clips,
69
+ drop_last=False,
70
+ num_workers=config.num_loader_workers,
71
+ pin_memory=False,
72
+ )
73
+
74
+ # setup the model
75
+ model = setup_model(config)
76
+ if args.use_cuda:
77
+ model.cuda()
78
+
79
+ # setup optimization parameters
80
+ base_values = sorted(10 * np.random.uniform(size=args.search_depth))
81
+ print(f" > base values: {base_values}")
82
+ exponents = 10 ** np.linspace(-6, -1, num=args.num_iter)
83
+ best_error = float("inf")
84
+ best_schedule = None # pylint: disable=C0103
85
+ total_search_iter = len(base_values) ** args.num_iter
86
+ for base in tqdm(cartesian_product(base_values, repeat=args.num_iter), total=total_search_iter):
87
+ beta = exponents * base
88
+ model.compute_noise_level(beta)
89
+ for data in loader:
90
+ mel, audio = data
91
+ y_hat = model.inference(mel.cuda() if args.use_cuda else mel)
92
+
93
+ if args.use_cuda:
94
+ y_hat = y_hat.cpu()
95
+ y_hat = y_hat.numpy()
96
+
97
+ mel_hat = []
98
+ for i in range(y_hat.shape[0]):
99
+ m = ap.melspectrogram(y_hat[i, 0])[:, :-1]
100
+ mel_hat.append(torch.from_numpy(m))
101
+
102
+ mel_hat = torch.stack(mel_hat)
103
+ mse = torch.sum((mel - mel_hat) ** 2).mean()
104
+ if mse.item() < best_error:
105
+ best_error = mse.item()
106
+ best_schedule = {"beta": beta}
107
+ print(f" > Found a better schedule. - MSE: {mse.item()}")
108
+ np.save(args.output_path, best_schedule)
TTS/config/__init__.py ADDED
@@ -0,0 +1,139 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import json
2
+ import os
3
+ import re
4
+ from typing import Any, Dict, Union
5
+
6
+ import fsspec
7
+ import yaml
8
+ from coqpit import Coqpit
9
+
10
+ from TTS.config.shared_configs import *
11
+ from TTS.utils.generic_utils import find_module
12
+
13
+
14
+ def read_json_with_comments(json_path):
15
+ """for backward compat."""
16
+ # fallback to json
17
+ with fsspec.open(json_path, "r", encoding="utf-8") as f:
18
+ input_str = f.read()
19
+ # handle comments but not urls with //
20
+ input_str = re.sub(
21
+ r"(\"(?:[^\"\\]|\\.)*\")|(/\*(?:.|[\\n\\r])*?\*/)|(//.*)", lambda m: m.group(1) or m.group(2) or "", input_str
22
+ )
23
+ return json.loads(input_str)
24
+
25
+
26
+ def register_config(model_name: str) -> Coqpit:
27
+ """Find the right config for the given model name.
28
+
29
+ Args:
30
+ model_name (str): Model name.
31
+
32
+ Raises:
33
+ ModuleNotFoundError: No matching config for the model name.
34
+
35
+ Returns:
36
+ Coqpit: config class.
37
+ """
38
+ config_class = None
39
+ config_name = model_name + "_config"
40
+
41
+ # TODO: fix this
42
+ if model_name == "xtts":
43
+ from TTS.tts.configs.xtts_config import XttsConfig
44
+
45
+ config_class = XttsConfig
46
+ paths = ["TTS.tts.configs", "TTS.vocoder.configs", "TTS.encoder.configs", "TTS.vc.configs"]
47
+ for path in paths:
48
+ try:
49
+ config_class = find_module(path, config_name)
50
+ except ModuleNotFoundError:
51
+ pass
52
+ if config_class is None:
53
+ raise ModuleNotFoundError(f" [!] Config for {model_name} cannot be found.")
54
+ return config_class
55
+
56
+
57
+ def _process_model_name(config_dict: Dict) -> str:
58
+ """Format the model name as expected. It is a band-aid for the old `vocoder` model names.
59
+
60
+ Args:
61
+ config_dict (Dict): A dictionary including the config fields.
62
+
63
+ Returns:
64
+ str: Formatted modelname.
65
+ """
66
+ model_name = config_dict["model"] if "model" in config_dict else config_dict["generator_model"]
67
+ model_name = model_name.replace("_generator", "").replace("_discriminator", "")
68
+ return model_name
69
+
70
+
71
+ def load_config(config_path: Union[str, os.PathLike[Any]]) -> Coqpit:
72
+ """Import `json` or `yaml` files as TTS configs. First, load the input file as a `dict` and check the model name
73
+ to find the corresponding Config class. Then initialize the Config.
74
+
75
+ Args:
76
+ config_path (str): path to the config file.
77
+
78
+ Raises:
79
+ TypeError: given config file has an unknown type.
80
+
81
+ Returns:
82
+ Coqpit: TTS config object.
83
+ """
84
+ config_path = str(config_path)
85
+ config_dict = {}
86
+ ext = os.path.splitext(config_path)[1]
87
+ if ext in (".yml", ".yaml"):
88
+ with fsspec.open(config_path, "r", encoding="utf-8") as f:
89
+ data = yaml.safe_load(f)
90
+ elif ext == ".json":
91
+ try:
92
+ with fsspec.open(config_path, "r", encoding="utf-8") as f:
93
+ data = json.load(f)
94
+ except json.decoder.JSONDecodeError:
95
+ # backwards compat.
96
+ data = read_json_with_comments(config_path)
97
+ else:
98
+ raise TypeError(f" [!] Unknown config file type {ext}")
99
+ config_dict.update(data)
100
+ model_name = _process_model_name(config_dict)
101
+ config_class = register_config(model_name.lower())
102
+ config = config_class()
103
+ config.from_dict(config_dict)
104
+ return config
105
+
106
+
107
+ def check_config_and_model_args(config, arg_name, value):
108
+ """Check the give argument in `config.model_args` if exist or in `config` for
109
+ the given value.
110
+
111
+ Return False if the argument does not exist in `config.model_args` or `config`.
112
+ This is to patch up the compatibility between models with and without `model_args`.
113
+
114
+ TODO: Remove this in the future with a unified approach.
115
+ """
116
+ if hasattr(config, "model_args"):
117
+ if arg_name in config.model_args:
118
+ return config.model_args[arg_name] == value
119
+ if hasattr(config, arg_name):
120
+ return config[arg_name] == value
121
+ return False
122
+
123
+
124
+ def get_from_config_or_model_args(config, arg_name):
125
+ """Get the given argument from `config.model_args` if exist or in `config`."""
126
+ if hasattr(config, "model_args"):
127
+ if arg_name in config.model_args:
128
+ return config.model_args[arg_name]
129
+ return config[arg_name]
130
+
131
+
132
+ def get_from_config_or_model_args_with_default(config, arg_name, def_val):
133
+ """Get the given argument from `config.model_args` if exist or in `config`."""
134
+ if hasattr(config, "model_args"):
135
+ if arg_name in config.model_args:
136
+ return config.model_args[arg_name]
137
+ if hasattr(config, arg_name):
138
+ return config[arg_name]
139
+ return def_val
TTS/config/__pycache__/__init__.cpython-310.pyc ADDED
Binary file (4.32 kB). View file
 
TTS/config/__pycache__/__init__.cpython-311.pyc ADDED
Binary file (6.83 kB). View file
 
TTS/config/__pycache__/__init__.cpython-39.pyc ADDED
Binary file (4.26 kB). View file
 
TTS/config/__pycache__/shared_configs.cpython-310.pyc ADDED
Binary file (9.52 kB). View file
 
TTS/config/__pycache__/shared_configs.cpython-311.pyc ADDED
Binary file (11.9 kB). View file
 
TTS/config/__pycache__/shared_configs.cpython-39.pyc ADDED
Binary file (9.52 kB). View file
 
TTS/config/shared_configs.py ADDED
@@ -0,0 +1,268 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ from dataclasses import asdict, dataclass
2
+ from typing import List
3
+
4
+ from coqpit import Coqpit, check_argument
5
+ from trainer import TrainerConfig
6
+
7
+
8
+ @dataclass
9
+ class BaseAudioConfig(Coqpit):
10
+ """Base config to definge audio processing parameters. It is used to initialize
11
+ ```TTS.utils.audio.AudioProcessor.```
12
+
13
+ Args:
14
+ fft_size (int):
15
+ Number of STFT frequency levels aka.size of the linear spectogram frame. Defaults to 1024.
16
+
17
+ win_length (int):
18
+ Each frame of audio is windowed by window of length ```win_length``` and then padded with zeros to match
19
+ ```fft_size```. Defaults to 1024.
20
+
21
+ hop_length (int):
22
+ Number of audio samples between adjacent STFT columns. Defaults to 1024.
23
+
24
+ frame_shift_ms (int):
25
+ Set ```hop_length``` based on milliseconds and sampling rate.
26
+
27
+ frame_length_ms (int):
28
+ Set ```win_length``` based on milliseconds and sampling rate.
29
+
30
+ stft_pad_mode (str):
31
+ Padding method used in STFT. 'reflect' or 'center'. Defaults to 'reflect'.
32
+
33
+ sample_rate (int):
34
+ Audio sampling rate. Defaults to 22050.
35
+
36
+ resample (bool):
37
+ Enable / Disable resampling audio to ```sample_rate```. Defaults to ```False```.
38
+
39
+ preemphasis (float):
40
+ Preemphasis coefficient. Defaults to 0.0.
41
+
42
+ ref_level_db (int): 20
43
+ Reference Db level to rebase the audio signal and ignore the level below. 20Db is assumed the sound of air.
44
+ Defaults to 20.
45
+
46
+ do_sound_norm (bool):
47
+ Enable / Disable sound normalization to reconcile the volume differences among samples. Defaults to False.
48
+
49
+ log_func (str):
50
+ Numpy log function used for amplitude to DB conversion. Defaults to 'np.log10'.
51
+
52
+ do_trim_silence (bool):
53
+ Enable / Disable trimming silences at the beginning and the end of the audio clip. Defaults to ```True```.
54
+
55
+ do_amp_to_db_linear (bool, optional):
56
+ enable/disable amplitude to dB conversion of linear spectrograms. Defaults to True.
57
+
58
+ do_amp_to_db_mel (bool, optional):
59
+ enable/disable amplitude to dB conversion of mel spectrograms. Defaults to True.
60
+
61
+ pitch_fmax (float, optional):
62
+ Maximum frequency of the F0 frames. Defaults to ```640```.
63
+
64
+ pitch_fmin (float, optional):
65
+ Minimum frequency of the F0 frames. Defaults to ```1```.
66
+
67
+ trim_db (int):
68
+ Silence threshold used for silence trimming. Defaults to 45.
69
+
70
+ do_rms_norm (bool, optional):
71
+ enable/disable RMS volume normalization when loading an audio file. Defaults to False.
72
+
73
+ db_level (int, optional):
74
+ dB level used for rms normalization. The range is -99 to 0. Defaults to None.
75
+
76
+ power (float):
77
+ Exponent used for expanding spectrogra levels before running Griffin Lim. It helps to reduce the
78
+ artifacts in the synthesized voice. Defaults to 1.5.
79
+
80
+ griffin_lim_iters (int):
81
+ Number of Griffing Lim iterations. Defaults to 60.
82
+
83
+ num_mels (int):
84
+ Number of mel-basis frames that defines the frame lengths of each mel-spectrogram frame. Defaults to 80.
85
+
86
+ mel_fmin (float): Min frequency level used for the mel-basis filters. ~50 for male and ~95 for female voices.
87
+ It needs to be adjusted for a dataset. Defaults to 0.
88
+
89
+ mel_fmax (float):
90
+ Max frequency level used for the mel-basis filters. It needs to be adjusted for a dataset.
91
+
92
+ spec_gain (int):
93
+ Gain applied when converting amplitude to DB. Defaults to 20.
94
+
95
+ signal_norm (bool):
96
+ enable/disable signal normalization. Defaults to True.
97
+
98
+ min_level_db (int):
99
+ minimum db threshold for the computed melspectrograms. Defaults to -100.
100
+
101
+ symmetric_norm (bool):
102
+ enable/disable symmetric normalization. If set True normalization is performed in the range [-k, k] else
103
+ [0, k], Defaults to True.
104
+
105
+ max_norm (float):
106
+ ```k``` defining the normalization range. Defaults to 4.0.
107
+
108
+ clip_norm (bool):
109
+ enable/disable clipping the our of range values in the normalized audio signal. Defaults to True.
110
+
111
+ stats_path (str):
112
+ Path to the computed stats file. Defaults to None.
113
+ """
114
+
115
+ # stft parameters
116
+ fft_size: int = 1024
117
+ win_length: int = 1024
118
+ hop_length: int = 256
119
+ frame_shift_ms: int = None
120
+ frame_length_ms: int = None
121
+ stft_pad_mode: str = "reflect"
122
+ # audio processing parameters
123
+ sample_rate: int = 22050
124
+ resample: bool = False
125
+ preemphasis: float = 0.0
126
+ ref_level_db: int = 20
127
+ do_sound_norm: bool = False
128
+ log_func: str = "np.log10"
129
+ # silence trimming
130
+ do_trim_silence: bool = True
131
+ trim_db: int = 45
132
+ # rms volume normalization
133
+ do_rms_norm: bool = False
134
+ db_level: float = None
135
+ # griffin-lim params
136
+ power: float = 1.5
137
+ griffin_lim_iters: int = 60
138
+ # mel-spec params
139
+ num_mels: int = 80
140
+ mel_fmin: float = 0.0
141
+ mel_fmax: float = None
142
+ spec_gain: int = 20
143
+ do_amp_to_db_linear: bool = True
144
+ do_amp_to_db_mel: bool = True
145
+ # f0 params
146
+ pitch_fmax: float = 640.0
147
+ pitch_fmin: float = 1.0
148
+ # normalization params
149
+ signal_norm: bool = True
150
+ min_level_db: int = -100
151
+ symmetric_norm: bool = True
152
+ max_norm: float = 4.0
153
+ clip_norm: bool = True
154
+ stats_path: str = None
155
+
156
+ def check_values(
157
+ self,
158
+ ):
159
+ """Check config fields"""
160
+ c = asdict(self)
161
+ check_argument("num_mels", c, restricted=True, min_val=10, max_val=2056)
162
+ check_argument("fft_size", c, restricted=True, min_val=128, max_val=4058)
163
+ check_argument("sample_rate", c, restricted=True, min_val=512, max_val=100000)
164
+ check_argument(
165
+ "frame_length_ms",
166
+ c,
167
+ restricted=True,
168
+ min_val=10,
169
+ max_val=1000,
170
+ alternative="win_length",
171
+ )
172
+ check_argument("frame_shift_ms", c, restricted=True, min_val=1, max_val=1000, alternative="hop_length")
173
+ check_argument("preemphasis", c, restricted=True, min_val=0, max_val=1)
174
+ check_argument("min_level_db", c, restricted=True, min_val=-1000, max_val=10)
175
+ check_argument("ref_level_db", c, restricted=True, min_val=0, max_val=1000)
176
+ check_argument("power", c, restricted=True, min_val=1, max_val=5)
177
+ check_argument("griffin_lim_iters", c, restricted=True, min_val=10, max_val=1000)
178
+
179
+ # normalization parameters
180
+ check_argument("signal_norm", c, restricted=True)
181
+ check_argument("symmetric_norm", c, restricted=True)
182
+ check_argument("max_norm", c, restricted=True, min_val=0.1, max_val=1000)
183
+ check_argument("clip_norm", c, restricted=True)
184
+ check_argument("mel_fmin", c, restricted=True, min_val=0.0, max_val=1000)
185
+ check_argument("mel_fmax", c, restricted=True, min_val=500.0, allow_none=True)
186
+ check_argument("spec_gain", c, restricted=True, min_val=1, max_val=100)
187
+ check_argument("do_trim_silence", c, restricted=True)
188
+ check_argument("trim_db", c, restricted=True)
189
+
190
+
191
+ @dataclass
192
+ class BaseDatasetConfig(Coqpit):
193
+ """Base config for TTS datasets.
194
+
195
+ Args:
196
+ formatter (str):
197
+ Formatter name that defines used formatter in ```TTS.tts.datasets.formatter```. Defaults to `""`.
198
+
199
+ dataset_name (str):
200
+ Unique name for the dataset. Defaults to `""`.
201
+
202
+ path (str):
203
+ Root path to the dataset files. Defaults to `""`.
204
+
205
+ meta_file_train (str):
206
+ Name of the dataset meta file. Or a list of speakers to be ignored at training for multi-speaker datasets.
207
+ Defaults to `""`.
208
+
209
+ ignored_speakers (List):
210
+ List of speakers IDs that are not used at the training. Default None.
211
+
212
+ language (str):
213
+ Language code of the dataset. If defined, it overrides `phoneme_language`. Defaults to `""`.
214
+
215
+ phonemizer (str):
216
+ Phonemizer used for that dataset's language. By default it uses `DEF_LANG_TO_PHONEMIZER`. Defaults to `""`.
217
+
218
+ meta_file_val (str):
219
+ Name of the dataset meta file that defines the instances used at validation.
220
+
221
+ meta_file_attn_mask (str):
222
+ Path to the file that lists the attention mask files used with models that require attention masks to
223
+ train the duration predictor.
224
+ """
225
+
226
+ formatter: str = ""
227
+ dataset_name: str = ""
228
+ path: str = ""
229
+ meta_file_train: str = ""
230
+ ignored_speakers: List[str] = None
231
+ language: str = ""
232
+ phonemizer: str = ""
233
+ meta_file_val: str = ""
234
+ meta_file_attn_mask: str = ""
235
+
236
+ def check_values(
237
+ self,
238
+ ):
239
+ """Check config fields"""
240
+ c = asdict(self)
241
+ check_argument("formatter", c, restricted=True)
242
+ check_argument("path", c, restricted=True)
243
+ check_argument("meta_file_train", c, restricted=True)
244
+ check_argument("meta_file_val", c, restricted=False)
245
+ check_argument("meta_file_attn_mask", c, restricted=False)
246
+
247
+
248
+ @dataclass
249
+ class BaseTrainingConfig(TrainerConfig):
250
+ """Base config to define the basic 🐸TTS training parameters that are shared
251
+ among all the models. It is based on ```Trainer.TrainingConfig```.
252
+
253
+ Args:
254
+ model (str):
255
+ Name of the model that is used in the training.
256
+
257
+ num_loader_workers (int):
258
+ Number of workers for training time dataloader.
259
+
260
+ num_eval_loader_workers (int):
261
+ Number of workers for evaluation time dataloader.
262
+ """
263
+
264
+ model: str = None
265
+ # dataloading
266
+ num_loader_workers: int = 0
267
+ num_eval_loader_workers: int = 0
268
+ use_noise_augment: bool = False
TTS/demos/xtts_ft_demo/requirements.txt ADDED
@@ -0,0 +1,2 @@
 
 
 
1
+ faster_whisper==0.9.0
2
+ gradio==4.7.1
TTS/demos/xtts_ft_demo/utils/formatter.py ADDED
@@ -0,0 +1,161 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import gc
2
+ import os
3
+
4
+ import pandas
5
+ import torch
6
+ import torchaudio
7
+ from faster_whisper import WhisperModel
8
+ from tqdm import tqdm
9
+
10
+ # torch.set_num_threads(1)
11
+ from TTS.tts.layers.xtts.tokenizer import multilingual_cleaners
12
+
13
+ torch.set_num_threads(16)
14
+
15
+ audio_types = (".wav", ".mp3", ".flac")
16
+
17
+
18
+ def list_audios(basePath, contains=None):
19
+ # return the set of files that are valid
20
+ return list_files(basePath, validExts=audio_types, contains=contains)
21
+
22
+
23
+ def list_files(basePath, validExts=None, contains=None):
24
+ # loop over the directory structure
25
+ for rootDir, dirNames, filenames in os.walk(basePath):
26
+ # loop over the filenames in the current directory
27
+ for filename in filenames:
28
+ # if the contains string is not none and the filename does not contain
29
+ # the supplied string, then ignore the file
30
+ if contains is not None and filename.find(contains) == -1:
31
+ continue
32
+
33
+ # determine the file extension of the current file
34
+ ext = filename[filename.rfind(".") :].lower()
35
+
36
+ # check to see if the file is an audio and should be processed
37
+ if validExts is None or ext.endswith(validExts):
38
+ # construct the path to the audio and yield it
39
+ audioPath = os.path.join(rootDir, filename)
40
+ yield audioPath
41
+
42
+
43
+ def format_audio_list(
44
+ audio_files,
45
+ target_language="en",
46
+ out_path=None,
47
+ buffer=0.2,
48
+ eval_percentage=0.15,
49
+ speaker_name="coqui",
50
+ gradio_progress=None,
51
+ ):
52
+ audio_total_size = 0
53
+ # make sure that ooutput file exists
54
+ os.makedirs(out_path, exist_ok=True)
55
+
56
+ # Loading Whisper
57
+ device = "cuda" if torch.cuda.is_available() else "cpu"
58
+
59
+ print("Loading Whisper Model!")
60
+ asr_model = WhisperModel("large-v2", device=device, compute_type="float16")
61
+
62
+ metadata = {"audio_file": [], "text": [], "speaker_name": []}
63
+
64
+ if gradio_progress is not None:
65
+ tqdm_object = gradio_progress.tqdm(audio_files, desc="Formatting...")
66
+ else:
67
+ tqdm_object = tqdm(audio_files)
68
+
69
+ for audio_path in tqdm_object:
70
+ wav, sr = torchaudio.load(audio_path)
71
+ # stereo to mono if needed
72
+ if wav.size(0) != 1:
73
+ wav = torch.mean(wav, dim=0, keepdim=True)
74
+
75
+ wav = wav.squeeze()
76
+ audio_total_size += wav.size(-1) / sr
77
+
78
+ segments, _ = asr_model.transcribe(audio_path, word_timestamps=True, language=target_language)
79
+ segments = list(segments)
80
+ i = 0
81
+ sentence = ""
82
+ sentence_start = None
83
+ first_word = True
84
+ # added all segments words in a unique list
85
+ words_list = []
86
+ for _, segment in enumerate(segments):
87
+ words = list(segment.words)
88
+ words_list.extend(words)
89
+
90
+ # process each word
91
+ for word_idx, word in enumerate(words_list):
92
+ if first_word:
93
+ sentence_start = word.start
94
+ # If it is the first sentence, add buffer or get the begining of the file
95
+ if word_idx == 0:
96
+ sentence_start = max(sentence_start - buffer, 0) # Add buffer to the sentence start
97
+ else:
98
+ # get previous sentence end
99
+ previous_word_end = words_list[word_idx - 1].end
100
+ # add buffer or get the silence midle between the previous sentence and the current one
101
+ sentence_start = max(sentence_start - buffer, (previous_word_end + sentence_start) / 2)
102
+
103
+ sentence = word.word
104
+ first_word = False
105
+ else:
106
+ sentence += word.word
107
+
108
+ if word.word[-1] in ["!", ".", "?"]:
109
+ sentence = sentence[1:]
110
+ # Expand number and abbreviations plus normalization
111
+ sentence = multilingual_cleaners(sentence, target_language)
112
+ audio_file_name, _ = os.path.splitext(os.path.basename(audio_path))
113
+
114
+ audio_file = f"wavs/{audio_file_name}_{str(i).zfill(8)}.wav"
115
+
116
+ # Check for the next word's existence
117
+ if word_idx + 1 < len(words_list):
118
+ next_word_start = words_list[word_idx + 1].start
119
+ else:
120
+ # If don't have more words it means that it is the last sentence then use the audio len as next word start
121
+ next_word_start = (wav.shape[0] - 1) / sr
122
+
123
+ # Average the current word end and next word start
124
+ word_end = min((word.end + next_word_start) / 2, word.end + buffer)
125
+
126
+ absoulte_path = os.path.join(out_path, audio_file)
127
+ os.makedirs(os.path.dirname(absoulte_path), exist_ok=True)
128
+ i += 1
129
+ first_word = True
130
+
131
+ audio = wav[int(sr * sentence_start) : int(sr * word_end)].unsqueeze(0)
132
+ # if the audio is too short ignore it (i.e < 0.33 seconds)
133
+ if audio.size(-1) >= sr / 3:
134
+ torchaudio.save(absoulte_path, audio, sr)
135
+ else:
136
+ continue
137
+
138
+ metadata["audio_file"].append(audio_file)
139
+ metadata["text"].append(sentence)
140
+ metadata["speaker_name"].append(speaker_name)
141
+
142
+ df = pandas.DataFrame(metadata)
143
+ df = df.sample(frac=1)
144
+ num_val_samples = int(len(df) * eval_percentage)
145
+
146
+ df_eval = df[:num_val_samples]
147
+ df_train = df[num_val_samples:]
148
+
149
+ df_train = df_train.sort_values("audio_file")
150
+ train_metadata_path = os.path.join(out_path, "metadata_train.csv")
151
+ df_train.to_csv(train_metadata_path, sep="|", index=False)
152
+
153
+ eval_metadata_path = os.path.join(out_path, "metadata_eval.csv")
154
+ df_eval = df_eval.sort_values("audio_file")
155
+ df_eval.to_csv(eval_metadata_path, sep="|", index=False)
156
+
157
+ # deallocate VRAM and RAM
158
+ del asr_model, df_train, df_eval, df, metadata
159
+ gc.collect()
160
+
161
+ return train_metadata_path, eval_metadata_path, audio_total_size
TTS/demos/xtts_ft_demo/utils/gpt_train.py ADDED
@@ -0,0 +1,172 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import gc
2
+ import os
3
+
4
+ from trainer import Trainer, TrainerArgs
5
+
6
+ from TTS.config.shared_configs import BaseDatasetConfig
7
+ from TTS.tts.datasets import load_tts_samples
8
+ from TTS.tts.layers.xtts.trainer.gpt_trainer import GPTArgs, GPTTrainer, GPTTrainerConfig
9
+ from TTS.tts.models.xtts import XttsAudioConfig
10
+ from TTS.utils.manage import ModelManager
11
+
12
+
13
+ def train_gpt(language, num_epochs, batch_size, grad_acumm, train_csv, eval_csv, output_path, max_audio_length=255995):
14
+ # Logging parameters
15
+ RUN_NAME = "GPT_XTTS_FT"
16
+ PROJECT_NAME = "XTTS_trainer"
17
+ DASHBOARD_LOGGER = "tensorboard"
18
+ LOGGER_URI = None
19
+
20
+ # Set here the path that the checkpoints will be saved. Default: ./run/training/
21
+ OUT_PATH = os.path.join(output_path, "run", "training")
22
+
23
+ # Training Parameters
24
+ OPTIMIZER_WD_ONLY_ON_WEIGHTS = True # for multi-gpu training please make it False
25
+ START_WITH_EVAL = False # if True it will star with evaluation
26
+ BATCH_SIZE = batch_size # set here the batch size
27
+ GRAD_ACUMM_STEPS = grad_acumm # set here the grad accumulation steps
28
+
29
+ # Define here the dataset that you want to use for the fine-tuning on.
30
+ config_dataset = BaseDatasetConfig(
31
+ formatter="coqui",
32
+ dataset_name="ft_dataset",
33
+ path=os.path.dirname(train_csv),
34
+ meta_file_train=train_csv,
35
+ meta_file_val=eval_csv,
36
+ language=language,
37
+ )
38
+
39
+ # Add here the configs of the datasets
40
+ DATASETS_CONFIG_LIST = [config_dataset]
41
+
42
+ # Define the path where XTTS v2.0.1 files will be downloaded
43
+ CHECKPOINTS_OUT_PATH = os.path.join(OUT_PATH, "XTTS_v2.0_original_model_files/")
44
+ os.makedirs(CHECKPOINTS_OUT_PATH, exist_ok=True)
45
+
46
+ # DVAE files
47
+ DVAE_CHECKPOINT_LINK = "https://huggingface.co/coqui/XTTS-v2/resolve/main/dvae.pth"
48
+ MEL_NORM_LINK = "https://huggingface.co/coqui/XTTS-v2/resolve/main/mel_stats.pth"
49
+
50
+ # Set the path to the downloaded files
51
+ DVAE_CHECKPOINT = os.path.join(CHECKPOINTS_OUT_PATH, os.path.basename(DVAE_CHECKPOINT_LINK))
52
+ MEL_NORM_FILE = os.path.join(CHECKPOINTS_OUT_PATH, os.path.basename(MEL_NORM_LINK))
53
+
54
+ # download DVAE files if needed
55
+ if not os.path.isfile(DVAE_CHECKPOINT) or not os.path.isfile(MEL_NORM_FILE):
56
+ print(" > Downloading DVAE files!")
57
+ ModelManager._download_model_files(
58
+ [MEL_NORM_LINK, DVAE_CHECKPOINT_LINK], CHECKPOINTS_OUT_PATH, progress_bar=True
59
+ )
60
+
61
+ # Download XTTS v2.0 checkpoint if needed
62
+ TOKENIZER_FILE_LINK = "https://huggingface.co/coqui/XTTS-v2/resolve/main/vocab.json"
63
+ XTTS_CHECKPOINT_LINK = "https://huggingface.co/coqui/XTTS-v2/resolve/main/model.pth"
64
+ XTTS_CONFIG_LINK = "https://huggingface.co/coqui/XTTS-v2/resolve/main/config.json"
65
+
66
+ # XTTS transfer learning parameters: You we need to provide the paths of XTTS model checkpoint that you want to do the fine tuning.
67
+ TOKENIZER_FILE = os.path.join(CHECKPOINTS_OUT_PATH, os.path.basename(TOKENIZER_FILE_LINK)) # vocab.json file
68
+ XTTS_CHECKPOINT = os.path.join(CHECKPOINTS_OUT_PATH, os.path.basename(XTTS_CHECKPOINT_LINK)) # model.pth file
69
+ XTTS_CONFIG_FILE = os.path.join(CHECKPOINTS_OUT_PATH, os.path.basename(XTTS_CONFIG_LINK)) # config.json file
70
+
71
+ # download XTTS v2.0 files if needed
72
+ if not os.path.isfile(TOKENIZER_FILE) or not os.path.isfile(XTTS_CHECKPOINT):
73
+ print(" > Downloading XTTS v2.0 files!")
74
+ ModelManager._download_model_files(
75
+ [TOKENIZER_FILE_LINK, XTTS_CHECKPOINT_LINK, XTTS_CONFIG_LINK], CHECKPOINTS_OUT_PATH, progress_bar=True
76
+ )
77
+
78
+ # init args and config
79
+ model_args = GPTArgs(
80
+ max_conditioning_length=132300, # 6 secs
81
+ min_conditioning_length=66150, # 3 secs
82
+ debug_loading_failures=False,
83
+ max_wav_length=max_audio_length, # ~11.6 seconds
84
+ max_text_length=200,
85
+ mel_norm_file=MEL_NORM_FILE,
86
+ dvae_checkpoint=DVAE_CHECKPOINT,
87
+ xtts_checkpoint=XTTS_CHECKPOINT, # checkpoint path of the model that you want to fine-tune
88
+ tokenizer_file=TOKENIZER_FILE,
89
+ gpt_num_audio_tokens=1026,
90
+ gpt_start_audio_token=1024,
91
+ gpt_stop_audio_token=1025,
92
+ gpt_use_masking_gt_prompt_approach=True,
93
+ gpt_use_perceiver_resampler=True,
94
+ )
95
+ # define audio config
96
+ audio_config = XttsAudioConfig(sample_rate=22050, dvae_sample_rate=22050, output_sample_rate=24000)
97
+ # training parameters config
98
+ config = GPTTrainerConfig(
99
+ epochs=num_epochs,
100
+ output_path=OUT_PATH,
101
+ model_args=model_args,
102
+ run_name=RUN_NAME,
103
+ project_name=PROJECT_NAME,
104
+ run_description="""
105
+ GPT XTTS training
106
+ """,
107
+ dashboard_logger=DASHBOARD_LOGGER,
108
+ logger_uri=LOGGER_URI,
109
+ audio=audio_config,
110
+ batch_size=BATCH_SIZE,
111
+ batch_group_size=48,
112
+ eval_batch_size=BATCH_SIZE,
113
+ num_loader_workers=8,
114
+ eval_split_max_size=256,
115
+ print_step=50,
116
+ plot_step=100,
117
+ log_model_step=100,
118
+ save_step=1000,
119
+ save_n_checkpoints=1,
120
+ save_checkpoints=True,
121
+ # target_loss="loss",
122
+ print_eval=False,
123
+ # Optimizer values like tortoise, pytorch implementation with modifications to not apply WD to non-weight parameters.
124
+ optimizer="AdamW",
125
+ optimizer_wd_only_on_weights=OPTIMIZER_WD_ONLY_ON_WEIGHTS,
126
+ optimizer_params={"betas": [0.9, 0.96], "eps": 1e-8, "weight_decay": 1e-2},
127
+ lr=5e-06, # learning rate
128
+ lr_scheduler="MultiStepLR",
129
+ # it was adjusted accordly for the new step scheme
130
+ lr_scheduler_params={"milestones": [50000 * 18, 150000 * 18, 300000 * 18], "gamma": 0.5, "last_epoch": -1},
131
+ test_sentences=[],
132
+ )
133
+
134
+ # init the model from config
135
+ model = GPTTrainer.init_from_config(config)
136
+
137
+ # load training samples
138
+ train_samples, eval_samples = load_tts_samples(
139
+ DATASETS_CONFIG_LIST,
140
+ eval_split=True,
141
+ eval_split_max_size=config.eval_split_max_size,
142
+ eval_split_size=config.eval_split_size,
143
+ )
144
+
145
+ # init the trainer and 🚀
146
+ trainer = Trainer(
147
+ TrainerArgs(
148
+ restore_path=None, # xtts checkpoint is restored via xtts_checkpoint key so no need of restore it using Trainer restore_path parameter
149
+ skip_train_epoch=False,
150
+ start_with_eval=START_WITH_EVAL,
151
+ grad_accum_steps=GRAD_ACUMM_STEPS,
152
+ ),
153
+ config,
154
+ output_path=OUT_PATH,
155
+ model=model,
156
+ train_samples=train_samples,
157
+ eval_samples=eval_samples,
158
+ )
159
+ trainer.fit()
160
+
161
+ # get the longest text audio file to use as speaker reference
162
+ samples_len = [len(item["text"].split(" ")) for item in train_samples]
163
+ longest_text_idx = samples_len.index(max(samples_len))
164
+ speaker_ref = train_samples[longest_text_idx]["audio_file"]
165
+
166
+ trainer_out_path = trainer.output_path
167
+
168
+ # deallocate VRAM and RAM
169
+ del model, trainer, train_samples, eval_samples
170
+ gc.collect()
171
+
172
+ return XTTS_CONFIG_FILE, XTTS_CHECKPOINT, TOKENIZER_FILE, trainer_out_path, speaker_ref
TTS/demos/xtts_ft_demo/xtts_demo.py ADDED
@@ -0,0 +1,433 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import argparse
2
+ import logging
3
+ import os
4
+ import sys
5
+ import tempfile
6
+ import traceback
7
+
8
+ import gradio as gr
9
+ import torch
10
+ import torchaudio
11
+
12
+ from TTS.demos.xtts_ft_demo.utils.formatter import format_audio_list
13
+ from TTS.demos.xtts_ft_demo.utils.gpt_train import train_gpt
14
+ from TTS.tts.configs.xtts_config import XttsConfig
15
+ from TTS.tts.models.xtts import Xtts
16
+
17
+
18
+ def clear_gpu_cache():
19
+ # clear the GPU cache
20
+ if torch.cuda.is_available():
21
+ torch.cuda.empty_cache()
22
+
23
+
24
+ XTTS_MODEL = None
25
+
26
+
27
+ def load_model(xtts_checkpoint, xtts_config, xtts_vocab):
28
+ global XTTS_MODEL
29
+ clear_gpu_cache()
30
+ if not xtts_checkpoint or not xtts_config or not xtts_vocab:
31
+ return "You need to run the previous steps or manually set the `XTTS checkpoint path`, `XTTS config path`, and `XTTS vocab path` fields !!"
32
+ config = XttsConfig()
33
+ config.load_json(xtts_config)
34
+ XTTS_MODEL = Xtts.init_from_config(config)
35
+ print("Loading XTTS model! ")
36
+ XTTS_MODEL.load_checkpoint(config, checkpoint_path=xtts_checkpoint, vocab_path=xtts_vocab, use_deepspeed=False)
37
+ if torch.cuda.is_available():
38
+ XTTS_MODEL.cuda()
39
+
40
+ print("Model Loaded!")
41
+ return "Model Loaded!"
42
+
43
+
44
+ def run_tts(lang, tts_text, speaker_audio_file):
45
+ if XTTS_MODEL is None or not speaker_audio_file:
46
+ return "You need to run the previous step to load the model !!", None, None
47
+
48
+ gpt_cond_latent, speaker_embedding = XTTS_MODEL.get_conditioning_latents(
49
+ audio_path=speaker_audio_file,
50
+ gpt_cond_len=XTTS_MODEL.config.gpt_cond_len,
51
+ max_ref_length=XTTS_MODEL.config.max_ref_len,
52
+ sound_norm_refs=XTTS_MODEL.config.sound_norm_refs,
53
+ )
54
+ out = XTTS_MODEL.inference(
55
+ text=tts_text,
56
+ language=lang,
57
+ gpt_cond_latent=gpt_cond_latent,
58
+ speaker_embedding=speaker_embedding,
59
+ temperature=XTTS_MODEL.config.temperature, # Add custom parameters here
60
+ length_penalty=XTTS_MODEL.config.length_penalty,
61
+ repetition_penalty=XTTS_MODEL.config.repetition_penalty,
62
+ top_k=XTTS_MODEL.config.top_k,
63
+ top_p=XTTS_MODEL.config.top_p,
64
+ )
65
+
66
+ with tempfile.NamedTemporaryFile(suffix=".wav", delete=False) as fp:
67
+ out["wav"] = torch.tensor(out["wav"]).unsqueeze(0)
68
+ out_path = fp.name
69
+ torchaudio.save(out_path, out["wav"], 24000)
70
+
71
+ return "Speech generated !", out_path, speaker_audio_file
72
+
73
+
74
+ # define a logger to redirect
75
+ class Logger:
76
+ def __init__(self, filename="log.out"):
77
+ self.log_file = filename
78
+ self.terminal = sys.stdout
79
+ self.log = open(self.log_file, "w")
80
+
81
+ def write(self, message):
82
+ self.terminal.write(message)
83
+ self.log.write(message)
84
+
85
+ def flush(self):
86
+ self.terminal.flush()
87
+ self.log.flush()
88
+
89
+ def isatty(self):
90
+ return False
91
+
92
+
93
+ # redirect stdout and stderr to a file
94
+ sys.stdout = Logger()
95
+ sys.stderr = sys.stdout
96
+
97
+
98
+ # logging.basicConfig(stream=sys.stdout, level=logging.INFO)
99
+
100
+ logging.basicConfig(
101
+ level=logging.INFO, format="%(asctime)s [%(levelname)s] %(message)s", handlers=[logging.StreamHandler(sys.stdout)]
102
+ )
103
+
104
+
105
+ def read_logs():
106
+ sys.stdout.flush()
107
+ with open(sys.stdout.log_file, "r") as f:
108
+ return f.read()
109
+
110
+
111
+ if __name__ == "__main__":
112
+ parser = argparse.ArgumentParser(
113
+ description="""XTTS fine-tuning demo\n\n"""
114
+ """
115
+ Example runs:
116
+ python3 TTS/demos/xtts_ft_demo/xtts_demo.py --port
117
+ """,
118
+ formatter_class=argparse.RawTextHelpFormatter,
119
+ )
120
+ parser.add_argument(
121
+ "--port",
122
+ type=int,
123
+ help="Port to run the gradio demo. Default: 5003",
124
+ default=5003,
125
+ )
126
+ parser.add_argument(
127
+ "--out_path",
128
+ type=str,
129
+ help="Output path (where data and checkpoints will be saved) Default: /tmp/xtts_ft/",
130
+ default="/tmp/xtts_ft/",
131
+ )
132
+
133
+ parser.add_argument(
134
+ "--num_epochs",
135
+ type=int,
136
+ help="Number of epochs to train. Default: 10",
137
+ default=10,
138
+ )
139
+ parser.add_argument(
140
+ "--batch_size",
141
+ type=int,
142
+ help="Batch size. Default: 4",
143
+ default=4,
144
+ )
145
+ parser.add_argument(
146
+ "--grad_acumm",
147
+ type=int,
148
+ help="Grad accumulation steps. Default: 1",
149
+ default=1,
150
+ )
151
+ parser.add_argument(
152
+ "--max_audio_length",
153
+ type=int,
154
+ help="Max permitted audio size in seconds. Default: 11",
155
+ default=11,
156
+ )
157
+
158
+ args = parser.parse_args()
159
+
160
+ with gr.Blocks() as demo:
161
+ with gr.Tab("1 - Data processing"):
162
+ out_path = gr.Textbox(
163
+ label="Output path (where data and checkpoints will be saved):",
164
+ value=args.out_path,
165
+ )
166
+ # upload_file = gr.Audio(
167
+ # sources="upload",
168
+ # label="Select here the audio files that you want to use for XTTS trainining !",
169
+ # type="filepath",
170
+ # )
171
+ upload_file = gr.File(
172
+ file_count="multiple",
173
+ label="Select here the audio files that you want to use for XTTS trainining (Supported formats: wav, mp3, and flac)",
174
+ )
175
+ lang = gr.Dropdown(
176
+ label="Dataset Language",
177
+ value="en",
178
+ choices=[
179
+ "en",
180
+ "es",
181
+ "fr",
182
+ "de",
183
+ "it",
184
+ "pt",
185
+ "pl",
186
+ "tr",
187
+ "ru",
188
+ "nl",
189
+ "cs",
190
+ "ar",
191
+ "zh",
192
+ "hu",
193
+ "ko",
194
+ "ja",
195
+ "hi",
196
+ ],
197
+ )
198
+ progress_data = gr.Label(label="Progress:")
199
+ logs = gr.Textbox(
200
+ label="Logs:",
201
+ interactive=False,
202
+ )
203
+ demo.load(read_logs, None, logs, every=1)
204
+
205
+ prompt_compute_btn = gr.Button(value="Step 1 - Create dataset")
206
+
207
+ def preprocess_dataset(audio_path, language, out_path, progress=gr.Progress(track_tqdm=True)):
208
+ clear_gpu_cache()
209
+ out_path = os.path.join(out_path, "dataset")
210
+ os.makedirs(out_path, exist_ok=True)
211
+ if audio_path is None:
212
+ return (
213
+ "You should provide one or multiple audio files! If you provided it, probably the upload of the files is not finished yet!",
214
+ "",
215
+ "",
216
+ )
217
+ else:
218
+ try:
219
+ train_meta, eval_meta, audio_total_size = format_audio_list(
220
+ audio_path, target_language=language, out_path=out_path, gradio_progress=progress
221
+ )
222
+ except:
223
+ traceback.print_exc()
224
+ error = traceback.format_exc()
225
+ return (
226
+ f"The data processing was interrupted due an error !! Please check the console to verify the full error message! \n Error summary: {error}",
227
+ "",
228
+ "",
229
+ )
230
+
231
+ clear_gpu_cache()
232
+
233
+ # if audio total len is less than 2 minutes raise an error
234
+ if audio_total_size < 120:
235
+ message = "The sum of the duration of the audios that you provided should be at least 2 minutes!"
236
+ print(message)
237
+ return message, "", ""
238
+
239
+ print("Dataset Processed!")
240
+ return "Dataset Processed!", train_meta, eval_meta
241
+
242
+ with gr.Tab("2 - Fine-tuning XTTS Encoder"):
243
+ train_csv = gr.Textbox(
244
+ label="Train CSV:",
245
+ )
246
+ eval_csv = gr.Textbox(
247
+ label="Eval CSV:",
248
+ )
249
+ num_epochs = gr.Slider(
250
+ label="Number of epochs:",
251
+ minimum=1,
252
+ maximum=100,
253
+ step=1,
254
+ value=args.num_epochs,
255
+ )
256
+ batch_size = gr.Slider(
257
+ label="Batch size:",
258
+ minimum=2,
259
+ maximum=512,
260
+ step=1,
261
+ value=args.batch_size,
262
+ )
263
+ grad_acumm = gr.Slider(
264
+ label="Grad accumulation steps:",
265
+ minimum=2,
266
+ maximum=128,
267
+ step=1,
268
+ value=args.grad_acumm,
269
+ )
270
+ max_audio_length = gr.Slider(
271
+ label="Max permitted audio size in seconds:",
272
+ minimum=2,
273
+ maximum=20,
274
+ step=1,
275
+ value=args.max_audio_length,
276
+ )
277
+ progress_train = gr.Label(label="Progress:")
278
+ logs_tts_train = gr.Textbox(
279
+ label="Logs:",
280
+ interactive=False,
281
+ )
282
+ demo.load(read_logs, None, logs_tts_train, every=1)
283
+ train_btn = gr.Button(value="Step 2 - Run the training")
284
+
285
+ def train_model(
286
+ language, train_csv, eval_csv, num_epochs, batch_size, grad_acumm, output_path, max_audio_length
287
+ ):
288
+ clear_gpu_cache()
289
+ if not train_csv or not eval_csv:
290
+ return (
291
+ "You need to run the data processing step or manually set `Train CSV` and `Eval CSV` fields !",
292
+ "",
293
+ "",
294
+ "",
295
+ "",
296
+ )
297
+ try:
298
+ # convert seconds to waveform frames
299
+ max_audio_length = int(max_audio_length * 22050)
300
+ config_path, original_xtts_checkpoint, vocab_file, exp_path, speaker_wav = train_gpt(
301
+ language,
302
+ num_epochs,
303
+ batch_size,
304
+ grad_acumm,
305
+ train_csv,
306
+ eval_csv,
307
+ output_path=output_path,
308
+ max_audio_length=max_audio_length,
309
+ )
310
+ except:
311
+ traceback.print_exc()
312
+ error = traceback.format_exc()
313
+ return (
314
+ f"The training was interrupted due an error !! Please check the console to check the full error message! \n Error summary: {error}",
315
+ "",
316
+ "",
317
+ "",
318
+ "",
319
+ )
320
+
321
+ # copy original files to avoid parameters changes issues
322
+ os.system(f"cp {config_path} {exp_path}")
323
+ os.system(f"cp {vocab_file} {exp_path}")
324
+
325
+ ft_xtts_checkpoint = os.path.join(exp_path, "best_model.pth")
326
+ print("Model training done!")
327
+ clear_gpu_cache()
328
+ return "Model training done!", config_path, vocab_file, ft_xtts_checkpoint, speaker_wav
329
+
330
+ with gr.Tab("3 - Inference"):
331
+ with gr.Row():
332
+ with gr.Column() as col1:
333
+ xtts_checkpoint = gr.Textbox(
334
+ label="XTTS checkpoint path:",
335
+ value="",
336
+ )
337
+ xtts_config = gr.Textbox(
338
+ label="XTTS config path:",
339
+ value="",
340
+ )
341
+
342
+ xtts_vocab = gr.Textbox(
343
+ label="XTTS vocab path:",
344
+ value="",
345
+ )
346
+ progress_load = gr.Label(label="Progress:")
347
+ load_btn = gr.Button(value="Step 3 - Load Fine-tuned XTTS model")
348
+
349
+ with gr.Column() as col2:
350
+ speaker_reference_audio = gr.Textbox(
351
+ label="Speaker reference audio:",
352
+ value="",
353
+ )
354
+ tts_language = gr.Dropdown(
355
+ label="Language",
356
+ value="en",
357
+ choices=[
358
+ "en",
359
+ "es",
360
+ "fr",
361
+ "de",
362
+ "it",
363
+ "pt",
364
+ "pl",
365
+ "tr",
366
+ "ru",
367
+ "nl",
368
+ "cs",
369
+ "ar",
370
+ "zh",
371
+ "hu",
372
+ "ko",
373
+ "ja",
374
+ "hi",
375
+ ],
376
+ )
377
+ tts_text = gr.Textbox(
378
+ label="Input Text.",
379
+ value="This model sounds really good and above all, it's reasonably fast.",
380
+ )
381
+ tts_btn = gr.Button(value="Step 4 - Inference")
382
+
383
+ with gr.Column() as col3:
384
+ progress_gen = gr.Label(label="Progress:")
385
+ tts_output_audio = gr.Audio(label="Generated Audio.")
386
+ reference_audio = gr.Audio(label="Reference audio used.")
387
+
388
+ prompt_compute_btn.click(
389
+ fn=preprocess_dataset,
390
+ inputs=[
391
+ upload_file,
392
+ lang,
393
+ out_path,
394
+ ],
395
+ outputs=[
396
+ progress_data,
397
+ train_csv,
398
+ eval_csv,
399
+ ],
400
+ )
401
+
402
+ train_btn.click(
403
+ fn=train_model,
404
+ inputs=[
405
+ lang,
406
+ train_csv,
407
+ eval_csv,
408
+ num_epochs,
409
+ batch_size,
410
+ grad_acumm,
411
+ out_path,
412
+ max_audio_length,
413
+ ],
414
+ outputs=[progress_train, xtts_config, xtts_vocab, xtts_checkpoint, speaker_reference_audio],
415
+ )
416
+
417
+ load_btn.click(
418
+ fn=load_model,
419
+ inputs=[xtts_checkpoint, xtts_config, xtts_vocab],
420
+ outputs=[progress_load],
421
+ )
422
+
423
+ tts_btn.click(
424
+ fn=run_tts,
425
+ inputs=[
426
+ tts_language,
427
+ tts_text,
428
+ speaker_reference_audio,
429
+ ],
430
+ outputs=[progress_gen, tts_output_audio, reference_audio],
431
+ )
432
+
433
+ demo.launch(share=True, debug=False, server_port=args.port, server_name="0.0.0.0")
TTS/encoder/.DS_Store ADDED
Binary file (6.15 kB). View file
 
TTS/encoder/README.md ADDED
@@ -0,0 +1,18 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ ### Speaker Encoder
2
+
3
+ This is an implementation of https://arxiv.org/abs/1710.10467. This model can be used for voice and speaker embedding.
4
+
5
+ With the code here you can generate d-vectors for both multi-speaker and single-speaker TTS datasets, then visualise and explore them along with the associated audio files in an interactive chart.
6
+
7
+ Below is an example showing embedding results of various speakers. You can generate the same plot with the provided notebook as demonstrated in [this video](https://youtu.be/KW3oO7JVa7Q).
8
+
9
+ ![](umap.png)
10
+
11
+ Download a pretrained model from [Released Models](https://github.com/mozilla/TTS/wiki/Released-Models) page.
12
+
13
+ To run the code, you need to follow the same flow as in TTS.
14
+
15
+ - Define 'config.json' for your needs. Note that, audio parameters should match your TTS model.
16
+ - Example training call ```python speaker_encoder/train.py --config_path speaker_encoder/config.json --data_path ~/Data/Libri-TTS/train-clean-360```
17
+ - Generate embedding vectors ```python speaker_encoder/compute_embeddings.py --use_cuda /model/path/best_model.pth model/config/path/config.json dataset/path/ output_path``` . This code parses all .wav files at the given dataset path and generates the same folder structure under the output path with the generated embedding files.
18
+ - Watch training on Tensorboard as in TTS
TTS/encoder/__init__.py ADDED
File without changes
TTS/encoder/__pycache__/__init__.cpython-310.pyc ADDED
Binary file (160 Bytes). View file
 
TTS/encoder/__pycache__/__init__.cpython-311.pyc ADDED
Binary file (204 Bytes). View file
 
TTS/encoder/__pycache__/__init__.cpython-39.pyc ADDED
Binary file (165 Bytes). View file
 
TTS/encoder/__pycache__/losses.cpython-310.pyc ADDED
Binary file (7.88 kB). View file
 
TTS/encoder/__pycache__/losses.cpython-311.pyc ADDED
Binary file (13.9 kB). View file
 
TTS/encoder/__pycache__/losses.cpython-39.pyc ADDED
Binary file (7.91 kB). View file
 
TTS/encoder/configs/base_encoder_config.py ADDED
@@ -0,0 +1,61 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ from dataclasses import asdict, dataclass, field
2
+ from typing import Dict, List
3
+
4
+ from coqpit import MISSING
5
+
6
+ from TTS.config.shared_configs import BaseAudioConfig, BaseDatasetConfig, BaseTrainingConfig
7
+
8
+
9
+ @dataclass
10
+ class BaseEncoderConfig(BaseTrainingConfig):
11
+ """Defines parameters for a Generic Encoder model."""
12
+
13
+ model: str = None
14
+ audio: BaseAudioConfig = field(default_factory=BaseAudioConfig)
15
+ datasets: List[BaseDatasetConfig] = field(default_factory=lambda: [BaseDatasetConfig()])
16
+ # model params
17
+ model_params: Dict = field(
18
+ default_factory=lambda: {
19
+ "model_name": "lstm",
20
+ "input_dim": 80,
21
+ "proj_dim": 256,
22
+ "lstm_dim": 768,
23
+ "num_lstm_layers": 3,
24
+ "use_lstm_with_projection": True,
25
+ }
26
+ )
27
+
28
+ audio_augmentation: Dict = field(default_factory=lambda: {})
29
+
30
+ # training params
31
+ epochs: int = 10000
32
+ loss: str = "angleproto"
33
+ grad_clip: float = 3.0
34
+ lr: float = 0.0001
35
+ optimizer: str = "radam"
36
+ optimizer_params: Dict = field(default_factory=lambda: {"betas": [0.9, 0.999], "weight_decay": 0})
37
+ lr_decay: bool = False
38
+ warmup_steps: int = 4000
39
+
40
+ # logging params
41
+ tb_model_param_stats: bool = False
42
+ steps_plot_stats: int = 10
43
+ save_step: int = 1000
44
+ print_step: int = 20
45
+ run_eval: bool = False
46
+
47
+ # data loader
48
+ num_classes_in_batch: int = MISSING
49
+ num_utter_per_class: int = MISSING
50
+ eval_num_classes_in_batch: int = None
51
+ eval_num_utter_per_class: int = None
52
+
53
+ num_loader_workers: int = MISSING
54
+ voice_len: float = 1.6
55
+
56
+ def check_values(self):
57
+ super().check_values()
58
+ c = asdict(self)
59
+ assert (
60
+ c["model_params"]["input_dim"] == self.audio.num_mels
61
+ ), " [!] model input dimendion must be equal to melspectrogram dimension."
TTS/encoder/configs/emotion_encoder_config.py ADDED
@@ -0,0 +1,12 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ from dataclasses import dataclass
2
+
3
+ from TTS.encoder.configs.base_encoder_config import BaseEncoderConfig
4
+
5
+
6
+ @dataclass
7
+ class EmotionEncoderConfig(BaseEncoderConfig):
8
+ """Defines parameters for Emotion Encoder model."""
9
+
10
+ model: str = "emotion_encoder"
11
+ map_classid_to_classname: dict = None
12
+ class_name_key: str = "emotion_name"