Spaces:
Running
Running
sync models
Browse files- app/models.py +67 -4
app/models.py
CHANGED
|
@@ -1,3 +1,4 @@
|
|
|
|
|
| 1 |
from gradio_client import handle_file
|
| 2 |
|
| 3 |
# Models to include in the leaderboard, only include models that users can vote on
|
|
@@ -48,6 +49,16 @@ AVAILABLE_MODELS = {
|
|
| 48 |
|
| 49 |
# IMS-Toucan
|
| 50 |
# 'Flux9665/MassivelyMultilingualTTS': 'Flux9665/MassivelyMultilingualTTS', # 5.1
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 51 |
|
| 52 |
# HF TTS w issues
|
| 53 |
'LeeSangHoon/HierSpeech_TTS': 'LeeSangHoon/HierSpeech_TTS', # irresponsive to exclamation marks # 4.29
|
|
@@ -168,7 +179,7 @@ HF_SPACES = {
|
|
| 168 |
'function': '/predict',
|
| 169 |
'text_param_index': 0,
|
| 170 |
'return_audio_index': 0,
|
| 171 |
-
'
|
| 172 |
'series': 'Edge TTS',
|
| 173 |
},
|
| 174 |
|
|
@@ -218,6 +229,34 @@ HF_SPACES = {
|
|
| 218 |
'is_zero_gpu_space': True,
|
| 219 |
'series': 'StyleTTS',
|
| 220 |
},
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 221 |
}
|
| 222 |
|
| 223 |
# for zero-shot TTS - voice sample used by XTTS (11 seconds)
|
|
@@ -317,8 +356,10 @@ OVERRIDE_INPUTS = {
|
|
| 317 |
'mrfakename/E2-F5-TTS': {
|
| 318 |
0: DEFAULT_VOICE_SAMPLE, # voice sample
|
| 319 |
1: DEFAULT_VOICE_TRANSCRIPT, # transcript of sample (< 15 seconds required)
|
| 320 |
-
3:
|
| 321 |
-
|
|
|
|
|
|
|
| 322 |
},
|
| 323 |
|
| 324 |
# IMS-Toucan
|
|
@@ -337,6 +378,28 @@ OVERRIDE_INPUTS = {
|
|
| 337 |
2: 'en-us', # lang
|
| 338 |
3: 8, # lngsteps
|
| 339 |
},
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 340 |
}
|
| 341 |
|
| 342 |
|
|
@@ -385,7 +448,7 @@ def make_link_to_space(model_name, for_leaderboard=False):
|
|
| 385 |
try:
|
| 386 |
if(
|
| 387 |
for_leaderboard
|
| 388 |
-
and HF_SPACES[model_name]['
|
| 389 |
):
|
| 390 |
model_basename += ' π'
|
| 391 |
title += '; π = online only or proprietary'
|
|
|
|
| 1 |
+
import os
|
| 2 |
from gradio_client import handle_file
|
| 3 |
|
| 4 |
# Models to include in the leaderboard, only include models that users can vote on
|
|
|
|
| 49 |
|
| 50 |
# IMS-Toucan
|
| 51 |
# 'Flux9665/MassivelyMultilingualTTS': 'Flux9665/MassivelyMultilingualTTS', # 5.1
|
| 52 |
+
# StyleTTS v2
|
| 53 |
+
# 'Pendrokar/style-tts-2': 'Pendrokar/style-tts-2', # more votes in OG arena; emotionless
|
| 54 |
+
# StyleTTS kokoro
|
| 55 |
+
'hexgrad/kokoro': 'hexgrad/kokoro',
|
| 56 |
+
|
| 57 |
+
# MaskGCT (by Amphion)
|
| 58 |
+
# DEMANDS 300 seconds of ZeroGPU
|
| 59 |
+
# 'amphion/maskgct': 'amphion/maskgct',
|
| 60 |
+
# default ZeroGPU borrow time
|
| 61 |
+
'Svngoku/maskgct-audio-lab': 'Svngoku/maskgct-audio-lab',
|
| 62 |
|
| 63 |
# HF TTS w issues
|
| 64 |
'LeeSangHoon/HierSpeech_TTS': 'LeeSangHoon/HierSpeech_TTS', # irresponsive to exclamation marks # 4.29
|
|
|
|
| 179 |
'function': '/predict',
|
| 180 |
'text_param_index': 0,
|
| 181 |
'return_audio_index': 0,
|
| 182 |
+
'is_closed_source': True,
|
| 183 |
'series': 'Edge TTS',
|
| 184 |
},
|
| 185 |
|
|
|
|
| 229 |
'is_zero_gpu_space': True,
|
| 230 |
'series': 'StyleTTS',
|
| 231 |
},
|
| 232 |
+
|
| 233 |
+
# StyleTTS v2 kokoro fine tune
|
| 234 |
+
'hexgrad/kokoro': {
|
| 235 |
+
'name': 'StyleTTS Kokoro',
|
| 236 |
+
'function': '/generate',
|
| 237 |
+
'text_param_index': 0,
|
| 238 |
+
'return_audio_index': 0,
|
| 239 |
+
'is_zero_gpu_space': True,
|
| 240 |
+
'series': 'StyleTTS',
|
| 241 |
+
},
|
| 242 |
+
|
| 243 |
+
# MaskGCT (by Amphion)
|
| 244 |
+
'amphion/maskgct': {
|
| 245 |
+
'name': 'MaskGCT',
|
| 246 |
+
'function': '/predict',
|
| 247 |
+
'text_param_index': 1,
|
| 248 |
+
'return_audio_index': 0,
|
| 249 |
+
'is_zero_gpu_space': True,
|
| 250 |
+
'series': 'MaskGCT',
|
| 251 |
+
},
|
| 252 |
+
'Svngoku/maskgct-audio-lab': {
|
| 253 |
+
'name': 'MaskGCT',
|
| 254 |
+
'function': '/predict',
|
| 255 |
+
'text_param_index': 1,
|
| 256 |
+
'return_audio_index': 0,
|
| 257 |
+
'is_zero_gpu_space': True,
|
| 258 |
+
'series': 'MaskGCT',
|
| 259 |
+
},
|
| 260 |
}
|
| 261 |
|
| 262 |
# for zero-shot TTS - voice sample used by XTTS (11 seconds)
|
|
|
|
| 356 |
'mrfakename/E2-F5-TTS': {
|
| 357 |
0: DEFAULT_VOICE_SAMPLE, # voice sample
|
| 358 |
1: DEFAULT_VOICE_TRANSCRIPT, # transcript of sample (< 15 seconds required)
|
| 359 |
+
3: False, # cleanup silence
|
| 360 |
+
4: 0.15, #crossfade
|
| 361 |
+
5: 32, #nfe_slider
|
| 362 |
+
6: 1, #speed
|
| 363 |
},
|
| 364 |
|
| 365 |
# IMS-Toucan
|
|
|
|
| 378 |
2: 'en-us', # lang
|
| 379 |
3: 8, # lngsteps
|
| 380 |
},
|
| 381 |
+
|
| 382 |
+
# StyleTTS 2 kokoro
|
| 383 |
+
'hexgrad/kokoro': {
|
| 384 |
+
1: "af", #voice
|
| 385 |
+
2: None, #ps
|
| 386 |
+
3: 1, #speed
|
| 387 |
+
4: 3000, #trim
|
| 388 |
+
5: False, #use_gpu; fast enough with multithreaded with CPU
|
| 389 |
+
6: os.getenv('KOKORO'), #sk
|
| 390 |
+
},
|
| 391 |
+
|
| 392 |
+
# maskGCT (by amphion)
|
| 393 |
+
'amphion/maskgct': {
|
| 394 |
+
0: DEFAULT_VOICE_SAMPLE, #prompt_wav
|
| 395 |
+
2: -1, #target_len
|
| 396 |
+
3: 25, #n_timesteps
|
| 397 |
+
},
|
| 398 |
+
'Svngoku/maskgct-audio-lab': {
|
| 399 |
+
0: DEFAULT_VOICE_SAMPLE, #prompt_wav
|
| 400 |
+
2: -1, #target_len
|
| 401 |
+
3: 25, #n_timesteps
|
| 402 |
+
},
|
| 403 |
}
|
| 404 |
|
| 405 |
|
|
|
|
| 448 |
try:
|
| 449 |
if(
|
| 450 |
for_leaderboard
|
| 451 |
+
and HF_SPACES[model_name]['is_closed_source']
|
| 452 |
):
|
| 453 |
model_basename += ' π'
|
| 454 |
title += '; π = online only or proprietary'
|