Spaces:
Running
Running
ZeroGPU XTTS
Browse files- app/models.py +26 -12
- test_tts_xtts.py +18 -11
app/models.py
CHANGED
|
@@ -26,6 +26,7 @@ AVAILABLE_MODELS = {
|
|
| 26 |
# '<keyname>':'<Space URL>'
|
| 27 |
# gradio version that works with most spaces: 4.29
|
| 28 |
# 'coqui/xtts': 'coqui/xtts', # 4.29 4.32; extra_headers error appears for 5.13+
|
|
|
|
| 29 |
# 'collabora/WhisperSpeech': 'collabora/WhisperSpeech', # 4.32 4.36.1
|
| 30 |
#'myshell-ai/OpenVoice': 'myshell-ai/OpenVoice', # same devs as MeloTTS, which scores higher # extra_headers error appears for 5.13+
|
| 31 |
#'myshell-ai/OpenVoiceV2': 'myshell-ai/OpenVoiceV2', # same devs as MeloTTS, which scores higher # extra_headers error appears for 5.13+
|
|
@@ -109,13 +110,21 @@ AVAILABLE_MODELS = {
|
|
| 109 |
|
| 110 |
HF_SPACES = {
|
| 111 |
# XTTS v2
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 112 |
'coqui/xtts': {
|
| 113 |
'name': 'XTTS v2',
|
| 114 |
-
'function': '
|
| 115 |
-
'text_param_index':
|
| 116 |
-
'return_audio_index':
|
| 117 |
'series': 'XTTS',
|
| 118 |
-
'emoji': '😩', # old gradio
|
| 119 |
},
|
| 120 |
|
| 121 |
# WhisperSpeech
|
|
@@ -238,7 +247,8 @@ HF_SPACES = {
|
|
| 238 |
'return_audio_index': 0,
|
| 239 |
'is_closed_source': True,
|
| 240 |
'series': 'Edge TTS',
|
| 241 |
-
'emoji': '
|
|
|
|
| 242 |
},
|
| 243 |
|
| 244 |
# Fish Speech
|
|
@@ -468,13 +478,17 @@ DEFAULT_VOICE_PROMPT = "female voice; very clear audio"
|
|
| 468 |
|
| 469 |
# Older gradio spaces use unnamed parameters, both types are valid
|
| 470 |
OVERRIDE_INPUTS = {
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 471 |
'coqui/xtts': {
|
| 472 |
-
|
| 473 |
-
2: DEFAULT_VOICE_SAMPLE_STR, # voice sample
|
| 474 |
-
3: None, # mic voice sample
|
| 475 |
-
4: False, #use_mic
|
| 476 |
-
5: False, #cleanup_reference
|
| 477 |
-
6: False, #auto_detect
|
| 478 |
},
|
| 479 |
'collabora/WhisperSpeech': {
|
| 480 |
1: DEFAULT_VOICE_SAMPLE, # voice sample
|
|
@@ -866,7 +880,7 @@ def make_link_to_space(model_name, for_leaderboard=False):
|
|
| 866 |
emoji = HF_SPACES[model_name]['emoji']
|
| 867 |
except:
|
| 868 |
pass
|
| 869 |
-
return emoji +' <a target="_blank" style="'+ style +'" title="'+ title +'" href="'+ space_link +'">'+ model_basename +'</a>'
|
| 870 |
|
| 871 |
# otherwise just return without emoji
|
| 872 |
return '<span style="'+ style +'" title="'+ title +'" href="'+ space_link +'">'+ model_name +'</span>'
|
|
|
|
| 26 |
# '<keyname>':'<Space URL>'
|
| 27 |
# gradio version that works with most spaces: 4.29
|
| 28 |
# 'coqui/xtts': 'coqui/xtts', # 4.29 4.32; extra_headers error appears for 5.13+
|
| 29 |
+
'coqui/xtts': 'tonyassi/voice-clone', # ZeroGPU clone
|
| 30 |
# 'collabora/WhisperSpeech': 'collabora/WhisperSpeech', # 4.32 4.36.1
|
| 31 |
#'myshell-ai/OpenVoice': 'myshell-ai/OpenVoice', # same devs as MeloTTS, which scores higher # extra_headers error appears for 5.13+
|
| 32 |
#'myshell-ai/OpenVoiceV2': 'myshell-ai/OpenVoiceV2', # same devs as MeloTTS, which scores higher # extra_headers error appears for 5.13+
|
|
|
|
| 110 |
|
| 111 |
HF_SPACES = {
|
| 112 |
# XTTS v2
|
| 113 |
+
# 'coqui/xtts': {
|
| 114 |
+
# 'name': 'XTTS v2',
|
| 115 |
+
# 'function': '1',
|
| 116 |
+
# 'text_param_index': 0,
|
| 117 |
+
# 'return_audio_index': 1,
|
| 118 |
+
# 'series': 'XTTS',
|
| 119 |
+
# 'emoji': '😩', # old gradio
|
| 120 |
+
# },
|
| 121 |
+
# tonyassi ZeroGPU XTTS v2
|
| 122 |
'coqui/xtts': {
|
| 123 |
'name': 'XTTS v2',
|
| 124 |
+
'function': '/predict',
|
| 125 |
+
'text_param_index': 'text',
|
| 126 |
+
'return_audio_index': 0,
|
| 127 |
'series': 'XTTS',
|
|
|
|
| 128 |
},
|
| 129 |
|
| 130 |
# WhisperSpeech
|
|
|
|
| 247 |
'return_audio_index': 0,
|
| 248 |
'is_closed_source': True,
|
| 249 |
'series': 'Edge TTS',
|
| 250 |
+
'emoji': '', # api disabled
|
| 251 |
+
'space_link': 'innoai/Edge-TTS-Text-to-Speech', # API disabled
|
| 252 |
},
|
| 253 |
|
| 254 |
# Fish Speech
|
|
|
|
| 478 |
|
| 479 |
# Older gradio spaces use unnamed parameters, both types are valid
|
| 480 |
OVERRIDE_INPUTS = {
|
| 481 |
+
# 'coqui/xtts': {
|
| 482 |
+
# 1: 'en',
|
| 483 |
+
# 2: DEFAULT_VOICE_SAMPLE_STR, # voice sample
|
| 484 |
+
# 3: None, # mic voice sample
|
| 485 |
+
# 4: False, #use_mic
|
| 486 |
+
# 5: False, #cleanup_reference
|
| 487 |
+
# 6: False, #auto_detect
|
| 488 |
+
# },
|
| 489 |
+
# tonyassi ZeroGPU space of XTTS:
|
| 490 |
'coqui/xtts': {
|
| 491 |
+
'audio': DEFAULT_VOICE_SAMPLE, # voice sample
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 492 |
},
|
| 493 |
'collabora/WhisperSpeech': {
|
| 494 |
1: DEFAULT_VOICE_SAMPLE, # voice sample
|
|
|
|
| 880 |
emoji = HF_SPACES[model_name]['emoji']
|
| 881 |
except:
|
| 882 |
pass
|
| 883 |
+
return (emoji +' <a target="_blank" style="'+ style +'" title="'+ title +'" href="'+ space_link +'">'+ model_basename +'</a>').strip()
|
| 884 |
|
| 885 |
# otherwise just return without emoji
|
| 886 |
return '<span style="'+ style +'" title="'+ title +'" href="'+ space_link +'">'+ model_name +'</span>'
|
test_tts_xtts.py
CHANGED
|
@@ -1,17 +1,24 @@
|
|
| 1 |
import os
|
| 2 |
-
from gradio_client import Client,
|
| 3 |
|
| 4 |
-
client = Client("coqui/xtts", hf_token=os.getenv('HF_TOKEN'))
|
|
|
|
| 5 |
endpoints = client.view_api(all_endpoints=True, print_info=False, return_format='dict')
|
| 6 |
# print(endpoints)
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 7 |
result = client.predict(
|
| 8 |
-
"Quick test.", # str in 'What should I say!? (max 512 characters).' Textbox component
|
| 9 |
-
'
|
| 10 |
-
|
| 11 |
-
None, # mic voice sample
|
| 12 |
-
False, #use_mic
|
| 13 |
-
False, #cleanup_reference
|
| 14 |
-
False, #auto_detect
|
| 15 |
-
True, #ToS
|
| 16 |
-
fn_index=1
|
| 17 |
)
|
|
|
|
| 1 |
import os
|
| 2 |
+
from gradio_client import Client, handle_file
|
| 3 |
|
| 4 |
+
# client = Client("coqui/xtts", hf_token=os.getenv('HF_TOKEN'), headers={})
|
| 5 |
+
client = Client("tonyassi/voice-clone", hf_token=os.getenv('HF_TOKEN'), headers={})
|
| 6 |
endpoints = client.view_api(all_endpoints=True, print_info=False, return_format='dict')
|
| 7 |
# print(endpoints)
|
| 8 |
+
# result = client.predict(
|
| 9 |
+
# "Quick test.", # str in 'What should I say!? (max 512 characters).' Textbox component
|
| 10 |
+
# 'en', #lang
|
| 11 |
+
# 'https://cdn-uploads.huggingface.co/production/uploads/63d52e0c4e5642795617f668/V6-rMmI-P59DA4leWDIcK.wav', # voice sample
|
| 12 |
+
# None, # mic voice sample
|
| 13 |
+
# False, #use_mic
|
| 14 |
+
# False, #cleanup_reference
|
| 15 |
+
# False, #auto_detect
|
| 16 |
+
# True, #ToS
|
| 17 |
+
# fn_index=1
|
| 18 |
+
# )
|
| 19 |
+
# tony's space
|
| 20 |
result = client.predict(
|
| 21 |
+
text="Quick test.", # str in 'What should I say!? (max 512 characters).' Textbox component
|
| 22 |
+
audio=handle_file('https://cdn-uploads.huggingface.co/production/uploads/63d52e0c4e5642795617f668/V6-rMmI-P59DA4leWDIcK.wav'), # voice sample
|
| 23 |
+
api_name="/predict"
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 24 |
)
|