File size: 2,047 Bytes
a5c86e8
802577d
 
a5c86e8
1d2e2ec
 
5457abc
1d2e2ec
 
 
a5c86e8
9b2107c
 
 
 
 
9ec632b
9b2107c
802577d
 
9b2107c
802577d
 
 
 
 
 
 
 
9b2107c
802577d
 
 
 
 
 
 
 
 
9b2107c
 
802577d
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
import subprocess
import spaces
import os

# Run the setup.py install command
try:
    subprocess.run(['python', 'setup.py', 'install', '--user'], check=True)
    print("Installation successful.")
except subprocess.CalledProcessError as e:
    print(f"Installation failed with error: {e}")

import gradio as gr
import torch
from TTS.api import TTS

# Get device
device = "cuda" if torch.cuda.is_available() else "cpu"

# Initialize TTS model globally but load it inside the GPU-decorated function
tts = None

@spaces.GPU(duration=120) # Voice cloning can take longer than default 60s
def initialize_tts():
    global tts
    if tts is None:
        tts = TTS("tts_models/multilingual/multi-dataset/xtts_v2").to(device)
    return tts

@spaces.GPU(duration=120)
def voice_clone(text: str, speaker_wav: str, language: str):
    global tts
    # Initialize TTS if not already done
    if tts is None:
        tts = initialize_tts()
    
    # Create output directory if it doesn't exist
    os.makedirs("outputs", exist_ok=True)
    output_path = os.path.join("outputs", "output.wav")
    
    # Run TTS
    print("Speaker wav:", speaker_wav)
    tts.tts_to_file(text=text, 
                    speaker_wav=speaker_wav, 
                    language=language, 
                    file_path=output_path)
    return output_path

# Create Gradio interface
iface = gr.Interface(
    fn=voice_clone,
    theme="Nymbo/Nymbo_Theme",
    inputs=[
        gr.Textbox(lines=2, placeholder="Enter the text...", label="Text"),
        gr.Audio(type="filepath", label="Upload audio file"),
        gr.Radio(
            ['ru', 'en', 'zh-cn', 'ja', 'de', 'fr', 'it', 'pt', 'pl', 'tr', 'ko', 'nl', 'cs', 'ar', 'es', 'hu'],
            label="language"
        ),
    ],
    outputs=gr.Audio(type="filepath", label="Generated audio file"),
    title="Voice Cloning",
    description="Upload a voice sample and enter text to clone the voice. Processing may take 1-2 minutes."
)

# Launch with queue enabled for better handling of GPU resources
iface.queue().launch()