Spaces:
Paused
Paused
v1
Browse files- Dockerfile +24 -0
- app.py +350 -0
- docker-compose.yml +40 -0
- requirements.txt +17 -0
- xtts.py +192 -0
Dockerfile
ADDED
|
@@ -0,0 +1,24 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
FROM pytorch/pytorch:2.1.0-cuda12.1-cudnn8-devel
|
| 2 |
+
ARG DEBIAN_FRONTEND=noninteractive
|
| 3 |
+
|
| 4 |
+
RUN apt-get update && \
|
| 5 |
+
apt-get install --no-install-recommends -y sox libsox-fmt-all curl wget gcc git git-lfs build-essential libaio-dev libsndfile1 ssh ffmpeg && \
|
| 6 |
+
apt-get clean && apt-get -y autoremove
|
| 7 |
+
|
| 8 |
+
WORKDIR /app
|
| 9 |
+
COPY requirements.txt .
|
| 10 |
+
RUN python -m pip install --use-deprecated=legacy-resolver -r requirements.txt \
|
| 11 |
+
&& python -m pip cache purge
|
| 12 |
+
|
| 13 |
+
RUN python -m unidic download
|
| 14 |
+
RUN mkdir -p /app/tts_models
|
| 15 |
+
|
| 16 |
+
COPY xtts.py .
|
| 17 |
+
COPY app.py .
|
| 18 |
+
|
| 19 |
+
#Mark this 1 if you have older card
|
| 20 |
+
#ENV NVIDIA_DISABLE_REQUIRE=0
|
| 21 |
+
|
| 22 |
+
ENV NUM_THREADS=2
|
| 23 |
+
EXPOSE 80
|
| 24 |
+
CMD ["python","app.py"]
|
app.py
ADDED
|
@@ -0,0 +1,350 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
import gradio as gr
|
| 2 |
+
import base64
|
| 3 |
+
import tempfile
|
| 4 |
+
import json
|
| 5 |
+
import os
|
| 6 |
+
from os.path import abspath
|
| 7 |
+
import zipfile
|
| 8 |
+
import random
|
| 9 |
+
import xtts
|
| 10 |
+
|
| 11 |
+
|
| 12 |
+
DO_CHECK = os.getenv('DO_CHECK', '1')
|
| 13 |
+
OUTPUT = "./demo_outputs"
|
| 14 |
+
cloned_speakers = {}
|
| 15 |
+
|
| 16 |
+
print("Preparing file structure...")
|
| 17 |
+
if not os.path.exists(OUTPUT):
|
| 18 |
+
os.mkdir(OUTPUT)
|
| 19 |
+
os.mkdir(os.path.join(OUTPUT, "cloned_speakers"))
|
| 20 |
+
os.mkdir(os.path.join(OUTPUT, "generated_audios"))
|
| 21 |
+
elif os.path.exists(os.path.join(OUTPUT, "cloned_speakers")):
|
| 22 |
+
print("Loading existing cloned speakers...")
|
| 23 |
+
for file in os.listdir(os.path.join(OUTPUT, "cloned_speakers")):
|
| 24 |
+
if file.endswith(".json"):
|
| 25 |
+
with open(os.path.join(OUTPUT, "cloned_speakers", file), "r") as fp:
|
| 26 |
+
cloned_speakers[file[:-5]] = json.load(fp)
|
| 27 |
+
print("Available cloned speakers:", ", ".join(cloned_speakers.keys()))
|
| 28 |
+
|
| 29 |
+
AUDIOS_DIR = os.path.join("demo_outputs", "generated_audios");
|
| 30 |
+
ZIP_DIR = os.path.join("zip_outputs");
|
| 31 |
+
|
| 32 |
+
print("Checking zip at", ZIP_DIR)
|
| 33 |
+
if not os.path.exists(ZIP_DIR):
|
| 34 |
+
os.mkdir(ZIP_DIR)
|
| 35 |
+
|
| 36 |
+
|
| 37 |
+
try:
|
| 38 |
+
print("Getting metadata from server ...")
|
| 39 |
+
LANUGAGES = xtts.get_languages()
|
| 40 |
+
print("Available languages:", ", ".join(LANUGAGES))
|
| 41 |
+
STUDIO_SPEAKERS = xtts.get_speakers()
|
| 42 |
+
print("Available studio speakers:", ", ".join(STUDIO_SPEAKERS.keys()))
|
| 43 |
+
except:
|
| 44 |
+
raise Exception("Please make sure the server is running first.")
|
| 45 |
+
|
| 46 |
+
|
| 47 |
+
def ExtractVars(input_string):
|
| 48 |
+
# Split the string into lines
|
| 49 |
+
lines = input_string.split('\n')
|
| 50 |
+
|
| 51 |
+
# Initialize an empty dictionary to store key-value pairs
|
| 52 |
+
result_dict = {
|
| 53 |
+
'prefix': None,
|
| 54 |
+
'name': '',
|
| 55 |
+
'speaker': None,
|
| 56 |
+
'num': None,
|
| 57 |
+
}
|
| 58 |
+
|
| 59 |
+
# List to hold lines that do not start with '!'
|
| 60 |
+
filtered_lines = []
|
| 61 |
+
|
| 62 |
+
# Iterate through each line
|
| 63 |
+
for line in lines:
|
| 64 |
+
# Check if the line starts with '!'
|
| 65 |
+
if line.strip().startswith('!'):
|
| 66 |
+
|
| 67 |
+
# Try to split the line into key and value parts
|
| 68 |
+
try:
|
| 69 |
+
# Split on '=' and strip whitespace from key and value
|
| 70 |
+
key, value = line.strip()[1:].split('=')
|
| 71 |
+
key = key.strip()
|
| 72 |
+
value = value.strip()
|
| 73 |
+
# Add to dictionary
|
| 74 |
+
result_dict[key] = value
|
| 75 |
+
except ValueError:
|
| 76 |
+
# Handle the case where there is no '=' or improper format
|
| 77 |
+
continue
|
| 78 |
+
elif len(line.strip()) > 0:
|
| 79 |
+
# Add the line to filtered_lines if it doesn't start with '!'
|
| 80 |
+
filtered_lines.append(line)
|
| 81 |
+
|
| 82 |
+
# Join the filtered lines back into a single string
|
| 83 |
+
filtered_string = '\n'.join(filtered_lines)
|
| 84 |
+
return result_dict, filtered_string
|
| 85 |
+
|
| 86 |
+
|
| 87 |
+
def FindSpeakerByName(name, speakerType):
|
| 88 |
+
|
| 89 |
+
srcItems = STUDIO_SPEAKERS if speakerType == "Studio" else cloned_speakers;
|
| 90 |
+
|
| 91 |
+
for key, value in srcItems.items():
|
| 92 |
+
|
| 93 |
+
if key == name:
|
| 94 |
+
return key,value
|
| 95 |
+
|
| 96 |
+
if key.split(" ")[0] == name:
|
| 97 |
+
return key,value;
|
| 98 |
+
|
| 99 |
+
|
| 100 |
+
def clone_speaker(upload_file, clone_speaker_name, cloned_speaker_names):
|
| 101 |
+
embeddings = xtts.predict_speaker(open(upload_file,"rb"))
|
| 102 |
+
with open(os.path.join(OUTPUT, "cloned_speakers", clone_speaker_name + ".json"), "w") as fp:
|
| 103 |
+
json.dump(embeddings, fp)
|
| 104 |
+
cloned_speakers[clone_speaker_name] = embeddings
|
| 105 |
+
cloned_speaker_names.append(clone_speaker_name)
|
| 106 |
+
return upload_file, clone_speaker_name, cloned_speaker_names, gr.Dropdown(choices=cloned_speaker_names)
|
| 107 |
+
|
| 108 |
+
def tts(text, speaker_type, speaker_name_studio, speaker_name_custom, lang, temperature
|
| 109 |
+
,speed,top_p,top_k, AllFileList,progress=gr.Progress()
|
| 110 |
+
):
|
| 111 |
+
embeddings = STUDIO_SPEAKERS[speaker_name_studio] if speaker_type == 'Studio' else cloned_speakers[speaker_name_custom]
|
| 112 |
+
|
| 113 |
+
# break at line!
|
| 114 |
+
lines = text.split("---");
|
| 115 |
+
totalLines = len(lines);
|
| 116 |
+
print("Total parts:", len(lines))
|
| 117 |
+
|
| 118 |
+
audioNum = 0;
|
| 119 |
+
|
| 120 |
+
DefaultPrefix = next(tempfile._get_candidate_names());
|
| 121 |
+
|
| 122 |
+
CurrentPrefix = DefaultPrefix
|
| 123 |
+
|
| 124 |
+
|
| 125 |
+
AudioList = [];
|
| 126 |
+
for line in progress.tqdm(lines, desc="Gerando fala..."):
|
| 127 |
+
audioNum += 1;
|
| 128 |
+
|
| 129 |
+
textVars,cleanLine = ExtractVars(line)
|
| 130 |
+
|
| 131 |
+
if textVars['prefix']:
|
| 132 |
+
CurrentPrefix = textVars['prefix']
|
| 133 |
+
|
| 134 |
+
audioName = textVars['name'];
|
| 135 |
+
|
| 136 |
+
if audioName:
|
| 137 |
+
audioName = '_'+audioName
|
| 138 |
+
|
| 139 |
+
num = textVars['num'];
|
| 140 |
+
|
| 141 |
+
if not num:
|
| 142 |
+
num = audioNum;
|
| 143 |
+
|
| 144 |
+
path = CurrentPrefix +"_n_" + str(num)+audioName+".wav"
|
| 145 |
+
|
| 146 |
+
print("Generating audio for line", num, 'sequence', audioNum);
|
| 147 |
+
|
| 148 |
+
speaker = textVars['speaker'];
|
| 149 |
+
|
| 150 |
+
if not speaker:
|
| 151 |
+
speaker = speaker_name_studio if speaker_type == 'Studio' else speaker_name_custom
|
| 152 |
+
|
| 153 |
+
speakerName,embeddings = FindSpeakerByName(speaker, speaker_type)
|
| 154 |
+
|
| 155 |
+
if not speakerName:
|
| 156 |
+
raise ValueError("InvalidSpeaker: "+speakerName)
|
| 157 |
+
|
| 158 |
+
ipts = xtts.TTSInputs(
|
| 159 |
+
speaker_embedding=embeddings["speaker_embedding"],
|
| 160 |
+
gpt_cond_latent=embeddings["gpt_cond_latent"],
|
| 161 |
+
text=cleanLine,
|
| 162 |
+
language=lang,
|
| 163 |
+
temperature=temperature,
|
| 164 |
+
speed=speed,
|
| 165 |
+
top_k=top_k,
|
| 166 |
+
top_p=top_p
|
| 167 |
+
)
|
| 168 |
+
|
| 169 |
+
generated_audio = xtts.predict_speech(ipts)
|
| 170 |
+
|
| 171 |
+
print("Audio generated.. Saving to", path);
|
| 172 |
+
generated_audio_path = os.path.join(AUDIOS_DIR, path)
|
| 173 |
+
with open(generated_audio_path, "wb") as fp:
|
| 174 |
+
fp.write(base64.b64decode(generated_audio))
|
| 175 |
+
AudioList.append(fp.name);
|
| 176 |
+
|
| 177 |
+
AllFileList.clear();
|
| 178 |
+
AllFileList.extend(AudioList);
|
| 179 |
+
|
| 180 |
+
return gr.Dropdown(
|
| 181 |
+
label="Generated Audios",
|
| 182 |
+
choices=list(AudioList),
|
| 183 |
+
value=AudioList[0]
|
| 184 |
+
)
|
| 185 |
+
|
| 186 |
+
def get_file_content(f):
|
| 187 |
+
if len(f) > 0:
|
| 188 |
+
return f[0];
|
| 189 |
+
|
| 190 |
+
return None;
|
| 191 |
+
|
| 192 |
+
|
| 193 |
+
def UpdateFileList(DirListState):
|
| 194 |
+
DirListState.clear();
|
| 195 |
+
DirListState.extend( os.listdir(AUDIOS_DIR) )
|
| 196 |
+
|
| 197 |
+
def audio_list_update(d):
|
| 198 |
+
fullPath = abspath(d)
|
| 199 |
+
return fullPath
|
| 200 |
+
|
| 201 |
+
def ZipAndDownload(files):
|
| 202 |
+
allFiles = files
|
| 203 |
+
|
| 204 |
+
DefaultPrefix = next(tempfile._get_candidate_names());
|
| 205 |
+
|
| 206 |
+
zipFile = abspath( os.path.join(ZIP_DIR, DefaultPrefix + ".zip") );
|
| 207 |
+
|
| 208 |
+
|
| 209 |
+
with zipfile.ZipFile(zipFile, 'w') as zipMe:
|
| 210 |
+
for file in allFiles:
|
| 211 |
+
print("Zipping", file);
|
| 212 |
+
zipMe.write(abspath(file), os.path.basename(file), compress_type=zipfile.ZIP_DEFLATED)
|
| 213 |
+
|
| 214 |
+
print("Pronto", zipFile);
|
| 215 |
+
|
| 216 |
+
return '<a href="/file='+zipFile+'">If donwload dont starts, click here</a>';
|
| 217 |
+
|
| 218 |
+
|
| 219 |
+
js = """
|
| 220 |
+
function DetectDownloadLink(){
|
| 221 |
+
console.log('Configuring AutoDonwloadObservr...');
|
| 222 |
+
let hiddenLink = document.getElementById("DonwloadLink");
|
| 223 |
+
let onChange= function(mutations){
|
| 224 |
+
|
| 225 |
+
for (const mutation of mutations) {
|
| 226 |
+
if (mutation.type !== 'childList')
|
| 227 |
+
continue;
|
| 228 |
+
|
| 229 |
+
for (const addedNode of mutation.addedNodes) {
|
| 230 |
+
if (addedNode.nodeName === 'A') {
|
| 231 |
+
location.href = addedNode.href;
|
| 232 |
+
}
|
| 233 |
+
}
|
| 234 |
+
|
| 235 |
+
}
|
| 236 |
+
}
|
| 237 |
+
|
| 238 |
+
let config = { attributes: true, childList: true, subtree: true, attributeFilter: ["href"] }
|
| 239 |
+
let obs = new MutationObserver(onChange);
|
| 240 |
+
obs.observe(hiddenLink, config);
|
| 241 |
+
}
|
| 242 |
+
"""
|
| 243 |
+
|
| 244 |
+
with gr.Blocks(js=js) as demo:
|
| 245 |
+
defaultSpeaker = "Dionisio Schuyler"
|
| 246 |
+
cloned_speaker_names = gr.State(list(cloned_speakers.keys()))
|
| 247 |
+
AllFileList = gr.State(list([]))
|
| 248 |
+
|
| 249 |
+
|
| 250 |
+
with gr.Tab("TTS"):
|
| 251 |
+
with gr.Column() as row4:
|
| 252 |
+
with gr.Row() as col4:
|
| 253 |
+
speaker_name_studio = gr.Dropdown(
|
| 254 |
+
label="Studio speaker",
|
| 255 |
+
choices=STUDIO_SPEAKERS.keys(),
|
| 256 |
+
value=defaultSpeaker if defaultSpeaker in STUDIO_SPEAKERS.keys() else None,
|
| 257 |
+
)
|
| 258 |
+
speaker_name_custom = gr.Dropdown(
|
| 259 |
+
label="Cloned speaker",
|
| 260 |
+
choices=cloned_speaker_names.value,
|
| 261 |
+
value=cloned_speaker_names.value[0] if len(cloned_speaker_names.value) != 0 else None,
|
| 262 |
+
)
|
| 263 |
+
speaker_type = gr.Dropdown(label="Speaker type", choices=["Studio", "Cloned"], value="Studio")
|
| 264 |
+
with gr.Column() as rowAdvanced:
|
| 265 |
+
with gr.Row() as rowAdvanced:
|
| 266 |
+
temperature = gr.Slider(0.00, 1.00, 0.5, step=0.05, label="Temperature", info="Choose between 0 and 1")
|
| 267 |
+
top_p = gr.Slider(0.00, 1.00, 0.8, step=0.05, label="TOP P", info="Choose between 0 and 1")
|
| 268 |
+
top_k = gr.Number(label="TOP K",value=50)
|
| 269 |
+
speed = gr.Slider(0.00, 1000.00, 1.0, step=0.1, label="Speed", info="Speed (0 to 1000)")
|
| 270 |
+
with gr.Column() as col2:
|
| 271 |
+
lang = gr.Dropdown(label="Language", choices=LANUGAGES, value="pt")
|
| 272 |
+
text = gr.Textbox(label="text",lines=4, value="A quick brown fox jumps over the lazy dog.")
|
| 273 |
+
tts_button = gr.Button(value="TTS")
|
| 274 |
+
with gr.Column() as col3:
|
| 275 |
+
# FileList = gr.FileExplorer(
|
| 276 |
+
# glob="*.wav",
|
| 277 |
+
# # value=["themes/utils"],
|
| 278 |
+
# ignore_glob="**/__init__.py",
|
| 279 |
+
# root_dir=AUDIOS_DIR,
|
| 280 |
+
# interactive = True,
|
| 281 |
+
# value=DirectoryList.value
|
| 282 |
+
# )
|
| 283 |
+
|
| 284 |
+
AudioList = gr.Dropdown(
|
| 285 |
+
label="Generated Audios",
|
| 286 |
+
choices=['a','b']
|
| 287 |
+
,interactive=True
|
| 288 |
+
)
|
| 289 |
+
|
| 290 |
+
generated_audio = gr.Audio(label="Audio Play", autoplay=True)
|
| 291 |
+
AudioList.change(fn=audio_list_update, inputs=[AudioList], outputs=[generated_audio])
|
| 292 |
+
|
| 293 |
+
dummyHtml = gr.HTML(elem_id = "DonwloadLink", render = False);
|
| 294 |
+
downloadAll = gr.DownloadButton("Download All Files")
|
| 295 |
+
downloadAll.click(ZipAndDownload, inputs=[AllFileList], outputs=[dummyHtml]);
|
| 296 |
+
dummyHtml.render();
|
| 297 |
+
|
| 298 |
+
|
| 299 |
+
with gr.Tab("Clone a new speaker"):
|
| 300 |
+
with gr.Column() as col1:
|
| 301 |
+
upload_file = gr.Audio(label="Upload reference audio", type="filepath")
|
| 302 |
+
clone_speaker_name = gr.Textbox(label="Speaker name", value="default_speaker")
|
| 303 |
+
clone_button = gr.Button(value="Clone speaker")
|
| 304 |
+
|
| 305 |
+
clone_button.click(
|
| 306 |
+
fn=clone_speaker,
|
| 307 |
+
inputs=[upload_file, clone_speaker_name, cloned_speaker_names],
|
| 308 |
+
outputs=[upload_file, clone_speaker_name, cloned_speaker_names, speaker_name_custom],
|
| 309 |
+
)
|
| 310 |
+
|
| 311 |
+
tts_button.click(
|
| 312 |
+
fn=tts,
|
| 313 |
+
inputs=[text, speaker_type, speaker_name_studio, speaker_name_custom, lang, temperature
|
| 314 |
+
,speed,top_p,top_k,AllFileList
|
| 315 |
+
],
|
| 316 |
+
outputs=[AudioList],
|
| 317 |
+
)
|
| 318 |
+
|
| 319 |
+
if __name__ == "__main__" and DO_CHECK == "1":
|
| 320 |
+
print("Warming up server... Checking server healthy...")
|
| 321 |
+
|
| 322 |
+
speakerName, embs = random.choice(list(STUDIO_SPEAKERS.items()));
|
| 323 |
+
|
| 324 |
+
print("Testing with", speakerName);
|
| 325 |
+
|
| 326 |
+
ipts = xtts.TTSInputs(
|
| 327 |
+
speaker_embedding=embs["speaker_embedding"],
|
| 328 |
+
gpt_cond_latent=embs["gpt_cond_latent"],
|
| 329 |
+
text="This is a warmup request.",
|
| 330 |
+
language="en",
|
| 331 |
+
temperature=0.5,
|
| 332 |
+
speed=1.0,
|
| 333 |
+
top_k=50,
|
| 334 |
+
top_p=0.8
|
| 335 |
+
)
|
| 336 |
+
|
| 337 |
+
resp = xtts.predict_speech(ipts)
|
| 338 |
+
|
| 339 |
+
print(" TEST OK")
|
| 340 |
+
|
| 341 |
+
|
| 342 |
+
if __name__ == "__main__":
|
| 343 |
+
print("STARTING...")
|
| 344 |
+
demo.launch(
|
| 345 |
+
share=False,
|
| 346 |
+
debug=False,
|
| 347 |
+
server_port=80,
|
| 348 |
+
server_name="0.0.0.0",
|
| 349 |
+
allowed_paths=[ZIP_DIR]
|
| 350 |
+
)
|
docker-compose.yml
ADDED
|
@@ -0,0 +1,40 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
name: webui-docker
|
| 2 |
+
|
| 3 |
+
volumes:
|
| 4 |
+
servel-model-root:
|
| 5 |
+
|
| 6 |
+
services:
|
| 7 |
+
|
| 8 |
+
xtts:
|
| 9 |
+
build:
|
| 10 |
+
context: .
|
| 11 |
+
dockerfile: Dockerfile
|
| 12 |
+
environment:
|
| 13 |
+
COQUI_TOS_AGREED: 1
|
| 14 |
+
CUSTOM_MODEL_PATH: /root/.local/share/tts/tts_models--multilingual--multi-dataset--xtts_v2
|
| 15 |
+
ports:
|
| 16 |
+
- 3000:80
|
| 17 |
+
expose:
|
| 18 |
+
- 80
|
| 19 |
+
volumes:
|
| 20 |
+
- type: volume
|
| 21 |
+
source: servel-model-root
|
| 22 |
+
target: /root/.local/share/tts/tts_models--multilingual--multi-dataset--xtts_v2
|
| 23 |
+
stdin_open: true # docker run -i
|
| 24 |
+
tty: true # docker run -t
|
| 25 |
+
deploy:
|
| 26 |
+
resources:
|
| 27 |
+
reservations:
|
| 28 |
+
devices:
|
| 29 |
+
- driver: nvidia
|
| 30 |
+
count: all
|
| 31 |
+
capabilities: [gpu]
|
| 32 |
+
healthcheck:
|
| 33 |
+
test: wget --no-verbose --tries=1 http://localhost || exit 1
|
| 34 |
+
interval: 5s
|
| 35 |
+
timeout: 30s
|
| 36 |
+
retries: 3
|
| 37 |
+
start_period: 5m
|
| 38 |
+
|
| 39 |
+
|
| 40 |
+
|
requirements.txt
ADDED
|
@@ -0,0 +1,17 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
torch
|
| 2 |
+
torchvision
|
| 3 |
+
torchaudio
|
| 4 |
+
gradio
|
| 5 |
+
numpy
|
| 6 |
+
TTS @ git+https://github.com/coqui-ai/TTS@fa28f99f1508b5b5366539b2149963edcb80ba62
|
| 7 |
+
uvicorn[standard]==0.23.2
|
| 8 |
+
deepspeed
|
| 9 |
+
pydantic
|
| 10 |
+
python-multipart==0.0.6
|
| 11 |
+
typing-extensions>=4.8.0
|
| 12 |
+
cutlet
|
| 13 |
+
mecab-python3==1.0.6
|
| 14 |
+
unidic-lite==1.0.8
|
| 15 |
+
unidic==1.1.0
|
| 16 |
+
|
| 17 |
+
|
xtts.py
ADDED
|
@@ -0,0 +1,192 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
import base64
|
| 2 |
+
import io
|
| 3 |
+
import os
|
| 4 |
+
import tempfile
|
| 5 |
+
import wave
|
| 6 |
+
import torch
|
| 7 |
+
import numpy as np
|
| 8 |
+
from typing import List
|
| 9 |
+
from pydantic import BaseModel
|
| 10 |
+
|
| 11 |
+
from TTS.tts.configs.xtts_config import XttsConfig
|
| 12 |
+
from TTS.tts.models.xtts import Xtts
|
| 13 |
+
from TTS.utils.generic_utils import get_user_data_dir
|
| 14 |
+
from TTS.utils.manage import ModelManager
|
| 15 |
+
|
| 16 |
+
torch.set_num_threads(int(os.environ.get("NUM_THREADS", os.cpu_count())))
|
| 17 |
+
device = torch.device("cuda" if os.environ.get("USE_CPU", "0") == "0" else "cpu")
|
| 18 |
+
if not torch.cuda.is_available() and device == "cuda":
|
| 19 |
+
raise RuntimeError("CUDA device unavailable, please use Dockerfile.cpu instead.")
|
| 20 |
+
|
| 21 |
+
custom_model_path = os.environ.get("CUSTOM_MODEL_PATH", "/app/tts_models")
|
| 22 |
+
|
| 23 |
+
if os.path.exists(custom_model_path) and os.path.isfile(custom_model_path + "/config.json"):
|
| 24 |
+
model_path = custom_model_path
|
| 25 |
+
print("Loading custom model from", model_path, flush=True)
|
| 26 |
+
else:
|
| 27 |
+
print("Loading default model", flush=True)
|
| 28 |
+
model_name = "tts_models/multilingual/multi-dataset/xtts_v2"
|
| 29 |
+
print("Downloading XTTS Model:", model_name, flush=True)
|
| 30 |
+
ModelManager().download_model(model_name)
|
| 31 |
+
model_path = os.path.join(get_user_data_dir("tts"), model_name.replace("/", "--"))
|
| 32 |
+
print("XTTS Model downloaded", flush=True)
|
| 33 |
+
|
| 34 |
+
print("Loading XTTS", flush=True)
|
| 35 |
+
config = XttsConfig()
|
| 36 |
+
config.load_json(os.path.join(model_path, "config.json"))
|
| 37 |
+
model = Xtts.init_from_config(config)
|
| 38 |
+
model.load_checkpoint(config, checkpoint_dir=model_path, eval=True, use_deepspeed=True if device == "cuda" else False)
|
| 39 |
+
model.to(device)
|
| 40 |
+
print("XTTS Loaded.", flush=True)
|
| 41 |
+
|
| 42 |
+
print("Running XTTS Server ...", flush=True)
|
| 43 |
+
|
| 44 |
+
|
| 45 |
+
|
| 46 |
+
# @app.post("/clone_speaker")
|
| 47 |
+
def predict_speaker(wav_file):
|
| 48 |
+
"""Compute conditioning inputs from reference audio file."""
|
| 49 |
+
temp_audio_name = next(tempfile._get_candidate_names())
|
| 50 |
+
with open(temp_audio_name, "wb") as temp, torch.inference_mode():
|
| 51 |
+
temp.write(io.BytesIO(wav_file.read()).getbuffer())
|
| 52 |
+
gpt_cond_latent, speaker_embedding = model.get_conditioning_latents(
|
| 53 |
+
temp_audio_name
|
| 54 |
+
)
|
| 55 |
+
return {
|
| 56 |
+
"gpt_cond_latent": gpt_cond_latent.cpu().squeeze().half().tolist(),
|
| 57 |
+
"speaker_embedding": speaker_embedding.cpu().squeeze().half().tolist(),
|
| 58 |
+
}
|
| 59 |
+
|
| 60 |
+
|
| 61 |
+
def postprocess(wav):
|
| 62 |
+
"""Post process the output waveform"""
|
| 63 |
+
if isinstance(wav, list):
|
| 64 |
+
wav = torch.cat(wav, dim=0)
|
| 65 |
+
wav = wav.clone().detach().cpu().numpy()
|
| 66 |
+
wav = wav[None, : int(wav.shape[0])]
|
| 67 |
+
wav = np.clip(wav, -1, 1)
|
| 68 |
+
wav = (wav * 32767).astype(np.int16)
|
| 69 |
+
return wav
|
| 70 |
+
|
| 71 |
+
|
| 72 |
+
def encode_audio_common(
|
| 73 |
+
frame_input, encode_base64=True, sample_rate=24000, sample_width=2, channels=1
|
| 74 |
+
):
|
| 75 |
+
"""Return base64 encoded audio"""
|
| 76 |
+
wav_buf = io.BytesIO()
|
| 77 |
+
with wave.open(wav_buf, "wb") as vfout:
|
| 78 |
+
vfout.setnchannels(channels)
|
| 79 |
+
vfout.setsampwidth(sample_width)
|
| 80 |
+
vfout.setframerate(sample_rate)
|
| 81 |
+
vfout.writeframes(frame_input)
|
| 82 |
+
|
| 83 |
+
wav_buf.seek(0)
|
| 84 |
+
if encode_base64:
|
| 85 |
+
b64_encoded = base64.b64encode(wav_buf.getbuffer()).decode("utf-8")
|
| 86 |
+
return b64_encoded
|
| 87 |
+
else:
|
| 88 |
+
return wav_buf.read()
|
| 89 |
+
|
| 90 |
+
|
| 91 |
+
class StreamingInputs(BaseModel):
|
| 92 |
+
speaker_embedding: List[float]
|
| 93 |
+
gpt_cond_latent: List[List[float]]
|
| 94 |
+
text: str
|
| 95 |
+
language: str
|
| 96 |
+
add_wav_header: bool = True
|
| 97 |
+
stream_chunk_size: str = "20"
|
| 98 |
+
|
| 99 |
+
#
|
| 100 |
+
#def predict_streaming_generator(parsed_input: dict = Body(...)):
|
| 101 |
+
# speaker_embedding = torch.tensor(parsed_input.speaker_embedding).unsqueeze(0).unsqueeze(-1)
|
| 102 |
+
# gpt_cond_latent = torch.tensor(parsed_input.gpt_cond_latent).reshape((-1, 1024)).unsqueeze(0)
|
| 103 |
+
# text = parsed_input.text
|
| 104 |
+
# language = parsed_input.language
|
| 105 |
+
#
|
| 106 |
+
# stream_chunk_size = int(parsed_input.stream_chunk_size)
|
| 107 |
+
# add_wav_header = parsed_input.add_wav_header
|
| 108 |
+
#
|
| 109 |
+
#
|
| 110 |
+
# chunks = model.inference_stream(
|
| 111 |
+
# text,
|
| 112 |
+
# language,
|
| 113 |
+
# gpt_cond_latent,
|
| 114 |
+
# speaker_embedding,
|
| 115 |
+
# stream_chunk_size=stream_chunk_size,
|
| 116 |
+
# enable_text_splitting=True
|
| 117 |
+
# )
|
| 118 |
+
#
|
| 119 |
+
# for i, chunk in enumerate(chunks):
|
| 120 |
+
# chunk = postprocess(chunk)
|
| 121 |
+
# if i == 0 and add_wav_header:
|
| 122 |
+
# yield encode_audio_common(b"", encode_base64=False)
|
| 123 |
+
# yield chunk.tobytes()
|
| 124 |
+
# else:
|
| 125 |
+
# yield chunk.tobytes()
|
| 126 |
+
#
|
| 127 |
+
#
|
| 128 |
+
## @app.post("/tts_stream")
|
| 129 |
+
#def predict_streaming_endpoint(parsed_input: StreamingInputs):
|
| 130 |
+
# return StreamingResponse(
|
| 131 |
+
# predict_streaming_generator(parsed_input),
|
| 132 |
+
# media_type="audio/wav",
|
| 133 |
+
# )
|
| 134 |
+
|
| 135 |
+
class TTSInputs(BaseModel):
|
| 136 |
+
speaker_embedding: List[float]
|
| 137 |
+
gpt_cond_latent: List[List[float]]
|
| 138 |
+
text: str
|
| 139 |
+
language: str
|
| 140 |
+
temperature: float
|
| 141 |
+
speed: float
|
| 142 |
+
top_k: int
|
| 143 |
+
top_p: float
|
| 144 |
+
|
| 145 |
+
# @app.post("/tts")
|
| 146 |
+
def predict_speech(parsed_input: TTSInputs):
|
| 147 |
+
speaker_embedding = torch.tensor(parsed_input.speaker_embedding).unsqueeze(0).unsqueeze(-1)
|
| 148 |
+
gpt_cond_latent = torch.tensor(parsed_input.gpt_cond_latent).reshape((-1, 1024)).unsqueeze(0)
|
| 149 |
+
text = parsed_input.text
|
| 150 |
+
language = parsed_input.language
|
| 151 |
+
temperature = parsed_input.temperature
|
| 152 |
+
speed = parsed_input.speed
|
| 153 |
+
top_k = parsed_input.top_k
|
| 154 |
+
top_p = parsed_input.top_p
|
| 155 |
+
length_penalty = 1.0
|
| 156 |
+
repetition_penalty= 2.0
|
| 157 |
+
|
| 158 |
+
|
| 159 |
+
out = model.inference(
|
| 160 |
+
text,
|
| 161 |
+
language,
|
| 162 |
+
gpt_cond_latent,
|
| 163 |
+
speaker_embedding,
|
| 164 |
+
temperature,
|
| 165 |
+
length_penalty,
|
| 166 |
+
repetition_penalty,
|
| 167 |
+
top_k,
|
| 168 |
+
top_p,
|
| 169 |
+
speed,
|
| 170 |
+
)
|
| 171 |
+
|
| 172 |
+
wav = postprocess(torch.tensor(out["wav"]))
|
| 173 |
+
|
| 174 |
+
return encode_audio_common(wav.tobytes())
|
| 175 |
+
|
| 176 |
+
|
| 177 |
+
# @app.get("/studio_speakers")
|
| 178 |
+
def get_speakers():
|
| 179 |
+
if hasattr(model, "speaker_manager") and hasattr(model.speaker_manager, "speakers"):
|
| 180 |
+
return {
|
| 181 |
+
speaker: {
|
| 182 |
+
"speaker_embedding": model.speaker_manager.speakers[speaker]["speaker_embedding"].cpu().squeeze().half().tolist(),
|
| 183 |
+
"gpt_cond_latent": model.speaker_manager.speakers[speaker]["gpt_cond_latent"].cpu().squeeze().half().tolist(),
|
| 184 |
+
}
|
| 185 |
+
for speaker in model.speaker_manager.speakers.keys()
|
| 186 |
+
}
|
| 187 |
+
else:
|
| 188 |
+
return {}
|
| 189 |
+
|
| 190 |
+
# @app.get("/languages")
|
| 191 |
+
def get_languages():
|
| 192 |
+
return config.languages
|