Spaces:

rrg92
/

xtts

Paused

App Files Files Community

rrg92 commited on Sep 6, 2024

Commit

7c3a67d

1 Parent(s): 051922b

v1

Browse files

Files changed (5) hide show

Dockerfile +24 -0
app.py +350 -0
docker-compose.yml +40 -0
requirements.txt +17 -0
xtts.py +192 -0

Dockerfile ADDED Viewed

	@@ -0,0 +1,24 @@

+FROM pytorch/pytorch:2.1.0-cuda12.1-cudnn8-devel
+ARG DEBIAN_FRONTEND=noninteractive
+RUN apt-get update && \
+    apt-get install --no-install-recommends -y sox libsox-fmt-all curl wget gcc git git-lfs build-essential libaio-dev libsndfile1 ssh ffmpeg && \
+    apt-get clean && apt-get -y autoremove
+WORKDIR /app
+COPY requirements.txt .
+RUN python -m pip install --use-deprecated=legacy-resolver -r requirements.txt \
+    && python -m pip cache purge
+RUN python -m unidic download
+RUN mkdir -p /app/tts_models
+COPY xtts.py .
+COPY app.py .
+#Mark this 1 if you have older card
+#ENV NVIDIA_DISABLE_REQUIRE=0
+ENV NUM_THREADS=2
+EXPOSE 80
+CMD ["python","app.py"]

app.py ADDED Viewed

	@@ -0,0 +1,350 @@

+import gradio as gr
+import base64
+import tempfile
+import json
+import os
+from os.path import abspath
+import zipfile
+import random
+import xtts
+DO_CHECK = os.getenv('DO_CHECK', '1')
+OUTPUT = "./demo_outputs"
+cloned_speakers = {}
+print("Preparing file structure...")
+if not os.path.exists(OUTPUT):
+    os.mkdir(OUTPUT)
+    os.mkdir(os.path.join(OUTPUT, "cloned_speakers"))
+    os.mkdir(os.path.join(OUTPUT, "generated_audios"))
+elif os.path.exists(os.path.join(OUTPUT, "cloned_speakers")):
+    print("Loading existing cloned speakers...")
+    for file in os.listdir(os.path.join(OUTPUT, "cloned_speakers")):
+        if file.endswith(".json"):
+            with open(os.path.join(OUTPUT, "cloned_speakers", file), "r") as fp:
+                cloned_speakers[file[:-5]] = json.load(fp)
+    print("Available cloned speakers:", ", ".join(cloned_speakers.keys()))
+AUDIOS_DIR = os.path.join("demo_outputs", "generated_audios");
+ZIP_DIR = os.path.join("zip_outputs");
+print("Checking zip at", ZIP_DIR)
+if not os.path.exists(ZIP_DIR):
+    os.mkdir(ZIP_DIR)
+try:
+    print("Getting metadata from server ...")
+    LANUGAGES = xtts.get_languages()
+    print("Available languages:", ", ".join(LANUGAGES))
+    STUDIO_SPEAKERS = xtts.get_speakers()
+    print("Available studio speakers:", ", ".join(STUDIO_SPEAKERS.keys()))
+except:
+    raise Exception("Please make sure the server is running first.")
+def ExtractVars(input_string):
+    # Split the string into lines
+    lines = input_string.split('\n')
+    # Initialize an empty dictionary to store key-value pairs
+    result_dict = {
+         'prefix': None,
+         'name': '',
+         'speaker': None,
+         'num': None,
+    }
+    # List to hold lines that do not start with '!'
+    filtered_lines = []
+    # Iterate through each line
+    for line in lines:
+        # Check if the line starts with '!'
+        if line.strip().startswith('!'):
+            # Try to split the line into key and value parts
+            try:
+                # Split on '=' and strip whitespace from key and value
+                key, value = line.strip()[1:].split('=')
+                key = key.strip()
+                value = value.strip()
+                # Add to dictionary
+                result_dict[key] = value
+            except ValueError:
+                # Handle the case where there is no '=' or improper format
+                continue
+        elif len(line.strip()) > 0:
+            # Add the line to filtered_lines if it doesn't start with '!'
+            filtered_lines.append(line)
+    # Join the filtered lines back into a single string
+    filtered_string = '\n'.join(filtered_lines)
+    return result_dict, filtered_string
+def FindSpeakerByName(name, speakerType):
+    srcItems = STUDIO_SPEAKERS if speakerType == "Studio" else cloned_speakers;
+    for key, value in srcItems.items():
+        if key == name:
+            return key,value
+        if key.split(" ")[0] == name:
+            return key,value;
+def clone_speaker(upload_file, clone_speaker_name, cloned_speaker_names):
+    embeddings =  xtts.predict_speaker(open(upload_file,"rb"))
+    with open(os.path.join(OUTPUT, "cloned_speakers", clone_speaker_name + ".json"), "w") as fp:
+        json.dump(embeddings, fp)
+    cloned_speakers[clone_speaker_name] = embeddings
+    cloned_speaker_names.append(clone_speaker_name)
+    return upload_file, clone_speaker_name, cloned_speaker_names, gr.Dropdown(choices=cloned_speaker_names)
+def tts(text, speaker_type, speaker_name_studio, speaker_name_custom, lang, temperature
+    ,speed,top_p,top_k, AllFileList,progress=gr.Progress()
+):
+    embeddings = STUDIO_SPEAKERS[speaker_name_studio] if speaker_type == 'Studio' else cloned_speakers[speaker_name_custom]
+    # break at line!
+    lines = text.split("---");
+    totalLines = len(lines);
+    print("Total parts:", len(lines))
+    audioNum = 0;
+    DefaultPrefix = next(tempfile._get_candidate_names());
+    CurrentPrefix = DefaultPrefix
+    AudioList = [];
+    for line in progress.tqdm(lines, desc="Gerando fala..."):
+        audioNum += 1;
+        textVars,cleanLine = ExtractVars(line)
+        if textVars['prefix']:
+            CurrentPrefix = textVars['prefix']
+        audioName = textVars['name'];
+        if audioName:
+            audioName = '_'+audioName
+        num = textVars['num'];
+        if not num:
+            num = audioNum;
+        path = CurrentPrefix +"_n_" + str(num)+audioName+".wav"
+        print("Generating audio for line", num, 'sequence', audioNum);
+        speaker = textVars['speaker'];
+        if not speaker:
+            speaker = speaker_name_studio if speaker_type == 'Studio' else speaker_name_custom
+        speakerName,embeddings = FindSpeakerByName(speaker, speaker_type)
+        if not speakerName:
+             raise ValueError("InvalidSpeaker: "+speakerName)
+        ipts = xtts.TTSInputs(
+            speaker_embedding=embeddings["speaker_embedding"],
+            gpt_cond_latent=embeddings["gpt_cond_latent"],
+            text=cleanLine,
+            language=lang,
+            temperature=temperature,
+            speed=speed,
+            top_k=top_k,
+            top_p=top_p
+        )
+        generated_audio = xtts.predict_speech(ipts)
+        print("Audio generated.. Saving to", path);
+        generated_audio_path = os.path.join(AUDIOS_DIR, path)
+        with open(generated_audio_path, "wb") as fp:
+            fp.write(base64.b64decode(generated_audio))
+            AudioList.append(fp.name);
+    AllFileList.clear();
+    AllFileList.extend(AudioList);
+    return gr.Dropdown(
+            label="Generated Audios",
+            choices=list(AudioList),
+            value=AudioList[0]
+        )
+def get_file_content(f):
+    if len(f) > 0:
+        return f[0];
+    return None;
+def UpdateFileList(DirListState):
+    DirListState.clear();
+    DirListState.extend( os.listdir(AUDIOS_DIR) )
+def audio_list_update(d):
+    fullPath = abspath(d)
+    return fullPath
+def ZipAndDownload(files):
+    allFiles = files
+    DefaultPrefix = next(tempfile._get_candidate_names());
+    zipFile = abspath( os.path.join(ZIP_DIR, DefaultPrefix + ".zip") );
+    with zipfile.ZipFile(zipFile, 'w') as zipMe:
+        for file in allFiles:
+            print("Zipping", file);
+            zipMe.write(abspath(file), os.path.basename(file), compress_type=zipfile.ZIP_DEFLATED)
+    print("Pronto",  zipFile);
+    return '<a href="/file='+zipFile+'">If donwload dont starts, click here</a>';
+js = """
+function DetectDownloadLink(){
+    console.log('Configuring AutoDonwloadObservr...');
+    let hiddenLink = document.getElementById("DonwloadLink");
+    let onChange= function(mutations){
+         for (const mutation of mutations) {
+            if (mutation.type !== 'childList')
+                continue;
+              for (const addedNode of mutation.addedNodes) {
+                if (addedNode.nodeName === 'A') {
+                    location.href = addedNode.href;
+                }
+              }
+          }
+    }
+    let config = {  attributes: true, childList: true, subtree: true, attributeFilter: ["href"] }
+    let obs = new MutationObserver(onChange);
+    obs.observe(hiddenLink, config);
+}
+"""
+with gr.Blocks(js=js) as demo:
+    defaultSpeaker = "Dionisio Schuyler"
+    cloned_speaker_names = gr.State(list(cloned_speakers.keys()))
+    AllFileList = gr.State(list([]))
+    with gr.Tab("TTS"):
+        with gr.Column() as row4:
+            with gr.Row() as col4:
+                speaker_name_studio = gr.Dropdown(
+                    label="Studio speaker",
+                    choices=STUDIO_SPEAKERS.keys(),
+                    value=defaultSpeaker if defaultSpeaker in STUDIO_SPEAKERS.keys() else None,
+                )
+                speaker_name_custom = gr.Dropdown(
+                    label="Cloned speaker",
+                    choices=cloned_speaker_names.value,
+                    value=cloned_speaker_names.value[0] if len(cloned_speaker_names.value) != 0 else None,
+                )
+            speaker_type = gr.Dropdown(label="Speaker type", choices=["Studio", "Cloned"], value="Studio")
+        with gr.Column() as rowAdvanced:
+             with gr.Row() as rowAdvanced:
+                temperature = gr.Slider(0.00, 1.00, 0.5, step=0.05, label="Temperature", info="Choose between 0 and 1")
+                top_p = gr.Slider(0.00, 1.00, 0.8, step=0.05, label="TOP P", info="Choose between 0 and 1")
+                top_k = gr.Number(label="TOP K",value=50)
+                speed = gr.Slider(0.00, 1000.00, 1.0, step=0.1, label="Speed", info="Speed (0 to 1000)")
+        with gr.Column() as col2:
+            lang = gr.Dropdown(label="Language", choices=LANUGAGES, value="pt")
+            text = gr.Textbox(label="text",lines=4, value="A quick brown fox jumps over the lazy dog.")
+            tts_button = gr.Button(value="TTS")
+        with gr.Column() as col3:
+            # FileList = gr.FileExplorer(
+            #     glob="*.wav",
+            #     # value=["themes/utils"],
+            #     ignore_glob="**/__init__.py",
+            #     root_dir=AUDIOS_DIR,
+            #     interactive = True,
+            #     value=DirectoryList.value
+            # )
+            AudioList = gr.Dropdown(
+                    label="Generated Audios",
+                    choices=['a','b']
+                    ,interactive=True
+                )
+            generated_audio = gr.Audio(label="Audio Play", autoplay=True)
+            AudioList.change(fn=audio_list_update, inputs=[AudioList], outputs=[generated_audio])
+            dummyHtml = gr.HTML(elem_id = "DonwloadLink", render = False);
+            downloadAll = gr.DownloadButton("Download All Files")
+            downloadAll.click(ZipAndDownload, inputs=[AllFileList], outputs=[dummyHtml]);
+            dummyHtml.render();
+    with gr.Tab("Clone a new speaker"):
+        with gr.Column() as col1:
+            upload_file = gr.Audio(label="Upload reference audio", type="filepath")
+            clone_speaker_name = gr.Textbox(label="Speaker name", value="default_speaker")
+            clone_button = gr.Button(value="Clone speaker")
+    clone_button.click(
+        fn=clone_speaker,
+        inputs=[upload_file, clone_speaker_name, cloned_speaker_names],
+        outputs=[upload_file, clone_speaker_name, cloned_speaker_names, speaker_name_custom],
+    )
+    tts_button.click(
+        fn=tts,
+        inputs=[text, speaker_type, speaker_name_studio, speaker_name_custom, lang, temperature
+                ,speed,top_p,top_k,AllFileList
+                ],
+        outputs=[AudioList],
+    )
+if __name__ == "__main__" and DO_CHECK == "1":
+    print("Warming up server... Checking server healthy...")
+    speakerName, embs = random.choice(list(STUDIO_SPEAKERS.items()));
+    print("Testing with", speakerName);
+    ipts = xtts.TTSInputs(
+        speaker_embedding=embs["speaker_embedding"],
+        gpt_cond_latent=embs["gpt_cond_latent"],
+        text="This is a warmup request.",
+        language="en",
+        temperature=0.5,
+        speed=1.0,
+        top_k=50,
+        top_p=0.8
+    )
+    resp = xtts.predict_speech(ipts)
+    print(" TEST OK")
+if __name__ == "__main__":
+    print("STARTING...")
+    demo.launch(
+        share=False,
+        debug=False,
+        server_port=80,
+        server_name="0.0.0.0",
+        allowed_paths=[ZIP_DIR]
+    )

docker-compose.yml ADDED Viewed

	@@ -0,0 +1,40 @@

+name: webui-docker
+volumes:
+    servel-model-root:
+services:
+    xtts:
+        build:
+            context: .
+            dockerfile: Dockerfile
+        environment:
+            COQUI_TOS_AGREED: 1
+            CUSTOM_MODEL_PATH: /root/.local/share/tts/tts_models--multilingual--multi-dataset--xtts_v2
+        ports:
+            - 3000:80
+        expose:
+            - 80
+        volumes:
+            - type: volume
+              source: servel-model-root
+              target: /root/.local/share/tts/tts_models--multilingual--multi-dataset--xtts_v2
+        stdin_open: true # docker run -i
+        tty: true        # docker run -t
+        deploy:
+          resources:
+            reservations:
+              devices:
+                - driver: nvidia
+                  count: all
+                  capabilities: [gpu]
+        healthcheck:
+          test: wget --no-verbose --tries=1 http://localhost || exit 1
+          interval: 5s
+          timeout: 30s
+          retries: 3
+          start_period: 5m

requirements.txt ADDED Viewed

	@@ -0,0 +1,17 @@

+torch
+torchvision
+torchaudio
+gradio
+numpy
+TTS @ git+https://github.com/coqui-ai/TTS@fa28f99f1508b5b5366539b2149963edcb80ba62
+uvicorn[standard]==0.23.2
+deepspeed
+pydantic
+python-multipart==0.0.6
+typing-extensions>=4.8.0
+cutlet
+mecab-python3==1.0.6
+unidic-lite==1.0.8
+unidic==1.1.0

xtts.py ADDED Viewed

	@@ -0,0 +1,192 @@

+import base64
+import io
+import os
+import tempfile
+import wave
+import torch
+import numpy as np
+from typing import List
+from pydantic import BaseModel
+from TTS.tts.configs.xtts_config import XttsConfig
+from TTS.tts.models.xtts import Xtts
+from TTS.utils.generic_utils import get_user_data_dir
+from TTS.utils.manage import ModelManager
+torch.set_num_threads(int(os.environ.get("NUM_THREADS", os.cpu_count())))
+device = torch.device("cuda" if os.environ.get("USE_CPU", "0") == "0" else "cpu")
+if not torch.cuda.is_available() and device == "cuda":
+    raise RuntimeError("CUDA device unavailable, please use Dockerfile.cpu instead.")
+custom_model_path = os.environ.get("CUSTOM_MODEL_PATH", "/app/tts_models")
+if os.path.exists(custom_model_path) and os.path.isfile(custom_model_path + "/config.json"):
+    model_path = custom_model_path
+    print("Loading custom model from", model_path, flush=True)
+else:
+    print("Loading default model", flush=True)
+    model_name = "tts_models/multilingual/multi-dataset/xtts_v2"
+    print("Downloading XTTS Model:", model_name, flush=True)
+    ModelManager().download_model(model_name)
+    model_path = os.path.join(get_user_data_dir("tts"), model_name.replace("/", "--"))
+    print("XTTS Model downloaded", flush=True)
+print("Loading XTTS", flush=True)
+config = XttsConfig()
+config.load_json(os.path.join(model_path, "config.json"))
+model = Xtts.init_from_config(config)
+model.load_checkpoint(config, checkpoint_dir=model_path, eval=True, use_deepspeed=True if device == "cuda" else False)
+model.to(device)
+print("XTTS Loaded.", flush=True)
+print("Running XTTS Server ...", flush=True)
+# @app.post("/clone_speaker")
+def predict_speaker(wav_file):
+    """Compute conditioning inputs from reference audio file."""
+    temp_audio_name = next(tempfile._get_candidate_names())
+    with open(temp_audio_name, "wb") as temp, torch.inference_mode():
+        temp.write(io.BytesIO(wav_file.read()).getbuffer())
+        gpt_cond_latent, speaker_embedding = model.get_conditioning_latents(
+            temp_audio_name
+        )
+    return {
+        "gpt_cond_latent": gpt_cond_latent.cpu().squeeze().half().tolist(),
+        "speaker_embedding": speaker_embedding.cpu().squeeze().half().tolist(),
+    }
+def postprocess(wav):
+    """Post process the output waveform"""
+    if isinstance(wav, list):
+        wav = torch.cat(wav, dim=0)
+    wav = wav.clone().detach().cpu().numpy()
+    wav = wav[None, : int(wav.shape[0])]
+    wav = np.clip(wav, -1, 1)
+    wav = (wav * 32767).astype(np.int16)
+    return wav
+def encode_audio_common(
+    frame_input, encode_base64=True, sample_rate=24000, sample_width=2, channels=1
+):
+    """Return base64 encoded audio"""
+    wav_buf = io.BytesIO()
+    with wave.open(wav_buf, "wb") as vfout:
+        vfout.setnchannels(channels)
+        vfout.setsampwidth(sample_width)
+        vfout.setframerate(sample_rate)
+        vfout.writeframes(frame_input)
+    wav_buf.seek(0)
+    if encode_base64:
+        b64_encoded = base64.b64encode(wav_buf.getbuffer()).decode("utf-8")
+        return b64_encoded
+    else:
+        return wav_buf.read()
+class StreamingInputs(BaseModel):
+    speaker_embedding: List[float]
+    gpt_cond_latent: List[List[float]]
+    text: str
+    language: str
+    add_wav_header: bool = True
+    stream_chunk_size: str = "20"
+#
+#def predict_streaming_generator(parsed_input: dict = Body(...)):
+#    speaker_embedding = torch.tensor(parsed_input.speaker_embedding).unsqueeze(0).unsqueeze(-1)
+#    gpt_cond_latent = torch.tensor(parsed_input.gpt_cond_latent).reshape((-1, 1024)).unsqueeze(0)
+#    text = parsed_input.text
+#    language = parsed_input.language
+#
+#    stream_chunk_size = int(parsed_input.stream_chunk_size)
+#    add_wav_header = parsed_input.add_wav_header
+#
+#
+#    chunks = model.inference_stream(
+#        text,
+#        language,
+#        gpt_cond_latent,
+#        speaker_embedding,
+#        stream_chunk_size=stream_chunk_size,
+#        enable_text_splitting=True
+#    )
+#
+#    for i, chunk in enumerate(chunks):
+#        chunk = postprocess(chunk)
+#        if i == 0 and add_wav_header:
+#            yield encode_audio_common(b"", encode_base64=False)
+#            yield chunk.tobytes()
+#        else:
+#            yield chunk.tobytes()
+#
+#
+## @app.post("/tts_stream")
+#def predict_streaming_endpoint(parsed_input: StreamingInputs):
+#    return StreamingResponse(
+#        predict_streaming_generator(parsed_input),
+#        media_type="audio/wav",
+#    )
+class TTSInputs(BaseModel):
+    speaker_embedding: List[float]
+    gpt_cond_latent: List[List[float]]
+    text: str
+    language: str
+    temperature: float
+    speed: float
+    top_k: int
+    top_p: float
+# @app.post("/tts")
+def predict_speech(parsed_input: TTSInputs):
+    speaker_embedding = torch.tensor(parsed_input.speaker_embedding).unsqueeze(0).unsqueeze(-1)
+    gpt_cond_latent = torch.tensor(parsed_input.gpt_cond_latent).reshape((-1, 1024)).unsqueeze(0)
+    text = parsed_input.text
+    language = parsed_input.language
+    temperature = parsed_input.temperature
+    speed = parsed_input.speed
+    top_k = parsed_input.top_k
+    top_p = parsed_input.top_p
+    length_penalty = 1.0
+    repetition_penalty= 2.0
+    out = model.inference(
+        text,
+        language,
+        gpt_cond_latent,
+        speaker_embedding,
+        temperature,
+        length_penalty,
+        repetition_penalty,
+        top_k,
+        top_p,
+        speed,
+    )
+    wav = postprocess(torch.tensor(out["wav"]))
+    return encode_audio_common(wav.tobytes())
+# @app.get("/studio_speakers")
+def get_speakers():
+    if hasattr(model, "speaker_manager") and hasattr(model.speaker_manager, "speakers"):
+        return {
+            speaker: {
+                "speaker_embedding": model.speaker_manager.speakers[speaker]["speaker_embedding"].cpu().squeeze().half().tolist(),
+                "gpt_cond_latent": model.speaker_manager.speakers[speaker]["gpt_cond_latent"].cpu().squeeze().half().tolist(),
+            }
+            for speaker in model.speaker_manager.speakers.keys()
+        }
+    else:
+        return {}
+# @app.get("/languages")
+def get_languages():
+    return config.languages