Spaces:
Build error
Build error
Upload 22 files
Browse files- .gitattributes +2 -0
- lib/__init__.py +57 -0
- lib/__pycache__/__init__.cpython-312.pyc +0 -0
- lib/__pycache__/conf.cpython-312.pyc +0 -0
- lib/__pycache__/functions.cpython-312.pyc +3 -0
- lib/__pycache__/lang.cpython-312.pyc +3 -0
- lib/__pycache__/models.cpython-312.pyc +0 -0
- lib/classes/__pycache__/background_detector.cpython-312.pyc +0 -0
- lib/classes/__pycache__/tts_manager.cpython-312.pyc +0 -0
- lib/classes/__pycache__/voice_extractor.cpython-312.pyc +0 -0
- lib/classes/argos_translator.py +122 -0
- lib/classes/background_detector.py +37 -0
- lib/classes/redirect_console.py +51 -0
- lib/classes/tts_engines/.template.py +232 -0
- lib/classes/tts_engines/common/audio_filters.py +107 -0
- lib/classes/tts_engines/common/utils.py +57 -0
- lib/classes/tts_engines/coqui.py +810 -0
- lib/classes/tts_manager.py +37 -0
- lib/classes/voice_extractor.py +286 -0
- lib/conf.py +78 -0
- lib/functions.py +0 -0
- lib/lang.py +0 -0
- lib/models.py +493 -0
.gitattributes
CHANGED
|
@@ -47,3 +47,5 @@ ebook2audiobook.egg-info/assets/gui_1.png filter=lfs diff=lfs merge=lfs -text
|
|
| 47 |
ebook2audiobook.egg-info/assets/gui_2.png filter=lfs diff=lfs merge=lfs -text
|
| 48 |
ebook2audiobook.egg-info/assets/gui_3.png filter=lfs diff=lfs merge=lfs -text
|
| 49 |
ebook2audiobook.egg-info/assets/Rainy_Day_voice_Demo.mp4 filter=lfs diff=lfs merge=lfs -text
|
|
|
|
|
|
|
|
|
| 47 |
ebook2audiobook.egg-info/assets/gui_2.png filter=lfs diff=lfs merge=lfs -text
|
| 48 |
ebook2audiobook.egg-info/assets/gui_3.png filter=lfs diff=lfs merge=lfs -text
|
| 49 |
ebook2audiobook.egg-info/assets/Rainy_Day_voice_Demo.mp4 filter=lfs diff=lfs merge=lfs -text
|
| 50 |
+
lib/__pycache__/functions.cpython-312.pyc filter=lfs diff=lfs merge=lfs -text
|
| 51 |
+
lib/__pycache__/lang.cpython-312.pyc filter=lfs diff=lfs merge=lfs -text
|
lib/__init__.py
ADDED
|
@@ -0,0 +1,57 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
from .models import (
|
| 2 |
+
TTS_ENGINES, TTS_VOICE_CONVERSION, TTS_SML, default_fine_tuned, default_tts_engine,
|
| 3 |
+
default_engine_settings, default_vc_model, default_voice_detection_model,
|
| 4 |
+
loaded_tts, max_custom_model, max_custom_voices,
|
| 5 |
+
max_tts_in_memory, max_upload_size, models, os, voices_dir
|
| 6 |
+
)
|
| 7 |
+
|
| 8 |
+
from .conf import (
|
| 9 |
+
FULL_DOCKER, NATIVE, audiobooks_cli_dir, audiobooks_gradio_dir,
|
| 10 |
+
audiobooks_host_dir, debug_mode, default_audio_proc_samplerate,
|
| 11 |
+
default_audio_proc_format, default_device, default_gpu_wiki,
|
| 12 |
+
default_output_format, device_list, ebook_formats,
|
| 13 |
+
ebooks_dir, interface_component_options, interface_concurrency_limit,
|
| 14 |
+
interface_host, interface_port, interface_shared_tmp_expire,
|
| 15 |
+
max_python_version, min_python_version, models_dir, os,
|
| 16 |
+
output_formats, platform, prog_version, python_env_dir,
|
| 17 |
+
requirements_file, tmp_dir, tmp_expire, tts_dir, voice_formats,
|
| 18 |
+
voices_dir, default_output_split, default_output_split_hours
|
| 19 |
+
)
|
| 20 |
+
|
| 21 |
+
from .lang import (
|
| 22 |
+
abbreviations_mapping, chapter_word_mapping, default_language_code,
|
| 23 |
+
roman_numbers_tuples, emojis_list, install_info, language_mapping,
|
| 24 |
+
language_math_phonemes, language_clock, language_tts, os, punctuation_list,
|
| 25 |
+
punctuation_list_set, punctuation_split_hard, punctuation_split_hard_set,
|
| 26 |
+
punctuation_split_soft, punctuation_split_soft_set, punctuation_switch,
|
| 27 |
+
specialchars_mapping, specialchars_remove, year_to_decades_languages
|
| 28 |
+
)
|
| 29 |
+
|
| 30 |
+
__all__ = [
|
| 31 |
+
# from models
|
| 32 |
+
"TTS_ENGINES", "TTS_VOICE_CONVERSION", "TTS_SML", "default_fine_tuned", "default_tts_engine",
|
| 33 |
+
"default_engine_settings", "default_vc_model", "default_voice_detection_model",
|
| 34 |
+
"loaded_tts", "max_custom_model",
|
| 35 |
+
"max_custom_voices", "max_tts_in_memory", "max_upload_size",
|
| 36 |
+
"models", "os", "voices_dir",
|
| 37 |
+
|
| 38 |
+
# from conf
|
| 39 |
+
"FULL_DOCKER", "NATIVE", "audiobooks_cli_dir", "audiobooks_gradio_dir",
|
| 40 |
+
"audiobooks_host_dir", "debug_mode", "default_audio_proc_samplerate",
|
| 41 |
+
"default_audio_proc_format", "default_device", "default_gpu_wiki",
|
| 42 |
+
"default_output_format", "device_list", "ebook_formats", "ebooks_dir",
|
| 43 |
+
"interface_component_options", "interface_concurrency_limit",
|
| 44 |
+
"interface_host", "interface_port", "interface_shared_tmp_expire",
|
| 45 |
+
"max_python_version", "min_python_version", "models_dir", "os",
|
| 46 |
+
"output_formats", "platform", "prog_version", "python_env_dir",
|
| 47 |
+
"requirements_file", "tmp_dir", "tmp_expire", "tts_dir",
|
| 48 |
+
"voice_formats", "voices_dir", "default_output_split", "default_output_split_hours",
|
| 49 |
+
|
| 50 |
+
# from lang
|
| 51 |
+
"abbreviations_mapping", "chapter_word_mapping", "default_language_code",
|
| 52 |
+
"roman_numbers_tuples", "emojis_list", "install_info", "language_mapping",
|
| 53 |
+
"language_math_phonemes", "language_clock", "language_tts", "os", "punctuation_list",
|
| 54 |
+
"punctuation_list_set", "punctuation_split_hard", "punctuation_split_hard_set",
|
| 55 |
+
"punctuation_split_soft", "punctuation_split_soft_set", "punctuation_switch",
|
| 56 |
+
"specialchars_mapping", "specialchars_remove", "year_to_decades_languages"
|
| 57 |
+
]
|
lib/__pycache__/__init__.cpython-312.pyc
ADDED
|
Binary file (2.64 kB). View file
|
|
|
lib/__pycache__/conf.cpython-312.pyc
ADDED
|
Binary file (4.98 kB). View file
|
|
|
lib/__pycache__/functions.cpython-312.pyc
ADDED
|
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
version https://git-lfs.github.com/spec/v1
|
| 2 |
+
oid sha256:59b1809dd2e4e86864d8ff51fbdade7548389b92cd6f3b24d9e9a54235eb0de2
|
| 3 |
+
size 236223
|
lib/__pycache__/lang.cpython-312.pyc
ADDED
|
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
version https://git-lfs.github.com/spec/v1
|
| 2 |
+
oid sha256:ff1e8d413d7881648a9aa7ffae42617ebc430ee61b2523706c9eb8315889c86e
|
| 3 |
+
size 228874
|
lib/__pycache__/models.cpython-312.pyc
ADDED
|
Binary file (20.8 kB). View file
|
|
|
lib/classes/__pycache__/background_detector.cpython-312.pyc
ADDED
|
Binary file (2.32 kB). View file
|
|
|
lib/classes/__pycache__/tts_manager.cpython-312.pyc
ADDED
|
Binary file (2.15 kB). View file
|
|
|
lib/classes/__pycache__/voice_extractor.cpython-312.pyc
ADDED
|
Binary file (14.3 kB). View file
|
|
|
lib/classes/argos_translator.py
ADDED
|
@@ -0,0 +1,122 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
import os
|
| 2 |
+
import tempfile
|
| 3 |
+
import argostranslate.package
|
| 4 |
+
import argostranslate.translate
|
| 5 |
+
|
| 6 |
+
from iso639 import languages
|
| 7 |
+
from lib.conf import models_dir
|
| 8 |
+
from lib.lang import language_mapping
|
| 9 |
+
|
| 10 |
+
# NOTE: source_lang and target_lang must be iso639-1 (2 letters)
|
| 11 |
+
|
| 12 |
+
class ArgosTranslator:
|
| 13 |
+
|
| 14 |
+
def __init__(self, neural_machine="argostranslate"):
|
| 15 |
+
self.neural_machine = neural_machine
|
| 16 |
+
self.translation = None
|
| 17 |
+
|
| 18 |
+
def get_language_iso3(self, lang_iso1):
|
| 19 |
+
lang = lang_iso1
|
| 20 |
+
try:
|
| 21 |
+
lang_array = languages.get(part1=lang_iso1)
|
| 22 |
+
if lang_array:
|
| 23 |
+
lang = lang_array.part3
|
| 24 |
+
except Exception:
|
| 25 |
+
pass
|
| 26 |
+
return lang
|
| 27 |
+
|
| 28 |
+
def get_all_sources_lang(self):
|
| 29 |
+
available_packages = argostranslate.package.get_available_packages()
|
| 30 |
+
return sorted(set(pkg.from_code for pkg in available_packages))
|
| 31 |
+
|
| 32 |
+
def get_all_targets_lang(self, source_lang):
|
| 33 |
+
available_packages = argostranslate.package.get_available_packages()
|
| 34 |
+
list_iso1 = sorted(set(pkg.to_code for pkg in available_packages if pkg.from_code == source_lang))
|
| 35 |
+
language_translate_mapping = {}
|
| 36 |
+
for iso1 in list_iso1:
|
| 37 |
+
try:
|
| 38 |
+
iso3 = self.get_language_iso3(iso1)
|
| 39 |
+
if iso3 in language_mapping:
|
| 40 |
+
language_translate_mapping[iso3] = dict(language_mapping[iso3])
|
| 41 |
+
language_translate_mapping[iso3]["iso1"] = iso1
|
| 42 |
+
except KeyError:
|
| 43 |
+
pass
|
| 44 |
+
language_translate_options = [
|
| 45 |
+
(
|
| 46 |
+
f"{details['name']} - {details['native_name']}" if details['name'] != details['native_name'] else details['name'],
|
| 47 |
+
lang
|
| 48 |
+
)
|
| 49 |
+
for lang, details in language_translate_mapping.items()
|
| 50 |
+
]
|
| 51 |
+
return language_translate_options
|
| 52 |
+
|
| 53 |
+
def get_all_target_packages(self, source_lang):
|
| 54 |
+
available_packages = argostranslate.package.get_available_packages()
|
| 55 |
+
return [pkg for pkg in available_packages if pkg.from_code == source_lang]
|
| 56 |
+
|
| 57 |
+
def is_package_installed(self, source_lang, target_lang):
|
| 58 |
+
try:
|
| 59 |
+
installed_languages = argostranslate.translate.get_installed_languages()
|
| 60 |
+
source_language = next((lang for lang in installed_languages if lang.code == source_lang), None)
|
| 61 |
+
target_language = next((lang for lang in installed_languages if lang.code == target_lang), None)
|
| 62 |
+
return source_language is not None and target_language is not None
|
| 63 |
+
except Exception as e:
|
| 64 |
+
error = f'is_package_installed() error: {e}'
|
| 65 |
+
return False
|
| 66 |
+
|
| 67 |
+
def download_and_install_argos_package(self, source_lang, target_lang):
|
| 68 |
+
try:
|
| 69 |
+
if self.is_package_installed(source_lang, target_lang):
|
| 70 |
+
print(f"Package for translation from {source_lang} to {target_lang} is already installed.")
|
| 71 |
+
print(msg)
|
| 72 |
+
return msg, True
|
| 73 |
+
available_packages = self.get_all_target_packages(source_lang)
|
| 74 |
+
target_package = None
|
| 75 |
+
for pkg in available_packages:
|
| 76 |
+
if pkg.from_code == source_lang and pkg.to_code == target_lang:
|
| 77 |
+
target_package = pkg
|
| 78 |
+
break
|
| 79 |
+
if target_package:
|
| 80 |
+
with tempfile.TemporaryDirectory() as tmpdirname:
|
| 81 |
+
print(f"Downloading package for translation from {source_lang} to {target_lang}...")
|
| 82 |
+
package_path = target_package.download()
|
| 83 |
+
argostranslate.package.install_from_path(package_path)
|
| 84 |
+
print(f"Package installed for translation from {source_lang} to {target_lang}")
|
| 85 |
+
return None, True
|
| 86 |
+
else:
|
| 87 |
+
msg = f"No available package found for translation from {source_lang} to {target_lang}."
|
| 88 |
+
return msg, False
|
| 89 |
+
except Exception as e:
|
| 90 |
+
error = f'download_and_install_argos_package() error: {e}'
|
| 91 |
+
return error, False
|
| 92 |
+
|
| 93 |
+
def process(self, text):
|
| 94 |
+
try:
|
| 95 |
+
return self.translation.translate(text), True
|
| 96 |
+
except Exception as e:
|
| 97 |
+
error = f'AgrosTranslator.process() error: {e}'
|
| 98 |
+
return error, False
|
| 99 |
+
|
| 100 |
+
def start(self, source_lang, target_lang):
|
| 101 |
+
try:
|
| 102 |
+
if self.neural_machine != "argostranslate":
|
| 103 |
+
error = f"Neural machine '{self.neural_machine}' is not supported."
|
| 104 |
+
return error, False
|
| 105 |
+
status = True
|
| 106 |
+
if not self.is_package_installed(source_lang, target_lang):
|
| 107 |
+
error, status = self.download_and_install_argos_package(source_lang, target_lang)
|
| 108 |
+
if status:
|
| 109 |
+
installed_languages = argostranslate.translate.get_installed_languages()
|
| 110 |
+
source_language = next((lang for lang in installed_languages if lang.code == source_lang), None)
|
| 111 |
+
target_language = next((lang for lang in installed_languages if lang.code == target_lang), None)
|
| 112 |
+
|
| 113 |
+
if not source_language or not target_language:
|
| 114 |
+
error = f"Translation languages not installed: {source_lang} to {target_lang}"
|
| 115 |
+
return error, False
|
| 116 |
+
|
| 117 |
+
self.translation = source_language.get_translation(target_language)
|
| 118 |
+
return None, True
|
| 119 |
+
return error, status
|
| 120 |
+
except Exception as e:
|
| 121 |
+
error = f'AgrosTranslator.process() error: {e}'
|
| 122 |
+
return error, False
|
lib/classes/background_detector.py
ADDED
|
@@ -0,0 +1,37 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
import os
|
| 2 |
+
import numpy as np
|
| 3 |
+
import librosa
|
| 4 |
+
|
| 5 |
+
from pyannote.audio import Model
|
| 6 |
+
from pyannote.audio.pipelines import VoiceActivityDetection
|
| 7 |
+
from lib.conf import tts_dir
|
| 8 |
+
from lib.models import default_voice_detection_model
|
| 9 |
+
|
| 10 |
+
class BackgroundDetector:
|
| 11 |
+
|
| 12 |
+
def __init__(self, wav_file: str):
|
| 13 |
+
self.wav_file = wav_file
|
| 14 |
+
model = Model.from_pretrained(default_voice_detection_model, cache_dir=tts_dir)
|
| 15 |
+
self.pipeline = VoiceActivityDetection(segmentation=model)
|
| 16 |
+
hyper_params = {
|
| 17 |
+
# onset/offset activation thresholds
|
| 18 |
+
"onset": 0.5, "offset": 0.5,
|
| 19 |
+
# remove speech regions shorter than that many seconds.
|
| 20 |
+
"min_duration_on": 0.0,
|
| 21 |
+
# fill non-speech regions shorter than that many seconds.
|
| 22 |
+
"min_duration_off": 0.0
|
| 23 |
+
}
|
| 24 |
+
self.pipeline.instantiate(hyper_params)
|
| 25 |
+
|
| 26 |
+
def detect(self, vad_ratio_thresh: float=0.05):
|
| 27 |
+
diarization = self.pipeline(self.wav_file)
|
| 28 |
+
speech_segments = [(s.start, s.end) for s in diarization.get_timeline()]
|
| 29 |
+
total_duration = librosa.get_duration(path=self.wav_file)
|
| 30 |
+
speech_time = sum(end - start for start, end in speech_segments)
|
| 31 |
+
non_speech_ratio = 1 - (speech_time / total_duration)
|
| 32 |
+
status = non_speech_ratio > vad_ratio_thresh
|
| 33 |
+
report = {
|
| 34 |
+
'non_speech_ratio': non_speech_ratio,
|
| 35 |
+
'background_detected': status
|
| 36 |
+
}
|
| 37 |
+
return status, report
|
lib/classes/redirect_console.py
ADDED
|
@@ -0,0 +1,51 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
from queue import Queue, Empty
|
| 2 |
+
import time
|
| 3 |
+
import logging
|
| 4 |
+
|
| 5 |
+
|
| 6 |
+
class RedirectConsole:
|
| 7 |
+
def __init__(self, log_buffer: Queue, real_output):
|
| 8 |
+
self.log_buffer = log_buffer # Queue buffer for the log
|
| 9 |
+
self.real_output = real_output # Real terminal (sys.__stdout__ or sys.__stderr__)
|
| 10 |
+
|
| 11 |
+
# Setup for transformers logging
|
| 12 |
+
self.setup_transformers_logger()
|
| 13 |
+
|
| 14 |
+
def write(self, message: str):
|
| 15 |
+
# Write to the real terminal
|
| 16 |
+
self.real_output.write(message)
|
| 17 |
+
self.real_output.flush()
|
| 18 |
+
|
| 19 |
+
# Write to the log buffer
|
| 20 |
+
self.log_buffer.put(message)
|
| 21 |
+
|
| 22 |
+
def flush(self):
|
| 23 |
+
self.real_output.flush()
|
| 24 |
+
|
| 25 |
+
def isatty(self) -> bool:
|
| 26 |
+
return self.real_output.isatty()
|
| 27 |
+
|
| 28 |
+
def poll_logs(self, stop_event):
|
| 29 |
+
logs = ""
|
| 30 |
+
errors = ""
|
| 31 |
+
while not stop_event.is_set() or not self.log_buffer.empty():
|
| 32 |
+
try:
|
| 33 |
+
# Read logs from the buffer without blocking
|
| 34 |
+
log = self.log_buffer.get_nowait()
|
| 35 |
+
if "An error occurred" in log:
|
| 36 |
+
errors += log # Capture error messages separately
|
| 37 |
+
logs += log
|
| 38 |
+
except Empty:
|
| 39 |
+
pass # No logs in the buffer
|
| 40 |
+
yield logs, errors # Yield updated logs and errors
|
| 41 |
+
time.sleep(0.1) # Prevent tight looping
|
| 42 |
+
|
| 43 |
+
def setup_transformers_logger(self):
|
| 44 |
+
# Configure the `transformers` logger
|
| 45 |
+
transformers_logger = logging.getLogger("transformers")
|
| 46 |
+
transformers_logger.setLevel(logging.WARNING) # Capture warnings and above
|
| 47 |
+
|
| 48 |
+
# Create a handler that writes to this instance
|
| 49 |
+
handler = logging.StreamHandler(self)
|
| 50 |
+
handler.setFormatter(logging.Formatter("%(message)s")) # Simplified format
|
| 51 |
+
transformers_logger.addHandler(handler)
|
lib/classes/tts_engines/.template.py
ADDED
|
@@ -0,0 +1,232 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
import hashlib
|
| 2 |
+
import math
|
| 3 |
+
import os
|
| 4 |
+
import shutil
|
| 5 |
+
import subprocess
|
| 6 |
+
import tempfile
|
| 7 |
+
import threading
|
| 8 |
+
import uuid
|
| 9 |
+
|
| 10 |
+
import numpy as np
|
| 11 |
+
import regex as re
|
| 12 |
+
import soundfile as sf
|
| 13 |
+
import torch
|
| 14 |
+
import torchaudio
|
| 15 |
+
|
| 16 |
+
from huggingface_hub import hf_hub_download
|
| 17 |
+
from pathlib import Path
|
| 18 |
+
from pprint import pprint
|
| 19 |
+
|
| 20 |
+
from lib import *
|
| 21 |
+
from lib.classes.tts_engines.common.utils import unload_tts, append_sentence2vtt
|
| 22 |
+
from lib.classes.tts_engines.common.audio_filters import detect_gender, trim_audio, normalize_audio, is_audio_data_valid
|
| 23 |
+
|
| 24 |
+
#import logging
|
| 25 |
+
#logging.basicConfig(level=logging.DEBUG)
|
| 26 |
+
|
| 27 |
+
lock = threading.Lock()
|
| 28 |
+
|
| 29 |
+
class Coqui:
|
| 30 |
+
|
| 31 |
+
def __init__(self, session):
|
| 32 |
+
try:
|
| 33 |
+
self.session = session
|
| 34 |
+
self.cache_dir = tts_dir
|
| 35 |
+
self.speakers_path = None
|
| 36 |
+
self.tts_key = f"{self.session['tts_engine']}-{self.session['fine_tuned']}"
|
| 37 |
+
self.tts_vc_key = default_vc_model.rsplit('/', 1)[-1]
|
| 38 |
+
self.is_bf16 = True if self.session['device'] == 'cuda' and torch.cuda.is_bf16_supported() == True else False
|
| 39 |
+
self.npz_path = None
|
| 40 |
+
self.npz_data = None
|
| 41 |
+
self.sentences_total_time = 0.0
|
| 42 |
+
self.sentence_idx = 1
|
| 43 |
+
self.params = {TTS_ENGINES['NEW_TTS']: {}}
|
| 44 |
+
self.params[self.session['tts_engine']]['samplerate'] = models[self.session['tts_engine']][self.session['fine_tuned']]['samplerate']
|
| 45 |
+
self.vtt_path = os.path.join(self.session['process_dir'], os.path.splitext(self.session['final_name'])[0] + '.vtt')
|
| 46 |
+
self.resampler_cache = {}
|
| 47 |
+
self.audio_segments = []
|
| 48 |
+
self._build()
|
| 49 |
+
except Exception as e:
|
| 50 |
+
error = f'__init__() error: {e}'
|
| 51 |
+
print(error)
|
| 52 |
+
return None
|
| 53 |
+
|
| 54 |
+
def _build(self):
|
| 55 |
+
try:
|
| 56 |
+
tts = (loaded_tts.get(self.tts_key) or {}).get('engine', False)
|
| 57 |
+
if not tts:
|
| 58 |
+
if self.session['tts_engine'] == TTS_ENGINES['NEW_TTS']:
|
| 59 |
+
if self.session['custom_model'] is not None:
|
| 60 |
+
msg = f"{self.session['tts_engine']} custom model not implemented yet!"
|
| 61 |
+
print(msg)
|
| 62 |
+
return False
|
| 63 |
+
else:
|
| 64 |
+
model_path = models[self.session['tts_engine']][self.session['fine_tuned']]['repo']
|
| 65 |
+
tts = self._load_api(self.tts_key, model_path, self.session['device'])
|
| 66 |
+
return (loaded_tts.get(self.tts_key) or {}).get('engine', False)
|
| 67 |
+
except Exception as e:
|
| 68 |
+
error = f'build() error: {e}'
|
| 69 |
+
print(error)
|
| 70 |
+
return False
|
| 71 |
+
|
| 72 |
+
def _load_api(self, key, model_path, device):
|
| 73 |
+
global lock
|
| 74 |
+
try:
|
| 75 |
+
if key in loaded_tts.keys():
|
| 76 |
+
return loaded_tts[key]['engine']
|
| 77 |
+
unload_tts(device, [self.tts_key, self.tts_vc_key])
|
| 78 |
+
with lock:
|
| 79 |
+
tts = NEW_TTS(model_path)
|
| 80 |
+
if tts
|
| 81 |
+
if device == 'cuda':
|
| 82 |
+
NEW_TTS.WITH_CUDA
|
| 83 |
+
else:
|
| 84 |
+
NEW_TTS.WITHOUT_CUDA
|
| 85 |
+
loaded_tts[key] = {"engine": tts, "config": None}
|
| 86 |
+
msg = f'{model_path} Loaded!'
|
| 87 |
+
print(msg)
|
| 88 |
+
return tts
|
| 89 |
+
else:
|
| 90 |
+
error = 'TTS engine could not be created!'
|
| 91 |
+
print(error)
|
| 92 |
+
except Exception as e:
|
| 93 |
+
error = f'_load_api() error: {e}'
|
| 94 |
+
print(error)
|
| 95 |
+
return False
|
| 96 |
+
|
| 97 |
+
def _load_checkpoint(self, **kwargs):
|
| 98 |
+
global lock
|
| 99 |
+
try:
|
| 100 |
+
key = kwargs.get('key')
|
| 101 |
+
if key in loaded_tts.keys():
|
| 102 |
+
return loaded_tts[key]['engine']
|
| 103 |
+
tts_engine = kwargs.get('tts_engine')
|
| 104 |
+
device = kwargs.get('device')
|
| 105 |
+
unload_tts(device, [self.tts_key])
|
| 106 |
+
with lock:
|
| 107 |
+
checkpoint_dir = kwargs.get('checkpoint_dir')
|
| 108 |
+
NEW_TTS.LOAD_CHECKPOINT(
|
| 109 |
+
config,
|
| 110 |
+
checkpoint_dir=checkpoint_dir,
|
| 111 |
+
eval=True
|
| 112 |
+
)
|
| 113 |
+
if tts:
|
| 114 |
+
if device == 'cuda':
|
| 115 |
+
NEW_TTS.WITH_CUDA
|
| 116 |
+
else:
|
| 117 |
+
NEW_TTS.WITHOUT_CUDA
|
| 118 |
+
loaded_tts[key] = {"engine": tts, "config": config}
|
| 119 |
+
msg = f'{tts_engine} Loaded!'
|
| 120 |
+
print(msg)
|
| 121 |
+
return tts
|
| 122 |
+
else:
|
| 123 |
+
error = 'TTS engine could not be created!'
|
| 124 |
+
print(error)
|
| 125 |
+
except Exception as e:
|
| 126 |
+
error = f'_load_checkpoint() error: {e}'
|
| 127 |
+
return False
|
| 128 |
+
|
| 129 |
+
def _tensor_type(self, audio_data):
|
| 130 |
+
if isinstance(audio_data, torch.Tensor):
|
| 131 |
+
return audio_data
|
| 132 |
+
elif isinstance(audio_data, np.ndarray):
|
| 133 |
+
return torch.from_numpy(audio_data).float()
|
| 134 |
+
elif isinstance(audio_data, list):
|
| 135 |
+
return torch.tensor(audio_data, dtype=torch.float32)
|
| 136 |
+
else:
|
| 137 |
+
raise TypeError(f"Unsupported type for audio_data: {type(audio_data)}")
|
| 138 |
+
|
| 139 |
+
def _get_resampler(self, orig_sr, target_sr):
|
| 140 |
+
key = (orig_sr, target_sr)
|
| 141 |
+
if key not in self.resampler_cache:
|
| 142 |
+
self.resampler_cache[key] = torchaudio.transforms.Resample(
|
| 143 |
+
orig_freq=orig_sr, new_freq=target_sr
|
| 144 |
+
)
|
| 145 |
+
return self.resampler_cache[key]
|
| 146 |
+
|
| 147 |
+
def _resample_wav(self, wav_path, expected_sr):
|
| 148 |
+
waveform, orig_sr = torchaudio.load(wav_path)
|
| 149 |
+
if orig_sr == expected_sr and waveform.size(0) == 1:
|
| 150 |
+
return wav_path
|
| 151 |
+
if waveform.size(0) > 1:
|
| 152 |
+
waveform = waveform.mean(dim=0, keepdim=True)
|
| 153 |
+
if orig_sr != expected_sr:
|
| 154 |
+
resampler = self._get_resampler(orig_sr, expected_sr)
|
| 155 |
+
waveform = resampler(waveform)
|
| 156 |
+
wav_tensor = waveform.squeeze(0)
|
| 157 |
+
wav_numpy = wav_tensor.cpu().numpy()
|
| 158 |
+
tmp_fh = tempfile.NamedTemporaryFile(suffix=".wav", delete=False)
|
| 159 |
+
tmp_path = tmp_fh.name
|
| 160 |
+
tmp_fh.close()
|
| 161 |
+
sf.write(tmp_path, wav_numpy, expected_sr, subtype="PCM_16")
|
| 162 |
+
return tmp_path
|
| 163 |
+
|
| 164 |
+
def convert(self, sentence_number, sentence):
|
| 165 |
+
global xtts_builtin_speakers_list
|
| 166 |
+
try:
|
| 167 |
+
speaker = None
|
| 168 |
+
audio_data = False
|
| 169 |
+
trim_audio_buffer = 0.004
|
| 170 |
+
settings = self.params[self.session['tts_engine']]
|
| 171 |
+
final_sentence_file = os.path.join(self.session['chapters_dir_sentences'], f'{sentence_number}.{default_audio_proc_format}')
|
| 172 |
+
sentence = sentence.strip()
|
| 173 |
+
settings['voice_path'] = (
|
| 174 |
+
self.session['voice'] if self.session['voice'] is not None
|
| 175 |
+
else os.path.join(self.session['custom_model_dir'], self.session['tts_engine'], self.session['custom_model'], 'ref.wav') if self.session['custom_model'] is not None
|
| 176 |
+
else models[self.session['tts_engine']][self.session['fine_tuned']]['voice']
|
| 177 |
+
)
|
| 178 |
+
if settings['voice_path'] is not None:
|
| 179 |
+
speaker = re.sub(r'\.wav$', '', os.path.basename(settings['voice_path']))
|
| 180 |
+
tts = (loaded_tts.get(self.tts_key) or {}).get('engine', False)
|
| 181 |
+
if tts:
|
| 182 |
+
if sentence[-1].isalnum():
|
| 183 |
+
sentence = f'{sentence} —'
|
| 184 |
+
if sentence == TTS_SML['break']:
|
| 185 |
+
break_tensor = torch.zeros(1, int(settings['samplerate'] * (int(np.random.uniform(0.3, 0.6) * 100) / 100))) # 0.4 to 0.7 seconds
|
| 186 |
+
self.audio_segments.append(break_tensor.clone())
|
| 187 |
+
return True
|
| 188 |
+
elif sentence == TTS_SML['pause']:
|
| 189 |
+
pause_tensor = torch.zeros(1, int(settings['samplerate'] * (int(np.random.uniform(1.0, 1.8) * 100) / 100))) # 1.0 to 1.8 seconds
|
| 190 |
+
self.audio_segments.append(pause_tensor.clone())
|
| 191 |
+
return True
|
| 192 |
+
else:
|
| 193 |
+
if self.session['tts_engine'] == TTS_ENGINES['NEW_TTS']:
|
| 194 |
+
audio_sentence = NEW_TTS.CONVERT() # audio_sentence must be torch.Tensor or (list, tuple) or np.ndarray
|
| 195 |
+
if is_audio_data_valid(audio_sentence):
|
| 196 |
+
sourceTensor = self._tensor_type(audio_sentence)
|
| 197 |
+
audio_tensor = sourceTensor.clone().detach().unsqueeze(0).cpu()
|
| 198 |
+
if sentence[-1].isalnum() or sentence[-1] == '—':
|
| 199 |
+
audio_tensor = trim_audio(audio_tensor.squeeze(), settings['samplerate'], 0.003, trim_audio_buffer).unsqueeze(0)
|
| 200 |
+
self.audio_segments.append(audio_tensor)
|
| 201 |
+
if not re.search(r'\w$', sentence, flags=re.UNICODE):
|
| 202 |
+
break_tensor = torch.zeros(1, int(settings['samplerate'] * (int(np.random.uniform(0.3, 0.6) * 100) / 100)))
|
| 203 |
+
self.audio_segments.append(break_tensor.clone())
|
| 204 |
+
if self.audio_segments:
|
| 205 |
+
audio_tensor = torch.cat(self.audio_segments, dim=-1)
|
| 206 |
+
start_time = self.sentences_total_time
|
| 207 |
+
duration = audio_tensor.shape[-1] / settings['samplerate']
|
| 208 |
+
end_time = start_time + duration
|
| 209 |
+
self.sentences_total_time = end_time
|
| 210 |
+
sentence_obj = {
|
| 211 |
+
"start": start_time,
|
| 212 |
+
"end": end_time,
|
| 213 |
+
"text": sentence,
|
| 214 |
+
"resume_check": self.sentence_idx
|
| 215 |
+
}
|
| 216 |
+
self.sentence_idx = append_sentence2vtt(sentence_obj, self.vtt_path)
|
| 217 |
+
if self.sentence_idx:
|
| 218 |
+
torchaudio.save(final_sentence_file, audio_tensor, settings['samplerate'], format=default_audio_proc_format)
|
| 219 |
+
del audio_tensor
|
| 220 |
+
self.audio_segments = []
|
| 221 |
+
if os.path.exists(final_sentence_file):
|
| 222 |
+
return True
|
| 223 |
+
else:
|
| 224 |
+
error = f"Cannot create {final_sentence_file}"
|
| 225 |
+
print(error)
|
| 226 |
+
else:
|
| 227 |
+
error = f"convert() error: {self.session['tts_engine']} is None"
|
| 228 |
+
print(error)
|
| 229 |
+
except Exception as e:
|
| 230 |
+
error = f'Coquit.convert(): {e}'
|
| 231 |
+
raise ValueError(e)
|
| 232 |
+
return False
|
lib/classes/tts_engines/common/audio_filters.py
ADDED
|
@@ -0,0 +1,107 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
import numpy as np
|
| 2 |
+
import torch
|
| 3 |
+
import subprocess
|
| 4 |
+
import shutil
|
| 5 |
+
|
| 6 |
+
from scipy.io import wavfile as wav
|
| 7 |
+
from scipy.signal import find_peaks
|
| 8 |
+
|
| 9 |
+
def detect_gender(voice_path):
|
| 10 |
+
try:
|
| 11 |
+
samplerate, signal = wav.read(voice_path)
|
| 12 |
+
# Convert stereo to mono if needed
|
| 13 |
+
if len(signal.shape) > 1:
|
| 14 |
+
signal = np.mean(signal, axis=1)
|
| 15 |
+
# Compute FFT
|
| 16 |
+
fft_spectrum = np.abs(np.fft.fft(signal))
|
| 17 |
+
freqs = np.fft.fftfreq(len(fft_spectrum), d=1/samplerate)
|
| 18 |
+
# Consider only positive frequencies
|
| 19 |
+
positive_freqs = freqs[:len(freqs)//2]
|
| 20 |
+
positive_magnitude = fft_spectrum[:len(fft_spectrum)//2]
|
| 21 |
+
# Find peaks in frequency spectrum
|
| 22 |
+
peaks, _ = find_peaks(positive_magnitude, height=np.max(positive_magnitude) * 0.2)
|
| 23 |
+
if len(peaks) == 0:
|
| 24 |
+
return None
|
| 25 |
+
# Find the first strong peak within the human voice range (75Hz - 300Hz)
|
| 26 |
+
for peak in peaks:
|
| 27 |
+
if 75 <= positive_freqs[peak] <= 300:
|
| 28 |
+
pitch = positive_freqs[peak]
|
| 29 |
+
gender = "female" if pitch > 135 else "male"
|
| 30 |
+
return gender
|
| 31 |
+
break
|
| 32 |
+
return None
|
| 33 |
+
except Exception as e:
|
| 34 |
+
error = f"_detect_gender() error: {voice_path}: {e}"
|
| 35 |
+
print(error)
|
| 36 |
+
return None
|
| 37 |
+
|
| 38 |
+
def trim_audio(audio_data, samplerate, silence_threshold=0.003, buffer_sec=0.005):
|
| 39 |
+
# Ensure audio_data is a PyTorch tensor
|
| 40 |
+
if isinstance(audio_data, list):
|
| 41 |
+
audio_data = torch.tensor(audio_data, dtype=torch.float32) # Ensure dtype and always float32 for audio
|
| 42 |
+
if isinstance(audio_data, torch.Tensor):
|
| 43 |
+
if audio_data.ndim != 1:
|
| 44 |
+
error = "audio_data must be a 1D tensor (mono audio)."
|
| 45 |
+
raise ValueError(error)
|
| 46 |
+
if audio_data.is_cuda:
|
| 47 |
+
audio_data = audio_data.cpu()
|
| 48 |
+
# Detect non-silent indices
|
| 49 |
+
non_silent_indices = torch.where(audio_data.abs() > silence_threshold)[0]
|
| 50 |
+
if len(non_silent_indices) == 0:
|
| 51 |
+
return torch.tensor([], dtype=audio_data.dtype) # Preserves dtype
|
| 52 |
+
# Calculate start and end trimming indices with buffer
|
| 53 |
+
start_index = max(non_silent_indices[0].item() - int(buffer_sec * samplerate), 0)
|
| 54 |
+
end_index = min(non_silent_indices[-1].item() + int(buffer_sec * samplerate), audio_data.size(0)) # Clamp end to signal length
|
| 55 |
+
trimmed_audio = audio_data[start_index:end_index]
|
| 56 |
+
return trimmed_audio
|
| 57 |
+
error = "audio_data must be a PyTorch tensor or a list of numerical values."
|
| 58 |
+
raise TypeError(error)
|
| 59 |
+
|
| 60 |
+
def normalize_audio(input_file, output_file, samplerate):
|
| 61 |
+
filter_complex = (
|
| 62 |
+
'agate=threshold=-25dB:ratio=1.4:attack=10:release=250,'
|
| 63 |
+
'afftdn=nf=-70,'
|
| 64 |
+
'acompressor=threshold=-20dB:ratio=2:attack=80:release=200:makeup=1dB,'
|
| 65 |
+
'loudnorm=I=-14:TP=-3:LRA=7:linear=true,'
|
| 66 |
+
'equalizer=f=150:t=q:w=2:g=1,'
|
| 67 |
+
'equalizer=f=250:t=q:w=2:g=-3,'
|
| 68 |
+
'equalizer=f=3000:t=q:w=2:g=2,'
|
| 69 |
+
'equalizer=f=5500:t=q:w=2:g=-4,'
|
| 70 |
+
'equalizer=f=9000:t=q:w=2:g=-2,'
|
| 71 |
+
'highpass=f=63[audio]'
|
| 72 |
+
)
|
| 73 |
+
ffmpeg_cmd = [shutil.which('ffmpeg'), '-hide_banner', '-nostats', '-i', input_file]
|
| 74 |
+
ffmpeg_cmd += [
|
| 75 |
+
'-filter_complex', filter_complex,
|
| 76 |
+
'-map', '[audio]',
|
| 77 |
+
'-ar', str(samplerate),
|
| 78 |
+
'-y', output_file
|
| 79 |
+
]
|
| 80 |
+
try:
|
| 81 |
+
subprocess.run(
|
| 82 |
+
ffmpeg_cmd,
|
| 83 |
+
env={},
|
| 84 |
+
stdout=subprocess.PIPE,
|
| 85 |
+
stderr=subprocess.PIPE,
|
| 86 |
+
encoding='utf-8',
|
| 87 |
+
errors='ignore'
|
| 88 |
+
)
|
| 89 |
+
return True
|
| 90 |
+
except subprocess.CalledProcessError as e:
|
| 91 |
+
error = f"normalize_audio() error: {input_file}: {e}"
|
| 92 |
+
print(error)
|
| 93 |
+
return False
|
| 94 |
+
|
| 95 |
+
def is_audio_data_valid(audio_data):
|
| 96 |
+
if audio_data is None:
|
| 97 |
+
return False
|
| 98 |
+
if isinstance(audio_data, torch.Tensor):
|
| 99 |
+
return audio_data.numel() > 0
|
| 100 |
+
if isinstance(audio_data, (list, tuple)):
|
| 101 |
+
return len(audio_data) > 0
|
| 102 |
+
try:
|
| 103 |
+
if isinstance(audio_data, np.ndarray):
|
| 104 |
+
return audio_data.size > 0
|
| 105 |
+
except ImportError:
|
| 106 |
+
pass
|
| 107 |
+
return False
|
lib/classes/tts_engines/common/utils.py
ADDED
|
@@ -0,0 +1,57 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
import os
|
| 2 |
+
import torch
|
| 3 |
+
import regex as re
|
| 4 |
+
import stanza
|
| 5 |
+
|
| 6 |
+
from lib.models import loaded_tts, max_tts_in_memory, TTS_ENGINES
|
| 7 |
+
|
| 8 |
+
def unload_tts(device, reserved_keys=None, tts_key=None):
|
| 9 |
+
try:
|
| 10 |
+
if len(loaded_tts) >= max_tts_in_memory:
|
| 11 |
+
if reserved_keys is None:
|
| 12 |
+
reserved_keys = []
|
| 13 |
+
if tts_key is not None:
|
| 14 |
+
if tts_key in loaded_tts.keys():
|
| 15 |
+
del loaded_tts[tts_key]
|
| 16 |
+
if device == 'cuda':
|
| 17 |
+
torch.cuda.empty_cache()
|
| 18 |
+
torch.cuda.ipc_collect()
|
| 19 |
+
else:
|
| 20 |
+
for key in list(loaded_tts.keys()):
|
| 21 |
+
if key not in reserved_keys:
|
| 22 |
+
del loaded_tts[key]
|
| 23 |
+
except Exception as e:
|
| 24 |
+
error = f'unload_tts() error: {e}'
|
| 25 |
+
print(error)
|
| 26 |
+
return False
|
| 27 |
+
|
| 28 |
+
def append_sentence2vtt(sentence_obj, path):
|
| 29 |
+
|
| 30 |
+
def format_timestamp(seconds):
|
| 31 |
+
m, s = divmod(seconds, 60)
|
| 32 |
+
h, m = divmod(m, 60)
|
| 33 |
+
return f"{int(h):02}:{int(m):02}:{s:06.3f}"
|
| 34 |
+
|
| 35 |
+
try:
|
| 36 |
+
index = 1
|
| 37 |
+
if os.path.exists(path):
|
| 38 |
+
with open(path, "r", encoding="utf-8") as f:
|
| 39 |
+
lines = f.readlines()
|
| 40 |
+
for line in lines:
|
| 41 |
+
if "-->" in line:
|
| 42 |
+
index += 1
|
| 43 |
+
if index > 1 and "resume_check" in sentence_obj and sentence_obj["resume_check"] < index:
|
| 44 |
+
return index # Already written
|
| 45 |
+
if not os.path.exists(path):
|
| 46 |
+
with open(path, "w", encoding="utf-8") as f:
|
| 47 |
+
f.write("WEBVTT\n\n")
|
| 48 |
+
with open(path, "a", encoding="utf-8") as f:
|
| 49 |
+
start = format_timestamp(sentence_obj["start"])
|
| 50 |
+
end = format_timestamp(sentence_obj["end"])
|
| 51 |
+
text = re.sub(r'[\r\n]+', ' ', sentence_obj["text"]).strip()
|
| 52 |
+
f.write(f"{start} --> {end}\n{text}\n\n")
|
| 53 |
+
return index + 1
|
| 54 |
+
except Exception as e:
|
| 55 |
+
error = f'append_sentence2vtt() error: {e}'
|
| 56 |
+
print(error)
|
| 57 |
+
return False
|
lib/classes/tts_engines/coqui.py
ADDED
|
@@ -0,0 +1,810 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
import hashlib, math, os, shutil, subprocess, tempfile, threading, uuid
|
| 2 |
+
import numpy as np, regex as re, soundfile as sf, torch, torchaudio
|
| 3 |
+
|
| 4 |
+
from huggingface_hub import hf_hub_download
|
| 5 |
+
from pathlib import Path
|
| 6 |
+
from pprint import pprint
|
| 7 |
+
|
| 8 |
+
from lib import *
|
| 9 |
+
from lib.classes.tts_engines.common.utils import unload_tts, append_sentence2vtt
|
| 10 |
+
from lib.classes.tts_engines.common.audio_filters import detect_gender, trim_audio, normalize_audio, is_audio_data_valid
|
| 11 |
+
|
| 12 |
+
#import logging
|
| 13 |
+
#logging.basicConfig(level=logging.DEBUG)
|
| 14 |
+
|
| 15 |
+
lock = threading.Lock()
|
| 16 |
+
xtts_builtin_speakers_list = None
|
| 17 |
+
|
| 18 |
+
class Coqui:
|
| 19 |
+
|
| 20 |
+
def __init__(self, session):
|
| 21 |
+
try:
|
| 22 |
+
self.session = session
|
| 23 |
+
self.cache_dir = tts_dir
|
| 24 |
+
self.speakers_path = None
|
| 25 |
+
self.tts_key = f"{self.session['tts_engine']}-{self.session['fine_tuned']}"
|
| 26 |
+
self.tts_vc_key = default_vc_model.rsplit('/', 1)[-1]
|
| 27 |
+
self.is_bf16 = True if self.session['device'] == 'cuda' and torch.cuda.is_bf16_supported() == True else False
|
| 28 |
+
self.npz_path = None
|
| 29 |
+
self.npz_data = None
|
| 30 |
+
self.sentences_total_time = 0.0
|
| 31 |
+
self.sentence_idx = 1
|
| 32 |
+
self.params = {TTS_ENGINES['XTTSv2']: {"latent_embedding":{}}, TTS_ENGINES['BARK']: {},TTS_ENGINES['VITS']: {"semitones": {}}, TTS_ENGINES['FAIRSEQ']: {"semitones": {}}, TTS_ENGINES['TACOTRON2']: {"semitones": {}}, TTS_ENGINES['YOURTTS']: {}}
|
| 33 |
+
self.params[self.session['tts_engine']]['samplerate'] = models[self.session['tts_engine']][self.session['fine_tuned']]['samplerate']
|
| 34 |
+
self.vtt_path = os.path.join(self.session['process_dir'], Path(self.session['final_name']).stem + '.vtt')
|
| 35 |
+
self.resampler_cache = {}
|
| 36 |
+
self.audio_segments = []
|
| 37 |
+
self._build()
|
| 38 |
+
except Exception as e:
|
| 39 |
+
error = f'__init__() error: {e}'
|
| 40 |
+
print(error)
|
| 41 |
+
return None
|
| 42 |
+
|
| 43 |
+
def _build(self):
|
| 44 |
+
try:
|
| 45 |
+
global xtts_builtin_speakers_list
|
| 46 |
+
load_zeroshot = True if self.session['tts_engine'] in [TTS_ENGINES['VITS'], TTS_ENGINES['FAIRSEQ'], TTS_ENGINES['TACOTRON2']] else False
|
| 47 |
+
tts = (loaded_tts.get(self.tts_key) or {}).get('engine', False)
|
| 48 |
+
if not tts:
|
| 49 |
+
if xtts_builtin_speakers_list is None:
|
| 50 |
+
self.speakers_path = hf_hub_download(repo_id=models[TTS_ENGINES['XTTSv2']]['internal']['repo'], filename=default_engine_settings[TTS_ENGINES['XTTSv2']]['files'][4], cache_dir=self.cache_dir)
|
| 51 |
+
xtts_builtin_speakers_list = torch.load(self.speakers_path)
|
| 52 |
+
if self.session['tts_engine'] == TTS_ENGINES['XTTSv2']:
|
| 53 |
+
msg = f"Loading TTS {self.session['tts_engine']} model, it takes a while, please be patient..."
|
| 54 |
+
print(msg)
|
| 55 |
+
if self.session['custom_model'] is not None:
|
| 56 |
+
config_path = os.path.join(self.session['custom_model_dir'], self.session['tts_engine'], self.session['custom_model'], default_engine_settings[TTS_ENGINES['XTTSv2']]['files'][0])
|
| 57 |
+
checkpoint_path = os.path.join(self.session['custom_model_dir'], self.session['tts_engine'], self.session['custom_model'], default_engine_settings[TTS_ENGINES['XTTSv2']]['files'][1])
|
| 58 |
+
vocab_path = os.path.join(self.session['custom_model_dir'], self.session['tts_engine'], self.session['custom_model'],default_engine_settings[TTS_ENGINES['XTTSv2']]['files'][2])
|
| 59 |
+
self.tts_key = f"{self.session['tts_engine']}-{self.session['custom_model']}"
|
| 60 |
+
tts = self._load_checkpoint(tts_engine=self.session['tts_engine'], key=self.tts_key, checkpoint_path=checkpoint_path, config_path=config_path, vocab_path=vocab_path, device=self.session['device'])
|
| 61 |
+
else:
|
| 62 |
+
hf_repo = models[self.session['tts_engine']][self.session['fine_tuned']]['repo']
|
| 63 |
+
if self.session['fine_tuned'] == 'internal':
|
| 64 |
+
hf_sub = ''
|
| 65 |
+
if self.speakers_path is None:
|
| 66 |
+
self.speakers_path = hf_hub_download(repo_id=hf_repo, filename=default_engine_settings[TTS_ENGINES['XTTSv2']]['files'][4], cache_dir=self.cache_dir)
|
| 67 |
+
else:
|
| 68 |
+
hf_sub = models[self.session['tts_engine']][self.session['fine_tuned']]['sub']
|
| 69 |
+
config_path = hf_hub_download(repo_id=hf_repo, filename=f"{hf_sub}{models[self.session['tts_engine']][self.session['fine_tuned']]['files'][0]}", cache_dir=self.cache_dir)
|
| 70 |
+
checkpoint_path = hf_hub_download(repo_id=hf_repo, filename=f"{hf_sub}{models[self.session['tts_engine']][self.session['fine_tuned']]['files'][1]}", cache_dir=self.cache_dir)
|
| 71 |
+
vocab_path = hf_hub_download(repo_id=hf_repo, filename=f"{hf_sub}{models[self.session['tts_engine']][self.session['fine_tuned']]['files'][2]}", cache_dir=self.cache_dir)
|
| 72 |
+
tts = self._load_checkpoint(tts_engine=self.session['tts_engine'], key=self.tts_key, checkpoint_path=checkpoint_path, config_path=config_path, vocab_path=vocab_path, device=self.session['device'])
|
| 73 |
+
elif self.session['tts_engine'] == TTS_ENGINES['BARK']:
|
| 74 |
+
if self.session['custom_model'] is not None:
|
| 75 |
+
msg = f"{self.session['tts_engine']} custom model not implemented yet!"
|
| 76 |
+
print(msg)
|
| 77 |
+
return False
|
| 78 |
+
else:
|
| 79 |
+
hf_repo = models[self.session['tts_engine']][self.session['fine_tuned']]['repo']
|
| 80 |
+
hf_sub = models[self.session['tts_engine']][self.session['fine_tuned']]['sub']
|
| 81 |
+
text_model_path = hf_hub_download(repo_id=hf_repo, filename=f"{hf_sub}{models[self.session['tts_engine']][self.session['fine_tuned']]['files'][0]}", cache_dir=self.cache_dir)
|
| 82 |
+
coarse_model_path = hf_hub_download(repo_id=hf_repo, filename=f"{hf_sub}{models[self.session['tts_engine']][self.session['fine_tuned']]['files'][1]}", cache_dir=self.cache_dir)
|
| 83 |
+
fine_model_path = hf_hub_download(repo_id=hf_repo, filename=f"{hf_sub}{models[self.session['tts_engine']][self.session['fine_tuned']]['files'][2]}", cache_dir=self.cache_dir)
|
| 84 |
+
checkpoint_dir = os.path.dirname(text_model_path)
|
| 85 |
+
tts = self._load_checkpoint(tts_engine=self.session['tts_engine'], key=self.tts_key, checkpoint_dir=checkpoint_dir, device=self.session['device'])
|
| 86 |
+
elif self.session['tts_engine'] == TTS_ENGINES['VITS']:
|
| 87 |
+
if self.session['custom_model'] is not None:
|
| 88 |
+
msg = f"{self.session['tts_engine']} custom model not implemented yet!"
|
| 89 |
+
print(msg)
|
| 90 |
+
return False
|
| 91 |
+
else:
|
| 92 |
+
iso_dir = language_tts[self.session['tts_engine']][self.session['language']]
|
| 93 |
+
sub_dict = models[self.session['tts_engine']][self.session['fine_tuned']]['sub']
|
| 94 |
+
sub = next((key for key, lang_list in sub_dict.items() if iso_dir in lang_list), None)
|
| 95 |
+
if sub is not None:
|
| 96 |
+
self.params[self.session['tts_engine']]['samplerate'] = models[TTS_ENGINES['VITS']][self.session['fine_tuned']]['samplerate'][sub]
|
| 97 |
+
model_path = models[self.session['tts_engine']][self.session['fine_tuned']]['repo'].replace("[lang_iso1]", iso_dir).replace("[xxx]", sub)
|
| 98 |
+
msg = f"Loading TTS {model_path} model, it takes a while, please be patient..."
|
| 99 |
+
print(msg)
|
| 100 |
+
self.tts_key = model_path
|
| 101 |
+
tts = self._load_api(self.tts_key, model_path, self.session['device'])
|
| 102 |
+
else:
|
| 103 |
+
msg = f"{self.session['tts_engine']} checkpoint for {self.session['language']} not found!"
|
| 104 |
+
print(msg)
|
| 105 |
+
return False
|
| 106 |
+
elif self.session['tts_engine'] == TTS_ENGINES['FAIRSEQ']:
|
| 107 |
+
if self.session['custom_model'] is not None:
|
| 108 |
+
msg = f"{self.session['tts_engine']} custom model not implemented yet!"
|
| 109 |
+
print(msg)
|
| 110 |
+
return False
|
| 111 |
+
else:
|
| 112 |
+
model_path = models[self.session['tts_engine']][self.session['fine_tuned']]['repo'].replace("[lang]", self.session['language'])
|
| 113 |
+
self.tts_key = model_path
|
| 114 |
+
tts = self._load_api(self.tts_key, model_path, self.session['device'])
|
| 115 |
+
elif self.session['tts_engine'] == TTS_ENGINES['TACOTRON2']:
|
| 116 |
+
if self.session['custom_model'] is not None:
|
| 117 |
+
msg = f"{self.session['tts_engine']} custom model not implemented yet!"
|
| 118 |
+
print(msg)
|
| 119 |
+
return False
|
| 120 |
+
else:
|
| 121 |
+
iso_dir = language_tts[self.session['tts_engine']][self.session['language']]
|
| 122 |
+
sub_dict = models[self.session['tts_engine']][self.session['fine_tuned']]['sub']
|
| 123 |
+
sub = next((key for key, lang_list in sub_dict.items() if iso_dir in lang_list), None)
|
| 124 |
+
self.params[self.session['tts_engine']]['samplerate'] = models[TTS_ENGINES['TACOTRON2']][self.session['fine_tuned']]['samplerate'][sub]
|
| 125 |
+
if sub is None:
|
| 126 |
+
iso_dir = self.session['language']
|
| 127 |
+
sub = next((key for key, lang_list in sub_dict.items() if iso_dir in lang_list), None)
|
| 128 |
+
if sub is not None:
|
| 129 |
+
model_path = models[self.session['tts_engine']][self.session['fine_tuned']]['repo'].replace("[lang_iso1]", iso_dir).replace("[xxx]", sub)
|
| 130 |
+
msg = f"Loading TTS {model_path} model, it takes a while, please be patient..."
|
| 131 |
+
print(msg)
|
| 132 |
+
self.tts_key = model_path
|
| 133 |
+
tts = self._load_api(self.tts_key, model_path, self.session['device'])
|
| 134 |
+
else:
|
| 135 |
+
msg = f"{self.session['tts_engine']} checkpoint for {self.session['language']} not found!"
|
| 136 |
+
print(msg)
|
| 137 |
+
return False
|
| 138 |
+
elif self.session['tts_engine'] == TTS_ENGINES['YOURTTS']:
|
| 139 |
+
if self.session['custom_model'] is not None:
|
| 140 |
+
msg = f"{self.session['tts_engine']} custom model not implemented yet!"
|
| 141 |
+
print(msg)
|
| 142 |
+
return False
|
| 143 |
+
else:
|
| 144 |
+
model_path = models[self.session['tts_engine']][self.session['fine_tuned']]['repo']
|
| 145 |
+
tts = self._load_api(self.tts_key, model_path, self.session['device'])
|
| 146 |
+
if load_zeroshot:
|
| 147 |
+
tts_vc = (loaded_tts.get(self.tts_vc_key) or {}).get('engine', False)
|
| 148 |
+
if not tts_vc:
|
| 149 |
+
if self.session['voice'] is not None:
|
| 150 |
+
msg = f"Loading TTS {self.tts_vc_key} zeroshot model, it takes a while, please be patient..."
|
| 151 |
+
print(msg)
|
| 152 |
+
tts_vc = self._load_api(self.tts_vc_key, default_vc_model, self.session['device'])
|
| 153 |
+
return (loaded_tts.get(self.tts_key) or {}).get('engine', False)
|
| 154 |
+
except Exception as e:
|
| 155 |
+
error = f'build() error: {e}'
|
| 156 |
+
print(error)
|
| 157 |
+
return False
|
| 158 |
+
|
| 159 |
+
def _load_api(self, key, model_path, device):
|
| 160 |
+
global lock
|
| 161 |
+
try:
|
| 162 |
+
if key in loaded_tts.keys():
|
| 163 |
+
return loaded_tts[key]['engine']
|
| 164 |
+
unload_tts(device, [self.tts_key, self.tts_vc_key])
|
| 165 |
+
from TTS.api import TTS as coquiAPI
|
| 166 |
+
with lock:
|
| 167 |
+
tts = coquiAPI(model_path)
|
| 168 |
+
if tts:
|
| 169 |
+
if device == 'cuda':
|
| 170 |
+
tts.cuda()
|
| 171 |
+
else:
|
| 172 |
+
tts.to(device)
|
| 173 |
+
loaded_tts[key] = {"engine": tts, "config": None}
|
| 174 |
+
msg = f'{model_path} Loaded!'
|
| 175 |
+
print(msg)
|
| 176 |
+
return tts
|
| 177 |
+
else:
|
| 178 |
+
error = 'TTS engine could not be created!'
|
| 179 |
+
print(error)
|
| 180 |
+
except Exception as e:
|
| 181 |
+
error = f'_load_api() error: {e}'
|
| 182 |
+
print(error)
|
| 183 |
+
return False
|
| 184 |
+
|
| 185 |
+
def _load_checkpoint(self, **kwargs):
|
| 186 |
+
global lock
|
| 187 |
+
try:
|
| 188 |
+
key = kwargs.get('key')
|
| 189 |
+
if key in loaded_tts.keys():
|
| 190 |
+
return loaded_tts[key]['engine']
|
| 191 |
+
tts_engine = kwargs.get('tts_engine')
|
| 192 |
+
device = kwargs.get('device')
|
| 193 |
+
unload_tts(device, [self.tts_key, self.tts_vc_key])
|
| 194 |
+
with lock:
|
| 195 |
+
if tts_engine == TTS_ENGINES['XTTSv2']:
|
| 196 |
+
from TTS.tts.configs.xtts_config import XttsConfig
|
| 197 |
+
from TTS.tts.models.xtts import Xtts
|
| 198 |
+
checkpoint_path = kwargs.get('checkpoint_path')
|
| 199 |
+
config_path = kwargs.get('config_path', None)
|
| 200 |
+
vocab_path = kwargs.get('vocab_path', None)
|
| 201 |
+
config = XttsConfig()
|
| 202 |
+
config.models_dir = os.path.join("models", "tts")
|
| 203 |
+
config.load_json(config_path)
|
| 204 |
+
tts = Xtts.init_from_config(config)
|
| 205 |
+
tts.load_checkpoint(
|
| 206 |
+
config,
|
| 207 |
+
checkpoint_path=checkpoint_path,
|
| 208 |
+
vocab_path=vocab_path,
|
| 209 |
+
use_deepspeed=default_engine_settings[TTS_ENGINES['XTTSv2']]['use_deepspeed'],
|
| 210 |
+
eval=True
|
| 211 |
+
)
|
| 212 |
+
elif tts_engine == TTS_ENGINES['BARK']:
|
| 213 |
+
from TTS.tts.configs.bark_config import BarkConfig
|
| 214 |
+
from TTS.tts.models.bark import Bark
|
| 215 |
+
checkpoint_dir = kwargs.get('checkpoint_dir')
|
| 216 |
+
config = BarkConfig()
|
| 217 |
+
config.CACHE_DIR = self.cache_dir
|
| 218 |
+
config.USE_SMALLER_MODELS = os.environ.get('SUNO_USE_SMALL_MODELS', '').lower() == 'true'
|
| 219 |
+
tts = Bark.init_from_config(config)
|
| 220 |
+
tts.load_checkpoint(
|
| 221 |
+
config,
|
| 222 |
+
checkpoint_dir=checkpoint_dir,
|
| 223 |
+
eval=True
|
| 224 |
+
)
|
| 225 |
+
if tts:
|
| 226 |
+
if device == 'cuda':
|
| 227 |
+
tts.cuda()
|
| 228 |
+
else:
|
| 229 |
+
tts.to(device)
|
| 230 |
+
loaded_tts[key] = {"engine": tts, "config": config}
|
| 231 |
+
msg = f'{tts_engine} Loaded!'
|
| 232 |
+
print(msg)
|
| 233 |
+
return tts
|
| 234 |
+
else:
|
| 235 |
+
error = 'TTS engine could not be created!'
|
| 236 |
+
print(error)
|
| 237 |
+
except Exception as e:
|
| 238 |
+
error = f'_load_checkpoint() error: {e}'
|
| 239 |
+
return False
|
| 240 |
+
|
| 241 |
+
def _check_xtts_builtin_speakers(self, voice_path, speaker, device):
|
| 242 |
+
try:
|
| 243 |
+
voice_parts = Path(voice_path).parts
|
| 244 |
+
if self.session['language'] not in voice_parts and speaker not in default_engine_settings[TTS_ENGINES['BARK']]['voices'].keys() and self.session['language'] != 'eng':
|
| 245 |
+
if self.session['language'] in language_tts[TTS_ENGINES['XTTSv2']].keys():
|
| 246 |
+
default_text_file = os.path.join(voices_dir, self.session['language'], 'default.txt')
|
| 247 |
+
if os.path.exists(default_text_file):
|
| 248 |
+
msg = f"Converting builtin eng voice to {self.session['language']}..."
|
| 249 |
+
print(msg)
|
| 250 |
+
tts_internal_key = f"{TTS_ENGINES['XTTSv2']}-internal"
|
| 251 |
+
default_text = Path(default_text_file).read_text(encoding="utf-8")
|
| 252 |
+
hf_repo = models[TTS_ENGINES['XTTSv2']]['internal']['repo']
|
| 253 |
+
hf_sub = ''
|
| 254 |
+
tts = (loaded_tts.get(tts_internal_key) or {}).get('engine', False)
|
| 255 |
+
if not tts:
|
| 256 |
+
for key in list(loaded_tts.keys()): unload_tts(device, None, key)
|
| 257 |
+
config_path = hf_hub_download(repo_id=hf_repo, filename=f"{hf_sub}{models[TTS_ENGINES['XTTSv2']]['internal']['files'][0]}", cache_dir=self.cache_dir)
|
| 258 |
+
checkpoint_path = hf_hub_download(repo_id=hf_repo, filename=f"{hf_sub}{models[TTS_ENGINES['XTTSv2']]['internal']['files'][1]}", cache_dir=self.cache_dir)
|
| 259 |
+
vocab_path = hf_hub_download(repo_id=hf_repo, filename=f"{hf_sub}{models[TTS_ENGINES['XTTSv2']]['internal']['files'][2]}", cache_dir=self.cache_dir)
|
| 260 |
+
tts = self._load_checkpoint(tts_engine=TTS_ENGINES['XTTSv2'], key=tts_internal_key, checkpoint_path=checkpoint_path, config_path=config_path, vocab_path=vocab_path, device=device)
|
| 261 |
+
if tts:
|
| 262 |
+
if speaker in default_engine_settings[TTS_ENGINES['XTTSv2']]['voices'].keys():
|
| 263 |
+
gpt_cond_latent, speaker_embedding = xtts_builtin_speakers_list[default_engine_settings[TTS_ENGINES['XTTSv2']]['voices'][speaker]].values()
|
| 264 |
+
else:
|
| 265 |
+
gpt_cond_latent, speaker_embedding = tts.get_conditioning_latents(audio_path=[voice_path])
|
| 266 |
+
fine_tuned_params = {
|
| 267 |
+
key: cast_type(self.session[key])
|
| 268 |
+
for key, cast_type in {
|
| 269 |
+
"temperature": float,
|
| 270 |
+
"length_penalty": float,
|
| 271 |
+
"num_beams": int,
|
| 272 |
+
"repetition_penalty": float,
|
| 273 |
+
"top_k": int,
|
| 274 |
+
"top_p": float,
|
| 275 |
+
"speed": float,
|
| 276 |
+
"enable_text_splitting": bool
|
| 277 |
+
}.items()
|
| 278 |
+
if self.session.get(key) is not None
|
| 279 |
+
}
|
| 280 |
+
with torch.no_grad():
|
| 281 |
+
result = tts.inference(
|
| 282 |
+
text=default_text,
|
| 283 |
+
language=self.session['language_iso1'],
|
| 284 |
+
gpt_cond_latent=gpt_cond_latent,
|
| 285 |
+
speaker_embedding=speaker_embedding,
|
| 286 |
+
**fine_tuned_params
|
| 287 |
+
)
|
| 288 |
+
audio_data = result.get('wav')
|
| 289 |
+
if audio_data is not None:
|
| 290 |
+
audio_data = audio_data.tolist()
|
| 291 |
+
sourceTensor = self._tensor_type(audio_data)
|
| 292 |
+
audio_tensor = sourceTensor.clone().detach().unsqueeze(0).cpu()
|
| 293 |
+
lang_dir = 'con-' if self.session['language'] == 'con' else self.session['language']
|
| 294 |
+
new_voice_path = re.sub(r'([\\/])eng([\\/])', rf'\1{lang_dir}\2', voice_path)
|
| 295 |
+
proc_voice_path = new_voice_path.replace('.wav', '_temp.wav')
|
| 296 |
+
torchaudio.save(proc_voice_path, audio_tensor, default_engine_settings[TTS_ENGINES['XTTSv2']]['samplerate'], format='wav')
|
| 297 |
+
if normalize_audio(proc_voice_path, new_voice_path, default_audio_proc_samplerate):
|
| 298 |
+
del audio_data, sourceTensor, audio_tensor
|
| 299 |
+
if self.session['tts_engine'] != TTS_ENGINES['XTTSv2']:
|
| 300 |
+
del tts
|
| 301 |
+
unload_tts(device, None, tts_internal_key)
|
| 302 |
+
return new_voice_path
|
| 303 |
+
else:
|
| 304 |
+
error = 'normalize_audio() error:'
|
| 305 |
+
else:
|
| 306 |
+
error = f'No audio waveform found in _check_xtts_builtin_speakers() result: {result}'
|
| 307 |
+
else:
|
| 308 |
+
error = f"_check_xtts_builtin_speakers() error: {TTS_ENGINES['XTTSv2']} is False"
|
| 309 |
+
else:
|
| 310 |
+
error = f'The translated {default_text_file} could not be found! Voice cloning file will stay in English.'
|
| 311 |
+
print(error)
|
| 312 |
+
else:
|
| 313 |
+
return voice_path
|
| 314 |
+
else:
|
| 315 |
+
return voice_path
|
| 316 |
+
except Exception as e:
|
| 317 |
+
error = f'_check_xtts_builtin_speakers() error: {e}'
|
| 318 |
+
print(error)
|
| 319 |
+
return False
|
| 320 |
+
|
| 321 |
+
def _check_bark_npz(self, voice_path, bark_dir, speaker, device):
|
| 322 |
+
try:
|
| 323 |
+
if self.session['language'] in language_tts[TTS_ENGINES['BARK']].keys():
|
| 324 |
+
npz_dir = os.path.join(bark_dir, speaker)
|
| 325 |
+
npz_file = os.path.join(npz_dir, f'{speaker}.npz')
|
| 326 |
+
if os.path.exists(npz_file):
|
| 327 |
+
return True
|
| 328 |
+
else:
|
| 329 |
+
os.makedirs(npz_dir, exist_ok=True)
|
| 330 |
+
tts_internal_key = f"{TTS_ENGINES['BARK']}-internal"
|
| 331 |
+
hf_repo = models[TTS_ENGINES['BARK']]['internal']['repo']
|
| 332 |
+
hf_sub = models[TTS_ENGINES['BARK']]['internal']['sub']
|
| 333 |
+
tts = (loaded_tts.get(tts_internal_key) or {}).get('engine', False)
|
| 334 |
+
if not tts:
|
| 335 |
+
for key in list(loaded_tts.keys()): unload_tts(device, None, key)
|
| 336 |
+
text_model_path = hf_hub_download(repo_id=hf_repo, filename=f"{hf_sub}{models[TTS_ENGINES['BARK']]['internal']['files'][0]}", cache_dir=self.cache_dir)
|
| 337 |
+
coarse_model_path = hf_hub_download(repo_id=hf_repo, filename=f"{hf_sub}{models[TTS_ENGINES['BARK']]['internal']['files'][1]}", cache_dir=self.cache_dir)
|
| 338 |
+
fine_model_path = hf_hub_download(repo_id=hf_repo, filename=f"{hf_sub}{models[TTS_ENGINES['BARK']]['internal']['files'][2]}", cache_dir=self.cache_dir)
|
| 339 |
+
checkpoint_dir = os.path.dirname(text_model_path)
|
| 340 |
+
tts = self._load_checkpoint(tts_engine=TTS_ENGINES['BARK'], key=tts_internal_key, checkpoint_dir=checkpoint_dir, device=device)
|
| 341 |
+
if tts:
|
| 342 |
+
voice_temp = os.path.splitext(npz_file)[0]+'.wav'
|
| 343 |
+
shutil.copy(voice_path, voice_temp)
|
| 344 |
+
default_text_file = os.path.join(voices_dir, self.session['language'], 'default.txt')
|
| 345 |
+
default_text = Path(default_text_file).read_text(encoding="utf-8")
|
| 346 |
+
fine_tuned_params = {
|
| 347 |
+
key: cast_type(self.session[key])
|
| 348 |
+
for key, cast_type in {
|
| 349 |
+
"text_temp": float,
|
| 350 |
+
"waveform_temp": float
|
| 351 |
+
}.items()
|
| 352 |
+
if self.session.get(key) is not None
|
| 353 |
+
}
|
| 354 |
+
with torch.no_grad():
|
| 355 |
+
torch.manual_seed(67878789)
|
| 356 |
+
audio_data = tts.synthesize(
|
| 357 |
+
default_text,
|
| 358 |
+
loaded_tts[tts_internal_key]['config'],
|
| 359 |
+
speaker_id=speaker,
|
| 360 |
+
voice_dirs=bark_dir,
|
| 361 |
+
silent=True,
|
| 362 |
+
**fine_tuned_params
|
| 363 |
+
)
|
| 364 |
+
os.remove(voice_temp)
|
| 365 |
+
del audio_data
|
| 366 |
+
if self.session['tts_engine'] != TTS_ENGINES['BARK']:
|
| 367 |
+
del tts
|
| 368 |
+
unload_tts(device, None, tts_internal_key)
|
| 369 |
+
msg = f"Saved NPZ file: {npz_file}"
|
| 370 |
+
print(msg)
|
| 371 |
+
return True
|
| 372 |
+
else:
|
| 373 |
+
error = f'_check_bark_npz() error: {tts_internal_key} is False'
|
| 374 |
+
print(error)
|
| 375 |
+
else:
|
| 376 |
+
return True
|
| 377 |
+
except Exception as e:
|
| 378 |
+
error = f'_check_bark_npz() error: {e}'
|
| 379 |
+
print(error)
|
| 380 |
+
return False
|
| 381 |
+
|
| 382 |
+
def _tensor_type(self, audio_data):
|
| 383 |
+
if isinstance(audio_data, torch.Tensor):
|
| 384 |
+
return audio_data
|
| 385 |
+
elif isinstance(audio_data, np.ndarray):
|
| 386 |
+
return torch.from_numpy(audio_data).float()
|
| 387 |
+
elif isinstance(audio_data, list):
|
| 388 |
+
return torch.tensor(audio_data, dtype=torch.float32)
|
| 389 |
+
else:
|
| 390 |
+
raise TypeError(f"Unsupported type for audio_data: {type(audio_data)}")
|
| 391 |
+
|
| 392 |
+
def _get_resampler(self, orig_sr, target_sr):
|
| 393 |
+
key = (orig_sr, target_sr)
|
| 394 |
+
if key not in self.resampler_cache:
|
| 395 |
+
self.resampler_cache[key] = torchaudio.transforms.Resample(
|
| 396 |
+
orig_freq=orig_sr, new_freq=target_sr
|
| 397 |
+
)
|
| 398 |
+
return self.resampler_cache[key]
|
| 399 |
+
|
| 400 |
+
def _resample_wav(self, wav_path, expected_sr):
|
| 401 |
+
waveform, orig_sr = torchaudio.load(wav_path)
|
| 402 |
+
if orig_sr == expected_sr and waveform.size(0) == 1:
|
| 403 |
+
return wav_path
|
| 404 |
+
if waveform.size(0) > 1:
|
| 405 |
+
waveform = waveform.mean(dim=0, keepdim=True)
|
| 406 |
+
if orig_sr != expected_sr:
|
| 407 |
+
resampler = self._get_resampler(orig_sr, expected_sr)
|
| 408 |
+
waveform = resampler(waveform)
|
| 409 |
+
wav_tensor = waveform.squeeze(0)
|
| 410 |
+
wav_numpy = wav_tensor.cpu().numpy()
|
| 411 |
+
tmp_fh = tempfile.NamedTemporaryFile(suffix=".wav", delete=False)
|
| 412 |
+
tmp_path = tmp_fh.name
|
| 413 |
+
tmp_fh.close()
|
| 414 |
+
sf.write(tmp_path, wav_numpy, expected_sr, subtype="PCM_16")
|
| 415 |
+
return tmp_path
|
| 416 |
+
|
| 417 |
+
def convert(self, s_n, s):
|
| 418 |
+
global xtts_builtin_speakers_list
|
| 419 |
+
try:
|
| 420 |
+
sentence_number = s_n
|
| 421 |
+
sentence = s
|
| 422 |
+
speaker = None
|
| 423 |
+
audio_data = False
|
| 424 |
+
trim_audio_buffer = 0.004
|
| 425 |
+
settings = self.params[self.session['tts_engine']]
|
| 426 |
+
final_sentence_file = os.path.join(self.session['chapters_dir_sentences'], f'{sentence_number}.{default_audio_proc_format}')
|
| 427 |
+
settings['voice_path'] = (
|
| 428 |
+
self.session['voice'] if self.session['voice'] is not None
|
| 429 |
+
else os.path.join(self.session['custom_model_dir'], self.session['tts_engine'], self.session['custom_model'], 'ref.wav') if self.session['custom_model'] is not None
|
| 430 |
+
else models[self.session['tts_engine']][self.session['fine_tuned']]['voice']
|
| 431 |
+
)
|
| 432 |
+
if settings['voice_path'] is not None:
|
| 433 |
+
speaker = re.sub(r'\.wav$', '', os.path.basename(settings['voice_path']))
|
| 434 |
+
if settings['voice_path'] not in default_engine_settings[TTS_ENGINES['BARK']]['voices'].keys() and os.path.basename(settings['voice_path']) != 'ref.wav':
|
| 435 |
+
self.session['voice'] = settings['voice_path'] = self._check_xtts_builtin_speakers(settings['voice_path'], speaker, self.session['device'])
|
| 436 |
+
if not settings['voice_path']:
|
| 437 |
+
msg = f"Could not create the builtin speaker selected voice in {self.session['language']}"
|
| 438 |
+
print(msg)
|
| 439 |
+
return False
|
| 440 |
+
tts = (loaded_tts.get(self.tts_key) or {}).get('engine', False)
|
| 441 |
+
if tts:
|
| 442 |
+
if sentence == TTS_SML['break']:
|
| 443 |
+
silence_time = int(np.random.uniform(0.3, 0.6) * 100) / 100
|
| 444 |
+
break_tensor = torch.zeros(1, int(settings['samplerate'] * silence_time)) # 0.4 to 0.7 seconds
|
| 445 |
+
self.audio_segments.append(break_tensor.clone())
|
| 446 |
+
return True
|
| 447 |
+
elif sentence == TTS_SML['pause']:
|
| 448 |
+
silence_time = int(np.random.uniform(1.0, 1.8) * 100) / 100
|
| 449 |
+
pause_tensor = torch.zeros(1, int(settings['samplerate'] * silence_time)) # 1.0 to 1.8 seconds
|
| 450 |
+
self.audio_segments.append(pause_tensor.clone())
|
| 451 |
+
return True
|
| 452 |
+
else:
|
| 453 |
+
if sentence[-1].isalnum():
|
| 454 |
+
sentence = f'{sentence} —'
|
| 455 |
+
if self.session['tts_engine'] == TTS_ENGINES['XTTSv2']:
|
| 456 |
+
trim_audio_buffer = 0.008
|
| 457 |
+
if settings['voice_path'] is not None and settings['voice_path'] in settings['latent_embedding'].keys():
|
| 458 |
+
settings['gpt_cond_latent'], settings['speaker_embedding'] = settings['latent_embedding'][settings['voice_path']]
|
| 459 |
+
else:
|
| 460 |
+
msg = 'Computing speaker latents...'
|
| 461 |
+
print(msg)
|
| 462 |
+
if speaker in default_engine_settings[TTS_ENGINES['XTTSv2']]['voices'].keys():
|
| 463 |
+
settings['gpt_cond_latent'], settings['speaker_embedding'] = xtts_builtin_speakers_list[default_engine_settings[TTS_ENGINES['XTTSv2']]['voices'][speaker]].values()
|
| 464 |
+
else:
|
| 465 |
+
settings['gpt_cond_latent'], settings['speaker_embedding'] = tts.get_conditioning_latents(audio_path=[settings['voice_path']])
|
| 466 |
+
settings['latent_embedding'][settings['voice_path']] = settings['gpt_cond_latent'], settings['speaker_embedding']
|
| 467 |
+
fine_tuned_params = {
|
| 468 |
+
key: cast_type(self.session[key])
|
| 469 |
+
for key, cast_type in {
|
| 470 |
+
"temperature": float,
|
| 471 |
+
"length_penalty": float,
|
| 472 |
+
"num_beams": int,
|
| 473 |
+
"repetition_penalty": float,
|
| 474 |
+
"top_k": int,
|
| 475 |
+
"top_p": float,
|
| 476 |
+
"speed": float,
|
| 477 |
+
"enable_text_splitting": bool
|
| 478 |
+
}.items()
|
| 479 |
+
if self.session.get(key) is not None
|
| 480 |
+
}
|
| 481 |
+
with torch.no_grad():
|
| 482 |
+
result = tts.inference(
|
| 483 |
+
text=sentence.replace('.', ' —'),
|
| 484 |
+
language=self.session['language_iso1'],
|
| 485 |
+
gpt_cond_latent=settings['gpt_cond_latent'],
|
| 486 |
+
speaker_embedding=settings['speaker_embedding'],
|
| 487 |
+
**fine_tuned_params
|
| 488 |
+
)
|
| 489 |
+
audio_sentence = result.get('wav')
|
| 490 |
+
if is_audio_data_valid(audio_sentence):
|
| 491 |
+
audio_sentence = audio_sentence.tolist()
|
| 492 |
+
elif self.session['tts_engine'] == TTS_ENGINES['BARK']:
|
| 493 |
+
trim_audio_buffer = 0.002
|
| 494 |
+
'''
|
| 495 |
+
[laughter]
|
| 496 |
+
[laughs]
|
| 497 |
+
[sighs]
|
| 498 |
+
[music]
|
| 499 |
+
[gasps]
|
| 500 |
+
[clears throat]
|
| 501 |
+
— or ... for hesitations
|
| 502 |
+
♪ for song lyrics
|
| 503 |
+
CAPITALIZATION for emphasis of a word
|
| 504 |
+
[MAN] and [WOMAN] to bias Bark toward male and female speakers, respectively
|
| 505 |
+
'''
|
| 506 |
+
if speaker in default_engine_settings[self.session['tts_engine']]['voices'].keys():
|
| 507 |
+
bark_dir = default_engine_settings[self.session['tts_engine']]['speakers_path']
|
| 508 |
+
else:
|
| 509 |
+
bark_dir = os.path.join(os.path.dirname(settings['voice_path']), 'bark')
|
| 510 |
+
if not self._check_bark_npz(settings['voice_path'], bark_dir, speaker, self.session['device']):
|
| 511 |
+
error = 'Could not create npz file!'
|
| 512 |
+
print(error)
|
| 513 |
+
return False
|
| 514 |
+
npz_file = os.path.join(bark_dir, speaker, f'{speaker}.npz')
|
| 515 |
+
fine_tuned_params = {
|
| 516 |
+
key: cast_type(self.session[key])
|
| 517 |
+
for key, cast_type in {
|
| 518 |
+
"text_temp": float,
|
| 519 |
+
"waveform_temp": float
|
| 520 |
+
}.items()
|
| 521 |
+
if self.session.get(key) is not None
|
| 522 |
+
}
|
| 523 |
+
if self.npz_path is None or self.npz_path != npz_file:
|
| 524 |
+
self.npz_path = npz_file
|
| 525 |
+
self.npz_data = np.load(self.npz_path, allow_pickle=True)
|
| 526 |
+
history_prompt = [
|
| 527 |
+
self.npz_data["semantic_prompt"],
|
| 528 |
+
self.npz_data["coarse_prompt"],
|
| 529 |
+
self.npz_data["fine_prompt"]
|
| 530 |
+
]
|
| 531 |
+
with torch.no_grad():
|
| 532 |
+
torch.manual_seed(67878789)
|
| 533 |
+
audio_sentence, _ = tts.generate_audio(
|
| 534 |
+
sentence,
|
| 535 |
+
history_prompt=history_prompt,
|
| 536 |
+
silent=True,
|
| 537 |
+
**fine_tuned_params
|
| 538 |
+
)
|
| 539 |
+
if is_audio_data_valid(audio_sentence):
|
| 540 |
+
audio_sentence = audio_sentence.tolist()
|
| 541 |
+
elif self.session['tts_engine'] == TTS_ENGINES['VITS']:
|
| 542 |
+
speaker_argument = {}
|
| 543 |
+
if self.session['language'] == 'eng' and 'vctk/vits' in models[self.session['tts_engine']]['internal']['sub']:
|
| 544 |
+
if self.session['language'] in models[self.session['tts_engine']]['internal']['sub']['vctk/vits'] or self.session['language_iso1'] in models[self.session['tts_engine']]['internal']['sub']['vctk/vits']:
|
| 545 |
+
speaker_argument = {"speaker": 'p262'}
|
| 546 |
+
elif self.session['language'] == 'cat' and 'custom/vits' in models[self.session['tts_engine']]['internal']['sub']:
|
| 547 |
+
if self.session['language'] in models[self.session['tts_engine']]['internal']['sub']['custom/vits'] or self.session['language_iso1'] in models[self.session['tts_engine']]['internal']['sub']['custom/vits']:
|
| 548 |
+
speaker_argument = {"speaker": '09901'}
|
| 549 |
+
if settings['voice_path'] is not None:
|
| 550 |
+
proc_dir = os.path.join(self.session['voice_dir'], 'proc')
|
| 551 |
+
os.makedirs(proc_dir, exist_ok=True)
|
| 552 |
+
tmp_in_wav = os.path.join(proc_dir, f"{uuid.uuid4()}.wav")
|
| 553 |
+
tmp_out_wav = os.path.join(proc_dir, f"{uuid.uuid4()}.wav")
|
| 554 |
+
tts.tts_to_file(
|
| 555 |
+
text=sentence,
|
| 556 |
+
file_path=tmp_in_wav,
|
| 557 |
+
**speaker_argument
|
| 558 |
+
)
|
| 559 |
+
if settings['voice_path'] in settings['semitones'].keys():
|
| 560 |
+
semitones = settings['semitones'][settings['voice_path']]
|
| 561 |
+
else:
|
| 562 |
+
voice_path_gender = detect_gender(settings['voice_path'])
|
| 563 |
+
voice_builtin_gender = detect_gender(tmp_in_wav)
|
| 564 |
+
msg = f"Cloned voice seems to be {voice_path_gender}\nBuiltin voice seems to be {voice_builtin_gender}"
|
| 565 |
+
print(msg)
|
| 566 |
+
if voice_builtin_gender != voice_path_gender:
|
| 567 |
+
semitones = -4 if voice_path_gender == 'male' else 4
|
| 568 |
+
msg = f"Adapting builtin voice frequencies from the clone voice..."
|
| 569 |
+
print(msg)
|
| 570 |
+
else:
|
| 571 |
+
semitones = 0
|
| 572 |
+
settings['semitones'][settings['voice_path']] = semitones
|
| 573 |
+
if semitones > 0:
|
| 574 |
+
try:
|
| 575 |
+
cmd = [
|
| 576 |
+
shutil.which('sox'), tmp_in_wav,
|
| 577 |
+
"-r", str(settings['samplerate']), tmp_out_wav,
|
| 578 |
+
"pitch", str(semitones * 100)
|
| 579 |
+
]
|
| 580 |
+
subprocess.run(cmd, stdout=subprocess.DEVNULL, stderr=subprocess.DEVNULL)
|
| 581 |
+
except subprocess.CalledProcessError as e:
|
| 582 |
+
error = f"Subprocess error: {e.stderr}"
|
| 583 |
+
print(error)
|
| 584 |
+
DependencyError(e)
|
| 585 |
+
return False
|
| 586 |
+
except FileNotFoundError as e:
|
| 587 |
+
error = f"File not found: {e}"
|
| 588 |
+
print(error)
|
| 589 |
+
DependencyError(e)
|
| 590 |
+
return False
|
| 591 |
+
else:
|
| 592 |
+
tmp_out_wav = tmp_in_wav
|
| 593 |
+
tts_vc = (loaded_tts.get(self.tts_vc_key) or {}).get('engine', False)
|
| 594 |
+
if tts_vc:
|
| 595 |
+
settings['samplerate'] = TTS_VOICE_CONVERSION[self.tts_vc_key]['samplerate']
|
| 596 |
+
source_wav = self._resample_wav(tmp_out_wav, settings['samplerate'])
|
| 597 |
+
target_wav = self._resample_wav(settings['voice_path'], settings['samplerate'])
|
| 598 |
+
audio_sentence = tts_vc.voice_conversion(
|
| 599 |
+
source_wav=source_wav,
|
| 600 |
+
target_wav=target_wav
|
| 601 |
+
)
|
| 602 |
+
else:
|
| 603 |
+
error = f'Engine {self.tts_vc_key} is None'
|
| 604 |
+
print(error)
|
| 605 |
+
return False
|
| 606 |
+
if os.path.exists(tmp_in_wav):
|
| 607 |
+
os.remove(tmp_in_wav)
|
| 608 |
+
if os.path.exists(tmp_out_wav):
|
| 609 |
+
os.remove(tmp_out_wav)
|
| 610 |
+
if os.path.exists(source_wav):
|
| 611 |
+
os.remove(source_wav)
|
| 612 |
+
else:
|
| 613 |
+
audio_sentence = tts.tts(
|
| 614 |
+
text=sentence,
|
| 615 |
+
**speaker_argument
|
| 616 |
+
)
|
| 617 |
+
elif self.session['tts_engine'] == TTS_ENGINES['FAIRSEQ']:
|
| 618 |
+
speaker_argument = {}
|
| 619 |
+
not_supported_punc_pattern = re.compile(r"[.:—]")
|
| 620 |
+
if settings['voice_path'] is not None:
|
| 621 |
+
proc_dir = os.path.join(self.session['voice_dir'], 'proc')
|
| 622 |
+
os.makedirs(proc_dir, exist_ok=True)
|
| 623 |
+
tmp_in_wav = os.path.join(proc_dir, f"{uuid.uuid4()}.wav")
|
| 624 |
+
tmp_out_wav = os.path.join(proc_dir, f"{uuid.uuid4()}.wav")
|
| 625 |
+
tts.tts_to_file(
|
| 626 |
+
text=re.sub(not_supported_punc_pattern, ' ', sentence),
|
| 627 |
+
file_path=tmp_in_wav,
|
| 628 |
+
**speaker_argument
|
| 629 |
+
)
|
| 630 |
+
if settings['voice_path'] in settings['semitones'].keys():
|
| 631 |
+
semitones = settings['semitones'][settings['voice_path']]
|
| 632 |
+
else:
|
| 633 |
+
voice_path_gender = detect_gender(settings['voice_path'])
|
| 634 |
+
voice_builtin_gender = detect_gender(tmp_in_wav)
|
| 635 |
+
msg = f"Cloned voice seems to be {voice_path_gender}\nBuiltin voice seems to be {voice_builtin_gender}"
|
| 636 |
+
print(msg)
|
| 637 |
+
if voice_builtin_gender != voice_path_gender:
|
| 638 |
+
semitones = -4 if voice_path_gender == 'male' else 4
|
| 639 |
+
msg = f"Adapting builtin voice frequencies from the clone voice..."
|
| 640 |
+
print(msg)
|
| 641 |
+
else:
|
| 642 |
+
semitones = 0
|
| 643 |
+
settings['semitones'][settings['voice_path']] = semitones
|
| 644 |
+
if semitones > 0:
|
| 645 |
+
try:
|
| 646 |
+
cmd = [
|
| 647 |
+
shutil.which('sox'), tmp_in_wav,
|
| 648 |
+
"-r", str(settings['samplerate']), tmp_out_wav,
|
| 649 |
+
"pitch", str(semitones * 100)
|
| 650 |
+
]
|
| 651 |
+
subprocess.run(cmd, stdout=subprocess.DEVNULL, stderr=subprocess.DEVNULL)
|
| 652 |
+
except subprocess.CalledProcessError as e:
|
| 653 |
+
print(f"Subprocess error: {e.stderr}")
|
| 654 |
+
DependencyError(e)
|
| 655 |
+
return False
|
| 656 |
+
except FileNotFoundError as e:
|
| 657 |
+
print(f"File not found: {e}")
|
| 658 |
+
DependencyError(e)
|
| 659 |
+
return False
|
| 660 |
+
else:
|
| 661 |
+
tmp_out_wav = tmp_in_wav
|
| 662 |
+
tts_vc = (loaded_tts.get(self.tts_vc_key) or {}).get('engine', False)
|
| 663 |
+
if tts_vc:
|
| 664 |
+
settings['samplerate'] = TTS_VOICE_CONVERSION[self.tts_vc_key]['samplerate']
|
| 665 |
+
source_wav = self._resample_wav(tmp_out_wav, settings['samplerate'])
|
| 666 |
+
target_wav = self._resample_wav(settings['voice_path'], settings['samplerate'])
|
| 667 |
+
audio_sentence = tts_vc.voice_conversion(
|
| 668 |
+
source_wav=source_wav,
|
| 669 |
+
target_wav=target_wav
|
| 670 |
+
)
|
| 671 |
+
else:
|
| 672 |
+
error = f'Engine {self.tts_vc_key} is None'
|
| 673 |
+
print(error)
|
| 674 |
+
return False
|
| 675 |
+
if os.path.exists(tmp_in_wav):
|
| 676 |
+
os.remove(tmp_in_wav)
|
| 677 |
+
if os.path.exists(tmp_out_wav):
|
| 678 |
+
os.remove(tmp_out_wav)
|
| 679 |
+
if os.path.exists(source_wav):
|
| 680 |
+
os.remove(source_wav)
|
| 681 |
+
else:
|
| 682 |
+
audio_sentence = tts.tts(
|
| 683 |
+
text=re.sub(not_supported_punc_pattern, ' ', sentence),
|
| 684 |
+
**speaker_argument
|
| 685 |
+
)
|
| 686 |
+
elif self.session['tts_engine'] == TTS_ENGINES['TACOTRON2']:
|
| 687 |
+
speaker_argument = {}
|
| 688 |
+
not_supported_punc_pattern = re.compile(r'["—]')
|
| 689 |
+
if settings['voice_path'] is not None:
|
| 690 |
+
proc_dir = os.path.join(self.session['voice_dir'], 'proc')
|
| 691 |
+
os.makedirs(proc_dir, exist_ok=True)
|
| 692 |
+
tmp_in_wav = os.path.join(proc_dir, f"{uuid.uuid4()}.wav")
|
| 693 |
+
tmp_out_wav = os.path.join(proc_dir, f"{uuid.uuid4()}.wav")
|
| 694 |
+
tts.tts_to_file(
|
| 695 |
+
text=re.sub(not_supported_punc_pattern, '', sentence),
|
| 696 |
+
file_path=tmp_in_wav,
|
| 697 |
+
**speaker_argument
|
| 698 |
+
)
|
| 699 |
+
if settings['voice_path'] in settings['semitones'].keys():
|
| 700 |
+
semitones = settings['semitones'][settings['voice_path']]
|
| 701 |
+
else:
|
| 702 |
+
voice_path_gender = detect_gender(settings['voice_path'])
|
| 703 |
+
voice_builtin_gender = detect_gender(tmp_in_wav)
|
| 704 |
+
msg = f"Cloned voice seems to be {voice_path_gender}\nBuiltin voice seems to be {voice_builtin_gender}"
|
| 705 |
+
print(msg)
|
| 706 |
+
if voice_builtin_gender != voice_path_gender:
|
| 707 |
+
semitones = -4 if voice_path_gender == 'male' else 4
|
| 708 |
+
msg = f"Adapting builtin voice frequencies from the clone voice..."
|
| 709 |
+
print(msg)
|
| 710 |
+
else:
|
| 711 |
+
semitones = 0
|
| 712 |
+
settings['semitones'][settings['voice_path']] = semitones
|
| 713 |
+
if semitones > 0:
|
| 714 |
+
try:
|
| 715 |
+
cmd = [
|
| 716 |
+
shutil.which('sox'), tmp_in_wav,
|
| 717 |
+
"-r", str(settings['samplerate']), tmp_out_wav,
|
| 718 |
+
"pitch", str(semitones * 100)
|
| 719 |
+
]
|
| 720 |
+
subprocess.run(cmd, stdout=subprocess.DEVNULL, stderr=subprocess.DEVNULL)
|
| 721 |
+
except subprocess.CalledProcessError as e:
|
| 722 |
+
error = f"Subprocess error: {e.stderr}"
|
| 723 |
+
print(error)
|
| 724 |
+
DependencyError(e)
|
| 725 |
+
return False
|
| 726 |
+
except FileNotFoundError as e:
|
| 727 |
+
error = f"File not found: {e}"
|
| 728 |
+
print(error)
|
| 729 |
+
DependencyError(e)
|
| 730 |
+
return False
|
| 731 |
+
else:
|
| 732 |
+
tmp_out_wav = tmp_in_wav
|
| 733 |
+
tts_vc = (loaded_tts.get(self.tts_vc_key) or {}).get('engine', False)
|
| 734 |
+
if tts_vc:
|
| 735 |
+
settings['samplerate'] = TTS_VOICE_CONVERSION[self.tts_vc_key]['samplerate']
|
| 736 |
+
source_wav = self._resample_wav(tmp_out_wav, settings['samplerate'])
|
| 737 |
+
target_wav = self._resample_wav(settings['voice_path'], settings['samplerate'])
|
| 738 |
+
audio_sentence = tts_vc.voice_conversion(
|
| 739 |
+
source_wav=source_wav,
|
| 740 |
+
target_wav=target_wav
|
| 741 |
+
)
|
| 742 |
+
else:
|
| 743 |
+
error = f'Engine {self.tts_vc_key} is None'
|
| 744 |
+
print(error)
|
| 745 |
+
return False
|
| 746 |
+
if os.path.exists(tmp_in_wav):
|
| 747 |
+
os.remove(tmp_in_wav)
|
| 748 |
+
if os.path.exists(tmp_out_wav):
|
| 749 |
+
os.remove(tmp_out_wav)
|
| 750 |
+
if os.path.exists(source_wav):
|
| 751 |
+
os.remove(source_wav)
|
| 752 |
+
else:
|
| 753 |
+
audio_sentence = tts.tts(
|
| 754 |
+
text=re.sub(not_supported_punc_pattern, '', sentence),
|
| 755 |
+
**speaker_argument
|
| 756 |
+
)
|
| 757 |
+
elif self.session['tts_engine'] == TTS_ENGINES['YOURTTS']:
|
| 758 |
+
speaker_argument = {}
|
| 759 |
+
language = self.session['language_iso1'] if self.session['language_iso1'] == 'en' else 'fr-fr' if self.session['language_iso1'] == 'fr' else 'pt-br' if self.session['language_iso1'] == 'pt' else 'en'
|
| 760 |
+
if settings['voice_path'] is not None:
|
| 761 |
+
speaker_wav = settings['voice_path']
|
| 762 |
+
speaker_argument = {"speaker_wav": speaker_wav}
|
| 763 |
+
else:
|
| 764 |
+
voice_key = default_engine_settings[TTS_ENGINES['YOURTTS']]['voices']['ElectroMale-2']
|
| 765 |
+
speaker_argument = {"speaker": voice_key}
|
| 766 |
+
with torch.no_grad():
|
| 767 |
+
audio_sentence = tts.tts(
|
| 768 |
+
text=sentence.replace('—', '').strip(),
|
| 769 |
+
language=language,
|
| 770 |
+
**speaker_argument
|
| 771 |
+
)
|
| 772 |
+
if is_audio_data_valid(audio_sentence):
|
| 773 |
+
sourceTensor = self._tensor_type(audio_sentence)
|
| 774 |
+
audio_tensor = sourceTensor.clone().detach().unsqueeze(0).cpu()
|
| 775 |
+
if sentence[-1].isalnum() or sentence[-1] == '—':
|
| 776 |
+
audio_tensor = trim_audio(audio_tensor.squeeze(), settings['samplerate'], 0.003, trim_audio_buffer).unsqueeze(0)
|
| 777 |
+
self.audio_segments.append(audio_tensor)
|
| 778 |
+
if not re.search(r'\w$', sentence, flags=re.UNICODE):
|
| 779 |
+
silence_time = int(np.random.uniform(0.3, 0.6) * 100) / 100
|
| 780 |
+
break_tensor = torch.zeros(1, int(settings['samplerate'] * silence_time))
|
| 781 |
+
self.audio_segments.append(break_tensor.clone())
|
| 782 |
+
if self.audio_segments:
|
| 783 |
+
audio_tensor = torch.cat(self.audio_segments, dim=-1)
|
| 784 |
+
start_time = self.sentences_total_time
|
| 785 |
+
duration = round((audio_tensor.shape[-1] / settings['samplerate']), 2)
|
| 786 |
+
end_time = start_time + duration
|
| 787 |
+
self.sentences_total_time = end_time
|
| 788 |
+
sentence_obj = {
|
| 789 |
+
"start": start_time,
|
| 790 |
+
"end": end_time,
|
| 791 |
+
"text": sentence,
|
| 792 |
+
"resume_check": self.sentence_idx
|
| 793 |
+
}
|
| 794 |
+
self.sentence_idx = append_sentence2vtt(sentence_obj, self.vtt_path)
|
| 795 |
+
if self.sentence_idx:
|
| 796 |
+
torchaudio.save(final_sentence_file, audio_tensor, settings['samplerate'], format=default_audio_proc_format)
|
| 797 |
+
del audio_tensor
|
| 798 |
+
self.audio_segments = []
|
| 799 |
+
if os.path.exists(final_sentence_file):
|
| 800 |
+
return True
|
| 801 |
+
else:
|
| 802 |
+
error = f"Cannot create {final_sentence_file}"
|
| 803 |
+
print(error)
|
| 804 |
+
else:
|
| 805 |
+
error = f"convert() error: {self.session['tts_engine']} is None"
|
| 806 |
+
print(error)
|
| 807 |
+
except Exception as e:
|
| 808 |
+
error = f'Coquit.convert(): {e}'
|
| 809 |
+
raise ValueError(e)
|
| 810 |
+
return False
|
lib/classes/tts_manager.py
ADDED
|
@@ -0,0 +1,37 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
import os
|
| 2 |
+
|
| 3 |
+
from lib.models import TTS_ENGINES
|
| 4 |
+
|
| 5 |
+
class TTSManager:
|
| 6 |
+
def __init__(self, session):
|
| 7 |
+
self.session = session
|
| 8 |
+
self.tts = None
|
| 9 |
+
self._build()
|
| 10 |
+
|
| 11 |
+
def _build(self):
|
| 12 |
+
if self.session['tts_engine'] in TTS_ENGINES.values():
|
| 13 |
+
if self.session['tts_engine'] in [TTS_ENGINES['XTTSv2'], TTS_ENGINES['BARK'], TTS_ENGINES['VITS'], TTS_ENGINES['FAIRSEQ'], TTS_ENGINES['TACOTRON2'], TTS_ENGINES['YOURTTS']]:
|
| 14 |
+
from lib.classes.tts_engines.coqui import Coqui
|
| 15 |
+
self.tts = Coqui(self.session)
|
| 16 |
+
#elif self.session['tts_engine'] in [TTS_ENGINES['NEW_TTS']]:
|
| 17 |
+
# from lib.classes.tts_engines.new_tts import NewTts
|
| 18 |
+
# self.tts = NewTts(self.session)
|
| 19 |
+
if self.tts:
|
| 20 |
+
return True
|
| 21 |
+
else:
|
| 22 |
+
error = 'TTS engine could not be created!'
|
| 23 |
+
print(error)
|
| 24 |
+
else:
|
| 25 |
+
print('Other TTS engines coming soon!')
|
| 26 |
+
return False
|
| 27 |
+
|
| 28 |
+
def convert_sentence2audio(self, sentence_number, sentence):
|
| 29 |
+
try:
|
| 30 |
+
if self.session['tts_engine'] in TTS_ENGINES.values():
|
| 31 |
+
return self.tts.convert(sentence_number, sentence)
|
| 32 |
+
else:
|
| 33 |
+
print('Other TTS engines coming soon!')
|
| 34 |
+
except Exception as e:
|
| 35 |
+
error = f'convert_sentence2audio(): {e}'
|
| 36 |
+
raise ValueError(e)
|
| 37 |
+
return False
|
lib/classes/voice_extractor.py
ADDED
|
@@ -0,0 +1,286 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
import os
|
| 2 |
+
import numpy as np
|
| 3 |
+
import regex as re
|
| 4 |
+
import scipy.fftpack
|
| 5 |
+
import soundfile as sf
|
| 6 |
+
import subprocess
|
| 7 |
+
import shutil
|
| 8 |
+
|
| 9 |
+
from io import BytesIO
|
| 10 |
+
from pydub import AudioSegment, silence
|
| 11 |
+
from pydub.silence import detect_silence
|
| 12 |
+
|
| 13 |
+
from lib.conf import voice_formats, default_audio_proc_samplerate
|
| 14 |
+
from lib.models import TTS_ENGINES, models
|
| 15 |
+
from lib.classes.background_detector import BackgroundDetector
|
| 16 |
+
|
| 17 |
+
class VoiceExtractor:
|
| 18 |
+
|
| 19 |
+
def __init__(self, session, voice_file, voice_name):
|
| 20 |
+
self.wav_file = None
|
| 21 |
+
self.session = session
|
| 22 |
+
self.voice_file = voice_file
|
| 23 |
+
self.voice_name = voice_name
|
| 24 |
+
self.voice_track = 'vocals.wav'
|
| 25 |
+
self.samplerate = models[session['tts_engine']][session['fine_tuned']]['samplerate']
|
| 26 |
+
self.output_dir = self.session['voice_dir']
|
| 27 |
+
self.demucs_dir = os.path.join(self.output_dir, 'htdemucs', voice_name)
|
| 28 |
+
self.silence_threshold = -60
|
| 29 |
+
|
| 30 |
+
def _validate_format(self):
|
| 31 |
+
file_extension = os.path.splitext(self.voice_file)[1].lower()
|
| 32 |
+
if file_extension in voice_formats:
|
| 33 |
+
msg = 'Input file valid'
|
| 34 |
+
return True, msg
|
| 35 |
+
error = f'Unsupported file format: {file_extension}. Supported formats are: {", ".join(voice_formats)}'
|
| 36 |
+
return False, error
|
| 37 |
+
|
| 38 |
+
def _convert2wav(self):
|
| 39 |
+
try:
|
| 40 |
+
self.wav_file = os.path.join(self.session['voice_dir'], f'{self.voice_name}.wav')
|
| 41 |
+
ffmpeg_cmd = [
|
| 42 |
+
shutil.which('ffmpeg'), '-hide_banner', '-nostats', '-i', self.voice_file,
|
| 43 |
+
'-ac', '1',
|
| 44 |
+
'-y', self.wav_file
|
| 45 |
+
]
|
| 46 |
+
process = subprocess.Popen(
|
| 47 |
+
ffmpeg_cmd,
|
| 48 |
+
env={},
|
| 49 |
+
stdout=subprocess.PIPE,
|
| 50 |
+
stderr=subprocess.STDOUT,
|
| 51 |
+
text=True,
|
| 52 |
+
universal_newlines=True,
|
| 53 |
+
encoding='utf-8'
|
| 54 |
+
)
|
| 55 |
+
for line in process.stdout:
|
| 56 |
+
print(line, end='') # Print each line of stdout
|
| 57 |
+
process.wait()
|
| 58 |
+
if process.returncode != 0:
|
| 59 |
+
error = f'_convert2wav(): process.returncode: {process.returncode}'
|
| 60 |
+
elif not os.path.exists(self.wav_file) or os.path.getsize(self.wav_file) == 0:
|
| 61 |
+
error = f'_convert2wav output error: {self.wav_file} was not created or is empty.'
|
| 62 |
+
else:
|
| 63 |
+
msg = 'Conversion to .wav format for processing successful'
|
| 64 |
+
return True, msg
|
| 65 |
+
except subprocess.CalledProcessError as e:
|
| 66 |
+
error = f'convert2wav fmpeg.Error: {e.stderr.decode()}'
|
| 67 |
+
raise ValueError(error)
|
| 68 |
+
except Exception as e:
|
| 69 |
+
error = f'_convert2wav() error: {e}'
|
| 70 |
+
raise ValueError(error)
|
| 71 |
+
return False, error
|
| 72 |
+
|
| 73 |
+
def _detect_background(self):
|
| 74 |
+
try:
|
| 75 |
+
msg = 'Detecting any background noise or music...'
|
| 76 |
+
print(msg)
|
| 77 |
+
detector = BackgroundDetector(wav_file=self.wav_file)
|
| 78 |
+
status, report = detector.detect(vad_ratio_thresh=0.15)
|
| 79 |
+
print(report)
|
| 80 |
+
if status:
|
| 81 |
+
msg = 'Background noise or music detected. Proceeding voice extraction...'
|
| 82 |
+
else:
|
| 83 |
+
msg = 'No background noise or music detected. Skipping separation...'
|
| 84 |
+
return True, status, msg
|
| 85 |
+
except Exception as e:
|
| 86 |
+
error = f'_detect_background() error: {e}'
|
| 87 |
+
raise ValueError(error)
|
| 88 |
+
return False, False, error
|
| 89 |
+
|
| 90 |
+
def _demucs_voice(self):
|
| 91 |
+
try:
|
| 92 |
+
cmd = [
|
| 93 |
+
"demucs",
|
| 94 |
+
"--verbose",
|
| 95 |
+
"--two-stems=vocals",
|
| 96 |
+
"--out", self.output_dir,
|
| 97 |
+
self.wav_file
|
| 98 |
+
]
|
| 99 |
+
try:
|
| 100 |
+
process = subprocess.run(cmd, check=True)
|
| 101 |
+
self.voice_track = os.path.join(self.demucs_dir, self.voice_track)
|
| 102 |
+
msg = 'Voice track isolation successful'
|
| 103 |
+
return True, msg
|
| 104 |
+
except subprocess.CalledProcessError as e:
|
| 105 |
+
error = (
|
| 106 |
+
f'_demucs_voice() subprocess CalledProcessError error: {e.returncode}\n\n'
|
| 107 |
+
f'stdout: {e.output}\n\n'
|
| 108 |
+
f'stderr: {e.stderr}'
|
| 109 |
+
)
|
| 110 |
+
raise ValueError(error)
|
| 111 |
+
except FileNotFoundError:
|
| 112 |
+
error = f'_demucs_voice() subprocess FileNotFoundError error: The "demucs" command was not found. Ensure it is installed and in PATH.'
|
| 113 |
+
raise ValueError(error)
|
| 114 |
+
except Exception as e:
|
| 115 |
+
error = f'_demucs_voice() subprocess Exception error: {str(e)}'
|
| 116 |
+
raise ValueError(error)
|
| 117 |
+
except Exception as e:
|
| 118 |
+
error = f'_demucs_voice() error: {e}'
|
| 119 |
+
raise ValueError(error)
|
| 120 |
+
return False, error
|
| 121 |
+
|
| 122 |
+
def _remove_silences(self, audio, silence_threshold, min_silence_len=200, keep_silence=300):
|
| 123 |
+
final_audio = AudioSegment.silent(duration=0)
|
| 124 |
+
chunks = silence.split_on_silence(
|
| 125 |
+
audio,
|
| 126 |
+
min_silence_len=min_silence_len,
|
| 127 |
+
silence_thresh=silence_threshold,
|
| 128 |
+
keep_silence=keep_silence
|
| 129 |
+
)
|
| 130 |
+
for chunk in chunks:
|
| 131 |
+
final_audio += chunk
|
| 132 |
+
final_audio.export(self.voice_track, format='wav')
|
| 133 |
+
|
| 134 |
+
def _trim_and_clean(self,silence_threshold, min_silence_len=200, chunk_size=100):
|
| 135 |
+
try:
|
| 136 |
+
audio = AudioSegment.from_file(self.voice_track)
|
| 137 |
+
total_duration = len(audio) # Total duration in milliseconds
|
| 138 |
+
min_required_duration = 20000 if self.session['tts_engine'] == TTS_ENGINES['BARK'] else 12000
|
| 139 |
+
msg = f"Removing long pauses..."
|
| 140 |
+
print(msg)
|
| 141 |
+
self._remove_silences(audio, silence_threshold)
|
| 142 |
+
if total_duration <= min_required_duration:
|
| 143 |
+
msg = f"Audio is only {total_duration/1000:.2f}s long; skipping audio trimming..."
|
| 144 |
+
return True, msg
|
| 145 |
+
else:
|
| 146 |
+
if total_duration > (min_required_duration * 2):
|
| 147 |
+
msg = f"Audio longer than the max allowed. Proceeding to audio trimming..."
|
| 148 |
+
print(msg)
|
| 149 |
+
window = min_required_duration
|
| 150 |
+
hop = max(1, window // 4)
|
| 151 |
+
best_var = -float("inf")
|
| 152 |
+
best_start = 0
|
| 153 |
+
sr = audio.frame_rate
|
| 154 |
+
for start in range(0, total_duration - window + 1, hop):
|
| 155 |
+
chunk = audio[start : start + window]
|
| 156 |
+
samples = np.array(chunk.get_array_of_samples()).astype(float)
|
| 157 |
+
# 1) FFT + magnitude
|
| 158 |
+
spectrum = np.abs(scipy.fftpack.fft(samples))
|
| 159 |
+
# 2) turn into a probability distribution
|
| 160 |
+
p = spectrum / (np.sum(spectrum) + 1e-10)
|
| 161 |
+
# 3) spectral entropy
|
| 162 |
+
entropy = -np.sum(p * np.log2(p + 1e-10))
|
| 163 |
+
if entropy > best_var:
|
| 164 |
+
best_var = entropy
|
| 165 |
+
best_start = start
|
| 166 |
+
best_end = best_start + window
|
| 167 |
+
msg = (
|
| 168 |
+
f"Selected most‐diverse‐spectrum window "
|
| 169 |
+
f"{best_start/1000:.2f}s–{best_end/1000:.2f}s "
|
| 170 |
+
f"(@ entropy {best_var:.2f} bits)"
|
| 171 |
+
)
|
| 172 |
+
print(msg)
|
| 173 |
+
# 1) find all silent spans in the file
|
| 174 |
+
silence_spans = detect_silence(
|
| 175 |
+
audio,
|
| 176 |
+
min_silence_len=min_silence_len,
|
| 177 |
+
silence_thresh=silence_threshold
|
| 178 |
+
)
|
| 179 |
+
# silence_spans = [ [start_ms, end_ms], … ]
|
| 180 |
+
# 2) snap best_start *backward* to the end of the last silence before it
|
| 181 |
+
prev_ends = [end for (start, end) in silence_spans if end <= best_start]
|
| 182 |
+
if prev_ends:
|
| 183 |
+
new_start = max(prev_ends)
|
| 184 |
+
else:
|
| 185 |
+
new_start = 0
|
| 186 |
+
# 3) snap best_end *forward* to the start of the first silence after it
|
| 187 |
+
next_starts = [start for (start, end) in silence_spans if start >= best_end]
|
| 188 |
+
if next_starts:
|
| 189 |
+
new_end = min(next_starts)
|
| 190 |
+
else:
|
| 191 |
+
new_end = total_duration
|
| 192 |
+
# 4) update your slice bounds
|
| 193 |
+
best_start, best_end = new_start, new_end
|
| 194 |
+
else:
|
| 195 |
+
best_start = 0
|
| 196 |
+
best_end = total_duration
|
| 197 |
+
trimmed_audio = audio[best_start:best_end]
|
| 198 |
+
trimmed_audio.export(self.voice_track, format='wav')
|
| 199 |
+
msg = 'Audio trimmed and cleaned!'
|
| 200 |
+
return True, msg
|
| 201 |
+
except Exception as e:
|
| 202 |
+
error = f'_trim_and_clean() error: {e}'
|
| 203 |
+
raise ValueError(error)
|
| 204 |
+
|
| 205 |
+
def _normalize_audio(self):
|
| 206 |
+
error = ''
|
| 207 |
+
try:
|
| 208 |
+
proc_voice_file = os.path.join(self.session['voice_dir'], f'{self.voice_name}_proc.wav')
|
| 209 |
+
final_voice_file = os.path.join(self.session['voice_dir'], f'{self.voice_name}.wav')
|
| 210 |
+
ffmpeg_cmd = [shutil.which('ffmpeg'), '-hide_banner', '-nostats', '-i', self.voice_track]
|
| 211 |
+
filter_complex = (
|
| 212 |
+
'agate=threshold=-25dB:ratio=1.4:attack=10:release=250,'
|
| 213 |
+
'afftdn=nf=-70,'
|
| 214 |
+
'acompressor=threshold=-20dB:ratio=2:attack=80:release=200:makeup=1dB,'
|
| 215 |
+
'loudnorm=I=-14:TP=-3:LRA=7:linear=true,'
|
| 216 |
+
'equalizer=f=150:t=q:w=2:g=1,'
|
| 217 |
+
'equalizer=f=250:t=q:w=2:g=-3,'
|
| 218 |
+
'equalizer=f=3000:t=q:w=2:g=2,'
|
| 219 |
+
'equalizer=f=5500:t=q:w=2:g=-4,'
|
| 220 |
+
'equalizer=f=9000:t=q:w=2:g=-2,'
|
| 221 |
+
'highpass=f=63[audio]'
|
| 222 |
+
)
|
| 223 |
+
ffmpeg_cmd += [
|
| 224 |
+
'-filter_complex', filter_complex,
|
| 225 |
+
'-map', '[audio]',
|
| 226 |
+
'-ar', f'{default_audio_proc_samplerate}',
|
| 227 |
+
'-y', proc_voice_file
|
| 228 |
+
]
|
| 229 |
+
try:
|
| 230 |
+
process = subprocess.Popen(
|
| 231 |
+
ffmpeg_cmd,
|
| 232 |
+
env={},
|
| 233 |
+
stdout=subprocess.PIPE,
|
| 234 |
+
stderr=subprocess.PIPE,
|
| 235 |
+
encoding='utf-8',
|
| 236 |
+
errors='ignore'
|
| 237 |
+
)
|
| 238 |
+
for line in process.stdout:
|
| 239 |
+
print(line, end='') # Print each line of stdout
|
| 240 |
+
process.wait()
|
| 241 |
+
if process.returncode != 0:
|
| 242 |
+
error = f'_normalize_audio(): process.returncode: {process.returncode}'
|
| 243 |
+
elif not os.path.exists(proc_voice_file) or os.path.getsize(proc_voice_file) == 0:
|
| 244 |
+
error = f'_normalize_audio() error: {proc_voice_file} was not created or is empty.'
|
| 245 |
+
else:
|
| 246 |
+
os.replace(proc_voice_file, final_voice_file)
|
| 247 |
+
shutil.rmtree(self.demucs_dir, ignore_errors=True)
|
| 248 |
+
msg = 'Audio normalization successful!'
|
| 249 |
+
return True, msg
|
| 250 |
+
except subprocess.CalledProcessError as e:
|
| 251 |
+
error = f'_normalize_audio() ffmpeg.Error: {e.stderr.decode()}'
|
| 252 |
+
except FileNotFoundError as e:
|
| 253 |
+
error = '_normalize_audio() FileNotFoundError: {e} Input file or FFmpeg PATH not found!'
|
| 254 |
+
except Exception as e:
|
| 255 |
+
error = f'_normalize_audio() error: {e}'
|
| 256 |
+
return False, error
|
| 257 |
+
|
| 258 |
+
def extract_voice(self):
|
| 259 |
+
success = False
|
| 260 |
+
msg = None
|
| 261 |
+
try:
|
| 262 |
+
success, msg = self._validate_format()
|
| 263 |
+
print(msg)
|
| 264 |
+
if success:
|
| 265 |
+
success, msg = self._convert2wav()
|
| 266 |
+
print(msg)
|
| 267 |
+
if success:
|
| 268 |
+
success, status, msg = self._detect_background()
|
| 269 |
+
print(msg)
|
| 270 |
+
if success:
|
| 271 |
+
if status:
|
| 272 |
+
success, msg = self._demucs_voice()
|
| 273 |
+
print(msg)
|
| 274 |
+
else:
|
| 275 |
+
self.voice_track = self.wav_file
|
| 276 |
+
if success:
|
| 277 |
+
success, msg = self._trim_and_clean(self.silence_threshold)
|
| 278 |
+
print(msg)
|
| 279 |
+
if success:
|
| 280 |
+
success, msg = self._normalize_audio()
|
| 281 |
+
print(msg)
|
| 282 |
+
except Exception as e:
|
| 283 |
+
msg = f'extract_voice() error: {e}'
|
| 284 |
+
raise ValueError(msg)
|
| 285 |
+
shutil.rmtree(self.demucs_dir, ignore_errors=True)
|
| 286 |
+
return success, msg
|
lib/conf.py
ADDED
|
@@ -0,0 +1,78 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
import os
|
| 2 |
+
import platform
|
| 3 |
+
|
| 4 |
+
tmp_dir = os.path.abspath('tmp')
|
| 5 |
+
tmp_expire = 7 # days
|
| 6 |
+
|
| 7 |
+
models_dir = os.path.abspath('models')
|
| 8 |
+
ebooks_dir = os.path.abspath('ebooks')
|
| 9 |
+
voices_dir = os.path.abspath('voices')
|
| 10 |
+
tts_dir = os.path.join(models_dir, 'tts')
|
| 11 |
+
|
| 12 |
+
os.environ['PYTHONUTF8'] = '1'
|
| 13 |
+
os.environ['PYTHONIOENCODING'] = 'utf-8'
|
| 14 |
+
os.environ['COQUI_TOS_AGREED'] = '1'
|
| 15 |
+
os.environ['PYTHONIOENCODING'] = 'utf-8'
|
| 16 |
+
os.environ['CALIBRE_NO_NATIVE_FILEDIALOGS'] = '1'
|
| 17 |
+
os.environ['GRADIO_DEBUG'] = '1'
|
| 18 |
+
os.environ['DO_NOT_TRACK'] = 'true'
|
| 19 |
+
os.environ['CALIBRE_TEMP_DIR'] = tmp_dir
|
| 20 |
+
os.environ['CALIBRE_CACHE_DIRECTORY'] = tmp_dir
|
| 21 |
+
os.environ['HUGGINGFACE_HUB_CACHE'] = tts_dir
|
| 22 |
+
os.environ['HF_HOME'] = tts_dir
|
| 23 |
+
os.environ['HF_DATASETS_CACHE'] = tts_dir
|
| 24 |
+
os.environ['BARK_CACHE_DIR'] = tts_dir
|
| 25 |
+
os.environ['TTS_CACHE'] = tts_dir
|
| 26 |
+
os.environ['TORCH_HOME'] = tts_dir
|
| 27 |
+
os.environ['TTS_HOME'] = models_dir
|
| 28 |
+
os.environ['XDG_CACHE_HOME'] = models_dir
|
| 29 |
+
os.environ['STANZA_RESOURCES_DIR'] = os.path.join(models_dir, 'stanza')
|
| 30 |
+
os.environ['ARGOS_TRANSLATE_PACKAGE_PATH'] = os.path.join(models_dir, 'argostranslate')
|
| 31 |
+
os.environ['TORCH_FORCE_NO_WEIGHTS_ONLY_LOAD'] = '1'
|
| 32 |
+
os.environ['PYTORCH_ENABLE_MPS_FALLBACK'] = '1'
|
| 33 |
+
os.environ['SUNO_OFFLOAD_CPU'] = 'False' # BARK option: False needs A GPU
|
| 34 |
+
os.environ['SUNO_USE_SMALL_MODELS'] = 'False' # BARK option: False needs a GPU with VRAM > 4GB
|
| 35 |
+
if platform.system() == 'Windows':
|
| 36 |
+
os.environ['ESPEAK_DATA_PATH'] = os.path.expandvars(r"%USERPROFILE%\scoop\apps\espeak-ng\current\eSpeak NG\espeak-ng-data")
|
| 37 |
+
|
| 38 |
+
prog_version = (lambda: open('VERSION.txt').read().strip())()
|
| 39 |
+
|
| 40 |
+
min_python_version = (3,10)
|
| 41 |
+
max_python_version = (3,12)
|
| 42 |
+
|
| 43 |
+
NATIVE = 'native'
|
| 44 |
+
FULL_DOCKER = 'full_docker'
|
| 45 |
+
|
| 46 |
+
debug_mode = True
|
| 47 |
+
|
| 48 |
+
device_list = ['cpu', 'gpu', 'mps']
|
| 49 |
+
default_device = 'cpu'
|
| 50 |
+
default_gpu_wiki = '<a href="https://github.com/DrewThomasson/ebook2audiobook/wiki/GPU-ISSUES">howto wiki</a>'
|
| 51 |
+
|
| 52 |
+
python_env_dir = os.path.abspath(os.path.join('.','python_env'))
|
| 53 |
+
requirements_file = os.path.abspath(os.path.join('.','requirements.txt'))
|
| 54 |
+
|
| 55 |
+
interface_host = '0.0.0.0'
|
| 56 |
+
interface_port = 7860
|
| 57 |
+
interface_shared_tmp_expire = 3 # in days
|
| 58 |
+
interface_concurrency_limit = 1 # or None for unlimited
|
| 59 |
+
|
| 60 |
+
interface_component_options = {
|
| 61 |
+
"gr_tab_xtts_params": True,
|
| 62 |
+
"gr_tab_bark_params": True,
|
| 63 |
+
"gr_group_voice_file": True,
|
| 64 |
+
"gr_group_custom_model": True
|
| 65 |
+
}
|
| 66 |
+
|
| 67 |
+
audiobooks_gradio_dir = os.path.abspath(os.path.join('audiobooks','gui','gradio'))
|
| 68 |
+
audiobooks_host_dir = os.path.abspath(os.path.join('audiobooks','gui','host'))
|
| 69 |
+
audiobooks_cli_dir = os.path.abspath(os.path.join('audiobooks','cli'))
|
| 70 |
+
|
| 71 |
+
ebook_formats = ['.epub', '.mobi', '.azw3', '.fb2', '.lrf', '.rb', '.snb', '.tcr', '.pdf', '.txt', '.rtf', '.doc', '.docx', '.html', '.odt', '.azw'] # Add or remove the format you accept as input
|
| 72 |
+
voice_formats = ['.mp4', '.m4b', '.m4a', '.mp3', '.wav', '.aac', '.flac', '.alac', '.ogg', '.aiff', '.aif', '.wma', '.dsd', '.opus', '.pcmu', '.pcma', '.gsm'] # Add or remove the format you accept as input
|
| 73 |
+
output_formats = ['aac', 'flac', 'mp3', 'm4b', 'm4a', 'mp4', 'mov', 'ogg', 'wav', 'webm']
|
| 74 |
+
default_audio_proc_samplerate = 24000
|
| 75 |
+
default_audio_proc_format = 'flac' # or 'mp3', 'aac', 'm4a', 'm4b', 'amr', '3gp', 'alac'. 'wav' format is ok but limited to process files < 4GB
|
| 76 |
+
default_output_format = 'm4b'
|
| 77 |
+
default_output_split = False
|
| 78 |
+
default_output_split_hours = '6' # if the final ouput esceed outpout_split_hours * 2 hours the final file will be splitted by outpout_split_hours + the end if any.
|
lib/functions.py
ADDED
|
The diff for this file is too large to render.
See raw diff
|
|
|
lib/lang.py
ADDED
|
The diff for this file is too large to render.
See raw diff
|
|
|
lib/models.py
ADDED
|
@@ -0,0 +1,493 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
import os
|
| 2 |
+
|
| 3 |
+
from lib.conf import tts_dir, voices_dir
|
| 4 |
+
loaded_tts = {}
|
| 5 |
+
|
| 6 |
+
TTS_ENGINES = {
|
| 7 |
+
"XTTSv2": "xtts",
|
| 8 |
+
"BARK": "bark",
|
| 9 |
+
"VITS": "vits",
|
| 10 |
+
"FAIRSEQ": "fairseq",
|
| 11 |
+
"TACOTRON2": "tacotron",
|
| 12 |
+
"YOURTTS": "yourtts"
|
| 13 |
+
}
|
| 14 |
+
|
| 15 |
+
TTS_VOICE_CONVERSION = {
|
| 16 |
+
"freevc24": {"path": "voice_conversion_models/multilingual/vctk/freevc24", "samplerate": 24000},
|
| 17 |
+
"knnvc": {"path": "voice_conversion_models/multilingual/multi-dataset/knnvc", "samplerate": 16000},
|
| 18 |
+
"openvoice_v1": {"path": "voice_conversion_models/multilingual/multi-dataset/openvoice_v1", "samplerate": 22050},
|
| 19 |
+
"openvoice_v2": {"path": "voice_conversion_models/multilingual/multi-dataset/openvoice_v2", "samplerate": 22050}
|
| 20 |
+
}
|
| 21 |
+
|
| 22 |
+
TTS_SML = {
|
| 23 |
+
"break": "‡break‡",
|
| 24 |
+
"pause": "‡pause‡",
|
| 25 |
+
"###": "‡pause‡"
|
| 26 |
+
}
|
| 27 |
+
|
| 28 |
+
default_tts_engine = TTS_ENGINES['XTTSv2']
|
| 29 |
+
default_fine_tuned = 'internal'
|
| 30 |
+
default_vc_model = TTS_VOICE_CONVERSION['knnvc']['path']
|
| 31 |
+
default_voice_detection_model = 'drewThomasson/segmentation'
|
| 32 |
+
|
| 33 |
+
max_tts_in_memory = 2 # TTS engines to keep in memory (1 tts engine ~= 4GB to 8GB RAM).
|
| 34 |
+
max_custom_model = 100
|
| 35 |
+
max_custom_voices = 1000
|
| 36 |
+
max_upload_size = '6GB'
|
| 37 |
+
|
| 38 |
+
default_engine_settings = {
|
| 39 |
+
TTS_ENGINES['XTTSv2']: {
|
| 40 |
+
"samplerate": 24000,
|
| 41 |
+
"temperature": 0.75,
|
| 42 |
+
"length_penalty": 1.0,
|
| 43 |
+
"num_beams": 1,
|
| 44 |
+
"repetition_penalty": 3.0,
|
| 45 |
+
"top_k": 50,
|
| 46 |
+
"top_p": 0.85,
|
| 47 |
+
"speed": 1.0,
|
| 48 |
+
"enable_text_splitting": False,
|
| 49 |
+
# to enable deepspeed, you must install it first:
|
| 50 |
+
# conda activate ./python_env (linux/mac) or .\python_env (windows)
|
| 51 |
+
# pip install deepspeed
|
| 52 |
+
# conda deactivate
|
| 53 |
+
"use_deepspeed": False,
|
| 54 |
+
"files": ['config.json', 'model.pth', 'vocab.json', 'ref.wav', 'speakers_xtts.pth'],
|
| 55 |
+
"voices": {
|
| 56 |
+
"ClaribelDervla": "Claribel Dervla", "DaisyStudious": "Daisy Studious", "GracieWise": "Gracie Wise",
|
| 57 |
+
"TammieEma": "Tammie Ema", "AlisonDietlinde": "Alison Dietlinde", "AnaFlorence": "Ana Florence",
|
| 58 |
+
"AnnmarieNele": "Annmarie Nele", "AsyaAnara": "Asya Anara", "BrendaStern": "Brenda Stern",
|
| 59 |
+
"GittaNikolina": "Gitta Nikolina", "HenrietteUsha": "Henriette Usha", "SofiaHellen": "Sofia Hellen",
|
| 60 |
+
"TammyGrit": "Tammy Grit", "TanjaAdelina": "Tanja Adelina", "VjollcaJohnnie": "Vjollca Johnnie",
|
| 61 |
+
"AndrewChipper": "Andrew Chipper", "BadrOdhiambo": "Badr Odhiambo", "DionisioSchuyler": "Dionisio Schuyler",
|
| 62 |
+
"RoystonMin": "Royston Min", "ViktorEka": "Viktor Eka", "AbrahanMack": "Abrahan Mack",
|
| 63 |
+
"AddeMichal": "Adde Michal", "BaldurSanjin": "Baldur Sanjin", "CraigGutsy": "Craig Gutsy",
|
| 64 |
+
"DamienBlack": "Damien Black", "GilbertoMathias": "Gilberto Mathias", "IlkinUrbano": "Ilkin Urbano",
|
| 65 |
+
"KazuhikoAtallah": "Kazuhiko Atallah", "LudvigMilivoj": "Ludvig Milivoj", "SuadQasim": "Suad Qasim",
|
| 66 |
+
"TorcullDiarmuid": "Torcull Diarmuid", "ViktorMenelaos": "Viktor Menelaos", "ZacharieAimilios": "Zacharie Aimilios",
|
| 67 |
+
"NovaHogarth": "Nova Hogarth", "MajaRuoho": "Maja Ruoho", "UtaObando": "Uta Obando",
|
| 68 |
+
"LidiyaSzekeres": "Lidiya Szekeres", "ChandraMacFarland": "Chandra MacFarland", "SzofiGranger": "Szofi Granger",
|
| 69 |
+
"CamillaHolmström": "Camilla Holmström", "LilyaStainthorpe": "Lilya Stainthorpe", "ZofijaKendrick": "Zofija Kendrick",
|
| 70 |
+
"NarelleMoon": "Narelle Moon", "BarboraMacLean": "Barbora MacLean", "AlexandraHisakawa": "Alexandra Hisakawa",
|
| 71 |
+
"AlmaMaría": "Alma María", "RosemaryOkafor": "Rosemary Okafor", "IgeBehringer": "Ige Behringer",
|
| 72 |
+
"FilipTraverse": "Filip Traverse", "DamjanChapman": "Damjan Chapman", "WulfCarlevaro": "Wulf Carlevaro",
|
| 73 |
+
"AaronDreschner": "Aaron Dreschner", "KumarDahl": "Kumar Dahl", "EugenioMataracı": "Eugenio Mataracı",
|
| 74 |
+
"FerranSimen": "Ferran Simen", "XavierHayasaka": "Xavier Hayasaka", "LuisMoray": "Luis Moray",
|
| 75 |
+
"MarcosRudaski": "Marcos Rudaski"
|
| 76 |
+
},
|
| 77 |
+
"rating": {"GPU VRAM": 4, "CPU": 3, "RAM": 8, "Realism": 4}
|
| 78 |
+
},
|
| 79 |
+
TTS_ENGINES['BARK']: {
|
| 80 |
+
"samplerate": 24000,
|
| 81 |
+
"text_temp": 0.50,
|
| 82 |
+
"waveform_temp": 0.50,
|
| 83 |
+
"files": ["text_2.pt", "coarse_2.pt", "fine_2.pt"],
|
| 84 |
+
"speakers_path": os.path.join(voices_dir, '__bark'),
|
| 85 |
+
"voices": {
|
| 86 |
+
"de_speaker_0": "Speaker 0", "de_speaker_1": "Speaker 1", "de_speaker_2": "Speaker 2",
|
| 87 |
+
"de_speaker_3": "Speaker 3", "de_speaker_4": "Speaker 4", "de_speaker_5": "Speaker 5",
|
| 88 |
+
"de_speaker_6": "Speaker 6", "de_speaker_7": "Speaker 7", "de_speaker_8": "Speaker 8",
|
| 89 |
+
"de_speaker_9": "Speaker 9", "en_speaker_0": "Speaker 0", "en_speaker_1": "Speaker 1",
|
| 90 |
+
"en_speaker_2": "Speaker 2", "en_speaker_3": "Speaker 3", "en_speaker_4": "Speaker 4",
|
| 91 |
+
"en_speaker_5": "Speaker 5", "en_speaker_6": "Speaker 6", "en_speaker_7": "Speaker 7",
|
| 92 |
+
"en_speaker_8": "Speaker 8", "en_speaker_9": "Speaker 9", "es_speaker_0": "Speaker 0",
|
| 93 |
+
"es_speaker_1": "Speaker 1", "es_speaker_2": "Speaker 2", "es_speaker_3": "Speaker 3",
|
| 94 |
+
"es_speaker_4": "Speaker 4", "es_speaker_5": "Speaker 5", "es_speaker_6": "Speaker 6",
|
| 95 |
+
"es_speaker_7": "Speaker 7", "es_speaker_8": "Speaker 8", "es_speaker_9": "Speaker 9",
|
| 96 |
+
"fr_speaker_0": "Speaker 0", "fr_speaker_1": "Speaker 1", "fr_speaker_2": "Speaker 2",
|
| 97 |
+
"fr_speaker_3": "Speaker 3", "fr_speaker_4": "Speaker 4", "fr_speaker_5": "Speaker 5",
|
| 98 |
+
"fr_speaker_6": "Speaker 6", "fr_speaker_7": "Speaker 7", "fr_speaker_8": "Speaker 8",
|
| 99 |
+
"fr_speaker_9": "Speaker 9", "hi_speaker_0": "Speaker 0", "hi_speaker_1": "Speaker 1",
|
| 100 |
+
"hi_speaker_2": "Speaker 2", "hi_speaker_3": "Speaker 3", "hi_speaker_4": "Speaker 4",
|
| 101 |
+
"hi_speaker_5": "Speaker 5", "hi_speaker_6": "Speaker 6", "hi_speaker_7": "Speaker 7",
|
| 102 |
+
"hi_speaker_8": "Speaker 8", "hi_speaker_9": "Speaker 9", "it_speaker_0": "Speaker 0",
|
| 103 |
+
"it_speaker_1": "Speaker 1", "it_speaker_2": "Speaker 2", "it_speaker_3": "Speaker 3",
|
| 104 |
+
"it_speaker_4": "Speaker 4", "it_speaker_5": "Speaker 5", "it_speaker_6": "Speaker 6",
|
| 105 |
+
"it_speaker_7": "Speaker 7", "it_speaker_8": "Speaker 8", "it_speaker_9": "Speaker 9",
|
| 106 |
+
"ja_speaker_0": "Speaker 0", "ja_speaker_1": "Speaker 1", "ja_speaker_2": "Speaker 2",
|
| 107 |
+
"ja_speaker_3": "Speaker 3", "ja_speaker_4": "Speaker 4", "ja_speaker_5": "Speaker 5",
|
| 108 |
+
"ja_speaker_6": "Speaker 6", "ja_speaker_7": "Speaker 7", "ja_speaker_8": "Speaker 8",
|
| 109 |
+
"ja_speaker_9": "Speaker 9", "ko_speaker_0": "Speaker 0", "ko_speaker_1": "Speaker 1",
|
| 110 |
+
"ko_speaker_2": "Speaker 2", "ko_speaker_3": "Speaker 3", "ko_speaker_4": "Speaker 4",
|
| 111 |
+
"ko_speaker_5": "Speaker 5", "ko_speaker_6": "Speaker 6", "ko_speaker_7": "Speaker 7",
|
| 112 |
+
"ko_speaker_8": "Speaker 8", "ko_speaker_9": "Speaker 9", "pl_speaker_0": "Speaker 0",
|
| 113 |
+
"pl_speaker_1": "Speaker 1", "pl_speaker_2": "Speaker 2", "pl_speaker_3": "Speaker 3",
|
| 114 |
+
"pl_speaker_4": "Speaker 4", "pl_speaker_5": "Speaker 5", "pl_speaker_6": "Speaker 6",
|
| 115 |
+
"pl_speaker_7": "Speaker 7", "pl_speaker_8": "Speaker 8", "pl_speaker_9": "Speaker 9",
|
| 116 |
+
"pt_speaker_0": "Speaker 0", "pt_speaker_1": "Speaker 1", "pt_speaker_2": "Speaker 2",
|
| 117 |
+
"pt_speaker_3": "Speaker 3", "pt_speaker_4": "Speaker 4", "pt_speaker_5": "Speaker 5",
|
| 118 |
+
"pt_speaker_6": "Speaker 6", "pt_speaker_7": "Speaker 7", "pt_speaker_8": "Speaker 8",
|
| 119 |
+
"pt_speaker_9": "Speaker 9", "ru_speaker_0": "Speaker 0", "ru_speaker_1": "Speaker 1",
|
| 120 |
+
"ru_speaker_2": "Speaker 2", "ru_speaker_3": "Speaker 3", "ru_speaker_4": "Speaker 4",
|
| 121 |
+
"ru_speaker_5": "Speaker 5", "ru_speaker_6": "Speaker 6", "ru_speaker_7": "Speaker 7",
|
| 122 |
+
"ru_speaker_8": "Speaker 8", "ru_speaker_9": "Speaker 9", "tr_speaker_0": "Speaker 0",
|
| 123 |
+
"tr_speaker_1": "Speaker 1", "tr_speaker_2": "Speaker 2", "tr_speaker_3": "Speaker 3",
|
| 124 |
+
"tr_speaker_4": "Speaker 4", "tr_speaker_5": "Speaker 5", "tr_speaker_6": "Speaker 6",
|
| 125 |
+
"tr_speaker_7": "Speaker 7", "tr_speaker_8": "Speaker 8", "tr_speaker_9": "Speaker 9",
|
| 126 |
+
"zh_speaker_0": "Speaker 0", "zh_speaker_1": "Speaker 1", "zh_speaker_2": "Speaker 2",
|
| 127 |
+
"zh_speaker_3": "Speaker 3", "zh_speaker_4": "Speaker 4", "zh_speaker_5": "Speaker 5",
|
| 128 |
+
"zh_speaker_6": "Speaker 6", "zh_speaker_7": "Speaker 7", "zh_speaker_8": "Speaker 8",
|
| 129 |
+
"zh_speaker_9": "Speaker 9"
|
| 130 |
+
},
|
| 131 |
+
"rating": {"GPU VRAM": 4, "CPU": 1, "RAM": 16, "Realism": 3}
|
| 132 |
+
},
|
| 133 |
+
TTS_ENGINES['VITS']: {
|
| 134 |
+
"samplerate": 22050,
|
| 135 |
+
"files": ['config.json', 'model_file.pth', 'language_ids.json'],
|
| 136 |
+
"voices": {},
|
| 137 |
+
"rating": {"GPU VRAM": 2, "CPU": 3, "RAM": 4, "Realism": 2}
|
| 138 |
+
},
|
| 139 |
+
TTS_ENGINES['FAIRSEQ']: {
|
| 140 |
+
"samplerate": 16000,
|
| 141 |
+
"files": ['config.json', 'G_100000.pth', 'vocab.json'],
|
| 142 |
+
"voices": {},
|
| 143 |
+
"rating": {"GPU VRAM": 2, "CPU": 3, "RAM": 4, "Realism": 2}
|
| 144 |
+
},
|
| 145 |
+
TTS_ENGINES['TACOTRON2']: {
|
| 146 |
+
"samplerate": 22050,
|
| 147 |
+
"files": ['config.json', 'best_model.pth', 'vocoder_config.json', 'vocoder_model.pth'],
|
| 148 |
+
"voices": {},
|
| 149 |
+
"rating": {"GPU VRAM": 2, "CPU": 3, "RAM": 4, "Realism": 2}
|
| 150 |
+
},
|
| 151 |
+
TTS_ENGINES['YOURTTS']: {
|
| 152 |
+
"samplerate": 16000,
|
| 153 |
+
"files": ['config.json', 'model_file.pth'],
|
| 154 |
+
"voices": {"Machinella-5": "female-en-5", "ElectroMale-2": "male-en-2", 'Machinella-4': 'female-pt-4\n', 'ElectroMale-3': 'male-pt-3\n'},
|
| 155 |
+
"rating": {"GPU VRAM": 1, "CPU": 5, "RAM": 4, "Realism": 1}
|
| 156 |
+
}
|
| 157 |
+
}
|
| 158 |
+
models = {
|
| 159 |
+
TTS_ENGINES['XTTSv2']: {
|
| 160 |
+
"internal": {
|
| 161 |
+
"lang": "multi",
|
| 162 |
+
"repo": "coqui/XTTS-v2",
|
| 163 |
+
"sub": "tts_models/multilingual/multi-dataset/xtts_v2/",
|
| 164 |
+
"voice": os.path.join(voices_dir, 'eng', 'adult', 'male', 'KumarDahl.wav'),
|
| 165 |
+
"files": default_engine_settings[TTS_ENGINES['XTTSv2']]['files'],
|
| 166 |
+
"samplerate": default_engine_settings[TTS_ENGINES['XTTSv2']]['samplerate']
|
| 167 |
+
},
|
| 168 |
+
"AiExplained": {
|
| 169 |
+
"lang": "eng",
|
| 170 |
+
"repo": "drewThomasson/fineTunedTTSModels",
|
| 171 |
+
"sub": "xtts-v2/eng/AiExplained/",
|
| 172 |
+
"voice": os.path.join(voices_dir, 'eng', 'adult', 'male', 'AiExplained.wav'),
|
| 173 |
+
"files": default_engine_settings[TTS_ENGINES['XTTSv2']]['files'],
|
| 174 |
+
"samplerate": default_engine_settings[TTS_ENGINES['XTTSv2']]['samplerate']
|
| 175 |
+
},
|
| 176 |
+
"AsmrRacoon": {
|
| 177 |
+
"lang": "eng",
|
| 178 |
+
"repo": "drewThomasson/fineTunedTTSModels",
|
| 179 |
+
"sub": "xtts-v2/eng/AsmrRacoon/",
|
| 180 |
+
"voice": os.path.join(voices_dir, 'eng', 'adult', 'male', 'AsmrRacoon.wav'),
|
| 181 |
+
"files": default_engine_settings[TTS_ENGINES['XTTSv2']]['files'],
|
| 182 |
+
"samplerate": default_engine_settings[TTS_ENGINES['XTTSv2']]['samplerate']
|
| 183 |
+
},
|
| 184 |
+
"Awkwafina": {
|
| 185 |
+
"lang": "eng",
|
| 186 |
+
"repo": "drewThomasson/fineTunedTTSModels",
|
| 187 |
+
"sub": "xtts-v2/eng/Awkwafina/",
|
| 188 |
+
"voice": os.path.join(voices_dir, 'eng', 'adult', 'female', 'Awkwafina.wav'),
|
| 189 |
+
"files": default_engine_settings[TTS_ENGINES['XTTSv2']]['files'],
|
| 190 |
+
"samplerate": default_engine_settings[TTS_ENGINES['XTTSv2']]['samplerate']
|
| 191 |
+
},
|
| 192 |
+
"BobOdenkirk": {
|
| 193 |
+
"lang": "eng",
|
| 194 |
+
"repo": "drewThomasson/fineTunedTTSModels",
|
| 195 |
+
"sub": "xtts-v2/eng/BobOdenkirk/",
|
| 196 |
+
"voice": os.path.join(voices_dir, 'eng', 'adult', 'male', 'BobOdenkirk.wav'),
|
| 197 |
+
"files": default_engine_settings[TTS_ENGINES['XTTSv2']]['files'],
|
| 198 |
+
"samplerate": default_engine_settings[TTS_ENGINES['XTTSv2']]['samplerate']
|
| 199 |
+
},
|
| 200 |
+
"BobRoss": {
|
| 201 |
+
"lang": "eng",
|
| 202 |
+
"repo": "drewThomasson/fineTunedTTSModels",
|
| 203 |
+
"sub": "xtts-v2/eng/BobRoss/",
|
| 204 |
+
"voice": os.path.join(voices_dir, 'eng', 'adult', 'male', 'BobRoss.wav'),
|
| 205 |
+
"files": default_engine_settings[TTS_ENGINES['XTTSv2']]['files'],
|
| 206 |
+
"samplerate": default_engine_settings[TTS_ENGINES['XTTSv2']]['samplerate']
|
| 207 |
+
},
|
| 208 |
+
"BrinaPalencia": {
|
| 209 |
+
"lang": "eng",
|
| 210 |
+
"repo": "drewThomasson/fineTunedTTSModels",
|
| 211 |
+
"sub": "xtts-v2/eng/BrinaPalencia/",
|
| 212 |
+
"voice": os.path.join(voices_dir, 'eng', 'adult', 'female', 'BrinaPalencia.wav'),
|
| 213 |
+
"files": default_engine_settings[TTS_ENGINES['XTTSv2']]['files'],
|
| 214 |
+
"samplerate": default_engine_settings[TTS_ENGINES['XTTSv2']]['samplerate']
|
| 215 |
+
},
|
| 216 |
+
"BryanCranston": {
|
| 217 |
+
"lang": "eng",
|
| 218 |
+
"repo": "drewThomasson/fineTunedTTSModels",
|
| 219 |
+
"sub": "xtts-v2/eng/BryanCranston/",
|
| 220 |
+
"voice": os.path.join(voices_dir, 'eng', 'adult', 'male', 'BryanCranston.wav'),
|
| 221 |
+
"files": default_engine_settings[TTS_ENGINES['XTTSv2']]['files'],
|
| 222 |
+
"samplerate": default_engine_settings[TTS_ENGINES['XTTSv2']]['samplerate']
|
| 223 |
+
},
|
| 224 |
+
"DavidAttenborough": {
|
| 225 |
+
"lang": "eng",
|
| 226 |
+
"repo": "drewThomasson/fineTunedTTSModels",
|
| 227 |
+
"sub": "xtts-v2/eng/DavidAttenborough/",
|
| 228 |
+
"voice": os.path.join(voices_dir, 'eng', 'elder', 'male', 'DavidAttenborough.wav'),
|
| 229 |
+
"files": default_engine_settings[TTS_ENGINES['XTTSv2']]['files'],
|
| 230 |
+
"samplerate": default_engine_settings[TTS_ENGINES['XTTSv2']]['samplerate']
|
| 231 |
+
},
|
| 232 |
+
"DeathPussInBoots": {
|
| 233 |
+
"lang": "eng",
|
| 234 |
+
"repo": "drewThomasson/fineTunedTTSModels",
|
| 235 |
+
"sub": "xtts-v2/eng/DeathPussInBoots/",
|
| 236 |
+
"voice": os.path.join(voices_dir, 'eng', 'adult', 'male', 'DeathPussInBoots.wav'),
|
| 237 |
+
"files": default_engine_settings[TTS_ENGINES['XTTSv2']]['files'],
|
| 238 |
+
"samplerate": default_engine_settings[TTS_ENGINES['XTTSv2']]['samplerate']
|
| 239 |
+
},
|
| 240 |
+
"DermotCrowley": {
|
| 241 |
+
"lang": "eng",
|
| 242 |
+
"repo": "drewThomasson/fineTunedTTSModels",
|
| 243 |
+
"sub": "xtts-v2/eng/DermotCrowley/",
|
| 244 |
+
"voice": os.path.join(voices_dir, 'eng', 'elder', 'male', 'DermotCrowley.wav'),
|
| 245 |
+
"files": default_engine_settings[TTS_ENGINES['XTTSv2']]['files'],
|
| 246 |
+
"samplerate": default_engine_settings[TTS_ENGINES['XTTSv2']]['samplerate']
|
| 247 |
+
},
|
| 248 |
+
"EvaSeymour": {
|
| 249 |
+
"lang": "eng",
|
| 250 |
+
"repo": "drewThomasson/fineTunedTTSModels",
|
| 251 |
+
"sub": "xtts-v2/eng/EvaSeymour/",
|
| 252 |
+
"voice": os.path.join(voices_dir, 'eng', 'adult', 'female', 'EvaSeymour.wav'),
|
| 253 |
+
"files": default_engine_settings[TTS_ENGINES['XTTSv2']]['files'],
|
| 254 |
+
"samplerate": default_engine_settings[TTS_ENGINES['XTTSv2']]['samplerate']
|
| 255 |
+
},
|
| 256 |
+
"GideonOfnirEldenRing": {
|
| 257 |
+
"lang": "eng",
|
| 258 |
+
"repo": "drewThomasson/fineTunedTTSModels",
|
| 259 |
+
"sub": "xtts-v2/eng/GideonOfnirEldenRing/",
|
| 260 |
+
"voice": os.path.join(voices_dir, 'eng', 'elder', 'male', 'GideonOfnirEldenRing.wav'),
|
| 261 |
+
"files": default_engine_settings[TTS_ENGINES['XTTSv2']]['files'],
|
| 262 |
+
"samplerate": default_engine_settings[TTS_ENGINES['XTTSv2']]['samplerate']
|
| 263 |
+
},
|
| 264 |
+
"GhostMW2": {
|
| 265 |
+
"lang": "eng",
|
| 266 |
+
"repo": "drewThomasson/fineTunedTTSModels",
|
| 267 |
+
"sub": "xtts-v2/eng/GhostMW2/",
|
| 268 |
+
"voice": os.path.join(voices_dir, 'eng', 'adult', 'male', 'GhostMW2.wav'),
|
| 269 |
+
"files": default_engine_settings[TTS_ENGINES['XTTSv2']]['files'],
|
| 270 |
+
"samplerate": default_engine_settings[TTS_ENGINES['XTTSv2']]['samplerate']
|
| 271 |
+
},
|
| 272 |
+
"JhonButlerASMR": {
|
| 273 |
+
"lang": "eng",
|
| 274 |
+
"repo": "drewThomasson/fineTunedTTSModels",
|
| 275 |
+
"sub": "xtts-v2/eng/JhonButlerASMR/",
|
| 276 |
+
"voice": os.path.join(voices_dir, 'eng', 'elder', 'male', 'JhonButlerASMR.wav'),
|
| 277 |
+
"files": default_engine_settings[TTS_ENGINES['XTTSv2']]['files'],
|
| 278 |
+
"samplerate": default_engine_settings[TTS_ENGINES['XTTSv2']]['samplerate']
|
| 279 |
+
},
|
| 280 |
+
"JhonMulaney": {
|
| 281 |
+
"lang": "eng",
|
| 282 |
+
"repo": "drewThomasson/fineTunedTTSModels",
|
| 283 |
+
"sub": "xtts-v2/eng/JhonMulaney/",
|
| 284 |
+
"voice": os.path.join(voices_dir, 'eng', 'adult', 'male', 'JhonMulaney.wav'),
|
| 285 |
+
"files": default_engine_settings[TTS_ENGINES['XTTSv2']]['files'],
|
| 286 |
+
"samplerate": default_engine_settings[TTS_ENGINES['XTTSv2']]['samplerate']
|
| 287 |
+
},
|
| 288 |
+
"JillRedfield": {
|
| 289 |
+
"lang": "eng",
|
| 290 |
+
"repo": "drewThomasson/fineTunedTTSModels",
|
| 291 |
+
"sub": "xtts-v2/eng/JillRedfield/",
|
| 292 |
+
"voice": os.path.join(voices_dir, 'eng', 'adult', 'female', 'JillRedfield.wav'),
|
| 293 |
+
"files": default_engine_settings[TTS_ENGINES['XTTSv2']]['files'],
|
| 294 |
+
"samplerate": default_engine_settings[TTS_ENGINES['XTTSv2']]['samplerate']
|
| 295 |
+
},
|
| 296 |
+
"JuliaWhenlan": {
|
| 297 |
+
"lang": "eng",
|
| 298 |
+
"repo": "drewThomasson/fineTunedTTSModels",
|
| 299 |
+
"sub": "xtts-v2/eng/JuliaWhenlan/",
|
| 300 |
+
"voice": os.path.join(voices_dir, 'eng', 'adult', 'female', 'JuliaWhenlan.wav'),
|
| 301 |
+
"files": default_engine_settings[TTS_ENGINES['XTTSv2']]['files'],
|
| 302 |
+
"samplerate": default_engine_settings[TTS_ENGINES['XTTSv2']]['samplerate']
|
| 303 |
+
},
|
| 304 |
+
"LeeHorsley": {
|
| 305 |
+
"lang": "eng",
|
| 306 |
+
"repo": "drewThomasson/fineTunedTTSModels",
|
| 307 |
+
"sub": "xtts-v2/eng/LeeHorsley/",
|
| 308 |
+
"voice": os.path.join(voices_dir, 'eng', 'adult', 'male', 'LeeHorsley.wav'),
|
| 309 |
+
"files": default_engine_settings[TTS_ENGINES['XTTSv2']]['files'],
|
| 310 |
+
"samplerate": default_engine_settings[TTS_ENGINES['XTTSv2']]['samplerate']
|
| 311 |
+
},
|
| 312 |
+
"MelinaEldenRing": {
|
| 313 |
+
"lang": "eng",
|
| 314 |
+
"repo": "drewThomasson/fineTunedTTSModels",
|
| 315 |
+
"sub": "xtts-v2/eng/MelinaEldenRing/",
|
| 316 |
+
"voice": os.path.join(voices_dir, 'eng', 'adult', 'female', 'MelinaEldenRing.wav'),
|
| 317 |
+
"files": default_engine_settings[TTS_ENGINES['XTTSv2']]['files'],
|
| 318 |
+
"samplerate": default_engine_settings[TTS_ENGINES['XTTSv2']]['samplerate']
|
| 319 |
+
},
|
| 320 |
+
"MorganFreeman": {
|
| 321 |
+
"lang": "eng",
|
| 322 |
+
"repo": "drewThomasson/fineTunedTTSModels",
|
| 323 |
+
"sub": "xtts-v2/eng/MorganFreeman/",
|
| 324 |
+
"voice": os.path.join(voices_dir, 'eng', 'adult', 'male', 'MorganFreeman.wav'),
|
| 325 |
+
"files": default_engine_settings[TTS_ENGINES['XTTSv2']]['files'],
|
| 326 |
+
"samplerate": default_engine_settings[TTS_ENGINES['XTTSv2']]['samplerate']
|
| 327 |
+
},
|
| 328 |
+
"NeilGaiman": {
|
| 329 |
+
"lang": "eng",
|
| 330 |
+
"repo": "drewThomasson/fineTunedTTSModels",
|
| 331 |
+
"sub": "xtts-v2/eng/NeilGaiman/",
|
| 332 |
+
"voice": os.path.join(voices_dir, 'eng', 'adult', 'male', 'NeilGaiman.wav'),
|
| 333 |
+
"files": default_engine_settings[TTS_ENGINES['XTTSv2']]['files'],
|
| 334 |
+
"samplerate": default_engine_settings[TTS_ENGINES['XTTSv2']]['samplerate']
|
| 335 |
+
},
|
| 336 |
+
"RainyDayHeadSpace": {
|
| 337 |
+
"lang": "eng",
|
| 338 |
+
"repo": "drewThomasson/fineTunedTTSModels",
|
| 339 |
+
"sub": "xtts-v2/eng/RainyDayHeadSpace/",
|
| 340 |
+
"voice": os.path.join(voices_dir, 'eng', 'elder', 'male', 'RainyDayHeadSpace.wav'),
|
| 341 |
+
"files": default_engine_settings[TTS_ENGINES['XTTSv2']]['files'],
|
| 342 |
+
"samplerate": default_engine_settings[TTS_ENGINES['XTTSv2']]['samplerate']
|
| 343 |
+
},
|
| 344 |
+
"RayPorter": {
|
| 345 |
+
"lang": "eng",
|
| 346 |
+
"repo": "drewThomasson/fineTunedTTSModels",
|
| 347 |
+
"sub": "xtts-v2/eng/RayPorter/",
|
| 348 |
+
"voice": os.path.join(voices_dir, 'eng', 'adult', 'male', 'RayPorter.wav'),
|
| 349 |
+
"files": default_engine_settings[TTS_ENGINES['XTTSv2']]['files'],
|
| 350 |
+
"samplerate": default_engine_settings[TTS_ENGINES['XTTSv2']]['samplerate']
|
| 351 |
+
},
|
| 352 |
+
"RelaxForAWhile": {
|
| 353 |
+
"lang": "eng",
|
| 354 |
+
"repo": "drewThomasson/fineTunedTTSModels",
|
| 355 |
+
"sub": "xtts-v2/eng/RelaxForAWhile/",
|
| 356 |
+
"voice": os.path.join(voices_dir, 'eng', 'adult', 'female', 'RelaxForAWhile.wav'),
|
| 357 |
+
"files": default_engine_settings[TTS_ENGINES['XTTSv2']]['files'],
|
| 358 |
+
"samplerate": default_engine_settings[TTS_ENGINES['XTTSv2']]['samplerate']
|
| 359 |
+
},
|
| 360 |
+
"RosamundPike": {
|
| 361 |
+
"lang": "eng",
|
| 362 |
+
"repo": "drewThomasson/fineTunedTTSModels",
|
| 363 |
+
"sub": "xtts-v2/eng/RosamundPike/",
|
| 364 |
+
"voice": os.path.join(voices_dir, 'eng', 'adult', 'female', 'RosamundPike.wav'),
|
| 365 |
+
"files": default_engine_settings[TTS_ENGINES['XTTSv2']]['files'],
|
| 366 |
+
"samplerate": default_engine_settings[TTS_ENGINES['XTTSv2']]['samplerate']
|
| 367 |
+
},
|
| 368 |
+
"ScarlettJohansson": {
|
| 369 |
+
"lang": "eng",
|
| 370 |
+
"repo": "drewThomasson/fineTunedTTSModels",
|
| 371 |
+
"sub": "xtts-v2/eng/ScarlettJohansson/",
|
| 372 |
+
"voice": os.path.join(voices_dir, 'eng', 'adult', 'female', 'ScarlettJohansson.wav'),
|
| 373 |
+
"files": default_engine_settings[TTS_ENGINES['XTTSv2']]['files'],
|
| 374 |
+
"samplerate": default_engine_settings[TTS_ENGINES['XTTSv2']]['samplerate']
|
| 375 |
+
},
|
| 376 |
+
"SladeTeenTitans": {
|
| 377 |
+
"lang": "eng",
|
| 378 |
+
"repo": "drewThomasson/fineTunedTTSModels",
|
| 379 |
+
"sub": "xtts-v2/eng/SladeTeenTitans/",
|
| 380 |
+
"voice": os.path.join(voices_dir, 'eng', 'adult', 'male', 'SladeTeenTitans.wav'),
|
| 381 |
+
"files": default_engine_settings[TTS_ENGINES['XTTSv2']]['files'],
|
| 382 |
+
"samplerate": default_engine_settings[TTS_ENGINES['XTTSv2']]['samplerate']
|
| 383 |
+
},
|
| 384 |
+
"StanleyParable": {
|
| 385 |
+
"lang": "eng",
|
| 386 |
+
"repo": "drewThomasson/fineTunedTTSModels",
|
| 387 |
+
"sub": "xtts-v2/eng/StanleyParable/",
|
| 388 |
+
"voice": os.path.join(voices_dir, 'eng', 'adult', 'male', 'StanleyParable.wav'),
|
| 389 |
+
"files": default_engine_settings[TTS_ENGINES['XTTSv2']]['files'],
|
| 390 |
+
"samplerate": default_engine_settings[TTS_ENGINES['XTTSv2']]['samplerate']
|
| 391 |
+
},
|
| 392 |
+
"WhisperSalemASMR": {
|
| 393 |
+
"lang": "eng",
|
| 394 |
+
"repo": "drewThomasson/fineTunedTTSModels",
|
| 395 |
+
"sub": "xtts-v2/eng/WhisperSalemASMR/",
|
| 396 |
+
"voice": os.path.join(voices_dir, 'eng', 'adult', 'male', 'WhisperSalemASMR.wav'),
|
| 397 |
+
"files": default_engine_settings[TTS_ENGINES['XTTSv2']]['files'],
|
| 398 |
+
"samplerate": default_engine_settings[TTS_ENGINES['XTTSv2']]['samplerate']
|
| 399 |
+
},
|
| 400 |
+
"Konishev": {
|
| 401 |
+
"lang": "rus",
|
| 402 |
+
"repo": "drewThomasson/fineTunedTTSModels",
|
| 403 |
+
"sub": "xtts-v2/rus/Konishev/",
|
| 404 |
+
"voice": os.path.join(voices_dir, 'rus', 'adult', 'male', 'Konishev.wav'),
|
| 405 |
+
"files": default_engine_settings[TTS_ENGINES['XTTSv2']]['files'],
|
| 406 |
+
"samplerate": default_engine_settings[TTS_ENGINES['XTTSv2']]['samplerate']
|
| 407 |
+
}
|
| 408 |
+
},
|
| 409 |
+
TTS_ENGINES['BARK']: {
|
| 410 |
+
"internal": {
|
| 411 |
+
"lang": "multi",
|
| 412 |
+
"repo": "erogol/bark", # suno/bark, rsxdalv/suno, tts_models/multilingual/multi-dataset/bark
|
| 413 |
+
"sub": "", # {"big-bf16": "big-bf16/", "small-bf16": "small-bf16/", "big": "big/", "small": "small/"}
|
| 414 |
+
"voice": os.path.join(voices_dir, 'eng', 'adult', 'male', 'KumarDahl.wav'),
|
| 415 |
+
"files": default_engine_settings[TTS_ENGINES['BARK']]['files'],
|
| 416 |
+
"samplerate": default_engine_settings[TTS_ENGINES['BARK']]['samplerate']
|
| 417 |
+
}
|
| 418 |
+
},
|
| 419 |
+
TTS_ENGINES['VITS']: {
|
| 420 |
+
"internal": {
|
| 421 |
+
"lang": "multi",
|
| 422 |
+
"repo": "tts_models/[lang_iso1]/[xxx]",
|
| 423 |
+
"sub": {
|
| 424 |
+
"css10/vits": ['es','hu','fi','fr','nl','ru','el'],
|
| 425 |
+
"custom/vits": ['ca'],
|
| 426 |
+
"custom/vits-female": ['bn', 'fa'],
|
| 427 |
+
"cv/vits": ['bg','cs','da','et','ga','hr','lt','lv','mt','pt','ro','sk','sl','sv'],
|
| 428 |
+
"mai/vits": ['uk'],
|
| 429 |
+
"mai_female/vits": ['pl'],
|
| 430 |
+
"mai_male/vits": ['it'],
|
| 431 |
+
"openbible/vits": ['ewe','hau','lin','tw_akuapem','tw_asante','yor'],
|
| 432 |
+
"vctk/vits": ['en'],
|
| 433 |
+
"thorsten/vits": ['de']
|
| 434 |
+
},
|
| 435 |
+
"voice": None,
|
| 436 |
+
"files": default_engine_settings[TTS_ENGINES['VITS']]['files'],
|
| 437 |
+
"samplerate": {
|
| 438 |
+
"css10/vits": default_engine_settings[TTS_ENGINES['VITS']]['samplerate'],
|
| 439 |
+
"custom/vits": default_engine_settings[TTS_ENGINES['VITS']]['samplerate'],
|
| 440 |
+
"custom/vits-female": default_engine_settings[TTS_ENGINES['VITS']]['samplerate'],
|
| 441 |
+
"cv/vits": default_engine_settings[TTS_ENGINES['VITS']]['samplerate'],
|
| 442 |
+
"mai/vits": default_engine_settings[TTS_ENGINES['VITS']]['samplerate'],
|
| 443 |
+
"mai_female/vits": 24000,
|
| 444 |
+
"mai_male/vits": 16000,
|
| 445 |
+
"openbible/vits": default_engine_settings[TTS_ENGINES['VITS']]['samplerate'],
|
| 446 |
+
"vctk/vits": default_engine_settings[TTS_ENGINES['VITS']]['samplerate'],
|
| 447 |
+
"thorsten/vits": default_engine_settings[TTS_ENGINES['VITS']]['samplerate']
|
| 448 |
+
}
|
| 449 |
+
}
|
| 450 |
+
},
|
| 451 |
+
TTS_ENGINES['FAIRSEQ']: {
|
| 452 |
+
"internal": {
|
| 453 |
+
"lang": "multi",
|
| 454 |
+
"repo": "tts_models/[lang]/fairseq/vits",
|
| 455 |
+
"sub": "",
|
| 456 |
+
"voice": None,
|
| 457 |
+
"files": default_engine_settings[TTS_ENGINES['FAIRSEQ']]['files'],
|
| 458 |
+
"samplerate": default_engine_settings[TTS_ENGINES['FAIRSEQ']]['samplerate']
|
| 459 |
+
}
|
| 460 |
+
},
|
| 461 |
+
TTS_ENGINES['TACOTRON2']: {
|
| 462 |
+
"internal": {
|
| 463 |
+
"lang": "multi",
|
| 464 |
+
"repo": "tts_models/[lang_iso1]/[xxx]",
|
| 465 |
+
"sub": {
|
| 466 |
+
"mai/tacotron2-DDC": ['fr', 'es', 'nl'],
|
| 467 |
+
"thorsten/tacotron2-DDC": ['de'],
|
| 468 |
+
"kokoro/tacotron2-DDC": ['ja'],
|
| 469 |
+
"ljspeech/tacotron2-DDC": ['en'],
|
| 470 |
+
"baker/tacotron2-DDC-GST": ['zh-CN']
|
| 471 |
+
},
|
| 472 |
+
"voice": None,
|
| 473 |
+
"files": default_engine_settings[TTS_ENGINES['TACOTRON2']]['files'],
|
| 474 |
+
"samplerate": {
|
| 475 |
+
"mai/tacotron2-DDC": default_engine_settings[TTS_ENGINES['TACOTRON2']]['samplerate'],
|
| 476 |
+
"thorsten/tacotron2-DDC": default_engine_settings[TTS_ENGINES['TACOTRON2']]['samplerate'],
|
| 477 |
+
"kokoro/tacotron2-DDC": default_engine_settings[TTS_ENGINES['TACOTRON2']]['samplerate'],
|
| 478 |
+
"ljspeech/tacotron2-DDC": default_engine_settings[TTS_ENGINES['TACOTRON2']]['samplerate'],
|
| 479 |
+
"baker/tacotron2-DDC-GST": default_engine_settings[TTS_ENGINES['TACOTRON2']]['samplerate']
|
| 480 |
+
},
|
| 481 |
+
}
|
| 482 |
+
},
|
| 483 |
+
TTS_ENGINES['YOURTTS']: {
|
| 484 |
+
"internal": {
|
| 485 |
+
"lang": "multi",
|
| 486 |
+
"repo": "tts_models/multilingual/multi-dataset/your_tts",
|
| 487 |
+
"sub": "",
|
| 488 |
+
"voice": None,
|
| 489 |
+
"files": default_engine_settings[TTS_ENGINES['YOURTTS']]['files'],
|
| 490 |
+
"samplerate": default_engine_settings[TTS_ENGINES['YOURTTS']]['samplerate']
|
| 491 |
+
}
|
| 492 |
+
}
|
| 493 |
+
}
|