|
# Usage |
|
Clone repo |
|
```bash |
|
git clone https://github.com/nguyenhoanganh2002/XTTSv2-Finetuning-for-New-Languages.git |
|
cd XTTSv2-Finetuning-for-New-Languages |
|
pip install -r requirements.txt |
|
``` |
|
|
|
Pull model's weights |
|
```python |
|
from huggingface_hub import snapshot_download |
|
|
|
snapshot_download(repo_id="anhnh2002/vnTTS", |
|
repo_type="model", |
|
local_dir="model/") |
|
``` |
|
|
|
Load model |
|
```python |
|
from pprint import pprint |
|
import torch |
|
import torchaudio |
|
from tqdm import tqdm |
|
from underthesea import sent_tokenize |
|
from vinorm import TTSnorm |
|
from TTS.tts.configs.xtts_config import XttsConfig |
|
from TTS.tts.models.xtts import Xtts |
|
|
|
device = "cuda:0" |
|
|
|
xtts_checkpoint = "model/model.pth" |
|
xtts_config = "model/config.json" |
|
xtts_vocab = "model/vocab.json" |
|
|
|
config = XttsConfig() |
|
config.load_json(xtts_config) |
|
XTTS_MODEL = Xtts.init_from_config(config) |
|
XTTS_MODEL.load_checkpoint(config, |
|
checkpoint_path=xtts_checkpoint, |
|
vocab_path=xtts_vocab, |
|
use_deepspeed=False) |
|
XTTS_MODEL.to(device) |
|
``` |
|
|
|
Preprocessing and chunking |
|
```python |
|
def preprocess_text(text, language="vi"): |
|
if language == "vi": |
|
text = TTSnorm(text, unknown=False, lower=False, rule=True) |
|
|
|
# split text into sentences |
|
if language in ["ja", "zh-cn"]: |
|
sentences = text.split("。") |
|
else: |
|
sentences = sent_tokenize(text) |
|
|
|
chunks = [] |
|
chunk_i = "" |
|
len_chunk_i = 0 |
|
for sentence in sentences: |
|
chunk_i += " " + sentence |
|
len_chunk_i += len(sentence.split()) |
|
if len_chunk_i > 30: |
|
chunks.append(chunk_i.strip()) |
|
chunk_i = "" |
|
len_chunk_i = 0 |
|
|
|
if (len(chunks) > 0) and (len_chunk_i < 15): |
|
chunks[-1] += chunk_i |
|
else: |
|
chunks.append(chunk_i) |
|
|
|
return chunks |
|
``` |
|
|
|
Generate latent embeddings for the speaker |
|
```python |
|
speaker_audio_file = "model/vi_man.wav" |
|
|
|
gpt_cond_latent, speaker_embedding = XTTS_MODEL.get_conditioning_latents( |
|
audio_path=speaker_audio_file, |
|
gpt_cond_len=XTTS_MODEL.config.gpt_cond_len, |
|
max_ref_length=XTTS_MODEL.config.max_ref_len, |
|
sound_norm_refs=XTTS_MODEL.config.sound_norm_refs, |
|
) |
|
``` |
|
|
|
Inference |
|
```python |
|
def tts( |
|
model: Xtts, |
|
text: str, |
|
language: str, |
|
gpt_cond_latent: torch.Tensor, |
|
speaker_embedding: torch.Tensor, |
|
verbose: bool = False, |
|
): |
|
# preprocess text |
|
chunks = preprocess_text(text, language) |
|
|
|
wav_chunks = [] |
|
for text in tqdm(chunks): |
|
if text.strip() == "": |
|
continue |
|
wav_chunk = model.inference( |
|
text=text, |
|
language=language, |
|
gpt_cond_latent=gpt_cond_latent, |
|
speaker_embedding=speaker_embedding, |
|
length_penalty=1.0, |
|
repetition_penalty=10.0, |
|
top_k=10, |
|
top_p=0.5, |
|
) |
|
|
|
wav_chunk["wav"] = torch.tensor(wav_chunk["wav"]) |
|
|
|
wav_chunks.append(wav_chunk["wav"]) |
|
|
|
out_wav = torch.cat(wav_chunks, dim=0).unsqueeze(0).cpu() |
|
|
|
return out_wav |
|
|
|
from IPython.display import Audio |
|
|
|
audio = tts( |
|
model=XTTS_MODEL, |
|
text="Xin chào, tôi là một hệ thống chuyển đổi văn bản tiếng Việt thành giọng nói.", #Hello, I am a Vietnamese text to speech conversion system. |
|
language="vi", |
|
gpt_cond_latent=gpt_cond_latent, |
|
speaker_embedding=speaker_embedding, |
|
verbose=True, |
|
) |
|
|
|
Audio(audio, rate=24000) |
|
``` |
|
|
|
# License |
|
This project uses a model licensed under the Coqui Public Model License 1.0.0, which permits non-commercial use only. This includes personal research, testing, and charitable purposes. Commercial entities may use it for non-commercial research and evaluation. Revenue-generating activities are prohibited. Users must include the license terms when distributing the model or its outputs. For full details, please refer to: https://coqui.ai/cpml |