File size: 3,913 Bytes
b24c7a3 800dc4c b24c7a3 |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 |
# Usage
Clone repo
```bash
git clone https://github.com/nguyenhoanganh2002/XTTSv2-Finetuning-for-New-Languages.git
cd XTTSv2-Finetuning-for-New-Languages
pip install -r requirements.txt
```
Pull model's weights
```python
from huggingface_hub import snapshot_download
snapshot_download(repo_id="anhnh2002/vnTTS",
repo_type="model",
local_dir="model/")
```
Load model
```python
from pprint import pprint
import torch
import torchaudio
from tqdm import tqdm
from underthesea import sent_tokenize
from vinorm import TTSnorm
from TTS.tts.configs.xtts_config import XttsConfig
from TTS.tts.models.xtts import Xtts
device = "cuda:0"
xtts_checkpoint = "model/model.pth"
xtts_config = "model/config.json"
xtts_vocab = "model/vocab.json"
config = XttsConfig()
config.load_json(xtts_config)
XTTS_MODEL = Xtts.init_from_config(config)
XTTS_MODEL.load_checkpoint(config,
checkpoint_path=xtts_checkpoint,
vocab_path=xtts_vocab,
use_deepspeed=False)
XTTS_MODEL.to(device)
```
Preprocessing and chunking
```python
def preprocess_text(text, language="vi"):
if language == "vi":
text = TTSnorm(text, unknown=False, lower=False, rule=True)
# split text into sentences
if language in ["ja", "zh-cn"]:
sentences = text.split("。")
else:
sentences = sent_tokenize(text)
chunks = []
chunk_i = ""
len_chunk_i = 0
for sentence in sentences:
chunk_i += " " + sentence
len_chunk_i += len(sentence.split())
if len_chunk_i > 30:
chunks.append(chunk_i.strip())
chunk_i = ""
len_chunk_i = 0
if (len(chunks) > 0) and (len_chunk_i < 15):
chunks[-1] += chunk_i
else:
chunks.append(chunk_i)
return chunks
```
Generate latent embeddings for the speaker
```python
speaker_audio_file = "model/vi_man.wav"
gpt_cond_latent, speaker_embedding = XTTS_MODEL.get_conditioning_latents(
audio_path=speaker_audio_file,
gpt_cond_len=XTTS_MODEL.config.gpt_cond_len,
max_ref_length=XTTS_MODEL.config.max_ref_len,
sound_norm_refs=XTTS_MODEL.config.sound_norm_refs,
)
```
Inference
```python
def tts(
model: Xtts,
text: str,
language: str,
gpt_cond_latent: torch.Tensor,
speaker_embedding: torch.Tensor,
verbose: bool = False,
):
# preprocess text
chunks = preprocess_text(text, language)
wav_chunks = []
for text in tqdm(chunks):
if text.strip() == "":
continue
wav_chunk = model.inference(
text=text,
language=language,
gpt_cond_latent=gpt_cond_latent,
speaker_embedding=speaker_embedding,
length_penalty=1.0,
repetition_penalty=10.0,
top_k=10,
top_p=0.5,
)
wav_chunk["wav"] = torch.tensor(wav_chunk["wav"])
wav_chunks.append(wav_chunk["wav"])
out_wav = torch.cat(wav_chunks, dim=0).unsqueeze(0).cpu()
return out_wav
from IPython.display import Audio
audio = tts(
model=XTTS_MODEL,
text="Xin chào, tôi là một hệ thống chuyển đổi văn bản tiếng Việt thành giọng nói.", #Hello, I am a Vietnamese text to speech conversion system.
language="vi",
gpt_cond_latent=gpt_cond_latent,
speaker_embedding=speaker_embedding,
verbose=True,
)
Audio(audio, rate=24000)
```
# License
This project uses a model licensed under the Coqui Public Model License 1.0.0, which permits non-commercial use only. This includes personal research, testing, and charitable purposes. Commercial entities may use it for non-commercial research and evaluation. Revenue-generating activities are prohibited. Users must include the license terms when distributing the model or its outputs. For full details, please refer to: https://coqui.ai/cpml |