Upload 6 files
Browse files- config.json +205 -0
- events.out.tfevents.1713874719.81c96a3a92b5.2617.0 +3 -0
- model.pth +3 -0
- trainer_0_log.txt +336 -0
- traing_xtts_mr.py +177 -0
- vocab.json +0 -0
config.json
ADDED
@@ -0,0 +1,205 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
{
|
2 |
+
"output_path": "/workspace/run/training",
|
3 |
+
"logger_uri": null,
|
4 |
+
"run_name": "GPT_XTTS_v2.0_LJSpeech_FT",
|
5 |
+
"project_name": "XTTS_trainer",
|
6 |
+
"run_description": "\n GPT XTTS training\n ",
|
7 |
+
"print_step": 50,
|
8 |
+
"plot_step": 100,
|
9 |
+
"model_param_stats": false,
|
10 |
+
"wandb_entity": null,
|
11 |
+
"dashboard_logger": "tensorboard",
|
12 |
+
"save_on_interrupt": true,
|
13 |
+
"log_model_step": 1000,
|
14 |
+
"save_step": 400,
|
15 |
+
"save_n_checkpoints": 1,
|
16 |
+
"save_checkpoints": true,
|
17 |
+
"save_all_best": false,
|
18 |
+
"save_best_after": 0,
|
19 |
+
"target_loss": null,
|
20 |
+
"print_eval": false,
|
21 |
+
"test_delay_epochs": 0,
|
22 |
+
"run_eval": true,
|
23 |
+
"run_eval_steps": null,
|
24 |
+
"distributed_backend": "nccl",
|
25 |
+
"distributed_url": "tcp://localhost:54321",
|
26 |
+
"mixed_precision": false,
|
27 |
+
"precision": "fp16",
|
28 |
+
"epochs": 1000,
|
29 |
+
"batch_size": 3,
|
30 |
+
"eval_batch_size": 3,
|
31 |
+
"grad_clip": 0.0,
|
32 |
+
"scheduler_after_epoch": true,
|
33 |
+
"lr": 4e-06,
|
34 |
+
"optimizer": "AdamW",
|
35 |
+
"optimizer_params": {
|
36 |
+
"betas": [
|
37 |
+
0.9,
|
38 |
+
0.96
|
39 |
+
],
|
40 |
+
"eps": 1e-08,
|
41 |
+
"weight_decay": 0.01
|
42 |
+
},
|
43 |
+
"lr_scheduler": "MultiStepLR",
|
44 |
+
"lr_scheduler_params": {
|
45 |
+
"milestones": [
|
46 |
+
900000,
|
47 |
+
2700000,
|
48 |
+
5400000
|
49 |
+
],
|
50 |
+
"gamma": 0.5,
|
51 |
+
"last_epoch": -1
|
52 |
+
},
|
53 |
+
"use_grad_scaler": false,
|
54 |
+
"allow_tf32": false,
|
55 |
+
"cudnn_enable": true,
|
56 |
+
"cudnn_deterministic": false,
|
57 |
+
"cudnn_benchmark": false,
|
58 |
+
"training_seed": 1,
|
59 |
+
"model": "xtts",
|
60 |
+
"num_loader_workers": 8,
|
61 |
+
"num_eval_loader_workers": 0,
|
62 |
+
"use_noise_augment": false,
|
63 |
+
"audio": {
|
64 |
+
"sample_rate": 22050,
|
65 |
+
"output_sample_rate": 24000,
|
66 |
+
"dvae_sample_rate": 22050
|
67 |
+
},
|
68 |
+
"use_phonemes": false,
|
69 |
+
"phonemizer": null,
|
70 |
+
"phoneme_language": null,
|
71 |
+
"compute_input_seq_cache": false,
|
72 |
+
"text_cleaner": null,
|
73 |
+
"enable_eos_bos_chars": false,
|
74 |
+
"test_sentences_file": "",
|
75 |
+
"phoneme_cache_path": null,
|
76 |
+
"characters": null,
|
77 |
+
"add_blank": false,
|
78 |
+
"batch_group_size": 48,
|
79 |
+
"loss_masking": null,
|
80 |
+
"min_audio_len": 1,
|
81 |
+
"max_audio_len": Infinity,
|
82 |
+
"min_text_len": 1,
|
83 |
+
"max_text_len": Infinity,
|
84 |
+
"compute_f0": false,
|
85 |
+
"compute_energy": false,
|
86 |
+
"compute_linear_spec": false,
|
87 |
+
"precompute_num_workers": 0,
|
88 |
+
"start_by_longest": false,
|
89 |
+
"shuffle": false,
|
90 |
+
"drop_last": false,
|
91 |
+
"datasets": [
|
92 |
+
{
|
93 |
+
"formatter": "",
|
94 |
+
"dataset_name": "",
|
95 |
+
"path": "",
|
96 |
+
"meta_file_train": "",
|
97 |
+
"ignored_speakers": null,
|
98 |
+
"language": "",
|
99 |
+
"phonemizer": "",
|
100 |
+
"meta_file_val": "",
|
101 |
+
"meta_file_attn_mask": ""
|
102 |
+
}
|
103 |
+
],
|
104 |
+
"test_sentences": [
|
105 |
+
{
|
106 |
+
"text": "\u092f\u093e \u092e\u094b\u0939\u093f\u092e\u0947\u091a\u0947 \u0905\u0927\u093f\u0915\u0943\u0924 \u0927\u094b\u0930\u0923 \u092a\u094b\u0932\u0902\u0921\u0932\u093e \u0930\u0936\u093f\u092f\u093e\u091a\u094d\u092f\u093e \u0927\u094b\u0915\u094d\u092f\u093e\u092a\u093e\u0938\u0942\u0928 \u0935\u093e\u091a\u0935\u0923\u0947 \u0939\u0947 \u0939\u094b\u0924\u0947.",
|
107 |
+
"speaker_wav": [
|
108 |
+
"./audios/wavs/01_0000_01_1_est1.wav"
|
109 |
+
],
|
110 |
+
"language": "hi"
|
111 |
+
},
|
112 |
+
{
|
113 |
+
"text": "\u0905\u0936\u0940 \u0928\u0947\u092a\u094b\u0932\u093f\u092f\u0928\u0932\u093e \u0906\u0936\u093e \u0939\u094b\u0924\u0940 \u092a\u0930\u0902\u0924\u0941 \u0930\u0936\u093f\u092f\u0928 \u0938\u0948\u0928\u094d\u092f\u093e\u0928\u0947",
|
114 |
+
"speaker_wav": [
|
115 |
+
"./audios/wavs/01_0000_01_1_est1.wav"
|
116 |
+
],
|
117 |
+
"language": "hi"
|
118 |
+
}
|
119 |
+
],
|
120 |
+
"eval_split_max_size": 256,
|
121 |
+
"eval_split_size": 0.01,
|
122 |
+
"use_speaker_weighted_sampler": false,
|
123 |
+
"speaker_weighted_sampler_alpha": 1.0,
|
124 |
+
"use_language_weighted_sampler": false,
|
125 |
+
"language_weighted_sampler_alpha": 1.0,
|
126 |
+
"use_length_weighted_sampler": false,
|
127 |
+
"length_weighted_sampler_alpha": 1.0,
|
128 |
+
"model_args": {
|
129 |
+
"gpt_batch_size": 1,
|
130 |
+
"enable_redaction": false,
|
131 |
+
"kv_cache": true,
|
132 |
+
"gpt_checkpoint": "",
|
133 |
+
"clvp_checkpoint": null,
|
134 |
+
"decoder_checkpoint": null,
|
135 |
+
"num_chars": 255,
|
136 |
+
"tokenizer_file": "/workspace/run/training/XTTS_v2.0_original_model_files/vocab.json",
|
137 |
+
"gpt_max_audio_tokens": 605,
|
138 |
+
"gpt_max_text_tokens": 402,
|
139 |
+
"gpt_max_prompt_tokens": 70,
|
140 |
+
"gpt_layers": 30,
|
141 |
+
"gpt_n_model_channels": 1024,
|
142 |
+
"gpt_n_heads": 16,
|
143 |
+
"gpt_number_text_tokens": 6681,
|
144 |
+
"gpt_start_text_token": 261,
|
145 |
+
"gpt_stop_text_token": 0,
|
146 |
+
"gpt_num_audio_tokens": 1026,
|
147 |
+
"gpt_start_audio_token": 1024,
|
148 |
+
"gpt_stop_audio_token": 1025,
|
149 |
+
"gpt_code_stride_len": 1024,
|
150 |
+
"gpt_use_masking_gt_prompt_approach": true,
|
151 |
+
"gpt_use_perceiver_resampler": true,
|
152 |
+
"input_sample_rate": 22050,
|
153 |
+
"output_sample_rate": 24000,
|
154 |
+
"output_hop_length": 256,
|
155 |
+
"decoder_input_dim": 1024,
|
156 |
+
"d_vector_dim": 512,
|
157 |
+
"cond_d_vector_in_each_upsampling_layer": true,
|
158 |
+
"duration_const": 102400,
|
159 |
+
"min_conditioning_length": 66150,
|
160 |
+
"max_conditioning_length": 132300,
|
161 |
+
"gpt_loss_text_ce_weight": 0.01,
|
162 |
+
"gpt_loss_mel_ce_weight": 1.0,
|
163 |
+
"debug_loading_failures": false,
|
164 |
+
"max_wav_length": 255995,
|
165 |
+
"max_text_length": 200,
|
166 |
+
"mel_norm_file": "/workspace/run/training/XTTS_v2.0_original_model_files/mel_stats.pth",
|
167 |
+
"dvae_checkpoint": "/workspace/run/training/XTTS_v2.0_original_model_files/dvae.pth",
|
168 |
+
"xtts_checkpoint": "/workspace/run/training/XTTS_v2.0_original_model_files/model.pth",
|
169 |
+
"vocoder": ""
|
170 |
+
},
|
171 |
+
"model_dir": null,
|
172 |
+
"languages": [
|
173 |
+
"en",
|
174 |
+
"es",
|
175 |
+
"fr",
|
176 |
+
"de",
|
177 |
+
"it",
|
178 |
+
"pt",
|
179 |
+
"pl",
|
180 |
+
"tr",
|
181 |
+
"ru",
|
182 |
+
"nl",
|
183 |
+
"cs",
|
184 |
+
"ar",
|
185 |
+
"zh-cn",
|
186 |
+
"hu",
|
187 |
+
"ko",
|
188 |
+
"ja",
|
189 |
+
"hi"
|
190 |
+
],
|
191 |
+
"temperature": 0.85,
|
192 |
+
"length_penalty": 1.0,
|
193 |
+
"repetition_penalty": 2.0,
|
194 |
+
"top_k": 50,
|
195 |
+
"top_p": 0.85,
|
196 |
+
"num_gpt_outputs": 1,
|
197 |
+
"gpt_cond_len": 12,
|
198 |
+
"gpt_cond_chunk_len": 4,
|
199 |
+
"max_ref_len": 10,
|
200 |
+
"sound_norm_refs": false,
|
201 |
+
"optimizer_wd_only_on_weights": true,
|
202 |
+
"weighted_loss_attrs": {},
|
203 |
+
"weighted_loss_multipliers": {},
|
204 |
+
"github_branch": "inside_docker"
|
205 |
+
}
|
events.out.tfevents.1713874719.81c96a3a92b5.2617.0
ADDED
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
1 |
+
version https://git-lfs.github.com/spec/v1
|
2 |
+
oid sha256:883713381f9e5bfef98f894d881b258fcbc180034a7cdc6f439644eafb5f610c
|
3 |
+
size 20650
|
model.pth
ADDED
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
1 |
+
version https://git-lfs.github.com/spec/v1
|
2 |
+
oid sha256:0cede63ef409ebe9762e3c3b7e0743ccbd1ec0736de90f7cb8052003db141599
|
3 |
+
size 5607927381
|
trainer_0_log.txt
ADDED
@@ -0,0 +1,336 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
> Training Environment:
|
2 |
+
| > Backend: Torch
|
3 |
+
| > Mixed precision: False
|
4 |
+
| > Precision: float32
|
5 |
+
| > Current device: 0
|
6 |
+
| > Num. of GPUs: 1
|
7 |
+
| > Num. of CPUs: 64
|
8 |
+
| > Num. of Torch Threads: 1
|
9 |
+
| > Torch seed: 1
|
10 |
+
| > Torch CUDNN: True
|
11 |
+
| > Torch CUDNN deterministic: False
|
12 |
+
| > Torch CUDNN benchmark: False
|
13 |
+
| > Torch TF32 MatMul: False
|
14 |
+
> Start Tensorboard: tensorboard --logdir=/workspace/run/training/GPT_XTTS_v2.0_LJSpeech_FT-April-23-2024_12+18PM-0000000
|
15 |
+
|
16 |
+
> Model has 518442047 parameters
|
17 |
+
|
18 |
+
[4m[1m > EPOCH: 0/1000[0m
|
19 |
+
--> /workspace/run/training/GPT_XTTS_v2.0_LJSpeech_FT-April-23-2024_12+18PM-0000000
|
20 |
+
|
21 |
+
[1m > TRAINING (2024-04-23 12:18:40) [0m
|
22 |
+
|
23 |
+
[1m --> TIME: 2024-04-23 12:18:42 -- STEP: 0/1695 -- GLOBAL_STEP: 0[0m
|
24 |
+
| > loss_text_ce: 0.042592838406562805 (0.042592838406562805)
|
25 |
+
| > loss_mel_ce: 3.744250535964966 (3.744250535964966)
|
26 |
+
| > loss: 0.04508147016167641 (0.04508147016167641)
|
27 |
+
| > current_lr: 4e-06
|
28 |
+
| > step_time: 0.3181 (0.3181343078613281)
|
29 |
+
| > loader_time: 1.1535 (1.153491735458374)
|
30 |
+
|
31 |
+
|
32 |
+
[1m --> TIME: 2024-04-23 12:18:50 -- STEP: 50/1695 -- GLOBAL_STEP: 50[0m
|
33 |
+
| > loss_text_ce: 0.043245986104011536 (0.045777649357914924)
|
34 |
+
| > loss_mel_ce: 4.0826735496521 (3.678379626274109)
|
35 |
+
| > loss: 0.04911808669567108 (0.044335206523537646)
|
36 |
+
| > current_lr: 4e-06
|
37 |
+
| > step_time: 0.1173 (0.10748531341552735)
|
38 |
+
| > loader_time: 0.0038 (0.012436685562133789)
|
39 |
+
|
40 |
+
|
41 |
+
[1m --> TIME: 2024-04-23 12:19:00 -- STEP: 100/1695 -- GLOBAL_STEP: 100[0m
|
42 |
+
| > loss_text_ce: 0.04654935747385025 (0.04617325332015751)
|
43 |
+
| > loss_mel_ce: 3.7310783863067627 (3.6352836871147156)
|
44 |
+
| > loss: 0.044971760362386703 (0.04382686924189331)
|
45 |
+
| > current_lr: 4e-06
|
46 |
+
| > step_time: 0.1229 (0.1165578818321228)
|
47 |
+
| > loader_time: 0.0044 (0.010995228290557862)
|
48 |
+
|
49 |
+
|
50 |
+
[1m --> TIME: 2024-04-23 12:19:10 -- STEP: 150/1695 -- GLOBAL_STEP: 150[0m
|
51 |
+
| > loss_text_ce: 0.04864665865898132 (0.04633487790822981)
|
52 |
+
| > loss_mel_ce: 3.695878267288208 (3.5984654172261554)
|
53 |
+
| > loss: 0.04457768052816391 (0.04339048052827519)
|
54 |
+
| > current_lr: 4e-06
|
55 |
+
| > step_time: 0.0968 (0.12075453917185465)
|
56 |
+
| > loader_time: 0.0068 (0.009986537297566734)
|
57 |
+
|
58 |
+
|
59 |
+
[1m --> TIME: 2024-04-23 12:19:21 -- STEP: 200/1695 -- GLOBAL_STEP: 200[0m
|
60 |
+
| > loss_text_ce: 0.04507960379123688 (0.04615468136966227)
|
61 |
+
| > loss_mel_ce: 3.4362077713012695 (3.5497735607624055)
|
62 |
+
| > loss: 0.041443899273872375 (0.042808670215308674)
|
63 |
+
| > current_lr: 4e-06
|
64 |
+
| > step_time: 0.1431 (0.12541004419326782)
|
65 |
+
| > loader_time: 0.004 (0.009364948272705077)
|
66 |
+
|
67 |
+
|
68 |
+
[1m --> TIME: 2024-04-23 12:19:33 -- STEP: 250/1695 -- GLOBAL_STEP: 250[0m
|
69 |
+
| > loss_text_ce: 0.044978540390729904 (0.04600780452787875)
|
70 |
+
| > loss_mel_ce: 3.3835601806640625 (3.5098479528427124)
|
71 |
+
| > loss: 0.040815938264131546 (0.04233161683380605)
|
72 |
+
| > current_lr: 4e-06
|
73 |
+
| > step_time: 0.1506 (0.12965419387817378)
|
74 |
+
| > loader_time: 0.0043 (0.008812045097351074)
|
75 |
+
|
76 |
+
|
77 |
+
[1m --> TIME: 2024-04-23 12:19:45 -- STEP: 300/1695 -- GLOBAL_STEP: 300[0m
|
78 |
+
| > loss_text_ce: 0.04761254042387009 (0.046099709086120134)
|
79 |
+
| > loss_mel_ce: 3.859790325164795 (3.4856272101402284)
|
80 |
+
| > loss: 0.04651670157909393 (0.04204436879605055)
|
81 |
+
| > current_lr: 4e-06
|
82 |
+
| > step_time: 0.107 (0.13299476464589427)
|
83 |
+
| > loader_time: 0.0045 (0.008340648015340164)
|
84 |
+
|
85 |
+
|
86 |
+
[1m --> TIME: 2024-04-23 12:19:57 -- STEP: 350/1695 -- GLOBAL_STEP: 350[0m
|
87 |
+
| > loss_text_ce: 0.041058849543333054 (0.04608927173273904)
|
88 |
+
| > loss_mel_ce: 3.2493679523468018 (3.4584123958860125)
|
89 |
+
| > loss: 0.039171747863292694 (0.04172025864677771)
|
90 |
+
| > current_lr: 4e-06
|
91 |
+
| > step_time: 0.1586 (0.1357990046909877)
|
92 |
+
| > loader_time: 0.0092 (0.007999198096139085)
|
93 |
+
|
94 |
+
|
95 |
+
[1m --> TIME: 2024-04-23 12:20:09 -- STEP: 400/1695 -- GLOBAL_STEP: 400[0m
|
96 |
+
| > loss_text_ce: 0.0439525842666626 (0.04606584513559937)
|
97 |
+
| > loss_mel_ce: 3.5535271167755127 (3.4283770048618316)
|
98 |
+
| > loss: 0.04282714053988457 (0.04136241558939219)
|
99 |
+
| > current_lr: 4e-06
|
100 |
+
| > step_time: 0.1314 (0.13862687826156628)
|
101 |
+
| > loader_time: 0.0039 (0.007809545397758481)
|
102 |
+
|
103 |
+
|
104 |
+
> CHECKPOINT : /workspace/run/training/GPT_XTTS_v2.0_LJSpeech_FT-April-23-2024_12+18PM-0000000/checkpoint_400.pth
|
105 |
+
|
106 |
+
[1m --> TIME: 2024-04-23 12:20:24 -- STEP: 450/1695 -- GLOBAL_STEP: 450[0m
|
107 |
+
| > loss_text_ce: 0.05098263919353485 (0.04611284121870995)
|
108 |
+
| > loss_mel_ce: 2.9446003437042236 (3.401099606090122)
|
109 |
+
| > loss: 0.03566170483827591 (0.04103824413898918)
|
110 |
+
| > current_lr: 4e-06
|
111 |
+
| > step_time: 0.1581 (0.1393417421976725)
|
112 |
+
| > loader_time: 0.0041 (0.0075671084721883105)
|
113 |
+
|
114 |
+
|
115 |
+
[1m --> TIME: 2024-04-23 12:20:36 -- STEP: 500/1695 -- GLOBAL_STEP: 500[0m
|
116 |
+
| > loss_text_ce: 0.03936528041958809 (0.04605886636674404)
|
117 |
+
| > loss_mel_ce: 3.534381628036499 (3.3785955691337586)
|
118 |
+
| > loss: 0.04254460707306862 (0.04076969639584422)
|
119 |
+
| > current_lr: 4e-06
|
120 |
+
| > step_time: 0.1362 (0.1412919845581054)
|
121 |
+
| > loader_time: 0.0044 (0.007305326461791989)
|
122 |
+
|
123 |
+
|
124 |
+
[1m --> TIME: 2024-04-23 12:20:49 -- STEP: 550/1695 -- GLOBAL_STEP: 550[0m
|
125 |
+
| > loss_text_ce: 0.043622393161058426 (0.04607608911666003)
|
126 |
+
| > loss_mel_ce: 3.36867618560791 (3.351200197826734)
|
127 |
+
| > loss: 0.04062260314822197 (0.04044376604597676)
|
128 |
+
| > current_lr: 4e-06
|
129 |
+
| > step_time: 0.1491 (0.1432263898849487)
|
130 |
+
| > loader_time: 0.0044 (0.007147459983825681)
|
131 |
+
|
132 |
+
|
133 |
+
[1m --> TIME: 2024-04-23 12:21:01 -- STEP: 600/1695 -- GLOBAL_STEP: 600[0m
|
134 |
+
| > loss_text_ce: 0.04180557280778885 (0.04603437863911191)
|
135 |
+
| > loss_mel_ce: 3.1069161891937256 (3.328243460655214)
|
136 |
+
| > loss: 0.0374847836792469 (0.04016997500322759)
|
137 |
+
| > current_lr: 4e-06
|
138 |
+
| > step_time: 0.1583 (0.14447109142939243)
|
139 |
+
| > loader_time: 0.0047 (0.006965583960215248)
|
140 |
+
|
141 |
+
|
142 |
+
[1m --> TIME: 2024-04-23 12:21:14 -- STEP: 650/1695 -- GLOBAL_STEP: 650[0m
|
143 |
+
| > loss_text_ce: 0.04896671324968338 (0.04602042846381666)
|
144 |
+
| > loss_mel_ce: 3.0476784706115723 (3.3038424359835123)
|
145 |
+
| > loss: 0.03686482459306717 (0.03987932055042337)
|
146 |
+
| > current_lr: 4e-06
|
147 |
+
| > step_time: 0.1219 (0.14626641933734613)
|
148 |
+
| > loader_time: 0.0047 (0.006803958232586197)
|
149 |
+
|
150 |
+
|
151 |
+
[1m --> TIME: 2024-04-23 12:21:27 -- STEP: 700/1695 -- GLOBAL_STEP: 700[0m
|
152 |
+
| > loss_text_ce: 0.04512707144021988 (0.046030817106366195)
|
153 |
+
| > loss_mel_ce: 3.066598892211914 (3.2816116438593195)
|
154 |
+
| > loss: 0.037044357508420944 (0.03961479195526668)
|
155 |
+
| > current_lr: 4e-06
|
156 |
+
| > step_time: 0.1502 (0.14775025640215206)
|
157 |
+
| > loader_time: 0.0044 (0.006717268739427837)
|
158 |
+
|
159 |
+
|
160 |
+
[1m --> TIME: 2024-04-23 12:21:40 -- STEP: 750/1695 -- GLOBAL_STEP: 750[0m
|
161 |
+
| > loss_text_ce: 0.04244884476065636 (0.04599520656466488)
|
162 |
+
| > loss_mel_ce: 2.8379921913146973 (3.264411670366924)
|
163 |
+
| > loss: 0.034290965646505356 (0.03940960643688838)
|
164 |
+
| > current_lr: 4e-06
|
165 |
+
| > step_time: 0.218 (0.14881795597076428)
|
166 |
+
| > loader_time: 0.0049 (0.006605740865071612)
|
167 |
+
|
168 |
+
|
169 |
+
[1m --> TIME: 2024-04-23 12:21:53 -- STEP: 800/1695 -- GLOBAL_STEP: 800[0m
|
170 |
+
| > loss_text_ce: 0.04257930815219879 (0.04597263523377482)
|
171 |
+
| > loss_mel_ce: 2.8074073791503906 (3.2470336309075365)
|
172 |
+
| > loss: 0.033928416669368744 (0.03920245631132278)
|
173 |
+
| > current_lr: 4e-06
|
174 |
+
| > step_time: 0.151 (0.14975822657346743)
|
175 |
+
| > loader_time: 0.0045 (0.006505406498908994)
|
176 |
+
|
177 |
+
|
178 |
+
> CHECKPOINT : /workspace/run/training/GPT_XTTS_v2.0_LJSpeech_FT-April-23-2024_12+18PM-0000000/checkpoint_800.pth
|
179 |
+
|
180 |
+
[1m --> TIME: 2024-04-23 12:22:08 -- STEP: 850/1695 -- GLOBAL_STEP: 850[0m
|
181 |
+
| > loss_text_ce: 0.046279508620500565 (0.04595626743400801)
|
182 |
+
| > loss_mel_ce: 2.9114205837249756 (3.232562539998224)
|
183 |
+
| > loss: 0.03521071374416351 (0.0390299865691101)
|
184 |
+
| > current_lr: 4e-06
|
185 |
+
| > step_time: 0.1686 (0.14989260813769187)
|
186 |
+
| > loader_time: 0.0041 (0.006382525107439824)
|
187 |
+
|
188 |
+
|
189 |
+
[1m --> TIME: 2024-04-23 12:22:21 -- STEP: 900/1695 -- GLOBAL_STEP: 900[0m
|
190 |
+
| > loss_text_ce: 0.04815426096320152 (0.045925861448049575)
|
191 |
+
| > loss_mel_ce: 2.881121873855591 (3.21540697336197)
|
192 |
+
| > loss: 0.03487233817577362 (0.03882539166758459)
|
193 |
+
| > current_lr: 4e-06
|
194 |
+
| > step_time: 0.1067 (0.15040262672636256)
|
195 |
+
| > loader_time: 0.0163 (0.006299734380510116)
|
196 |
+
|
197 |
+
|
198 |
+
[1m --> TIME: 2024-04-23 12:22:33 -- STEP: 950/1695 -- GLOBAL_STEP: 950[0m
|
199 |
+
| > loss_text_ce: 0.046194590628147125 (0.045895876723684795)
|
200 |
+
| > loss_mel_ce: 2.452665328979492 (3.2002050801327364)
|
201 |
+
| > loss: 0.029748331755399704 (0.03864405976706431)
|
202 |
+
| > current_lr: 4e-06
|
203 |
+
| > step_time: 0.1483 (0.1510076773794075)
|
204 |
+
| > loader_time: 0.0041 (0.006192966511375024)
|
205 |
+
|
206 |
+
|
207 |
+
[1m --> TIME: 2024-04-23 12:22:46 -- STEP: 1000/1695 -- GLOBAL_STEP: 1000[0m
|
208 |
+
| > loss_text_ce: 0.04607674479484558 (0.04585176565870645)
|
209 |
+
| > loss_mel_ce: 2.9387059211730957 (3.187430265903474)
|
210 |
+
| > loss: 0.035533126443624496 (0.03849145351536573)
|
211 |
+
| > current_lr: 4e-06
|
212 |
+
| > step_time: 0.1648 (0.15175995016098034)
|
213 |
+
| > loader_time: 0.0044 (0.006122385978698729)
|
214 |
+
|
215 |
+
|
216 |
+
[1m --> TIME: 2024-04-23 12:22:59 -- STEP: 1050/1695 -- GLOBAL_STEP: 1050[0m
|
217 |
+
| > loss_text_ce: 0.0466134138405323 (0.045852795899623947)
|
218 |
+
| > loss_mel_ce: 2.9738194942474365 (3.172598667598907)
|
219 |
+
| > loss: 0.03595753759145737 (0.03831489915826492)
|
220 |
+
| > current_lr: 4e-06
|
221 |
+
| > step_time: 0.1396 (0.15251214708600735)
|
222 |
+
| > loader_time: 0.0049 (0.00607424667903355)
|
223 |
+
|
224 |
+
|
225 |
+
[1m --> TIME: 2024-04-23 12:23:12 -- STEP: 1100/1695 -- GLOBAL_STEP: 1100[0m
|
226 |
+
| > loss_text_ce: 0.04659873992204666 (0.04585135899822824)
|
227 |
+
| > loss_mel_ce: 2.4221293926239014 (3.1576039728251373)
|
228 |
+
| > loss: 0.029389619827270508 (0.038136373775249206)
|
229 |
+
| > current_lr: 4e-06
|
230 |
+
| > step_time: 0.186 (0.15299982179294946)
|
231 |
+
| > loader_time: 0.0048 (0.006017334894700482)
|
232 |
+
|
233 |
+
|
234 |
+
[1m --> TIME: 2024-04-23 12:23:25 -- STEP: 1150/1695 -- GLOBAL_STEP: 1150[0m
|
235 |
+
| > loss_text_ce: 0.043769825249910355 (0.045824583464342636)
|
236 |
+
| > loss_mel_ce: 2.859921455383301 (3.1463320172351352)
|
237 |
+
| > loss: 0.034567754715681076 (0.03800186507079915)
|
238 |
+
| > current_lr: 4e-06
|
239 |
+
| > step_time: 0.1919 (0.15354946157206664)
|
240 |
+
| > loader_time: 0.0045 (0.005964664376300311)
|
241 |
+
|
242 |
+
|
243 |
+
[1m --> TIME: 2024-04-23 12:23:38 -- STEP: 1200/1695 -- GLOBAL_STEP: 1200[0m
|
244 |
+
| > loss_text_ce: 0.04848972707986832 (0.0457837945688516)
|
245 |
+
| > loss_mel_ce: 2.9194998741149902 (3.13719070851803)
|
246 |
+
| > loss: 0.035333212465047836 (0.037892554394590376)
|
247 |
+
| > current_lr: 4e-06
|
248 |
+
| > step_time: 0.2642 (0.15420349061489103)
|
249 |
+
| > loader_time: 0.0046 (0.005914180874824522)
|
250 |
+
|
251 |
+
|
252 |
+
> CHECKPOINT : /workspace/run/training/GPT_XTTS_v2.0_LJSpeech_FT-April-23-2024_12+18PM-0000000/checkpoint_1200.pth
|
253 |
+
|
254 |
+
[1m --> TIME: 2024-04-23 12:23:54 -- STEP: 1250/1695 -- GLOBAL_STEP: 1250[0m
|
255 |
+
| > loss_text_ce: 0.044037092477083206 (0.04573154278397562)
|
256 |
+
| > loss_mel_ce: 2.6508209705352783 (3.125970713424685)
|
257 |
+
| > loss: 0.03208164498209953 (0.03775836098492144)
|
258 |
+
| > current_lr: 4e-06
|
259 |
+
| > step_time: 0.1472 (0.15428158359527583)
|
260 |
+
| > loader_time: 0.0044 (0.0058847253799438485)
|
261 |
+
|
262 |
+
|
263 |
+
[1m --> TIME: 2024-04-23 12:24:07 -- STEP: 1300/1695 -- GLOBAL_STEP: 1300[0m
|
264 |
+
| > loss_text_ce: 0.04510482773184776 (0.04571826306959757)
|
265 |
+
| > loss_mel_ce: 3.4077906608581543 (3.1158635647480315)
|
266 |
+
| > loss: 0.04110589995980263 (0.03763787967797653)
|
267 |
+
| > current_lr: 4e-06
|
268 |
+
| > step_time: 0.1634 (0.1545918438984798)
|
269 |
+
| > loader_time: 0.0047 (0.00584103455910316)
|
270 |
+
|
271 |
+
|
272 |
+
[1m --> TIME: 2024-04-23 12:24:19 -- STEP: 1350/1695 -- GLOBAL_STEP: 1350[0m
|
273 |
+
| > loss_text_ce: 0.0476665161550045 (0.0456638012136574)
|
274 |
+
| > loss_mel_ce: 2.8584489822387695 (3.1040570794211506)
|
275 |
+
| > loss: 0.03459661453962326 (0.03749667791994631)
|
276 |
+
| > current_lr: 4e-06
|
277 |
+
| > step_time: 0.1331 (0.15490423820636887)
|
278 |
+
| > loader_time: 0.0044 (0.0058183479309082044)
|
279 |
+
|
280 |
+
|
281 |
+
[1m --> TIME: 2024-04-23 12:24:32 -- STEP: 1400/1695 -- GLOBAL_STEP: 1400[0m
|
282 |
+
| > loss_text_ce: 0.04452496021986008 (0.04561551003051656)
|
283 |
+
| > loss_mel_ce: 3.234622001647949 (3.0916232017108385)
|
284 |
+
| > loss: 0.03903746232390404 (0.03734808066327656)
|
285 |
+
| > current_lr: 4e-06
|
286 |
+
| > step_time: 0.1215 (0.15542454413005272)
|
287 |
+
| > loader_time: 0.0045 (0.005781678301947453)
|
288 |
+
|
289 |
+
|
290 |
+
[1m --> TIME: 2024-04-23 12:24:45 -- STEP: 1450/1695 -- GLOBAL_STEP: 1450[0m
|
291 |
+
| > loss_text_ce: 0.042180027812719345 (0.04556434659608479)
|
292 |
+
| > loss_mel_ce: 2.699432134628296 (3.080473143150068)
|
293 |
+
| > loss: 0.03263824060559273 (0.037214732784135576)
|
294 |
+
| > current_lr: 4e-06
|
295 |
+
| > step_time: 0.167 (0.15565427286871544)
|
296 |
+
| > loader_time: 0.0044 (0.005737697502662392)
|
297 |
+
|
298 |
+
|
299 |
+
[1m --> TIME: 2024-04-23 12:24:57 -- STEP: 1500/1695 -- GLOBAL_STEP: 1500[0m
|
300 |
+
| > loss_text_ce: 0.04820888489484787 (0.04552951066195965)
|
301 |
+
| > loss_mel_ce: 2.6011390686035156 (3.0704109377861033)
|
302 |
+
| > loss: 0.031539857387542725 (0.037094529901941585)
|
303 |
+
| > current_lr: 4e-06
|
304 |
+
| > step_time: 0.1594 (0.15576313861211125)
|
305 |
+
| > loader_time: 0.0044 (0.005703491051991777)
|
306 |
+
|
307 |
+
|
308 |
+
[1m --> TIME: 2024-04-23 12:25:11 -- STEP: 1550/1695 -- GLOBAL_STEP: 1550[0m
|
309 |
+
| > loss_text_ce: 0.045843496918678284 (0.04549794487655163)
|
310 |
+
| > loss_mel_ce: 2.6503143310546875 (3.059678205059422)
|
311 |
+
| > loss: 0.032097119837999344 (0.036966383488428164)
|
312 |
+
| > current_lr: 4e-06
|
313 |
+
| > step_time: 0.1592 (0.15621070800289008)
|
314 |
+
| > loader_time: 0.0046 (0.005665828643306605)
|
315 |
+
|
316 |
+
|
317 |
+
[1m --> TIME: 2024-04-23 12:25:24 -- STEP: 1600/1695 -- GLOBAL_STEP: 1600[0m
|
318 |
+
| > loss_text_ce: 0.04320811480283737 (0.045465721643995496)
|
319 |
+
| > loss_mel_ce: 2.5281929969787598 (3.049819415509702)
|
320 |
+
| > loss: 0.0306119192391634 (0.03684863331844095)
|
321 |
+
| > current_lr: 4e-06
|
322 |
+
| > step_time: 0.1849 (0.15655839025974236)
|
323 |
+
| > loader_time: 0.0047 (0.005632958114147183)
|
324 |
+
|
325 |
+
|
326 |
+
> CHECKPOINT : /workspace/run/training/GPT_XTTS_v2.0_LJSpeech_FT-April-23-2024_12+18PM-0000000/checkpoint_1600.pth
|
327 |
+
|
328 |
+
[1m --> TIME: 2024-04-23 12:25:39 -- STEP: 1650/1695 -- GLOBAL_STEP: 1650[0m
|
329 |
+
| > loss_text_ce: 0.04909869655966759 (0.04544659794957349)
|
330 |
+
| > loss_mel_ce: 2.6178195476531982 (3.039313706195717)
|
331 |
+
| > loss: 0.03174902871251106 (0.036723337676940526)
|
332 |
+
| > current_lr: 4e-06
|
333 |
+
| > step_time: 0.1669 (0.1564438345938015)
|
334 |
+
| > loader_time: 0.0042 (0.005605537674643773)
|
335 |
+
|
336 |
+
! Run is kept in /workspace/run/training/GPT_XTTS_v2.0_LJSpeech_FT-April-23-2024_12+18PM-0000000
|
traing_xtts_mr.py
ADDED
@@ -0,0 +1,177 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
import os
|
2 |
+
|
3 |
+
from trainer import Trainer, TrainerArgs
|
4 |
+
|
5 |
+
from TTS.config.shared_configs import BaseDatasetConfig
|
6 |
+
from TTS.tts.datasets import load_tts_samples
|
7 |
+
from TTS.tts.layers.xtts.trainer.gpt_trainer import GPTArgs, GPTTrainer, GPTTrainerConfig, XttsAudioConfig
|
8 |
+
from TTS.utils.manage import ModelManager
|
9 |
+
|
10 |
+
# Logging parameters
|
11 |
+
RUN_NAME = "GPT_XTTS_v2.0_LJSpeech_FT"
|
12 |
+
PROJECT_NAME = "XTTS_trainer"
|
13 |
+
DASHBOARD_LOGGER = "tensorboard"
|
14 |
+
LOGGER_URI = None
|
15 |
+
|
16 |
+
# Set here the path that the checkpoints will be saved. Default: ./run/training/
|
17 |
+
OUT_PATH = os.path.join(os.path.dirname(os.path.abspath(__file__)), "run", "training")
|
18 |
+
|
19 |
+
# Training Parameters
|
20 |
+
OPTIMIZER_WD_ONLY_ON_WEIGHTS = True # for multi-gpu training please make it False
|
21 |
+
START_WITH_EVAL = False # if True it will star with evaluation
|
22 |
+
BATCH_SIZE = 3 # set here the batch size
|
23 |
+
GRAD_ACUMM_STEPS = 84 # set here the grad accumulation steps
|
24 |
+
# Note: we recommend that BATCH_SIZE * GRAD_ACUMM_STEPS need to be at least 252 for more efficient training. You can increase/decrease BATCH_SIZE but then set GRAD_ACUMM_STEPS accordingly.
|
25 |
+
|
26 |
+
# Define here the dataset that you want to use for the fine-tuning on.
|
27 |
+
config_dataset = BaseDatasetConfig(
|
28 |
+
formatter="ljspeech",
|
29 |
+
dataset_name="ljspeech",
|
30 |
+
path="./audios/",
|
31 |
+
meta_file_train="metadata_expanded.csv",
|
32 |
+
language="hi",
|
33 |
+
)
|
34 |
+
|
35 |
+
# Add here the configs of the datasets
|
36 |
+
DATASETS_CONFIG_LIST = [config_dataset]
|
37 |
+
|
38 |
+
# Define the path where XTTS v2.0.1 files will be downloaded
|
39 |
+
CHECKPOINTS_OUT_PATH = os.path.join(OUT_PATH, "XTTS_v2.0_original_model_files/")
|
40 |
+
os.makedirs(CHECKPOINTS_OUT_PATH, exist_ok=True)
|
41 |
+
|
42 |
+
|
43 |
+
# DVAE files
|
44 |
+
DVAE_CHECKPOINT_LINK = "https://coqui.gateway.scarf.sh/hf-coqui/XTTS-v2/main/dvae.pth"
|
45 |
+
MEL_NORM_LINK = "https://coqui.gateway.scarf.sh/hf-coqui/XTTS-v2/main/mel_stats.pth"
|
46 |
+
|
47 |
+
# Set the path to the downloaded files
|
48 |
+
DVAE_CHECKPOINT = os.path.join(CHECKPOINTS_OUT_PATH, os.path.basename(DVAE_CHECKPOINT_LINK))
|
49 |
+
MEL_NORM_FILE = os.path.join(CHECKPOINTS_OUT_PATH, os.path.basename(MEL_NORM_LINK))
|
50 |
+
|
51 |
+
# download DVAE files if needed
|
52 |
+
if not os.path.isfile(DVAE_CHECKPOINT) or not os.path.isfile(MEL_NORM_FILE):
|
53 |
+
print(" > Downloading DVAE files!")
|
54 |
+
ModelManager._download_model_files([MEL_NORM_LINK, DVAE_CHECKPOINT_LINK], CHECKPOINTS_OUT_PATH, progress_bar=True)
|
55 |
+
|
56 |
+
|
57 |
+
# Download XTTS v2.0 checkpoint if needed
|
58 |
+
TOKENIZER_FILE_LINK = "https://coqui.gateway.scarf.sh/hf-coqui/XTTS-v2/main/vocab.json"
|
59 |
+
XTTS_CHECKPOINT_LINK = "https://coqui.gateway.scarf.sh/hf-coqui/XTTS-v2/main/model.pth"
|
60 |
+
|
61 |
+
# XTTS transfer learning parameters: You we need to provide the paths of XTTS model checkpoint that you want to do the fine tuning.
|
62 |
+
TOKENIZER_FILE = os.path.join(CHECKPOINTS_OUT_PATH, os.path.basename(TOKENIZER_FILE_LINK)) # vocab.json file
|
63 |
+
XTTS_CHECKPOINT = os.path.join(CHECKPOINTS_OUT_PATH, os.path.basename(XTTS_CHECKPOINT_LINK)) # model.pth file
|
64 |
+
|
65 |
+
# download XTTS v2.0 files if needed
|
66 |
+
if not os.path.isfile(TOKENIZER_FILE) or not os.path.isfile(XTTS_CHECKPOINT):
|
67 |
+
print(" > Downloading XTTS v2.0 files!")
|
68 |
+
ModelManager._download_model_files(
|
69 |
+
[TOKENIZER_FILE_LINK, XTTS_CHECKPOINT_LINK], CHECKPOINTS_OUT_PATH, progress_bar=True
|
70 |
+
)
|
71 |
+
|
72 |
+
|
73 |
+
# Training sentences generations
|
74 |
+
SPEAKER_REFERENCE = [
|
75 |
+
"./audios/wavs/01_0000_01_1_est1.wav" # speaker reference to be used in training test sentences
|
76 |
+
]
|
77 |
+
LANGUAGE = config_dataset.language
|
78 |
+
|
79 |
+
|
80 |
+
def main():
|
81 |
+
# init args and config
|
82 |
+
model_args = GPTArgs(
|
83 |
+
max_conditioning_length=132300, # 6 secs
|
84 |
+
min_conditioning_length=66150, # 3 secs
|
85 |
+
debug_loading_failures=False,
|
86 |
+
max_wav_length=255995, # ~11.6 seconds
|
87 |
+
max_text_length=200,
|
88 |
+
mel_norm_file=MEL_NORM_FILE,
|
89 |
+
dvae_checkpoint=DVAE_CHECKPOINT,
|
90 |
+
xtts_checkpoint=XTTS_CHECKPOINT, # checkpoint path of the model that you want to fine-tune
|
91 |
+
tokenizer_file=TOKENIZER_FILE,
|
92 |
+
gpt_num_audio_tokens=1026,
|
93 |
+
gpt_start_audio_token=1024,
|
94 |
+
gpt_stop_audio_token=1025,
|
95 |
+
gpt_use_masking_gt_prompt_approach=True,
|
96 |
+
gpt_use_perceiver_resampler=True,
|
97 |
+
)
|
98 |
+
# define audio config
|
99 |
+
audio_config = XttsAudioConfig(sample_rate=22050, dvae_sample_rate=22050, output_sample_rate=24000)
|
100 |
+
# training parameters config
|
101 |
+
config = GPTTrainerConfig(
|
102 |
+
output_path=OUT_PATH,
|
103 |
+
model_args=model_args,
|
104 |
+
run_name=RUN_NAME,
|
105 |
+
project_name=PROJECT_NAME,
|
106 |
+
run_description="""
|
107 |
+
GPT XTTS training
|
108 |
+
""",
|
109 |
+
dashboard_logger=DASHBOARD_LOGGER,
|
110 |
+
logger_uri=LOGGER_URI,
|
111 |
+
audio=audio_config,
|
112 |
+
batch_size=BATCH_SIZE,
|
113 |
+
batch_group_size=48,
|
114 |
+
eval_batch_size=BATCH_SIZE,
|
115 |
+
num_loader_workers=8,
|
116 |
+
eval_split_max_size=256,
|
117 |
+
print_step=50,
|
118 |
+
plot_step=100,
|
119 |
+
log_model_step=1000,
|
120 |
+
save_step=400,
|
121 |
+
save_n_checkpoints=1,
|
122 |
+
save_checkpoints=True,
|
123 |
+
# run_eval = False,
|
124 |
+
# target_loss="loss",
|
125 |
+
print_eval=False,
|
126 |
+
# Optimizer values like tortoise, pytorch implementation with modifications to not apply WD to non-weight parameters.
|
127 |
+
optimizer="AdamW",
|
128 |
+
optimizer_wd_only_on_weights=OPTIMIZER_WD_ONLY_ON_WEIGHTS,
|
129 |
+
optimizer_params={"betas": [0.9, 0.96], "eps": 1e-8, "weight_decay": 1e-2},
|
130 |
+
lr=4e-06, # learning rate
|
131 |
+
lr_scheduler="MultiStepLR",
|
132 |
+
# it was adjusted accordly for the new step scheme
|
133 |
+
lr_scheduler_params={"milestones": [50000 * 18, 150000 * 18, 300000 * 18], "gamma": 0.5, "last_epoch": -1},
|
134 |
+
test_sentences=[
|
135 |
+
{
|
136 |
+
"text": "या मोहिमेचे अधिकृत धोरण पोलंडला रशियाच्या धोक्यापासून वाचवणे हे होते.",
|
137 |
+
"speaker_wav": SPEAKER_REFERENCE,
|
138 |
+
"language": LANGUAGE,
|
139 |
+
},
|
140 |
+
{
|
141 |
+
"text": "अशी नेपोलियनला आशा होती परंतु रशियन सैन्याने",
|
142 |
+
"speaker_wav": SPEAKER_REFERENCE,
|
143 |
+
"language": LANGUAGE,
|
144 |
+
},
|
145 |
+
],
|
146 |
+
)
|
147 |
+
|
148 |
+
# init the model from config
|
149 |
+
model = GPTTrainer.init_from_config(config)
|
150 |
+
|
151 |
+
# load training samples
|
152 |
+
train_samples, eval_samples = load_tts_samples(
|
153 |
+
DATASETS_CONFIG_LIST,
|
154 |
+
eval_split=False,
|
155 |
+
# eval_split_max_size=config.eval_split_max_size,
|
156 |
+
# eval_split_size=config.eval_split_size,
|
157 |
+
)
|
158 |
+
|
159 |
+
# init the trainer and 🚀
|
160 |
+
trainer = Trainer(
|
161 |
+
TrainerArgs(
|
162 |
+
restore_path=None, # xtts checkpoint is restored via xtts_checkpoint key so no need of restore it using Trainer restore_path parameter
|
163 |
+
skip_train_epoch=False,
|
164 |
+
start_with_eval=START_WITH_EVAL,
|
165 |
+
grad_accum_steps=GRAD_ACUMM_STEPS,
|
166 |
+
),
|
167 |
+
config,
|
168 |
+
output_path=OUT_PATH,
|
169 |
+
model=model,
|
170 |
+
train_samples=train_samples,
|
171 |
+
# eval_samples=eval_samples,
|
172 |
+
)
|
173 |
+
trainer.fit()
|
174 |
+
|
175 |
+
|
176 |
+
if __name__ == "__main__":
|
177 |
+
main()
|
vocab.json
ADDED
The diff for this file is too large to render.
See raw diff
|
|