Patil commited on
Commit
7726997
·
verified ·
1 Parent(s): d42f030

Upload 6 files

Browse files
config.json ADDED
@@ -0,0 +1,205 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "output_path": "/workspace/run/training",
3
+ "logger_uri": null,
4
+ "run_name": "GPT_XTTS_v2.0_LJSpeech_FT",
5
+ "project_name": "XTTS_trainer",
6
+ "run_description": "\n GPT XTTS training\n ",
7
+ "print_step": 50,
8
+ "plot_step": 100,
9
+ "model_param_stats": false,
10
+ "wandb_entity": null,
11
+ "dashboard_logger": "tensorboard",
12
+ "save_on_interrupt": true,
13
+ "log_model_step": 1000,
14
+ "save_step": 400,
15
+ "save_n_checkpoints": 1,
16
+ "save_checkpoints": true,
17
+ "save_all_best": false,
18
+ "save_best_after": 0,
19
+ "target_loss": null,
20
+ "print_eval": false,
21
+ "test_delay_epochs": 0,
22
+ "run_eval": true,
23
+ "run_eval_steps": null,
24
+ "distributed_backend": "nccl",
25
+ "distributed_url": "tcp://localhost:54321",
26
+ "mixed_precision": false,
27
+ "precision": "fp16",
28
+ "epochs": 1000,
29
+ "batch_size": 3,
30
+ "eval_batch_size": 3,
31
+ "grad_clip": 0.0,
32
+ "scheduler_after_epoch": true,
33
+ "lr": 4e-06,
34
+ "optimizer": "AdamW",
35
+ "optimizer_params": {
36
+ "betas": [
37
+ 0.9,
38
+ 0.96
39
+ ],
40
+ "eps": 1e-08,
41
+ "weight_decay": 0.01
42
+ },
43
+ "lr_scheduler": "MultiStepLR",
44
+ "lr_scheduler_params": {
45
+ "milestones": [
46
+ 900000,
47
+ 2700000,
48
+ 5400000
49
+ ],
50
+ "gamma": 0.5,
51
+ "last_epoch": -1
52
+ },
53
+ "use_grad_scaler": false,
54
+ "allow_tf32": false,
55
+ "cudnn_enable": true,
56
+ "cudnn_deterministic": false,
57
+ "cudnn_benchmark": false,
58
+ "training_seed": 1,
59
+ "model": "xtts",
60
+ "num_loader_workers": 8,
61
+ "num_eval_loader_workers": 0,
62
+ "use_noise_augment": false,
63
+ "audio": {
64
+ "sample_rate": 22050,
65
+ "output_sample_rate": 24000,
66
+ "dvae_sample_rate": 22050
67
+ },
68
+ "use_phonemes": false,
69
+ "phonemizer": null,
70
+ "phoneme_language": null,
71
+ "compute_input_seq_cache": false,
72
+ "text_cleaner": null,
73
+ "enable_eos_bos_chars": false,
74
+ "test_sentences_file": "",
75
+ "phoneme_cache_path": null,
76
+ "characters": null,
77
+ "add_blank": false,
78
+ "batch_group_size": 48,
79
+ "loss_masking": null,
80
+ "min_audio_len": 1,
81
+ "max_audio_len": Infinity,
82
+ "min_text_len": 1,
83
+ "max_text_len": Infinity,
84
+ "compute_f0": false,
85
+ "compute_energy": false,
86
+ "compute_linear_spec": false,
87
+ "precompute_num_workers": 0,
88
+ "start_by_longest": false,
89
+ "shuffle": false,
90
+ "drop_last": false,
91
+ "datasets": [
92
+ {
93
+ "formatter": "",
94
+ "dataset_name": "",
95
+ "path": "",
96
+ "meta_file_train": "",
97
+ "ignored_speakers": null,
98
+ "language": "",
99
+ "phonemizer": "",
100
+ "meta_file_val": "",
101
+ "meta_file_attn_mask": ""
102
+ }
103
+ ],
104
+ "test_sentences": [
105
+ {
106
+ "text": "\u092f\u093e \u092e\u094b\u0939\u093f\u092e\u0947\u091a\u0947 \u0905\u0927\u093f\u0915\u0943\u0924 \u0927\u094b\u0930\u0923 \u092a\u094b\u0932\u0902\u0921\u0932\u093e \u0930\u0936\u093f\u092f\u093e\u091a\u094d\u092f\u093e \u0927\u094b\u0915\u094d\u092f\u093e\u092a\u093e\u0938\u0942\u0928 \u0935\u093e\u091a\u0935\u0923\u0947 \u0939\u0947 \u0939\u094b\u0924\u0947.",
107
+ "speaker_wav": [
108
+ "./audios/wavs/01_0000_01_1_est1.wav"
109
+ ],
110
+ "language": "hi"
111
+ },
112
+ {
113
+ "text": "\u0905\u0936\u0940 \u0928\u0947\u092a\u094b\u0932\u093f\u092f\u0928\u0932\u093e \u0906\u0936\u093e \u0939\u094b\u0924\u0940 \u092a\u0930\u0902\u0924\u0941 \u0930\u0936\u093f\u092f\u0928 \u0938\u0948\u0928\u094d\u092f\u093e\u0928\u0947",
114
+ "speaker_wav": [
115
+ "./audios/wavs/01_0000_01_1_est1.wav"
116
+ ],
117
+ "language": "hi"
118
+ }
119
+ ],
120
+ "eval_split_max_size": 256,
121
+ "eval_split_size": 0.01,
122
+ "use_speaker_weighted_sampler": false,
123
+ "speaker_weighted_sampler_alpha": 1.0,
124
+ "use_language_weighted_sampler": false,
125
+ "language_weighted_sampler_alpha": 1.0,
126
+ "use_length_weighted_sampler": false,
127
+ "length_weighted_sampler_alpha": 1.0,
128
+ "model_args": {
129
+ "gpt_batch_size": 1,
130
+ "enable_redaction": false,
131
+ "kv_cache": true,
132
+ "gpt_checkpoint": "",
133
+ "clvp_checkpoint": null,
134
+ "decoder_checkpoint": null,
135
+ "num_chars": 255,
136
+ "tokenizer_file": "/workspace/run/training/XTTS_v2.0_original_model_files/vocab.json",
137
+ "gpt_max_audio_tokens": 605,
138
+ "gpt_max_text_tokens": 402,
139
+ "gpt_max_prompt_tokens": 70,
140
+ "gpt_layers": 30,
141
+ "gpt_n_model_channels": 1024,
142
+ "gpt_n_heads": 16,
143
+ "gpt_number_text_tokens": 6681,
144
+ "gpt_start_text_token": 261,
145
+ "gpt_stop_text_token": 0,
146
+ "gpt_num_audio_tokens": 1026,
147
+ "gpt_start_audio_token": 1024,
148
+ "gpt_stop_audio_token": 1025,
149
+ "gpt_code_stride_len": 1024,
150
+ "gpt_use_masking_gt_prompt_approach": true,
151
+ "gpt_use_perceiver_resampler": true,
152
+ "input_sample_rate": 22050,
153
+ "output_sample_rate": 24000,
154
+ "output_hop_length": 256,
155
+ "decoder_input_dim": 1024,
156
+ "d_vector_dim": 512,
157
+ "cond_d_vector_in_each_upsampling_layer": true,
158
+ "duration_const": 102400,
159
+ "min_conditioning_length": 66150,
160
+ "max_conditioning_length": 132300,
161
+ "gpt_loss_text_ce_weight": 0.01,
162
+ "gpt_loss_mel_ce_weight": 1.0,
163
+ "debug_loading_failures": false,
164
+ "max_wav_length": 255995,
165
+ "max_text_length": 200,
166
+ "mel_norm_file": "/workspace/run/training/XTTS_v2.0_original_model_files/mel_stats.pth",
167
+ "dvae_checkpoint": "/workspace/run/training/XTTS_v2.0_original_model_files/dvae.pth",
168
+ "xtts_checkpoint": "/workspace/run/training/XTTS_v2.0_original_model_files/model.pth",
169
+ "vocoder": ""
170
+ },
171
+ "model_dir": null,
172
+ "languages": [
173
+ "en",
174
+ "es",
175
+ "fr",
176
+ "de",
177
+ "it",
178
+ "pt",
179
+ "pl",
180
+ "tr",
181
+ "ru",
182
+ "nl",
183
+ "cs",
184
+ "ar",
185
+ "zh-cn",
186
+ "hu",
187
+ "ko",
188
+ "ja",
189
+ "hi"
190
+ ],
191
+ "temperature": 0.85,
192
+ "length_penalty": 1.0,
193
+ "repetition_penalty": 2.0,
194
+ "top_k": 50,
195
+ "top_p": 0.85,
196
+ "num_gpt_outputs": 1,
197
+ "gpt_cond_len": 12,
198
+ "gpt_cond_chunk_len": 4,
199
+ "max_ref_len": 10,
200
+ "sound_norm_refs": false,
201
+ "optimizer_wd_only_on_weights": true,
202
+ "weighted_loss_attrs": {},
203
+ "weighted_loss_multipliers": {},
204
+ "github_branch": "inside_docker"
205
+ }
events.out.tfevents.1713874719.81c96a3a92b5.2617.0 ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:883713381f9e5bfef98f894d881b258fcbc180034a7cdc6f439644eafb5f610c
3
+ size 20650
model.pth ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:0cede63ef409ebe9762e3c3b7e0743ccbd1ec0736de90f7cb8052003db141599
3
+ size 5607927381
trainer_0_log.txt ADDED
@@ -0,0 +1,336 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ > Training Environment:
2
+ | > Backend: Torch
3
+ | > Mixed precision: False
4
+ | > Precision: float32
5
+ | > Current device: 0
6
+ | > Num. of GPUs: 1
7
+ | > Num. of CPUs: 64
8
+ | > Num. of Torch Threads: 1
9
+ | > Torch seed: 1
10
+ | > Torch CUDNN: True
11
+ | > Torch CUDNN deterministic: False
12
+ | > Torch CUDNN benchmark: False
13
+ | > Torch TF32 MatMul: False
14
+ > Start Tensorboard: tensorboard --logdir=/workspace/run/training/GPT_XTTS_v2.0_LJSpeech_FT-April-23-2024_12+18PM-0000000
15
+
16
+ > Model has 518442047 parameters
17
+
18
+  > EPOCH: 0/1000
19
+ --> /workspace/run/training/GPT_XTTS_v2.0_LJSpeech_FT-April-23-2024_12+18PM-0000000
20
+
21
+  > TRAINING (2024-04-23 12:18:40) 
22
+
23
+  --> TIME: 2024-04-23 12:18:42 -- STEP: 0/1695 -- GLOBAL_STEP: 0
24
+ | > loss_text_ce: 0.042592838406562805 (0.042592838406562805)
25
+ | > loss_mel_ce: 3.744250535964966 (3.744250535964966)
26
+ | > loss: 0.04508147016167641 (0.04508147016167641)
27
+ | > current_lr: 4e-06
28
+ | > step_time: 0.3181 (0.3181343078613281)
29
+ | > loader_time: 1.1535 (1.153491735458374)
30
+
31
+
32
+  --> TIME: 2024-04-23 12:18:50 -- STEP: 50/1695 -- GLOBAL_STEP: 50
33
+ | > loss_text_ce: 0.043245986104011536 (0.045777649357914924)
34
+ | > loss_mel_ce: 4.0826735496521 (3.678379626274109)
35
+ | > loss: 0.04911808669567108 (0.044335206523537646)
36
+ | > current_lr: 4e-06
37
+ | > step_time: 0.1173 (0.10748531341552735)
38
+ | > loader_time: 0.0038 (0.012436685562133789)
39
+
40
+
41
+  --> TIME: 2024-04-23 12:19:00 -- STEP: 100/1695 -- GLOBAL_STEP: 100
42
+ | > loss_text_ce: 0.04654935747385025 (0.04617325332015751)
43
+ | > loss_mel_ce: 3.7310783863067627 (3.6352836871147156)
44
+ | > loss: 0.044971760362386703 (0.04382686924189331)
45
+ | > current_lr: 4e-06
46
+ | > step_time: 0.1229 (0.1165578818321228)
47
+ | > loader_time: 0.0044 (0.010995228290557862)
48
+
49
+
50
+  --> TIME: 2024-04-23 12:19:10 -- STEP: 150/1695 -- GLOBAL_STEP: 150
51
+ | > loss_text_ce: 0.04864665865898132 (0.04633487790822981)
52
+ | > loss_mel_ce: 3.695878267288208 (3.5984654172261554)
53
+ | > loss: 0.04457768052816391 (0.04339048052827519)
54
+ | > current_lr: 4e-06
55
+ | > step_time: 0.0968 (0.12075453917185465)
56
+ | > loader_time: 0.0068 (0.009986537297566734)
57
+
58
+
59
+  --> TIME: 2024-04-23 12:19:21 -- STEP: 200/1695 -- GLOBAL_STEP: 200
60
+ | > loss_text_ce: 0.04507960379123688 (0.04615468136966227)
61
+ | > loss_mel_ce: 3.4362077713012695 (3.5497735607624055)
62
+ | > loss: 0.041443899273872375 (0.042808670215308674)
63
+ | > current_lr: 4e-06
64
+ | > step_time: 0.1431 (0.12541004419326782)
65
+ | > loader_time: 0.004 (0.009364948272705077)
66
+
67
+
68
+  --> TIME: 2024-04-23 12:19:33 -- STEP: 250/1695 -- GLOBAL_STEP: 250
69
+ | > loss_text_ce: 0.044978540390729904 (0.04600780452787875)
70
+ | > loss_mel_ce: 3.3835601806640625 (3.5098479528427124)
71
+ | > loss: 0.040815938264131546 (0.04233161683380605)
72
+ | > current_lr: 4e-06
73
+ | > step_time: 0.1506 (0.12965419387817378)
74
+ | > loader_time: 0.0043 (0.008812045097351074)
75
+
76
+
77
+  --> TIME: 2024-04-23 12:19:45 -- STEP: 300/1695 -- GLOBAL_STEP: 300
78
+ | > loss_text_ce: 0.04761254042387009 (0.046099709086120134)
79
+ | > loss_mel_ce: 3.859790325164795 (3.4856272101402284)
80
+ | > loss: 0.04651670157909393 (0.04204436879605055)
81
+ | > current_lr: 4e-06
82
+ | > step_time: 0.107 (0.13299476464589427)
83
+ | > loader_time: 0.0045 (0.008340648015340164)
84
+
85
+
86
+  --> TIME: 2024-04-23 12:19:57 -- STEP: 350/1695 -- GLOBAL_STEP: 350
87
+ | > loss_text_ce: 0.041058849543333054 (0.04608927173273904)
88
+ | > loss_mel_ce: 3.2493679523468018 (3.4584123958860125)
89
+ | > loss: 0.039171747863292694 (0.04172025864677771)
90
+ | > current_lr: 4e-06
91
+ | > step_time: 0.1586 (0.1357990046909877)
92
+ | > loader_time: 0.0092 (0.007999198096139085)
93
+
94
+
95
+  --> TIME: 2024-04-23 12:20:09 -- STEP: 400/1695 -- GLOBAL_STEP: 400
96
+ | > loss_text_ce: 0.0439525842666626 (0.04606584513559937)
97
+ | > loss_mel_ce: 3.5535271167755127 (3.4283770048618316)
98
+ | > loss: 0.04282714053988457 (0.04136241558939219)
99
+ | > current_lr: 4e-06
100
+ | > step_time: 0.1314 (0.13862687826156628)
101
+ | > loader_time: 0.0039 (0.007809545397758481)
102
+
103
+
104
+ > CHECKPOINT : /workspace/run/training/GPT_XTTS_v2.0_LJSpeech_FT-April-23-2024_12+18PM-0000000/checkpoint_400.pth
105
+
106
+  --> TIME: 2024-04-23 12:20:24 -- STEP: 450/1695 -- GLOBAL_STEP: 450
107
+ | > loss_text_ce: 0.05098263919353485 (0.04611284121870995)
108
+ | > loss_mel_ce: 2.9446003437042236 (3.401099606090122)
109
+ | > loss: 0.03566170483827591 (0.04103824413898918)
110
+ | > current_lr: 4e-06
111
+ | > step_time: 0.1581 (0.1393417421976725)
112
+ | > loader_time: 0.0041 (0.0075671084721883105)
113
+
114
+
115
+  --> TIME: 2024-04-23 12:20:36 -- STEP: 500/1695 -- GLOBAL_STEP: 500
116
+ | > loss_text_ce: 0.03936528041958809 (0.04605886636674404)
117
+ | > loss_mel_ce: 3.534381628036499 (3.3785955691337586)
118
+ | > loss: 0.04254460707306862 (0.04076969639584422)
119
+ | > current_lr: 4e-06
120
+ | > step_time: 0.1362 (0.1412919845581054)
121
+ | > loader_time: 0.0044 (0.007305326461791989)
122
+
123
+
124
+  --> TIME: 2024-04-23 12:20:49 -- STEP: 550/1695 -- GLOBAL_STEP: 550
125
+ | > loss_text_ce: 0.043622393161058426 (0.04607608911666003)
126
+ | > loss_mel_ce: 3.36867618560791 (3.351200197826734)
127
+ | > loss: 0.04062260314822197 (0.04044376604597676)
128
+ | > current_lr: 4e-06
129
+ | > step_time: 0.1491 (0.1432263898849487)
130
+ | > loader_time: 0.0044 (0.007147459983825681)
131
+
132
+
133
+  --> TIME: 2024-04-23 12:21:01 -- STEP: 600/1695 -- GLOBAL_STEP: 600
134
+ | > loss_text_ce: 0.04180557280778885 (0.04603437863911191)
135
+ | > loss_mel_ce: 3.1069161891937256 (3.328243460655214)
136
+ | > loss: 0.0374847836792469 (0.04016997500322759)
137
+ | > current_lr: 4e-06
138
+ | > step_time: 0.1583 (0.14447109142939243)
139
+ | > loader_time: 0.0047 (0.006965583960215248)
140
+
141
+
142
+  --> TIME: 2024-04-23 12:21:14 -- STEP: 650/1695 -- GLOBAL_STEP: 650
143
+ | > loss_text_ce: 0.04896671324968338 (0.04602042846381666)
144
+ | > loss_mel_ce: 3.0476784706115723 (3.3038424359835123)
145
+ | > loss: 0.03686482459306717 (0.03987932055042337)
146
+ | > current_lr: 4e-06
147
+ | > step_time: 0.1219 (0.14626641933734613)
148
+ | > loader_time: 0.0047 (0.006803958232586197)
149
+
150
+
151
+  --> TIME: 2024-04-23 12:21:27 -- STEP: 700/1695 -- GLOBAL_STEP: 700
152
+ | > loss_text_ce: 0.04512707144021988 (0.046030817106366195)
153
+ | > loss_mel_ce: 3.066598892211914 (3.2816116438593195)
154
+ | > loss: 0.037044357508420944 (0.03961479195526668)
155
+ | > current_lr: 4e-06
156
+ | > step_time: 0.1502 (0.14775025640215206)
157
+ | > loader_time: 0.0044 (0.006717268739427837)
158
+
159
+
160
+  --> TIME: 2024-04-23 12:21:40 -- STEP: 750/1695 -- GLOBAL_STEP: 750
161
+ | > loss_text_ce: 0.04244884476065636 (0.04599520656466488)
162
+ | > loss_mel_ce: 2.8379921913146973 (3.264411670366924)
163
+ | > loss: 0.034290965646505356 (0.03940960643688838)
164
+ | > current_lr: 4e-06
165
+ | > step_time: 0.218 (0.14881795597076428)
166
+ | > loader_time: 0.0049 (0.006605740865071612)
167
+
168
+
169
+  --> TIME: 2024-04-23 12:21:53 -- STEP: 800/1695 -- GLOBAL_STEP: 800
170
+ | > loss_text_ce: 0.04257930815219879 (0.04597263523377482)
171
+ | > loss_mel_ce: 2.8074073791503906 (3.2470336309075365)
172
+ | > loss: 0.033928416669368744 (0.03920245631132278)
173
+ | > current_lr: 4e-06
174
+ | > step_time: 0.151 (0.14975822657346743)
175
+ | > loader_time: 0.0045 (0.006505406498908994)
176
+
177
+
178
+ > CHECKPOINT : /workspace/run/training/GPT_XTTS_v2.0_LJSpeech_FT-April-23-2024_12+18PM-0000000/checkpoint_800.pth
179
+
180
+  --> TIME: 2024-04-23 12:22:08 -- STEP: 850/1695 -- GLOBAL_STEP: 850
181
+ | > loss_text_ce: 0.046279508620500565 (0.04595626743400801)
182
+ | > loss_mel_ce: 2.9114205837249756 (3.232562539998224)
183
+ | > loss: 0.03521071374416351 (0.0390299865691101)
184
+ | > current_lr: 4e-06
185
+ | > step_time: 0.1686 (0.14989260813769187)
186
+ | > loader_time: 0.0041 (0.006382525107439824)
187
+
188
+
189
+  --> TIME: 2024-04-23 12:22:21 -- STEP: 900/1695 -- GLOBAL_STEP: 900
190
+ | > loss_text_ce: 0.04815426096320152 (0.045925861448049575)
191
+ | > loss_mel_ce: 2.881121873855591 (3.21540697336197)
192
+ | > loss: 0.03487233817577362 (0.03882539166758459)
193
+ | > current_lr: 4e-06
194
+ | > step_time: 0.1067 (0.15040262672636256)
195
+ | > loader_time: 0.0163 (0.006299734380510116)
196
+
197
+
198
+  --> TIME: 2024-04-23 12:22:33 -- STEP: 950/1695 -- GLOBAL_STEP: 950
199
+ | > loss_text_ce: 0.046194590628147125 (0.045895876723684795)
200
+ | > loss_mel_ce: 2.452665328979492 (3.2002050801327364)
201
+ | > loss: 0.029748331755399704 (0.03864405976706431)
202
+ | > current_lr: 4e-06
203
+ | > step_time: 0.1483 (0.1510076773794075)
204
+ | > loader_time: 0.0041 (0.006192966511375024)
205
+
206
+
207
+  --> TIME: 2024-04-23 12:22:46 -- STEP: 1000/1695 -- GLOBAL_STEP: 1000
208
+ | > loss_text_ce: 0.04607674479484558 (0.04585176565870645)
209
+ | > loss_mel_ce: 2.9387059211730957 (3.187430265903474)
210
+ | > loss: 0.035533126443624496 (0.03849145351536573)
211
+ | > current_lr: 4e-06
212
+ | > step_time: 0.1648 (0.15175995016098034)
213
+ | > loader_time: 0.0044 (0.006122385978698729)
214
+
215
+
216
+  --> TIME: 2024-04-23 12:22:59 -- STEP: 1050/1695 -- GLOBAL_STEP: 1050
217
+ | > loss_text_ce: 0.0466134138405323 (0.045852795899623947)
218
+ | > loss_mel_ce: 2.9738194942474365 (3.172598667598907)
219
+ | > loss: 0.03595753759145737 (0.03831489915826492)
220
+ | > current_lr: 4e-06
221
+ | > step_time: 0.1396 (0.15251214708600735)
222
+ | > loader_time: 0.0049 (0.00607424667903355)
223
+
224
+
225
+  --> TIME: 2024-04-23 12:23:12 -- STEP: 1100/1695 -- GLOBAL_STEP: 1100
226
+ | > loss_text_ce: 0.04659873992204666 (0.04585135899822824)
227
+ | > loss_mel_ce: 2.4221293926239014 (3.1576039728251373)
228
+ | > loss: 0.029389619827270508 (0.038136373775249206)
229
+ | > current_lr: 4e-06
230
+ | > step_time: 0.186 (0.15299982179294946)
231
+ | > loader_time: 0.0048 (0.006017334894700482)
232
+
233
+
234
+  --> TIME: 2024-04-23 12:23:25 -- STEP: 1150/1695 -- GLOBAL_STEP: 1150
235
+ | > loss_text_ce: 0.043769825249910355 (0.045824583464342636)
236
+ | > loss_mel_ce: 2.859921455383301 (3.1463320172351352)
237
+ | > loss: 0.034567754715681076 (0.03800186507079915)
238
+ | > current_lr: 4e-06
239
+ | > step_time: 0.1919 (0.15354946157206664)
240
+ | > loader_time: 0.0045 (0.005964664376300311)
241
+
242
+
243
+  --> TIME: 2024-04-23 12:23:38 -- STEP: 1200/1695 -- GLOBAL_STEP: 1200
244
+ | > loss_text_ce: 0.04848972707986832 (0.0457837945688516)
245
+ | > loss_mel_ce: 2.9194998741149902 (3.13719070851803)
246
+ | > loss: 0.035333212465047836 (0.037892554394590376)
247
+ | > current_lr: 4e-06
248
+ | > step_time: 0.2642 (0.15420349061489103)
249
+ | > loader_time: 0.0046 (0.005914180874824522)
250
+
251
+
252
+ > CHECKPOINT : /workspace/run/training/GPT_XTTS_v2.0_LJSpeech_FT-April-23-2024_12+18PM-0000000/checkpoint_1200.pth
253
+
254
+  --> TIME: 2024-04-23 12:23:54 -- STEP: 1250/1695 -- GLOBAL_STEP: 1250
255
+ | > loss_text_ce: 0.044037092477083206 (0.04573154278397562)
256
+ | > loss_mel_ce: 2.6508209705352783 (3.125970713424685)
257
+ | > loss: 0.03208164498209953 (0.03775836098492144)
258
+ | > current_lr: 4e-06
259
+ | > step_time: 0.1472 (0.15428158359527583)
260
+ | > loader_time: 0.0044 (0.0058847253799438485)
261
+
262
+
263
+  --> TIME: 2024-04-23 12:24:07 -- STEP: 1300/1695 -- GLOBAL_STEP: 1300
264
+ | > loss_text_ce: 0.04510482773184776 (0.04571826306959757)
265
+ | > loss_mel_ce: 3.4077906608581543 (3.1158635647480315)
266
+ | > loss: 0.04110589995980263 (0.03763787967797653)
267
+ | > current_lr: 4e-06
268
+ | > step_time: 0.1634 (0.1545918438984798)
269
+ | > loader_time: 0.0047 (0.00584103455910316)
270
+
271
+
272
+  --> TIME: 2024-04-23 12:24:19 -- STEP: 1350/1695 -- GLOBAL_STEP: 1350
273
+ | > loss_text_ce: 0.0476665161550045 (0.0456638012136574)
274
+ | > loss_mel_ce: 2.8584489822387695 (3.1040570794211506)
275
+ | > loss: 0.03459661453962326 (0.03749667791994631)
276
+ | > current_lr: 4e-06
277
+ | > step_time: 0.1331 (0.15490423820636887)
278
+ | > loader_time: 0.0044 (0.0058183479309082044)
279
+
280
+
281
+  --> TIME: 2024-04-23 12:24:32 -- STEP: 1400/1695 -- GLOBAL_STEP: 1400
282
+ | > loss_text_ce: 0.04452496021986008 (0.04561551003051656)
283
+ | > loss_mel_ce: 3.234622001647949 (3.0916232017108385)
284
+ | > loss: 0.03903746232390404 (0.03734808066327656)
285
+ | > current_lr: 4e-06
286
+ | > step_time: 0.1215 (0.15542454413005272)
287
+ | > loader_time: 0.0045 (0.005781678301947453)
288
+
289
+
290
+  --> TIME: 2024-04-23 12:24:45 -- STEP: 1450/1695 -- GLOBAL_STEP: 1450
291
+ | > loss_text_ce: 0.042180027812719345 (0.04556434659608479)
292
+ | > loss_mel_ce: 2.699432134628296 (3.080473143150068)
293
+ | > loss: 0.03263824060559273 (0.037214732784135576)
294
+ | > current_lr: 4e-06
295
+ | > step_time: 0.167 (0.15565427286871544)
296
+ | > loader_time: 0.0044 (0.005737697502662392)
297
+
298
+
299
+  --> TIME: 2024-04-23 12:24:57 -- STEP: 1500/1695 -- GLOBAL_STEP: 1500
300
+ | > loss_text_ce: 0.04820888489484787 (0.04552951066195965)
301
+ | > loss_mel_ce: 2.6011390686035156 (3.0704109377861033)
302
+ | > loss: 0.031539857387542725 (0.037094529901941585)
303
+ | > current_lr: 4e-06
304
+ | > step_time: 0.1594 (0.15576313861211125)
305
+ | > loader_time: 0.0044 (0.005703491051991777)
306
+
307
+
308
+  --> TIME: 2024-04-23 12:25:11 -- STEP: 1550/1695 -- GLOBAL_STEP: 1550
309
+ | > loss_text_ce: 0.045843496918678284 (0.04549794487655163)
310
+ | > loss_mel_ce: 2.6503143310546875 (3.059678205059422)
311
+ | > loss: 0.032097119837999344 (0.036966383488428164)
312
+ | > current_lr: 4e-06
313
+ | > step_time: 0.1592 (0.15621070800289008)
314
+ | > loader_time: 0.0046 (0.005665828643306605)
315
+
316
+
317
+  --> TIME: 2024-04-23 12:25:24 -- STEP: 1600/1695 -- GLOBAL_STEP: 1600
318
+ | > loss_text_ce: 0.04320811480283737 (0.045465721643995496)
319
+ | > loss_mel_ce: 2.5281929969787598 (3.049819415509702)
320
+ | > loss: 0.0306119192391634 (0.03684863331844095)
321
+ | > current_lr: 4e-06
322
+ | > step_time: 0.1849 (0.15655839025974236)
323
+ | > loader_time: 0.0047 (0.005632958114147183)
324
+
325
+
326
+ > CHECKPOINT : /workspace/run/training/GPT_XTTS_v2.0_LJSpeech_FT-April-23-2024_12+18PM-0000000/checkpoint_1600.pth
327
+
328
+  --> TIME: 2024-04-23 12:25:39 -- STEP: 1650/1695 -- GLOBAL_STEP: 1650
329
+ | > loss_text_ce: 0.04909869655966759 (0.04544659794957349)
330
+ | > loss_mel_ce: 2.6178195476531982 (3.039313706195717)
331
+ | > loss: 0.03174902871251106 (0.036723337676940526)
332
+ | > current_lr: 4e-06
333
+ | > step_time: 0.1669 (0.1564438345938015)
334
+ | > loader_time: 0.0042 (0.005605537674643773)
335
+
336
+ ! Run is kept in /workspace/run/training/GPT_XTTS_v2.0_LJSpeech_FT-April-23-2024_12+18PM-0000000
traing_xtts_mr.py ADDED
@@ -0,0 +1,177 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import os
2
+
3
+ from trainer import Trainer, TrainerArgs
4
+
5
+ from TTS.config.shared_configs import BaseDatasetConfig
6
+ from TTS.tts.datasets import load_tts_samples
7
+ from TTS.tts.layers.xtts.trainer.gpt_trainer import GPTArgs, GPTTrainer, GPTTrainerConfig, XttsAudioConfig
8
+ from TTS.utils.manage import ModelManager
9
+
10
+ # Logging parameters
11
+ RUN_NAME = "GPT_XTTS_v2.0_LJSpeech_FT"
12
+ PROJECT_NAME = "XTTS_trainer"
13
+ DASHBOARD_LOGGER = "tensorboard"
14
+ LOGGER_URI = None
15
+
16
+ # Set here the path that the checkpoints will be saved. Default: ./run/training/
17
+ OUT_PATH = os.path.join(os.path.dirname(os.path.abspath(__file__)), "run", "training")
18
+
19
+ # Training Parameters
20
+ OPTIMIZER_WD_ONLY_ON_WEIGHTS = True # for multi-gpu training please make it False
21
+ START_WITH_EVAL = False # if True it will star with evaluation
22
+ BATCH_SIZE = 3 # set here the batch size
23
+ GRAD_ACUMM_STEPS = 84 # set here the grad accumulation steps
24
+ # Note: we recommend that BATCH_SIZE * GRAD_ACUMM_STEPS need to be at least 252 for more efficient training. You can increase/decrease BATCH_SIZE but then set GRAD_ACUMM_STEPS accordingly.
25
+
26
+ # Define here the dataset that you want to use for the fine-tuning on.
27
+ config_dataset = BaseDatasetConfig(
28
+ formatter="ljspeech",
29
+ dataset_name="ljspeech",
30
+ path="./audios/",
31
+ meta_file_train="metadata_expanded.csv",
32
+ language="hi",
33
+ )
34
+
35
+ # Add here the configs of the datasets
36
+ DATASETS_CONFIG_LIST = [config_dataset]
37
+
38
+ # Define the path where XTTS v2.0.1 files will be downloaded
39
+ CHECKPOINTS_OUT_PATH = os.path.join(OUT_PATH, "XTTS_v2.0_original_model_files/")
40
+ os.makedirs(CHECKPOINTS_OUT_PATH, exist_ok=True)
41
+
42
+
43
+ # DVAE files
44
+ DVAE_CHECKPOINT_LINK = "https://coqui.gateway.scarf.sh/hf-coqui/XTTS-v2/main/dvae.pth"
45
+ MEL_NORM_LINK = "https://coqui.gateway.scarf.sh/hf-coqui/XTTS-v2/main/mel_stats.pth"
46
+
47
+ # Set the path to the downloaded files
48
+ DVAE_CHECKPOINT = os.path.join(CHECKPOINTS_OUT_PATH, os.path.basename(DVAE_CHECKPOINT_LINK))
49
+ MEL_NORM_FILE = os.path.join(CHECKPOINTS_OUT_PATH, os.path.basename(MEL_NORM_LINK))
50
+
51
+ # download DVAE files if needed
52
+ if not os.path.isfile(DVAE_CHECKPOINT) or not os.path.isfile(MEL_NORM_FILE):
53
+ print(" > Downloading DVAE files!")
54
+ ModelManager._download_model_files([MEL_NORM_LINK, DVAE_CHECKPOINT_LINK], CHECKPOINTS_OUT_PATH, progress_bar=True)
55
+
56
+
57
+ # Download XTTS v2.0 checkpoint if needed
58
+ TOKENIZER_FILE_LINK = "https://coqui.gateway.scarf.sh/hf-coqui/XTTS-v2/main/vocab.json"
59
+ XTTS_CHECKPOINT_LINK = "https://coqui.gateway.scarf.sh/hf-coqui/XTTS-v2/main/model.pth"
60
+
61
+ # XTTS transfer learning parameters: You we need to provide the paths of XTTS model checkpoint that you want to do the fine tuning.
62
+ TOKENIZER_FILE = os.path.join(CHECKPOINTS_OUT_PATH, os.path.basename(TOKENIZER_FILE_LINK)) # vocab.json file
63
+ XTTS_CHECKPOINT = os.path.join(CHECKPOINTS_OUT_PATH, os.path.basename(XTTS_CHECKPOINT_LINK)) # model.pth file
64
+
65
+ # download XTTS v2.0 files if needed
66
+ if not os.path.isfile(TOKENIZER_FILE) or not os.path.isfile(XTTS_CHECKPOINT):
67
+ print(" > Downloading XTTS v2.0 files!")
68
+ ModelManager._download_model_files(
69
+ [TOKENIZER_FILE_LINK, XTTS_CHECKPOINT_LINK], CHECKPOINTS_OUT_PATH, progress_bar=True
70
+ )
71
+
72
+
73
+ # Training sentences generations
74
+ SPEAKER_REFERENCE = [
75
+ "./audios/wavs/01_0000_01_1_est1.wav" # speaker reference to be used in training test sentences
76
+ ]
77
+ LANGUAGE = config_dataset.language
78
+
79
+
80
+ def main():
81
+ # init args and config
82
+ model_args = GPTArgs(
83
+ max_conditioning_length=132300, # 6 secs
84
+ min_conditioning_length=66150, # 3 secs
85
+ debug_loading_failures=False,
86
+ max_wav_length=255995, # ~11.6 seconds
87
+ max_text_length=200,
88
+ mel_norm_file=MEL_NORM_FILE,
89
+ dvae_checkpoint=DVAE_CHECKPOINT,
90
+ xtts_checkpoint=XTTS_CHECKPOINT, # checkpoint path of the model that you want to fine-tune
91
+ tokenizer_file=TOKENIZER_FILE,
92
+ gpt_num_audio_tokens=1026,
93
+ gpt_start_audio_token=1024,
94
+ gpt_stop_audio_token=1025,
95
+ gpt_use_masking_gt_prompt_approach=True,
96
+ gpt_use_perceiver_resampler=True,
97
+ )
98
+ # define audio config
99
+ audio_config = XttsAudioConfig(sample_rate=22050, dvae_sample_rate=22050, output_sample_rate=24000)
100
+ # training parameters config
101
+ config = GPTTrainerConfig(
102
+ output_path=OUT_PATH,
103
+ model_args=model_args,
104
+ run_name=RUN_NAME,
105
+ project_name=PROJECT_NAME,
106
+ run_description="""
107
+ GPT XTTS training
108
+ """,
109
+ dashboard_logger=DASHBOARD_LOGGER,
110
+ logger_uri=LOGGER_URI,
111
+ audio=audio_config,
112
+ batch_size=BATCH_SIZE,
113
+ batch_group_size=48,
114
+ eval_batch_size=BATCH_SIZE,
115
+ num_loader_workers=8,
116
+ eval_split_max_size=256,
117
+ print_step=50,
118
+ plot_step=100,
119
+ log_model_step=1000,
120
+ save_step=400,
121
+ save_n_checkpoints=1,
122
+ save_checkpoints=True,
123
+ # run_eval = False,
124
+ # target_loss="loss",
125
+ print_eval=False,
126
+ # Optimizer values like tortoise, pytorch implementation with modifications to not apply WD to non-weight parameters.
127
+ optimizer="AdamW",
128
+ optimizer_wd_only_on_weights=OPTIMIZER_WD_ONLY_ON_WEIGHTS,
129
+ optimizer_params={"betas": [0.9, 0.96], "eps": 1e-8, "weight_decay": 1e-2},
130
+ lr=4e-06, # learning rate
131
+ lr_scheduler="MultiStepLR",
132
+ # it was adjusted accordly for the new step scheme
133
+ lr_scheduler_params={"milestones": [50000 * 18, 150000 * 18, 300000 * 18], "gamma": 0.5, "last_epoch": -1},
134
+ test_sentences=[
135
+ {
136
+ "text": "या मोहिमेचे अधिकृत धोरण पोलंडला रशियाच्या धोक्यापासून वाचवणे हे होते.",
137
+ "speaker_wav": SPEAKER_REFERENCE,
138
+ "language": LANGUAGE,
139
+ },
140
+ {
141
+ "text": "अशी नेपोलियनला आशा होती परंतु रशियन सैन्याने",
142
+ "speaker_wav": SPEAKER_REFERENCE,
143
+ "language": LANGUAGE,
144
+ },
145
+ ],
146
+ )
147
+
148
+ # init the model from config
149
+ model = GPTTrainer.init_from_config(config)
150
+
151
+ # load training samples
152
+ train_samples, eval_samples = load_tts_samples(
153
+ DATASETS_CONFIG_LIST,
154
+ eval_split=False,
155
+ # eval_split_max_size=config.eval_split_max_size,
156
+ # eval_split_size=config.eval_split_size,
157
+ )
158
+
159
+ # init the trainer and 🚀
160
+ trainer = Trainer(
161
+ TrainerArgs(
162
+ restore_path=None, # xtts checkpoint is restored via xtts_checkpoint key so no need of restore it using Trainer restore_path parameter
163
+ skip_train_epoch=False,
164
+ start_with_eval=START_WITH_EVAL,
165
+ grad_accum_steps=GRAD_ACUMM_STEPS,
166
+ ),
167
+ config,
168
+ output_path=OUT_PATH,
169
+ model=model,
170
+ train_samples=train_samples,
171
+ # eval_samples=eval_samples,
172
+ )
173
+ trainer.fit()
174
+
175
+
176
+ if __name__ == "__main__":
177
+ main()
vocab.json ADDED
The diff for this file is too large to render. See raw diff