danhtran2mind commited on
Commit
3f9cba0
·
verified ·
1 Parent(s): c87ce87

Upload 244 files

Browse files
This view is limited to 50 files because it contains too many changes.   See raw diff
Files changed (50) hide show
  1. .gitattributes +24 -0
  2. apps/gradio_app.py +149 -0
  3. apps/gradio_app/__init__.py +0 -0
  4. apps/gradio_app/asr_utils.py +16 -0
  5. apps/gradio_app/assets/examples/f5_tts/1/infer_audio.wav +3 -0
  6. apps/gradio_app/assets/examples/f5_tts/1/infer_text.txt +1 -0
  7. apps/gradio_app/assets/examples/f5_tts/1/refer_audio.mp3 +0 -0
  8. apps/gradio_app/assets/examples/f5_tts/1/refer_text.txt +1 -0
  9. apps/gradio_app/assets/examples/f5_tts/2/infer_audio.mp3 +0 -0
  10. apps/gradio_app/assets/examples/f5_tts/2/infer_text.txt +1 -0
  11. apps/gradio_app/assets/examples/f5_tts/2/refer_audio.mp3 +0 -0
  12. apps/gradio_app/assets/examples/f5_tts/2/refer_text.txt +1 -0
  13. apps/gradio_app/assets/examples/f5_tts/3/infer_audio.mp3 +0 -0
  14. apps/gradio_app/assets/examples/f5_tts/3/infer_text.txt +1 -0
  15. apps/gradio_app/assets/examples/f5_tts/3/refer_audio.mp3 +3 -0
  16. apps/gradio_app/assets/examples/f5_tts/3/refer_text.txt +1 -0
  17. apps/gradio_app/assets/examples/f5_tts/4/infer_audio.mp3 +0 -0
  18. apps/gradio_app/assets/examples/f5_tts/4/infer_text.txt +1 -0
  19. apps/gradio_app/assets/examples/f5_tts/4/refer_audio.mp3 +3 -0
  20. apps/gradio_app/assets/examples/f5_tts/4/refer_text.txt +1 -0
  21. apps/gradio_app/components.py +90 -0
  22. apps/gradio_app/setup_scripts.py +61 -0
  23. apps/gradio_app/static/scripts.js +0 -0
  24. apps/gradio_app/static/styles.css +100 -0
  25. apps/old-gradio_app.py +140 -0
  26. assets/.gitkeep +0 -0
  27. assets/examples/f5_tts/1/infer_audio.wav +3 -0
  28. assets/examples/f5_tts/1/infer_text.txt +1 -0
  29. assets/examples/f5_tts/1/refer_audio.mp3 +0 -0
  30. assets/examples/f5_tts/1/refer_text.txt +1 -0
  31. assets/examples/f5_tts/2/infer_audio.mp3 +0 -0
  32. assets/examples/f5_tts/2/infer_text.txt +1 -0
  33. assets/examples/f5_tts/2/refer_audio.mp3 +0 -0
  34. assets/examples/f5_tts/2/refer_text.txt +1 -0
  35. assets/examples/f5_tts/3/infer_audio.mp3 +0 -0
  36. assets/examples/f5_tts/3/infer_text.txt +1 -0
  37. assets/examples/f5_tts/3/refer_audio.mp3 +3 -0
  38. assets/examples/f5_tts/3/refer_text.txt +1 -0
  39. assets/examples/f5_tts/4/infer_audio.mp3 +0 -0
  40. assets/examples/f5_tts/4/infer_text.txt +1 -0
  41. assets/examples/f5_tts/4/refer_audio.mp3 +3 -0
  42. assets/examples/f5_tts/4/refer_text.txt +1 -0
  43. ckpts/.gitkeep +0 -0
  44. configs/.gitkeep +0 -0
  45. configs/vi-fine-tuned-f5-tts.yaml +52 -0
  46. data/.gitkeep +0 -0
  47. docs/.gitkeep +0 -0
  48. docs/inference/inference_doc.md +38 -0
  49. docs/training/training_doc.md +0 -0
  50. notebooks/1-vi-fine-tuned-t5-tts.ipynb +952 -0
.gitattributes CHANGED
@@ -33,3 +33,27 @@ saved_model/**/* filter=lfs diff=lfs merge=lfs -text
33
  *.zip filter=lfs diff=lfs merge=lfs -text
34
  *.zst filter=lfs diff=lfs merge=lfs -text
35
  *tfevents* filter=lfs diff=lfs merge=lfs -text
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
33
  *.zip filter=lfs diff=lfs merge=lfs -text
34
  *.zst filter=lfs diff=lfs merge=lfs -text
35
  *tfevents* filter=lfs diff=lfs merge=lfs -text
36
+ apps/gradio_app/assets/examples/f5_tts/1/infer_audio.wav filter=lfs diff=lfs merge=lfs -text
37
+ apps/gradio_app/assets/examples/f5_tts/3/refer_audio.mp3 filter=lfs diff=lfs merge=lfs -text
38
+ apps/gradio_app/assets/examples/f5_tts/4/refer_audio.mp3 filter=lfs diff=lfs merge=lfs -text
39
+ assets/examples/f5_tts/1/infer_audio.wav filter=lfs diff=lfs merge=lfs -text
40
+ assets/examples/f5_tts/3/refer_audio.mp3 filter=lfs diff=lfs merge=lfs -text
41
+ assets/examples/f5_tts/4/refer_audio.mp3 filter=lfs diff=lfs merge=lfs -text
42
+ src/f5_tts/src/f5_tts/infer/examples/basic/basic_ref_en.wav filter=lfs diff=lfs merge=lfs -text
43
+ src/f5_tts/src/f5_tts/infer/examples/basic/basic_ref_zh.wav filter=lfs diff=lfs merge=lfs -text
44
+ src/f5_tts/src/f5_tts/infer/examples/multi/country.flac filter=lfs diff=lfs merge=lfs -text
45
+ src/f5_tts/src/f5_tts/infer/examples/multi/main.flac filter=lfs diff=lfs merge=lfs -text
46
+ src/f5_tts/src/f5_tts/infer/examples/multi/town.flac filter=lfs diff=lfs merge=lfs -text
47
+ src/third_party/BigVGAN/demo/examples/dance_24k.wav filter=lfs diff=lfs merge=lfs -text
48
+ src/third_party/BigVGAN/demo/examples/hifitts_44k.wav filter=lfs diff=lfs merge=lfs -text
49
+ src/third_party/BigVGAN/demo/examples/jensen_24k.wav filter=lfs diff=lfs merge=lfs -text
50
+ src/third_party/BigVGAN/demo/examples/libritts_24k.wav filter=lfs diff=lfs merge=lfs -text
51
+ src/third_party/BigVGAN/demo/examples/megalovania_24k.wav filter=lfs diff=lfs merge=lfs -text
52
+ src/third_party/BigVGAN/demo/examples/musdbhq_44k.wav filter=lfs diff=lfs merge=lfs -text
53
+ src/third_party/BigVGAN/demo/examples/musiccaps1_44k.wav filter=lfs diff=lfs merge=lfs -text
54
+ src/third_party/BigVGAN/demo/examples/musiccaps2_44k.wav filter=lfs diff=lfs merge=lfs -text
55
+ src/third_party/BigVGAN/demo/examples/queen_24k.wav filter=lfs diff=lfs merge=lfs -text
56
+ src/third_party/BigVGAN/filelists/LibriTTS/train-full.txt filter=lfs diff=lfs merge=lfs -text
57
+ tests/test_data/1/infer_audio.wav filter=lfs diff=lfs merge=lfs -text
58
+ tests/test_data/3/refer_audio.mp3 filter=lfs diff=lfs merge=lfs -text
59
+ tests/test_data/4/refer_audio.mp3 filter=lfs diff=lfs merge=lfs -text
apps/gradio_app.py ADDED
@@ -0,0 +1,149 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import gradio as gr
2
+ from gradio_app.components import (
3
+ get_files_in_ckpts, handle_file_upload,
4
+ run_tts_inference,
5
+ run_setup_script
6
+ )
7
+ from gradio_app.asr_utils import transcribe_audio
8
+ from pathlib import Path
9
+
10
+ def create_gradio_app():
11
+ """Create Gradio interface for F5-TTS inference with Whisper ASR."""
12
+ # Run setup script to ensure dependencies are installed
13
+ run_setup_script()
14
+
15
+ # Function to update reference text based on audio file and Whisper checkbox
16
+ def update_ref_text(audio_file_path, use_whisper):
17
+ if use_whisper and audio_file_path:
18
+ return transcribe_audio(audio_file_path)
19
+ return gr.update()
20
+
21
+ def toggle_model_inputs(use_upload):
22
+ return (
23
+ gr.update(visible=not use_upload),
24
+ gr.update(visible=not use_upload),
25
+ gr.update(visible=not use_upload),
26
+ gr.update(visible=use_upload),
27
+ gr.update(visible=use_upload),
28
+ gr.update(visible=use_upload)
29
+ )
30
+
31
+ def load_example(ref_audio_path, ref_text, inf_text):
32
+ """Load example inputs and retrieve corresponding infer_audio for output."""
33
+ # Find the matching example folder to get infer_audio
34
+ example_dirs = [
35
+ Path("apps/gradio_app/assets/examples/f5_tts/1"),
36
+ Path("apps/gradio_app/assets/examples/f5_tts/2"),
37
+ Path("apps/gradio_app/assets/examples/f5_tts/3"),
38
+ Path("apps/gradio_app/assets/examples/f5_tts/4")
39
+ ]
40
+ inf_audio_path = None
41
+ for dir_path in example_dirs:
42
+ if dir_path.exists():
43
+ ref_audio = next((f for f in dir_path.glob("refer_audio.*") if f.suffix in [".mp3", ".wav"]), None)
44
+ if ref_audio and str(ref_audio) == ref_audio_path:
45
+ inf_audio = next((f for f in dir_path.glob("infer_audio.*") if f.suffix in [".mp3", ".wav"]), None)
46
+ inf_audio_path = str(inf_audio) if inf_audio else None
47
+ break
48
+
49
+ return ref_audio_path, ref_text, inf_text, inf_audio_path
50
+
51
+ # Prepare examples for gr.Examples (exclude infer_audio from table)
52
+ example_dirs = [
53
+ Path("apps/gradio_app/assets/examples/f5_tts/1"),
54
+ Path("apps/gradio_app/assets/examples/f5_tts/2"),
55
+ Path("apps/gradio_app/assets/examples/f5_tts/3"),
56
+ Path("apps/gradio_app/assets/examples/f5_tts/4")
57
+ ]
58
+ examples = []
59
+ for dir_path in example_dirs:
60
+ if not dir_path.exists():
61
+ continue
62
+ # Read text files
63
+ ref_text = (dir_path / "refer_text.txt").read_text(encoding="utf-8") if (dir_path / "refer_text.txt").exists() else ""
64
+ inf_text = (dir_path / "infer_text.txt").read_text(encoding="utf-8") if (dir_path / "infer_text.txt").exists() else ""
65
+ # Find audio files (mp3 or wav)
66
+ ref_audio = next((f for f in dir_path.glob("refer_audio.*") if f.suffix in [".mp3", ".wav"]), None)
67
+ examples.append([
68
+ str(ref_audio) if ref_audio else None,
69
+ ref_text,
70
+ inf_text
71
+ ])
72
+
73
+ CSS = open("apps/gradio_app/static/styles.css", "r").read()
74
+ with gr.Blocks(css=CSS) as demo:
75
+ gr.Markdown("# F5-TTS Audio Generation")
76
+ gr.Markdown("Generate high-quality audio with a fine-tuned F5-TTS model. Upload reference audio, use Whisper ASR for transcription, enter text, adjust speed, and select or upload model files.")
77
+
78
+ with gr.Row():
79
+ with gr.Column():
80
+ ref_audio = gr.Audio(label="Reference Audio", type="filepath")
81
+ with gr.Group():
82
+ use_whisper = gr.Checkbox(label="Use Whisper ASR for Transcription", value=False)
83
+ ref_text = gr.Textbox(
84
+ label="Reference Text",
85
+ placeholder="e.g., Sau nhà Ngô, lần lượt các triều Đinh...",
86
+ lines=1
87
+ )
88
+ gen_text = gr.Textbox(
89
+ label="Generated Text",
90
+ placeholder="e.g., Nhà Tiền Lê, Lý và Trần đã chống trả...",
91
+ lines=1
92
+ )
93
+ generate_btn = gr.Button("Generate Audio")
94
+ with gr.Column():
95
+ output_audio = gr.Audio(label="Generated Audio")
96
+ output_text = gr.Textbox(label="Status", interactive=False)
97
+ with gr.Group():
98
+ speed = gr.Slider(0.5, 2.0, 1.0, step=0.1, label="Speed")
99
+ model_cfg = gr.Dropdown(
100
+ choices=get_files_in_ckpts([".yaml"]),
101
+ label="Model Config (*.yaml)",
102
+ value=get_files_in_ckpts([".yaml"])[0],
103
+ visible=True
104
+ )
105
+ ckpt_file = gr.Dropdown(
106
+ choices=get_files_in_ckpts([".pt", ".safetensors"], include_subdirs=True),
107
+ label="Checkpoint File (*.pt or *.safetensors)",
108
+ value=get_files_in_ckpts([".pt", ".safetensors"], include_subdirs=True)[0],
109
+ visible=True
110
+ )
111
+ vocab_file = gr.Dropdown(
112
+ choices=get_files_in_ckpts([".txt", ".safetensors"]),
113
+ label="Vocab File (*.txt or *.safetensors)",
114
+ value=get_files_in_ckpts([".txt", ".safetensors"])[0],
115
+ visible=True
116
+ )
117
+ use_upload = gr.Checkbox(label="Upload Custom Model Files", value=False)
118
+ model_cfg_upload = gr.File(label="Model Config (*.yaml)", file_types=[".yaml"], visible=False)
119
+ ckpt_file_upload = gr.File(label="Checkpoint File (*.pt or *.safetensors)", file_types=[".pt", ".safetensors"], visible=False)
120
+ vocab_file_upload = gr.File(label="Vocab File (*.txt or *.safetensors)", file_types=[".txt", ".safetensors"], visible=False)
121
+
122
+ # Add Examples component after both columns
123
+ gr.Examples(
124
+ examples=examples,
125
+ inputs=[ref_audio, ref_text, gen_text],
126
+ outputs=[ref_audio, ref_text, gen_text, output_audio], # Keep output_audio to display infer_audio
127
+ fn=load_example,
128
+ label="Example Inputs",
129
+ examples_per_page=4,
130
+ cache_examples=False
131
+ )
132
+
133
+ ref_audio.change(fn=update_ref_text, inputs=[ref_audio, use_whisper], outputs=ref_text)
134
+ use_whisper.change(fn=update_ref_text, inputs=[ref_audio, use_whisper], outputs=ref_text)
135
+ use_upload.change(
136
+ fn=toggle_model_inputs,
137
+ inputs=[use_upload],
138
+ outputs=[model_cfg, ckpt_file, vocab_file, model_cfg_upload, ckpt_file_upload, vocab_file_upload]
139
+ )
140
+ generate_btn.click(
141
+ fn=run_tts_inference,
142
+ inputs=[ref_audio, ref_text, gen_text, speed, use_upload, model_cfg, ckpt_file, vocab_file],
143
+ outputs=[output_audio, output_text]
144
+ )
145
+ return demo
146
+
147
+ if __name__ == "__main__":
148
+ demo = create_gradio_app()
149
+ demo.launch(share=True)
apps/gradio_app/__init__.py ADDED
File without changes
apps/gradio_app/asr_utils.py ADDED
@@ -0,0 +1,16 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ from transformers import WhisperProcessor, WhisperForConditionalGeneration
2
+ import librosa
3
+
4
+ def transcribe_audio(audio_file_path):
5
+ """Transcribe audio using PhoWhisper-tiny model."""
6
+ try:
7
+ processor = WhisperProcessor.from_pretrained("vinai/PhoWhisper-tiny")
8
+ model = WhisperForConditionalGeneration.from_pretrained("vinai/PhoWhisper-tiny")
9
+ audio, sr = librosa.load(audio_file_path, sr=16000)
10
+ input_features = processor(audio, sampling_rate=16000, return_tensors="pt").input_features
11
+ forced_decoder_ids = processor.get_decoder_prompt_ids(language="vi", task="transcribe")
12
+ predicted_ids = model.generate(input_features, forced_decoder_ids=forced_decoder_ids)
13
+ transcription = processor.batch_decode(predicted_ids, skip_special_tokens=True)
14
+ return transcription[0] if transcription else ""
15
+ except Exception as e:
16
+ return f"Transcription error: {str(e)}"
apps/gradio_app/assets/examples/f5_tts/1/infer_audio.wav ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:2924700ad369afabb4489eceec9c5e1e9c0fae90a3409f480678aba7a79a7378
3
+ size 127020
apps/gradio_app/assets/examples/f5_tts/1/infer_text.txt ADDED
@@ -0,0 +1 @@
 
 
1
+ chào mọi người, mọi người khỏe không?
apps/gradio_app/assets/examples/f5_tts/1/refer_audio.mp3 ADDED
Binary file (35.3 kB). View file
 
apps/gradio_app/assets/examples/f5_tts/1/refer_text.txt ADDED
@@ -0,0 +1 @@
 
 
1
+ bạn và tôi đều như nhau nhé, rồi chúng ta đi đâu nè
apps/gradio_app/assets/examples/f5_tts/2/infer_audio.mp3 ADDED
Binary file (12.7 kB). View file
 
apps/gradio_app/assets/examples/f5_tts/2/infer_text.txt ADDED
@@ -0,0 +1 @@
 
 
1
+ Tôi rất khỏe,cảm ơn mọi người đã quan tâm.
apps/gradio_app/assets/examples/f5_tts/2/refer_audio.mp3 ADDED
Binary file (61.9 kB). View file
 
apps/gradio_app/assets/examples/f5_tts/2/refer_text.txt ADDED
@@ -0,0 +1 @@
 
 
1
+ Chúng thường sống hòa bình với các loài động vật khác, kể cả những loài săn mồi.
apps/gradio_app/assets/examples/f5_tts/3/infer_audio.mp3 ADDED
Binary file (51.8 kB). View file
 
apps/gradio_app/assets/examples/f5_tts/3/infer_text.txt ADDED
@@ -0,0 +1 @@
 
 
1
+ Nhà Tiền Lê, Lý và Trần đã chống trả các cuộc tấn công của nhà Tống và nhà Mông – Nguyên, đều thắng lợi và bảo vệ được Đại Việt.
apps/gradio_app/assets/examples/f5_tts/3/refer_audio.mp3 ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:cd15755a7704fd99247dfae618a4f8e9d9655af735def78e6fdec5467faca641
3
+ size 183110
apps/gradio_app/assets/examples/f5_tts/3/refer_text.txt ADDED
@@ -0,0 +1 @@
 
 
1
+ Sau nhà Ngô, lần lượt các triều Đinh, Tiền Lê, Lý và Trần tổ chức chính quyền tương tự các triều đại Trung Hoa, lấy Phật giáo làm tôn giáo chính của quốc gia và cho truyền bá cả Nho giáo và Đạo giáo.
apps/gradio_app/assets/examples/f5_tts/4/infer_audio.mp3 ADDED
Binary file (52.7 kB). View file
 
apps/gradio_app/assets/examples/f5_tts/4/infer_text.txt ADDED
@@ -0,0 +1 @@
 
 
1
+ Người dân Đông Á cổ đại đã uống trà trong nhiều thế kỷ, thậm chí có thể là hàng thiên niên kỷ , trước khi sử dụng nó như một thức uống.
apps/gradio_app/assets/examples/f5_tts/4/refer_audio.mp3 ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:9ea81c8700f5ff2e6497c9beaa942b5ed107e03ae468472d78a4c8c80e3b63af
3
+ size 138388
apps/gradio_app/assets/examples/f5_tts/4/refer_text.txt ADDED
@@ -0,0 +1 @@
 
 
1
+ Cấu trúc sừng và mào là phổ biến ở tất cả các nhóm khủng long, và vài nhóm thậm chí còn phát triển các biến đổi bộ xương như giáp mô hoặc gai.
apps/gradio_app/components.py ADDED
@@ -0,0 +1,90 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import os
2
+ import subprocess
3
+ import uuid
4
+ from pathlib import Path
5
+ import shutil
6
+
7
+ def run_setup_script():
8
+ setup_script = os.path.join(os.path.dirname(__file__), "setup_scripts.py")
9
+ try:
10
+ result = subprocess.run(["python", setup_script], capture_output=True, text=True, check=True)
11
+ return result.stdout
12
+ except subprocess.CalledProcessError as e:
13
+ return f"Setup script failed: {e.stderr}"
14
+
15
+
16
+ def get_files_in_ckpts(extensions, include_subdirs=False):
17
+ """List files in ckpts directory with specified extensions, optionally including subdirectories."""
18
+ ckpts_dir = Path("ckpts")
19
+ if not ckpts_dir.exists():
20
+ return ["No files found"]
21
+ files = []
22
+ for ext in extensions:
23
+ if include_subdirs:
24
+ files.extend([str(f) for f in ckpts_dir.glob(f"**/*{ext}")])
25
+ else:
26
+ files.extend([str(f) for f in ckpts_dir.glob(f"*{ext}")])
27
+ return files if files else ["No files found"]
28
+
29
+ def handle_file_upload(file_obj, allowed_extensions):
30
+ """Copy uploaded file to a permanent location and validate extension."""
31
+ if not file_obj:
32
+ return None, "No file uploaded."
33
+ try:
34
+ file_ext = os.path.splitext(file_obj.name)[1].lower()
35
+ if file_ext not in allowed_extensions:
36
+ return None, f"Invalid file extension. Allowed: {', '.join(allowed_extensions)}"
37
+ upload_dir = Path("uploads")
38
+ upload_dir.mkdir(exist_ok=True)
39
+ file_name = f"upload_{str(uuid.uuid4())[:8]}{file_ext}"
40
+ dest_path = upload_dir / file_name
41
+ shutil.copyfile(file_obj.name, dest_path)
42
+ return str(dest_path), None
43
+ except Exception as e:
44
+ return None, f"File upload error: {str(e)}"
45
+
46
+ def run_tts_inference(ref_audio, ref_text, gen_text, speed, use_upload, model_cfg, ckpt_file, vocab_file):
47
+ """Run F5-TTS inference with selected or uploaded model files."""
48
+ if use_upload:
49
+ model_cfg_path, model_cfg_error = handle_file_upload(model_cfg, [".yaml"])
50
+ ckpt_file_path, ckpt_file_error = handle_file_upload(ckpt_file, [".pt", ".safetensors"])
51
+ vocab_file_path, vocab_file_error = handle_file_upload(vocab_file, [".txt", ".safetensors"])
52
+ if model_cfg_error or ckpt_file_error or vocab_file_error:
53
+ return None, model_cfg_error or ckpt_file_error or vocab_file_error
54
+ if not (model_cfg_path and ckpt_file_path and vocab_file_path):
55
+ return None, "Please upload all model files (model_cfg, ckpt_file, vocab_file)."
56
+ config = {"model_cfg": model_cfg_path, "ckpt_file": ckpt_file_path, "vocab_file": vocab_file_path}
57
+ else:
58
+ if any(f == "No files found" for f in [model_cfg, ckpt_file, vocab_file]):
59
+ return None, "No valid model files found in ckpts. Upload custom files or add files to ckpts."
60
+ config = {"model_cfg": model_cfg, "ckpt_file": ckpt_file, "vocab_file": vocab_file}
61
+
62
+ if not ref_audio:
63
+ return None, "Reference audio is required."
64
+
65
+ output_dir = "apps/gradio_app/temp_data"
66
+ os.makedirs(output_dir, exist_ok=True)
67
+ output_file = f"infer_audio_{str(uuid.uuid4())[:8]}.mp3"
68
+ output_path = os.path.join(output_dir, output_file)
69
+
70
+ try:
71
+ command = [
72
+ "python", "src/f5_tts/infer/infer_cli.py",
73
+ "--model_cfg", config["model_cfg"],
74
+ "--ckpt_file", config["ckpt_file"],
75
+ "--vocab_file", config["vocab_file"],
76
+ "--ref_audio", ref_audio,
77
+ "--ref_text", ref_text,
78
+ "--gen_text", gen_text,
79
+ "--speed", str(speed),
80
+ "--output_dir", output_dir,
81
+ "--output_file", output_file
82
+ ]
83
+ result = subprocess.run(command, capture_output=True, text=True)
84
+ if result.returncode != 0:
85
+ return None, f"Inference error: {result.stderr}"
86
+ if not os.path.exists(output_path):
87
+ return None, f"Output audio not found at {output_path}"
88
+ return output_path, "Audio generated successfully!"
89
+ except Exception as e:
90
+ return None, f"Inference error: {str(e)}"
apps/gradio_app/setup_scripts.py ADDED
@@ -0,0 +1,61 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import subprocess
2
+ import sys
3
+ import os
4
+
5
+ def run_script(script_path, args=None):
6
+ """
7
+ Run a Python script using subprocess with optional arguments and handle errors.
8
+ Returns True if successful, False otherwise.
9
+ """
10
+ try:
11
+ command = [sys.executable, script_path]
12
+ if args:
13
+ command.extend(args)
14
+ result = subprocess.run(
15
+ command,
16
+ check=True,
17
+ text=True,
18
+ capture_output=True
19
+ )
20
+ print(f"Successfully executed {script_path}")
21
+ print(result.stdout)
22
+ return True
23
+ except subprocess.CalledProcessError as e:
24
+ print(f"Error executing {script_path}:")
25
+ print(e.stderr)
26
+ return False
27
+ except FileNotFoundError:
28
+ print(f"Script not found: {script_path}")
29
+ return False
30
+
31
+ def main():
32
+ """
33
+ Main function to execute setup_third_party.py and download_ckpts.py in sequence.
34
+ """
35
+ scripts_dir = "scripts"
36
+ scripts = [
37
+ {
38
+ "path": os.path.join(scripts_dir, "setup_third_party.py"),
39
+ "args": None
40
+ },
41
+ {
42
+ "path": os.path.join(scripts_dir, "download_ckpts.py"),
43
+ "args": [
44
+ "--repo_id", "danhtran2mind/Vi-F5-TTS",
45
+ "--local_dir", "./ckpts",
46
+ "--pruning_model"
47
+ ]
48
+ }
49
+ ]
50
+
51
+ for script in scripts:
52
+ script_path = script["path"]
53
+ args = script["args"]
54
+ print(f"Start running {script_path} {' '.join(args) if args else ''}\n")
55
+ if not run_script(script_path, args):
56
+ print(f"Stopping execution due to error in {script_path}")
57
+ sys.exit(1)
58
+ print(f"Completed {script_path}\n")
59
+
60
+ if __name__ == "__main__":
61
+ main()
apps/gradio_app/static/scripts.js ADDED
File without changes
apps/gradio_app/static/styles.css ADDED
@@ -0,0 +1,100 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ /* General body styling */
2
+ .gradio-container {
3
+ background: linear-gradient(180deg, #f9fafb, #f1efef);
4
+ font-family: 'Quicksand', ui-sans-serif, sans-serif;
5
+ color: #6b46c1; /* Purple-800 for text (neutral hue) */
6
+ font-size: 16px; /* Medium text size */
7
+ font-weight: 400;
8
+ }
9
+
10
+ /* Dark mode background */
11
+ @media (prefers-color-scheme: dark) {
12
+ .gradio-container {
13
+ background: linear-gradient(180deg, #1f2937, #111827);
14
+ color: #d6bcfa; /* Lighter purple for dark mode */
15
+ }
16
+ }
17
+
18
+ /* Block styling (containers for components) */
19
+ .block {
20
+ border: 1px solid #e9d8fd; /* Purple-200 for borders */
21
+ border-radius: 8px; /* Medium radius */
22
+ box-shadow: 0 1px 2px rgba(0, 0, 0, 0.05); /* Small shadow */
23
+ padding: 16px; /* Medium spacing */
24
+ background: #f1efef;
25
+ }
26
+
27
+ /* Input fields */
28
+ input[type="text"], textarea {
29
+ background: #faf5ff; /* Purple-50 for input background */
30
+ border: 1px solid #e9d8fd; /* Purple-200 for borders */
31
+ border-radius: 8px;
32
+ padding: 8px;
33
+ font-family: 'Quicksand', ui-sans-serif, sans-serif;
34
+ font-size: 16px;
35
+ color: #6b46c1;
36
+ box-shadow: none;
37
+ }
38
+ input[type="text"]:focus, textarea:focus {
39
+ outline: none;
40
+ box-shadow: 0 1px 2px rgba(0, 0, 0, 0.1); /* Small shadow on focus */
41
+ border-color: #48bb78; /* Green-400 for focus */
42
+ }
43
+
44
+ /* Primary button */
45
+ button.primary {
46
+ background: #48bb78; /* Green-400 */
47
+ color: #f1efef;
48
+ border: none;
49
+ border-radius: 8px;
50
+ padding: 8px 16px;
51
+ font-family: 'Quicksand', ui-sans-serif, sans-serif;
52
+ font-size: 16px;
53
+ font-weight: 500;
54
+ box-shadow: 0 1px 2px rgba(0, 0, 0, 0.05);
55
+ cursor: pointer;
56
+ }
57
+ button.primary:hover {
58
+ background: #ed8936; /* Orange-400 for hover */
59
+ box-shadow: 0 2px 4px rgba(0, 0, 0, 0.1); /* Medium shadow on hover */
60
+ }
61
+
62
+ /* Secondary button */
63
+ button.secondary {
64
+ color: #48bb78; /* Green-400 for text */
65
+ border: 1px solid #48bb78; /* Green-400 for border */
66
+ border-radius: 8px;
67
+ padding: 8px 16px;
68
+ font-family: 'Quicksand', ui-sans-serif, sans-serif;
69
+ font-size: 16px;
70
+ font-weight: 500;
71
+ box-shadow: 0 1px 2px rgba(0, 0, 0, 0.05);
72
+ cursor: pointer;
73
+ }
74
+ button.secondary:hover {
75
+ background: #ed8936; /* Orange-400 for hover */
76
+ color: #48bb78;
77
+ box-shadow: 0 2px 4px rgba(0, 0, 0, 0.1);
78
+ }
79
+
80
+ /* Slider styling */
81
+ input[type="range"] {
82
+ accent-color: #ed8936; /* Orange-400 for slider */
83
+ }
84
+ @media (prefers-color-scheme: dark) {
85
+ input[type="range"] {
86
+ accent-color: #f6ad55; /* Orange-600 for dark mode */
87
+ }
88
+ }
89
+
90
+ /* Markdown headers */
91
+ h2 {
92
+ font-weight: 500;
93
+ color: #6b46c1; /* Purple-800 */
94
+ margin-bottom: 16px;
95
+ }
96
+
97
+ /* Code or monospace elements */
98
+ code, pre {
99
+ font-family: 'IBM Plex Mono', ui-monospace, monospace;
100
+ }
apps/old-gradio_app.py ADDED
@@ -0,0 +1,140 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import gradio as gr
2
+ import os
3
+ import subprocess
4
+ import tempfile
5
+ from pathlib import Path
6
+ from transformers import WhisperProcessor, WhisperForConditionalGeneration
7
+ import librosa
8
+
9
+ def transcribe_audio(audio_file_path):
10
+ """Transcribe audio using PhoWhisper-tiny model."""
11
+ try:
12
+ processor = WhisperProcessor.from_pretrained("vinai/PhoWhisper-tiny")
13
+ model = WhisperForConditionalGeneration.from_pretrained("vinai/PhoWhisper-tiny")
14
+ audio, sr = librosa.load(audio_file_path, sr=16000)
15
+ input_features = processor(audio, sampling_rate=16000, return_tensors="pt").input_features
16
+ forced_decoder_ids = processor.get_decoder_prompt_ids(language="vi", task="transcribe")
17
+ predicted_ids = model.generate(input_features, forced_decoder_ids=forced_decoder_ids)
18
+ transcription = processor.batch_decode(predicted_ids, skip_special_tokens=True)
19
+ return transcription[0] if transcription else ""
20
+ except Exception as e:
21
+ return f"Error during transcription: {str(e)}"
22
+
23
+ def run_tts_inference(ref_audio, ref_text, gen_text, speed, model_option):
24
+ """
25
+ Run the F5-TTS inference script with provided inputs and return the output audio path.
26
+ """
27
+ model_configs = {
28
+ "Vietnamese Fine-Tuned": {
29
+ "model_cfg": "ckpts/vi-fine-tuned-f5-tts.yaml",
30
+ "ckpt_file": "ckpts/Vi_F5_TTS_ckpts/pruning_model.pt",
31
+ "vocab_file": "ckpts/vocab.txt"
32
+ },
33
+ }
34
+
35
+ if model_option not in model_configs:
36
+ return None, f"Invalid model option: {model_option}"
37
+
38
+ config = model_configs[model_option]
39
+
40
+ output_dir = "apps/gradio_app/temp_data"
41
+ os.makedirs(output_dir, exist_ok=True)
42
+ output_file = "infer_audio.mp3"
43
+ output_path = os.path.join(output_dir, output_file)
44
+
45
+ if ref_audio:
46
+ temp_audio = ref_audio
47
+ else:
48
+ return None, "Reference audio is required"
49
+
50
+ # with tempfile.NamedTemporaryFile(mode='w', suffix='.txt', delete=False) as temp_ref_text:
51
+ # temp_ref_text.write(ref_text or "")
52
+ # temp_ref_text_path = temp_ref_text.name
53
+
54
+ # with tempfile.NamedTemporaryFile(mode='w', suffix='.txt', delete=False) as temp_gen_text:
55
+ # temp_gen_text.write(gen_text or "")
56
+ # temp_gen_text_path = temp_gen_text.name
57
+
58
+ try:
59
+ command = [
60
+ "python", "src/f5_tts/infer/infer_cli.py",
61
+ "--model_cfg", config["model_cfg"],
62
+ "--ckpt_file", config["ckpt_file"],
63
+ "--vocab_file", config["vocab_file"],
64
+ "--ref_audio", temp_audio,
65
+ "--ref_text", ref_text,
66
+ "--gen_text", gen_text,
67
+ "--speed", str(speed),
68
+ "--output_dir", output_dir,
69
+ "--output_file", output_file
70
+ ]
71
+
72
+ result = subprocess.run(command, capture_output=True, text=True)
73
+
74
+ if result.returncode != 0:
75
+ return None, f"Error running inference: {result.stderr}"
76
+
77
+ if not os.path.exists(output_path):
78
+ return None, f"Output audio file not found at {output_path}"
79
+
80
+ return output_path, "Audio generated successfully!"
81
+
82
+ except Exception as e:
83
+ return None, f"Error during inference: {str(e)}"
84
+
85
+
86
+ def create_gradio_app():
87
+ """
88
+ Create and return a Gradio interface for the F5-TTS inference with optional Whisper ASR.
89
+ """
90
+ def update_ref_text(audio_file_path, use_whisper):
91
+ """Conditionally transcribe audio based on Whisper checkbox."""
92
+ if use_whisper and audio_file_path:
93
+ return transcribe_audio(audio_file_path)
94
+ return gr.update() # Keep current text if Whisper is disabled or no audio
95
+
96
+ with gr.Blocks() as demo:
97
+ gr.Markdown("# F5-TTS Audio Generation App")
98
+ gr.Markdown("Generate audio using a fine-tuned F5-TTS model. Upload a reference audio, enable Whisper ASR for auto-transcription or manually enter reference text, provide generated text, and adjust the speed.")
99
+
100
+ with gr.Row():
101
+ with gr.Column():
102
+ ref_audio = gr.Audio(label="Reference Audio", type="filepath")
103
+ use_whisper = gr.Checkbox(label="Use Whisper ASR for Reference Text", value=False)
104
+ ref_text = gr.Textbox(label="Reference Text", placeholder="e.g., Sau nhà Ngô, lần lượt các triều Đinh...")
105
+ gen_text = gr.Textbox(label="Generated Text", placeholder="e.g., Nhà Tiền Lê, Lý và Trần đã chống trả...")
106
+ speed = gr.Slider(0.5, 2.0, 1.0, step=0.1, label="Speed")
107
+ model_option = gr.Dropdown(
108
+ choices=["Vietnamese Fine-Tuned"],
109
+ label="Model Option",
110
+ value="Vietnamese Fine-Tuned"
111
+ )
112
+ generate_btn = gr.Button("Generate Audio")
113
+
114
+ with gr.Column():
115
+ output_audio = gr.Audio(label="Generated Audio")
116
+ output_text = gr.Textbox(label="Status")
117
+
118
+ # Update reference text when audio is uploaded or Whisper checkbox changes
119
+ ref_audio.change(
120
+ fn=update_ref_text,
121
+ inputs=[ref_audio, use_whisper],
122
+ outputs=ref_text
123
+ )
124
+ use_whisper.change(
125
+ fn=update_ref_text,
126
+ inputs=[ref_audio, use_whisper],
127
+ outputs=ref_text
128
+ )
129
+
130
+ generate_btn.click(
131
+ fn=run_tts_inference,
132
+ inputs=[ref_audio, ref_text, gen_text, speed, model_option],
133
+ outputs=[output_audio, output_text]
134
+ )
135
+
136
+ return demo
137
+
138
+ if __name__ == "__main__":
139
+ demo = create_gradio_app()
140
+ demo.launch(share=True)
assets/.gitkeep ADDED
File without changes
assets/examples/f5_tts/1/infer_audio.wav ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:2924700ad369afabb4489eceec9c5e1e9c0fae90a3409f480678aba7a79a7378
3
+ size 127020
assets/examples/f5_tts/1/infer_text.txt ADDED
@@ -0,0 +1 @@
 
 
1
+ chào mọi người, mọi người khỏe không?
assets/examples/f5_tts/1/refer_audio.mp3 ADDED
Binary file (35.3 kB). View file
 
assets/examples/f5_tts/1/refer_text.txt ADDED
@@ -0,0 +1 @@
 
 
1
+ bạn và tôi đều như nhau nhé, rồi chúng ta đi đâu nè
assets/examples/f5_tts/2/infer_audio.mp3 ADDED
Binary file (12.7 kB). View file
 
assets/examples/f5_tts/2/infer_text.txt ADDED
@@ -0,0 +1 @@
 
 
1
+ Tôi rất khỏe,cảm ơn mọi người đã quan tâm.
assets/examples/f5_tts/2/refer_audio.mp3 ADDED
Binary file (61.9 kB). View file
 
assets/examples/f5_tts/2/refer_text.txt ADDED
@@ -0,0 +1 @@
 
 
1
+ Chúng thường sống hòa bình với các loài động vật khác, kể cả những loài săn mồi.
assets/examples/f5_tts/3/infer_audio.mp3 ADDED
Binary file (51.8 kB). View file
 
assets/examples/f5_tts/3/infer_text.txt ADDED
@@ -0,0 +1 @@
 
 
1
+ Nhà Tiền Lê, Lý và Trần đã chống trả các cuộc tấn công của nhà Tống và nhà Mông – Nguyên, đều thắng lợi và bảo vệ được Đại Việt.
assets/examples/f5_tts/3/refer_audio.mp3 ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:cd15755a7704fd99247dfae618a4f8e9d9655af735def78e6fdec5467faca641
3
+ size 183110
assets/examples/f5_tts/3/refer_text.txt ADDED
@@ -0,0 +1 @@
 
 
1
+ Sau nhà Ngô, lần lượt các triều Đinh, Tiền Lê, Lý và Trần tổ chức chính quyền tương tự các triều đại Trung Hoa, lấy Phật giáo làm tôn giáo chính của quốc gia và cho truyền bá cả Nho giáo và Đạo giáo.
assets/examples/f5_tts/4/infer_audio.mp3 ADDED
Binary file (52.7 kB). View file
 
assets/examples/f5_tts/4/infer_text.txt ADDED
@@ -0,0 +1 @@
 
 
1
+ Người dân Đông Á cổ đại đã uống trà trong nhiều thế kỷ, thậm chí có thể là hàng thiên niên kỷ , trước khi sử dụng nó như một thức uống.
assets/examples/f5_tts/4/refer_audio.mp3 ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:9ea81c8700f5ff2e6497c9beaa942b5ed107e03ae468472d78a4c8c80e3b63af
3
+ size 138388
assets/examples/f5_tts/4/refer_text.txt ADDED
@@ -0,0 +1 @@
 
 
1
+ Cấu trúc sừng và mào là phổ biến ở tất cả các nhóm khủng long, và vài nhóm thậm chí còn phát triển các biến đổi bộ xương như giáp mô hoặc gai.
ckpts/.gitkeep ADDED
File without changes
configs/.gitkeep ADDED
File without changes
configs/vi-fine-tuned-f5-tts.yaml ADDED
@@ -0,0 +1,52 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ hydra:
2
+ run:
3
+ dir: ckpts/${model.name}_${model.mel_spec.mel_spec_type}_${model.tokenizer}_${datasets.name}/${now:%Y-%m-%d}/${now:%H-%M-%S}
4
+
5
+ datasets:
6
+ name: vin100h-preprocessed-v2 # dataset name
7
+ batch_size_per_gpu: 3200 # 1 GPUs, 1 * 3200 = 3200
8
+ batch_size_type: frame # frame | sample
9
+ max_samples: 64 # max sequences per batch if use frame-wise batch_size. we set 32 for small models, 64 for base models
10
+ num_workers: 4
11
+
12
+ optim:
13
+ epochs: 80
14
+ learning_rate: 1e-5
15
+ num_warmup_updates: 2761 # warmup updates
16
+ grad_accumulation_steps: 2 # note: updates = steps / grad_accumulation_steps
17
+ max_grad_norm: 1.0 # gradient clipping
18
+ bnb_optimizer: False # use bnb 8bit AdamW optimizer or not
19
+
20
+ model:
21
+ name: vi_fine_tuned_t5_tts # model name
22
+ tokenizer: pinyin # tokenizer type
23
+ tokenizer_path: null # if 'custom' tokenizer, define the path want to use (should be vocab.txt)
24
+ backbone: DiT
25
+ arch:
26
+ dim: 1024
27
+ depth: 22
28
+ heads: 16
29
+ ff_mult: 2
30
+ text_dim: 512
31
+ text_mask_padding: False
32
+ conv_layers: 4
33
+ pe_attn_head: 1
34
+ checkpoint_activations: False # recompute activations and save memory for extra compute
35
+ mel_spec:
36
+ target_sample_rate: 24000
37
+ n_mel_channels: 100
38
+ hop_length: 256
39
+ win_length: 1024
40
+ n_fft: 1024
41
+ mel_spec_type: vocos # vocos | bigvgan
42
+ vocoder:
43
+ is_local: False # use local offline ckpt or not
44
+ local_path: null # local vocoder path
45
+
46
+ ckpts:
47
+ logger: null # wandb | tensorboard | null
48
+ log_samples: True # infer random sample per save checkpoint. wip, normal to fail with extra long samples
49
+ save_per_updates: 4000 # save checkpoint per updates
50
+ keep_last_n_checkpoints: 1 # -1 to keep all, 0 to not save intermediate, > 0 to keep last N checkpoints
51
+ last_per_updates: 4000 # save last checkpoint per updates
52
+ save_dir: ckpts/${model.name}_${model.mel_spec.mel_spec_type}_${model.tokenizer}_${datasets.name}
data/.gitkeep ADDED
File without changes
docs/.gitkeep ADDED
File without changes
docs/inference/inference_doc.md ADDED
@@ -0,0 +1,38 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # Inference Arguments
2
+
3
+ The following table describes the command-line arguments available for the `infer-cli.py` script, which is used for text-to-speech (TTS) inference with advanced batch processing capabilities. These arguments allow users to override settings defined in the configuration file (`basic.toml` by default).
4
+
5
+ | Argument | Description | Type | Default Value | Notes |
6
+ |----------|-------------|------|---------------|-------|
7
+ | `-c`, `--config` | Path to the configuration file. | `str` | `f5_tts/infer/examples/basic/basic.toml` | Specifies the TOML configuration file to use. |
8
+ | `-m`, `--model` | Model name to use for inference. | `str` | `F5TTS_v1_Base` (from config) | Options: `F5TTS_v1_Base`, `F5TTS_Base`, `E2TTS_Base`, etc. |
9
+ | `-mc`, `--model_cfg` | Path to the model's YAML configuration file. | `str` | `configs/<model>.yaml` (from config) | Defines model-specific settings. |
10
+ | `-p`, `--ckpt_file` | Path to the model checkpoint file (.pt). | `str` | (from config) | Leave blank to use default checkpoint. |
11
+ | `-v`, `--vocab_file` | Path to the vocabulary file (.txt). | `str` | (from config) | Leave blank to use default vocabulary. |
12
+ | `-r`, `--ref_audio` | Path to the reference audio file. | `str` | `infer/examples/basic/basic_ref_en.wav` (from config) | Used as a reference for voice synthesis. |
13
+ | `-s`, `--ref_text` | Transcript or subtitle for the reference audio. | `str` | `Some call me nature, others call me mother nature.` (from config) | Text corresponding to the reference audio. |
14
+ | `-t`, `--gen_text` | Text to synthesize into speech. | `str` | `Here we generate something just for test.` (from config) | Ignored if `--gen_file` is provided. |
15
+ | `-f`, `--gen_file` | Path to a file containing text to synthesize. | `str` | (from config) | Overrides `--gen_text` if specified. |
16
+ | `-o`, `--output_dir` | Path to the output directory. | `str` | `tests` (from config) | Directory where generated audio files are saved. |
17
+ | `-w`, `--output_file` | Name of the output audio file. | `str` | `infer_cli_<timestamp>.wav` (from config) | Timestamp format: `%Y%m%d_%H%M%S`. |
18
+ | `--save_chunk` | Save individual audio chunks during inference. | `bool` | `False` (from config) | If enabled, saves chunks to `<output_dir>/<output_file>_chunks/`. |
19
+ | `--no_legacy_text` | Disable lossy ASCII transliteration for Unicode text in file names. | `bool` | `False` (from config) | If disabled, uses Unicode in file names; warns if used with `--save_chunk`. |
20
+ | `--remove_silence` | Remove long silences from the generated audio. | `bool` | `False` (from config) | Applies silence removal post-processing. |
21
+ | `--load_vocoder_from_local` | Load vocoder from a local directory. | `bool` | `False` (from config) | Uses `../checkpoints/vocos-mel-24khz` or similar if enabled. |
22
+ | `--vocoder_name` | Name of the vocoder to use. | `str` | (from config, defaults to `mel_spec_type`) | Options: `vocos`, `bigvgan`. |
23
+ | `--target_rms` | Target loudness normalization value for output speech. | `float` | (from config, defaults to `target_rms`) | Adjusts audio loudness. |
24
+ | `--cross_fade_duration` | Duration of cross-fade between audio segments (seconds). | `float` | (from config, defaults to `cross_fade_duration`) | Smooths transitions between segments. |
25
+ | `--nfe_step` | Number of function evaluation (denoising) steps. | `int` | (from config, defaults to `nfe_step`) | Controls inference quality. |
26
+ | `--cfg_strength` | Classifier-free guidance strength. | `float` | (from config, defaults to `cfg_strength`) | Influences generation quality. |
27
+ | `--sway_sampling_coef` | Sway sampling coefficient. | `float` | (from config, defaults to `sway_sampling_coef`) | Affects sampling behavior. |
28
+ | `--speed` | Speed of the generated audio. | `float` | (from config, defaults to `speed`) | Adjusts playback speed. |
29
+ | `--fix_duration` | Fixed total duration for reference and generated audio (seconds). | `float` | (from config, defaults to `fix_duration`) | Enforces a specific duration. |
30
+ | `--device` | Device to run inference on. | `str` | (from config, defaults to `device`) | E.g., `cpu`, `cuda`. |
31
+
32
+ ## Notes
33
+ - Arguments without default values in the script (e.g., `--model`, `--ref_audio`) inherit defaults from the configuration file.
34
+ - The `--no_legacy_text` flag is implemented as `store_false`, so enabling it sets `use_legacy_text` to `False`.
35
+ - If `--gen_file` is provided, it overrides `--gen_text`.
36
+ - The script supports multiple voices defined in the config file under the `voices` key, with a fallback to a `main` voice.
37
+ - The output audio is saved as a WAV file, and optional chunked audio segments can be saved if `--save_chunk` is enabled.
38
+ - The script uses `cached_path` for downloading model checkpoints from Hugging Face if no local checkpoint is specified.
docs/training/training_doc.md ADDED
File without changes
notebooks/1-vi-fine-tuned-t5-tts.ipynb ADDED
@@ -0,0 +1,952 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "cells": [
3
+ {
4
+ "cell_type": "code",
5
+ "execution_count": null,
6
+ "metadata": {
7
+ "_cell_guid": "b1076dfc-b9ad-4769-8c92-a6c4dae69d19",
8
+ "_uuid": "8f2839f25d086af736a60e9eeb907d3b93b6e0e5",
9
+ "execution": {
10
+ "iopub.execute_input": "2025-06-15T14:21:25.974502Z",
11
+ "iopub.status.busy": "2025-06-15T14:21:25.974227Z",
12
+ "iopub.status.idle": "2025-06-15T14:21:31.475226Z",
13
+ "shell.execute_reply": "2025-06-15T14:21:31.474663Z",
14
+ "shell.execute_reply.started": "2025-06-15T14:21:25.974478Z"
15
+ },
16
+ "trusted": true
17
+ },
18
+ "outputs": [],
19
+ "source": [
20
+ "import os\n",
21
+ "os.system(\"pip install -q wget\")"
22
+ ]
23
+ },
24
+ {
25
+ "cell_type": "code",
26
+ "execution_count": null,
27
+ "metadata": {
28
+ "execution": {
29
+ "iopub.execute_input": "2025-06-15T14:21:31.476734Z",
30
+ "iopub.status.busy": "2025-06-15T14:21:31.476449Z",
31
+ "iopub.status.idle": "2025-06-15T14:21:37.092039Z",
32
+ "shell.execute_reply": "2025-06-15T14:21:37.091491Z",
33
+ "shell.execute_reply.started": "2025-06-15T14:21:31.476715Z"
34
+ },
35
+ "trusted": true
36
+ },
37
+ "outputs": [],
38
+ "source": [
39
+ "import wget\n",
40
+ "import tarfile\n",
41
+ "import torchaudio\n",
42
+ "import pandas as pd\n",
43
+ "from huggingface_hub import snapshot_download, login\n",
44
+ "login(\"<your_huggingface_token>\")"
45
+ ]
46
+ },
47
+ {
48
+ "cell_type": "code",
49
+ "execution_count": null,
50
+ "metadata": {
51
+ "execution": {
52
+ "iopub.execute_input": "2025-06-15T14:21:37.092984Z",
53
+ "iopub.status.busy": "2025-06-15T14:21:37.092705Z",
54
+ "iopub.status.idle": "2025-06-15T14:21:37.096562Z",
55
+ "shell.execute_reply": "2025-06-15T14:21:37.096039Z",
56
+ "shell.execute_reply.started": "2025-06-15T14:21:37.092967Z"
57
+ },
58
+ "trusted": true
59
+ },
60
+ "outputs": [],
61
+ "source": [
62
+ "os.chdir(\"/content\")"
63
+ ]
64
+ },
65
+ {
66
+ "cell_type": "code",
67
+ "execution_count": null,
68
+ "metadata": {
69
+ "execution": {
70
+ "iopub.execute_input": "2025-06-15T13:59:06.772020Z",
71
+ "iopub.status.busy": "2025-06-15T13:59:06.771694Z",
72
+ "iopub.status.idle": "2025-06-15T14:00:28.043176Z",
73
+ "shell.execute_reply": "2025-06-15T14:00:28.041603Z",
74
+ "shell.execute_reply.started": "2025-06-15T13:59:06.771995Z"
75
+ },
76
+ "trusted": true
77
+ },
78
+ "outputs": [],
79
+ "source": [
80
+ "from huggingface_hub import HfApi\n",
81
+ "from huggingface_hub import snapshot_download\n",
82
+ "import os\n",
83
+ "api = HfApi()\n",
84
+ "!git lfs install --force\n",
85
+ "\n",
86
+ "# Define the dataset name and local directory\n",
87
+ "\n",
88
+ "repo_id = \"heboya8/t5-tts-temp-model\"\n",
89
+ "save_path = \".\"\n",
90
+ "\n",
91
+ "# Create the directory if it doesn't exist\n",
92
+ "os.makedirs(save_path, exist_ok=True)\n",
93
+ "\n",
94
+ "# Download the dataset\n",
95
+ "snapshot_download(repo_id=repo_id, repo_type=\"model\", local_dir=save_path)"
96
+ ]
97
+ },
98
+ {
99
+ "cell_type": "code",
100
+ "execution_count": null,
101
+ "metadata": {
102
+ "execution": {
103
+ "iopub.execute_input": "2025-06-15T14:21:37.389642Z",
104
+ "iopub.status.busy": "2025-06-15T14:21:37.389399Z",
105
+ "iopub.status.idle": "2025-06-15T14:24:47.468892Z",
106
+ "shell.execute_reply": "2025-06-15T14:24:47.468139Z",
107
+ "shell.execute_reply.started": "2025-06-15T14:21:37.389623Z"
108
+ },
109
+ "trusted": true
110
+ },
111
+ "outputs": [],
112
+ "source": [
113
+ "# Step 1: Set Up the Environment\n",
114
+ "os.system(\"pip install -e . >/dev/null 2>&1\")\n",
115
+ "os.system(\"pip install torch==2.4.0+cu124 torchaudio==2.4.0+cu124 torchvision==0.19.0+cu124 --extra-index-url https://download.pytorch.org/whl/cu124 >/dev/null 2>&1\")\n",
116
+ "os.system(\"pip install accelerate==0.33.0 tensorboard >/dev/null 2>&1\")\n",
117
+ "if not os.path.exists(\"F5-TTS\"):\n",
118
+ " os.system(\"git clone https://github.com/SWivid/F5-TTS.git\")\n",
119
+ "os.chdir(\"F5-TTS\")\n",
120
+ "os.system(\"pip install -e . >/dev/null 2>&1\")\n"
121
+ ]
122
+ },
123
+ {
124
+ "cell_type": "code",
125
+ "execution_count": null,
126
+ "metadata": {
127
+ "execution": {
128
+ "iopub.execute_input": "2025-06-15T14:24:47.470454Z",
129
+ "iopub.status.busy": "2025-06-15T14:24:47.470177Z",
130
+ "iopub.status.idle": "2025-06-15T14:24:47.473922Z",
131
+ "shell.execute_reply": "2025-06-15T14:24:47.473261Z",
132
+ "shell.execute_reply.started": "2025-06-15T14:24:47.470429Z"
133
+ },
134
+ "trusted": true
135
+ },
136
+ "outputs": [],
137
+ "source": [
138
+ "os.chdir(\"/content/F5-TTS\")\n",
139
+ " # os.chdir(\"F5-TTS-Vietnamese\")"
140
+ ]
141
+ },
142
+ {
143
+ "cell_type": "code",
144
+ "execution_count": null,
145
+ "metadata": {
146
+ "execution": {
147
+ "iopub.execute_input": "2025-06-15T06:47:34.909957Z",
148
+ "iopub.status.busy": "2025-06-15T06:47:34.909372Z",
149
+ "iopub.status.idle": "2025-06-15T06:47:35.040348Z",
150
+ "shell.execute_reply": "2025-06-15T06:47:35.039424Z",
151
+ "shell.execute_reply.started": "2025-06-15T06:47:34.909927Z"
152
+ },
153
+ "trusted": true
154
+ },
155
+ "outputs": [],
156
+ "source": [
157
+ "!pwd"
158
+ ]
159
+ },
160
+ {
161
+ "cell_type": "code",
162
+ "execution_count": null,
163
+ "metadata": {
164
+ "execution": {
165
+ "iopub.execute_input": "2025-06-15T14:24:47.475053Z",
166
+ "iopub.status.busy": "2025-06-15T14:24:47.474827Z",
167
+ "iopub.status.idle": "2025-06-15T14:24:47.644337Z",
168
+ "shell.execute_reply": "2025-06-15T14:24:47.643562Z",
169
+ "shell.execute_reply.started": "2025-06-15T14:24:47.475031Z"
170
+ },
171
+ "trusted": true
172
+ },
173
+ "outputs": [],
174
+ "source": [
175
+ "!mkdir ./ckpts/vin100h-preprocessed-v2\n",
176
+ "# !cp /kaggle/input/vi-fine-tuned-t5-tts/69/model_last.pt \\\n",
177
+ "# ./ckpts/vin100h-preprocessed-v2\n",
178
+ "# !cp -r /content/73/* ./ckpts/vin100h-preprocessed-v2"
179
+ ]
180
+ },
181
+ {
182
+ "cell_type": "code",
183
+ "execution_count": null,
184
+ "metadata": {
185
+ "execution": {
186
+ "iopub.execute_input": "2025-06-15T14:24:47.646473Z",
187
+ "iopub.status.busy": "2025-06-15T14:24:47.646278Z",
188
+ "iopub.status.idle": "2025-06-15T14:25:20.275283Z",
189
+ "shell.execute_reply": "2025-06-15T14:25:20.274453Z",
190
+ "shell.execute_reply.started": "2025-06-15T14:24:47.646454Z"
191
+ },
192
+ "trusted": true
193
+ },
194
+ "outputs": [],
195
+ "source": [
196
+ "# !cp -r /kaggle/input/vi-fine-tuned-t5-tts/7/* ./ckpts\n",
197
+ "!cp -r /kaggle/input/vi-fine-tuned-t5-tts/75/model_last.pt \\\n",
198
+ " ./ckpts/vin100h-preprocessed-v2"
199
+ ]
200
+ },
201
+ {
202
+ "cell_type": "code",
203
+ "execution_count": null,
204
+ "metadata": {
205
+ "execution": {
206
+ "iopub.execute_input": "2025-06-15T14:25:20.276407Z",
207
+ "iopub.status.busy": "2025-06-15T14:25:20.276159Z",
208
+ "iopub.status.idle": "2025-06-15T14:25:20.413414Z",
209
+ "shell.execute_reply": "2025-06-15T14:25:20.412180Z",
210
+ "shell.execute_reply.started": "2025-06-15T14:25:20.276382Z"
211
+ },
212
+ "trusted": true
213
+ },
214
+ "outputs": [],
215
+ "source": [
216
+ "!ls -a ./ckpts/vin100h-preprocessed-v2"
217
+ ]
218
+ },
219
+ {
220
+ "cell_type": "code",
221
+ "execution_count": null,
222
+ "metadata": {
223
+ "execution": {
224
+ "iopub.execute_input": "2025-05-10T15:59:08.329794Z",
225
+ "iopub.status.busy": "2025-05-10T15:59:08.329442Z",
226
+ "iopub.status.idle": "2025-05-10T15:59:09.362207Z",
227
+ "shell.execute_reply": "2025-05-10T15:59:09.361253Z",
228
+ "shell.execute_reply.started": "2025-05-10T15:59:08.329757Z"
229
+ },
230
+ "trusted": true
231
+ },
232
+ "outputs": [],
233
+ "source": [
234
+ "import json\n",
235
+ "import os\n",
236
+ "from pathlib import Path\n",
237
+ "import shutil\n",
238
+ "import torchaudio\n",
239
+ "from datasets import load_dataset\n",
240
+ "from datasets.arrow_writer import ArrowWriter\n",
241
+ "from tqdm import tqdm\n",
242
+ "import soundfile as sf\n",
243
+ "import csv\n",
244
+ "\n",
245
+ "def save_dataset_to_local_disk(output_dir=\"./data/vin100h-preprocessed-v2\",\n",
246
+ " base_model=\"htdung167/vin100h-preprocessed-v2\",\n",
247
+ " audio_header='audio',\n",
248
+ " text_header='transcription'):\n",
249
+ " \n",
250
+ " wavs_dir = os.path.join(output_dir, \"wavs\")\n",
251
+ " metadata_path = os.path.join(output_dir, \"metadata.csv\")\n",
252
+ " os.makedirs(wavs_dir, exist_ok=True)\n",
253
+ "\n",
254
+ " ds = load_dataset(base_model)['train']\n",
255
+ " metadata = []\n",
256
+ "\n",
257
+ " for idx, sample in tqdm(enumerate(ds), total=len(ds),\n",
258
+ " desc=\"Saving samples to directory\"):\n",
259
+ " audio_array = sample[audio_header]['array']\n",
260
+ " sampling_rate = sample[audio_header]['sampling_rate']\n",
261
+ " filename = f\"audio_{idx:06d}.wav\"\n",
262
+ " sf.write(os.path.join(wavs_dir, filename), audio_array, sampling_rate)\n",
263
+ " # metadata.append([f\"wavs/{filename}\", sample['preprocessed_sentence_v2']])\n",
264
+ " metadata.append([f\"wavs/{filename}\", sample[text_header]])\n",
265
+ " # metadata.append([f\"{filename}\", sample['transcription']])\n",
266
+ " \n",
267
+ " with open(metadata_path, 'w', newline='', encoding='utf-8') as f:\n",
268
+ " csv.writer(f, delimiter='|').writerows(metadata)\n",
269
+ "\n",
270
+ " print(f\"Dataset saved to {output_dir}\")\n"
271
+ ]
272
+ },
273
+ {
274
+ "cell_type": "code",
275
+ "execution_count": null,
276
+ "metadata": {
277
+ "execution": {
278
+ "iopub.execute_input": "2025-05-10T15:59:10.399030Z",
279
+ "iopub.status.busy": "2025-05-10T15:59:10.397916Z",
280
+ "iopub.status.idle": "2025-05-10T16:10:46.269067Z",
281
+ "shell.execute_reply": "2025-05-10T16:10:46.267298Z",
282
+ "shell.execute_reply.started": "2025-05-10T15:59:10.398995Z"
283
+ },
284
+ "trusted": true
285
+ },
286
+ "outputs": [],
287
+ "source": [
288
+ "output_dir = \"./data/vin100h-preprocessed-v2\"\n",
289
+ "tokenizer_type = \"pinyin\"\n",
290
+ "\n",
291
+ "save_dataset_to_local_disk(output_dir=output_dir,\n",
292
+ " base_model=\"htdung167/vin100h-preprocessed-v2\",\n",
293
+ " text_header=\"preprocessed_sentence_v2\"\n",
294
+ " )"
295
+ ]
296
+ },
297
+ {
298
+ "cell_type": "code",
299
+ "execution_count": null,
300
+ "metadata": {
301
+ "_kg_hide-output": true,
302
+ "execution": {
303
+ "iopub.execute_input": "2025-05-10T16:10:46.273403Z",
304
+ "iopub.status.busy": "2025-05-10T16:10:46.272176Z",
305
+ "iopub.status.idle": "2025-05-10T17:15:19.405258Z",
306
+ "shell.execute_reply": "2025-05-10T17:15:19.402002Z",
307
+ "shell.execute_reply.started": "2025-05-10T16:10:46.273366Z"
308
+ },
309
+ "trusted": true
310
+ },
311
+ "outputs": [],
312
+ "source": [
313
+ "!python ./src/f5_tts/train/datasets/prepare_csv_wavs.py \\\n",
314
+ " \"./data/vin100h-preprocessed-v2\" \\\n",
315
+ " \"./data/vin100h-preprocessed-v2_pinyin\" \\\n",
316
+ " --workers 4 # Sets the number of parallel processes for preprocessing."
317
+ ]
318
+ },
319
+ {
320
+ "cell_type": "code",
321
+ "execution_count": null,
322
+ "metadata": {
323
+ "execution": {
324
+ "iopub.execute_input": "2025-06-15T14:25:20.414900Z",
325
+ "iopub.status.busy": "2025-06-15T14:25:20.414621Z",
326
+ "iopub.status.idle": "2025-06-15T14:25:21.649820Z",
327
+ "shell.execute_reply": "2025-06-15T14:25:21.648942Z",
328
+ "shell.execute_reply.started": "2025-06-15T14:25:20.414873Z"
329
+ },
330
+ "trusted": true
331
+ },
332
+ "outputs": [],
333
+ "source": [
334
+ "%%writefile ./src/f5_tts/configs/vi-fine-tuned-t5-tts.yaml\n",
335
+ "hydra:\n",
336
+ " run:\n",
337
+ " dir: ckpts/${model.name}_${model.mel_spec.mel_spec_type}_${model.tokenizer}_${datasets.name}/${now:%Y-%m-%d}/${now:%H-%M-%S}\n",
338
+ "\n",
339
+ "datasets:\n",
340
+ " name: vin100h-preprocessed-v2 # dataset name\n",
341
+ " batch_size_per_gpu: 3200 # 1 GPUs, 1 * 3200 = 3200\n",
342
+ " batch_size_type: frame # frame | sample\n",
343
+ " max_samples: 64 # max sequences per batch if use frame-wise batch_size. we set 32 for small models, 64 for base models\n",
344
+ " num_workers: 4\n",
345
+ "\n",
346
+ "optim:\n",
347
+ " epochs: 10\n",
348
+ " learning_rate: 1e-5\n",
349
+ " num_warmup_updates: 2761 # warmup updates\n",
350
+ " grad_accumulation_steps: 2 # note: updates = steps / grad_accumulation_steps\n",
351
+ " max_grad_norm: 1.0 # gradient clipping\n",
352
+ " bnb_optimizer: False # use bnb 8bit AdamW optimizer or not\n",
353
+ "\n",
354
+ "model:\n",
355
+ " name: vi_fine_tuned_t5_tts # model name\n",
356
+ " tokenizer: pinyin # tokenizer type\n",
357
+ " tokenizer_path: null # if 'custom' tokenizer, define the path want to use (should be vocab.txt)\n",
358
+ " backbone: DiT\n",
359
+ " arch:\n",
360
+ " dim: 1024\n",
361
+ " depth: 22\n",
362
+ " heads: 16\n",
363
+ " ff_mult: 2\n",
364
+ " text_dim: 512\n",
365
+ " text_mask_padding: False\n",
366
+ " conv_layers: 4\n",
367
+ " pe_attn_head: 1\n",
368
+ " checkpoint_activations: False # recompute activations and save memory for extra compute\n",
369
+ " mel_spec:\n",
370
+ " target_sample_rate: 24000\n",
371
+ " n_mel_channels: 100\n",
372
+ " hop_length: 256\n",
373
+ " win_length: 1024\n",
374
+ " n_fft: 1024\n",
375
+ " mel_spec_type: vocos # vocos | bigvgan\n",
376
+ " vocoder:\n",
377
+ " is_local: False # use local offline ckpt or not\n",
378
+ " local_path: null # local vocoder path\n",
379
+ "\n",
380
+ "ckpts:\n",
381
+ " logger: null # wandb | tensorboard | null\n",
382
+ " log_samples: True # infer random sample per save checkpoint. wip, normal to fail with extra long samples\n",
383
+ " save_per_updates: 4000 # save checkpoint per updates\n",
384
+ " keep_last_n_checkpoints: 1 # -1 to keep all, 0 to not save intermediate, > 0 to keep last N checkpoints\n",
385
+ " last_per_updates: 4000 # save last checkpoint per updates\n",
386
+ " save_dir: ckpts/${model.name}_${model.mel_spec.mel_spec_type}_${model.tokenizer}_${datasets.name}"
387
+ ]
388
+ },
389
+ {
390
+ "cell_type": "code",
391
+ "execution_count": null,
392
+ "metadata": {
393
+ "execution": {
394
+ "iopub.execute_input": "2025-06-15T14:25:21.651011Z",
395
+ "iopub.status.busy": "2025-06-15T14:25:21.650749Z",
396
+ "iopub.status.idle": "2025-06-15T14:25:22.958480Z",
397
+ "shell.execute_reply": "2025-06-15T14:25:22.957781Z",
398
+ "shell.execute_reply.started": "2025-06-15T14:25:21.650992Z"
399
+ },
400
+ "trusted": true
401
+ },
402
+ "outputs": [],
403
+ "source": [
404
+ "!echo hello"
405
+ ]
406
+ },
407
+ {
408
+ "cell_type": "code",
409
+ "execution_count": null,
410
+ "metadata": {
411
+ "execution": {
412
+ "iopub.execute_input": "2025-06-15T14:25:22.959726Z",
413
+ "iopub.status.busy": "2025-06-15T14:25:22.959476Z",
414
+ "iopub.status.idle": "2025-06-15T14:25:38.131765Z",
415
+ "shell.execute_reply": "2025-06-15T14:25:38.130931Z",
416
+ "shell.execute_reply.started": "2025-06-15T14:25:22.959692Z"
417
+ },
418
+ "trusted": true
419
+ },
420
+ "outputs": [],
421
+ "source": [
422
+ "!accelerate config default"
423
+ ]
424
+ },
425
+ {
426
+ "cell_type": "code",
427
+ "execution_count": null,
428
+ "metadata": {
429
+ "execution": {
430
+ "iopub.execute_input": "2025-06-15T14:28:31.671797Z",
431
+ "iopub.status.busy": "2025-06-15T14:28:31.671483Z",
432
+ "iopub.status.idle": "2025-06-15T14:28:31.803519Z",
433
+ "shell.execute_reply": "2025-06-15T14:28:31.802848Z",
434
+ "shell.execute_reply.started": "2025-06-15T14:28:31.671770Z"
435
+ },
436
+ "trusted": true
437
+ },
438
+ "outputs": [],
439
+ "source": [
440
+ "!echo go"
441
+ ]
442
+ },
443
+ {
444
+ "cell_type": "code",
445
+ "execution_count": null,
446
+ "metadata": {
447
+ "execution": {
448
+ "iopub.execute_input": "2025-06-15T14:28:31.804624Z",
449
+ "iopub.status.busy": "2025-06-15T14:28:31.804419Z",
450
+ "iopub.status.idle": "2025-06-15T17:59:02.693078Z",
451
+ "shell.execute_reply": "2025-06-15T17:59:02.692025Z",
452
+ "shell.execute_reply.started": "2025-06-15T14:28:31.804591Z"
453
+ },
454
+ "trusted": true
455
+ },
456
+ "outputs": [],
457
+ "source": [
458
+ "# ************\n",
459
+ "!accelerate launch ./src/f5_tts/train/finetune_cli.py \\\n",
460
+ " --exp_name F5TTS_Base \\\n",
461
+ " --dataset_name vin100h-preprocessed-v2 \\\n",
462
+ " --finetune \\\n",
463
+ " --tokenizer pinyin \\\n",
464
+ " --learning_rate 1e-05 \\\n",
465
+ " --batch_size_type frame \\\n",
466
+ " --batch_size_per_gpu 3200 \\\n",
467
+ " --max_samples 64 \\\n",
468
+ " --grad_accumulation_steps 2 \\\n",
469
+ " --max_grad_norm 1 \\\n",
470
+ " --epochs 76 \\\n",
471
+ " --num_warmup_updates 2761 \\\n",
472
+ " --save_per_updates 4000 \\\n",
473
+ " --keep_last_n_checkpoints 1 \\\n",
474
+ " --last_per_updates 4000 \\\n",
475
+ " --log_samples \\\n",
476
+ " --pretrain ./ckpts/vin100h-preprocessed-v2/model_last.pt\n"
477
+ ]
478
+ },
479
+ {
480
+ "cell_type": "code",
481
+ "execution_count": null,
482
+ "metadata": {
483
+ "execution": {
484
+ "iopub.execute_input": "2025-06-15T18:05:50.705629Z",
485
+ "iopub.status.busy": "2025-06-15T18:05:50.704903Z",
486
+ "iopub.status.idle": "2025-06-15T18:05:50.891227Z",
487
+ "shell.execute_reply": "2025-06-15T18:05:50.890434Z",
488
+ "shell.execute_reply.started": "2025-06-15T18:05:50.705578Z"
489
+ },
490
+ "trusted": true
491
+ },
492
+ "outputs": [],
493
+ "source": [
494
+ "!echo abc"
495
+ ]
496
+ },
497
+ {
498
+ "cell_type": "markdown",
499
+ "metadata": {},
500
+ "source": [
501
+ "# Copy and save"
502
+ ]
503
+ },
504
+ {
505
+ "cell_type": "code",
506
+ "execution_count": null,
507
+ "metadata": {
508
+ "execution": {
509
+ "iopub.execute_input": "2025-06-14T10:18:46.384990Z",
510
+ "iopub.status.busy": "2025-06-14T10:18:46.384685Z",
511
+ "iopub.status.idle": "2025-06-14T10:18:46.518166Z",
512
+ "shell.execute_reply": "2025-06-14T10:18:46.517174Z",
513
+ "shell.execute_reply.started": "2025-06-14T10:18:46.384965Z"
514
+ },
515
+ "trusted": true
516
+ },
517
+ "outputs": [],
518
+ "source": [
519
+ "!rm -rf /kaggle/working/.cache"
520
+ ]
521
+ },
522
+ {
523
+ "cell_type": "code",
524
+ "execution_count": null,
525
+ "metadata": {
526
+ "execution": {
527
+ "iopub.execute_input": "2025-06-07T16:58:20.250613Z",
528
+ "iopub.status.busy": "2025-06-07T16:58:20.250305Z",
529
+ "iopub.status.idle": "2025-06-07T16:58:20.446725Z",
530
+ "shell.execute_reply": "2025-06-07T16:58:20.445927Z",
531
+ "shell.execute_reply.started": "2025-06-07T16:58:20.250588Z"
532
+ },
533
+ "trusted": true
534
+ },
535
+ "outputs": [],
536
+ "source": [
537
+ "!ls -a ckpts/vin100h-preprocessed-v2"
538
+ ]
539
+ },
540
+ {
541
+ "cell_type": "code",
542
+ "execution_count": null,
543
+ "metadata": {
544
+ "execution": {
545
+ "iopub.execute_input": "2025-06-15T18:06:00.980687Z",
546
+ "iopub.status.busy": "2025-06-15T18:06:00.979884Z",
547
+ "iopub.status.idle": "2025-06-15T18:06:07.418545Z",
548
+ "shell.execute_reply": "2025-06-15T18:06:07.417240Z",
549
+ "shell.execute_reply.started": "2025-06-15T18:06:00.980649Z"
550
+ },
551
+ "trusted": true
552
+ },
553
+ "outputs": [],
554
+ "source": [
555
+ "# *******************Importance\n",
556
+ "model_dir = \"/kaggle/working/76\"\n",
557
+ "os.makedirs(model_dir, exist_ok=True)\n",
558
+ "!cp -r ./ckpts/vin100h-preprocessed-v2/model_last.pt $model_dir"
559
+ ]
560
+ },
561
+ {
562
+ "cell_type": "code",
563
+ "execution_count": null,
564
+ "metadata": {
565
+ "execution": {
566
+ "iopub.status.busy": "2025-06-14T10:34:21.049620Z",
567
+ "iopub.status.idle": "2025-06-14T10:34:21.049856Z",
568
+ "shell.execute_reply": "2025-06-14T10:34:21.049753Z",
569
+ "shell.execute_reply.started": "2025-06-14T10:34:21.049740Z"
570
+ },
571
+ "trusted": true
572
+ },
573
+ "outputs": [],
574
+ "source": [
575
+ "# To temporary Model hub\n",
576
+ "from huggingface_hub import HfApi\n",
577
+ "from huggingface_hub import snapshot_download\n",
578
+ "# Initialize API\n",
579
+ "api = HfApi()\n",
580
+ "\n",
581
+ "# Upload the folder to the repository root\n",
582
+ "api.upload_large_folder(\n",
583
+ " folder_path=\"/kaggle/working\", # Local folder path\n",
584
+ " repo_id=\"heboya8/t5-tts-temp-model\",\n",
585
+ " repo_type=\"model\"\n",
586
+ ")"
587
+ ]
588
+ },
589
+ {
590
+ "cell_type": "markdown",
591
+ "metadata": {},
592
+ "source": [
593
+ "# Prune Checkpoint"
594
+ ]
595
+ },
596
+ {
597
+ "cell_type": "code",
598
+ "execution_count": null,
599
+ "metadata": {
600
+ "execution": {
601
+ "iopub.execute_input": "2025-05-11T14:11:57.837831Z",
602
+ "iopub.status.busy": "2025-05-11T14:11:57.837476Z",
603
+ "iopub.status.idle": "2025-05-11T14:11:57.844498Z",
604
+ "shell.execute_reply": "2025-05-11T14:11:57.843701Z",
605
+ "shell.execute_reply.started": "2025-05-11T14:11:57.837803Z"
606
+ },
607
+ "trusted": true
608
+ },
609
+ "outputs": [],
610
+ "source": [
611
+ "import torch\n",
612
+ "\n",
613
+ "def prune_checkpoint(checkpoint_path: str, new_checkpoint_path: str, save_ema: bool, safetensors: bool) -> str:\n",
614
+ " try:\n",
615
+ " checkpoint = torch.load(checkpoint_path, weights_only=True)\n",
616
+ " print(\"Original Checkpoint Keys:\", checkpoint.keys())\n",
617
+ "\n",
618
+ " to_retain = \"ema_model_state_dict\" if save_ema else \"model_state_dict\"\n",
619
+ " try:\n",
620
+ " model_state_dict_to_retain = checkpoint[to_retain]\n",
621
+ " except KeyError:\n",
622
+ " return f\"{to_retain} not found in the checkpoint.\"\n",
623
+ "\n",
624
+ " if safetensors:\n",
625
+ " new_checkpoint_path = new_checkpoint_path.replace(\".pt\", \".safetensors\")\n",
626
+ " save_file(model_state_dict_to_retain, new_checkpoint_path)\n",
627
+ " else:\n",
628
+ " new_checkpoint_path = new_checkpoint_path.replace(\".safetensors\", \".pt\")\n",
629
+ " new_checkpoint = {\"ema_model_state_dict\": model_state_dict_to_retain}\n",
630
+ " torch.save(new_checkpoint, new_checkpoint_path)\n",
631
+ "\n",
632
+ " return f\"New checkpoint saved at: {new_checkpoint_path}\"\n",
633
+ "\n",
634
+ " except Exception as e:\n",
635
+ " return f\"An error occurred: {e}\""
636
+ ]
637
+ },
638
+ {
639
+ "cell_type": "code",
640
+ "execution_count": null,
641
+ "metadata": {
642
+ "execution": {
643
+ "iopub.execute_input": "2025-05-11T14:22:24.624318Z",
644
+ "iopub.status.busy": "2025-05-11T14:22:24.623974Z",
645
+ "iopub.status.idle": "2025-05-11T14:22:30.316195Z",
646
+ "shell.execute_reply": "2025-05-11T14:22:30.315529Z",
647
+ "shell.execute_reply.started": "2025-05-11T14:22:24.624292Z"
648
+ },
649
+ "trusted": true
650
+ },
651
+ "outputs": [],
652
+ "source": [
653
+ "# Prune a checkpoint after training resize model\n",
654
+ "result = prune_checkpoint(\n",
655
+ " checkpoint_path=\"/kaggle/working/F5-TTS/ckpts/vin100h-preprocessed-v2/model_last.pt\",\n",
656
+ " new_checkpoint_path=\"/root/.cache/abc.pt\",\n",
657
+ " save_ema=False,\n",
658
+ " safetensors=False\n",
659
+ ")\n",
660
+ "print(result)"
661
+ ]
662
+ },
663
+ {
664
+ "cell_type": "markdown",
665
+ "metadata": {},
666
+ "source": [
667
+ "# Inference"
668
+ ]
669
+ },
670
+ {
671
+ "cell_type": "code",
672
+ "execution_count": null,
673
+ "metadata": {
674
+ "execution": {
675
+ "iopub.execute_input": "2025-05-20T17:08:02.683953Z",
676
+ "iopub.status.busy": "2025-05-20T17:08:02.683595Z",
677
+ "iopub.status.idle": "2025-05-20T17:08:02.753448Z",
678
+ "shell.execute_reply": "2025-05-20T17:08:02.752714Z",
679
+ "shell.execute_reply.started": "2025-05-20T17:08:02.683922Z"
680
+ },
681
+ "trusted": true
682
+ },
683
+ "outputs": [],
684
+ "source": [
685
+ "from IPython.display import Audio\n",
686
+ "\n",
687
+ "# Path to your audio file\n",
688
+ "audio_path = './data/vin100h-preprocessed-v2/wavs/audio_000010.wav'\n",
689
+ "\n",
690
+ "# Display and play the audio\n",
691
+ "Audio(audio_path)\n"
692
+ ]
693
+ },
694
+ {
695
+ "cell_type": "code",
696
+ "execution_count": null,
697
+ "metadata": {
698
+ "execution": {
699
+ "iopub.execute_input": "2025-06-14T10:24:03.249295Z",
700
+ "iopub.status.busy": "2025-06-14T10:24:03.248968Z",
701
+ "iopub.status.idle": "2025-06-14T10:24:41.393133Z",
702
+ "shell.execute_reply": "2025-06-14T10:24:41.391987Z",
703
+ "shell.execute_reply.started": "2025-06-14T10:24:03.249273Z"
704
+ },
705
+ "trusted": true
706
+ },
707
+ "outputs": [],
708
+ "source": [
709
+ "!python ./src/f5_tts/infer/infer_cli.py \\\n",
710
+ " --model \"vin100h-preprocessed-v2\" \\\n",
711
+ " --model_cfg \"./src/f5_tts/configs/F5TTS_Base.yaml\" \\\n",
712
+ " --ckpt_file \"./ckpts/vin100h-preprocessed-v2/model_last.pt\" \\\n",
713
+ " --vocab_file \"./data/vin100h-preprocessed-v2_pinyin/vocab.txt\" \\\n",
714
+ " --ref_audio \"./data/vin100h-preprocessed-v2/wavs/audio_000010.wav\" \\\n",
715
+ " --ref_text \"Về giá cả so với giá bán ngoài các siêu thị thì dâu trái ở đây rẻ hơn khá nhiều. Giả sử như bó rau ở siêu thị bán khoảng 2 đô la một bó thì ở đây chỉ có một đô la một bó. Có khi mua 50 bó được tặng thêm một bó nữa.\" \\\n",
716
+ " --gen_text \"Về giá cả so với giá bán ngoài các siêu thị\" \\\n",
717
+ " --output_dir \"/kaggle/working/\"\n",
718
+ " # --output_file \"/content/abc.wav\""
719
+ ]
720
+ },
721
+ {
722
+ "cell_type": "code",
723
+ "execution_count": null,
724
+ "metadata": {
725
+ "execution": {
726
+ "iopub.execute_input": "2025-06-14T10:24:41.395230Z",
727
+ "iopub.status.busy": "2025-06-14T10:24:41.394917Z",
728
+ "iopub.status.idle": "2025-06-14T10:24:41.404325Z",
729
+ "shell.execute_reply": "2025-06-14T10:24:41.403321Z",
730
+ "shell.execute_reply.started": "2025-06-14T10:24:41.395199Z"
731
+ },
732
+ "trusted": true
733
+ },
734
+ "outputs": [],
735
+ "source": [
736
+ "from IPython.display import Audio\n",
737
+ "\n",
738
+ "# Path to your audio file\n",
739
+ "audio_path = '/kaggle/working/infer_cli_basic.wav'\n",
740
+ "\n",
741
+ "# Display and play the audio\n",
742
+ "Audio(audio_path)\n"
743
+ ]
744
+ },
745
+ {
746
+ "cell_type": "markdown",
747
+ "metadata": {},
748
+ "source": [
749
+ "## Download"
750
+ ]
751
+ },
752
+ {
753
+ "cell_type": "code",
754
+ "execution_count": null,
755
+ "metadata": {
756
+ "execution": {
757
+ "iopub.execute_input": "2025-06-15T14:25:38.133173Z",
758
+ "iopub.status.busy": "2025-06-15T14:25:38.132898Z",
759
+ "iopub.status.idle": "2025-06-15T14:26:12.006111Z",
760
+ "shell.execute_reply": "2025-06-15T14:26:12.005444Z",
761
+ "shell.execute_reply.started": "2025-06-15T14:25:38.133137Z"
762
+ },
763
+ "trusted": true
764
+ },
765
+ "outputs": [],
766
+ "source": [
767
+ "from huggingface_hub import HfApi\n",
768
+ "from huggingface_hub import snapshot_download\n",
769
+ "import os\n",
770
+ "api = HfApi()\n",
771
+ "!git lfs install --force\n",
772
+ "\n",
773
+ "# Define the dataset name and local directory\n",
774
+ "repo_id = \"heboya8/f5-tts-dataset\"\n",
775
+ "save_path = \"/root/.cache\"\n",
776
+ "\n",
777
+ "# Create the directory if it doesn't exist\n",
778
+ "os.makedirs(save_path, exist_ok=True)\n",
779
+ "\n",
780
+ "# Download the dataset\n",
781
+ "snapshot_download(repo_id=repo_id, repo_type=\"dataset\", local_dir=save_path)"
782
+ ]
783
+ },
784
+ {
785
+ "cell_type": "code",
786
+ "execution_count": null,
787
+ "metadata": {
788
+ "execution": {
789
+ "iopub.execute_input": "2025-06-15T14:26:12.009357Z",
790
+ "iopub.status.busy": "2025-06-15T14:26:12.009122Z",
791
+ "iopub.status.idle": "2025-06-15T14:28:31.670192Z",
792
+ "shell.execute_reply": "2025-06-15T14:28:31.669158Z",
793
+ "shell.execute_reply.started": "2025-06-15T14:26:12.009338Z"
794
+ },
795
+ "trusted": true
796
+ },
797
+ "outputs": [],
798
+ "source": [
799
+ "!unzip -q -o /root/.cache/data_compress.zip -d \".\""
800
+ ]
801
+ },
802
+ {
803
+ "cell_type": "markdown",
804
+ "metadata": {},
805
+ "source": [
806
+ "## Upload"
807
+ ]
808
+ },
809
+ {
810
+ "cell_type": "code",
811
+ "execution_count": null,
812
+ "metadata": {
813
+ "execution": {
814
+ "iopub.execute_input": "2025-05-10T20:06:26.721683Z",
815
+ "iopub.status.busy": "2025-05-10T20:06:26.720825Z",
816
+ "iopub.status.idle": "2025-05-10T20:11:36.850624Z",
817
+ "shell.execute_reply": "2025-05-10T20:11:36.849599Z",
818
+ "shell.execute_reply.started": "2025-05-10T20:06:26.721632Z"
819
+ },
820
+ "trusted": true
821
+ },
822
+ "outputs": [],
823
+ "source": [
824
+ "from huggingface_hub import HfApi\n",
825
+ "from huggingface_hub import snapshot_download\n",
826
+ "# Initialize API\n",
827
+ "api = HfApi()\n",
828
+ "\n",
829
+ "# Upload the folder to the repository root\n",
830
+ "api.upload_large_folder(\n",
831
+ " folder_path=\"/root/.cache/dataset\", # Local folder path\n",
832
+ " repo_id=\"heboya8/f5-tts-dataset\",\n",
833
+ " repo_type=\"dataset\",\n",
834
+ " # multi_commits=True, # Enable resumable uploads\n",
835
+ " # multi_commits_verbose=True # Show progress\n",
836
+ ")"
837
+ ]
838
+ },
839
+ {
840
+ "cell_type": "markdown",
841
+ "metadata": {},
842
+ "source": [
843
+ "## /kaggle/working/F5-TTS/ckpts/vin100h-preprocessed-v2/model_last.ptDowload Dataset"
844
+ ]
845
+ },
846
+ {
847
+ "cell_type": "code",
848
+ "execution_count": null,
849
+ "metadata": {
850
+ "execution": {
851
+ "iopub.execute_input": "2025-05-10T20:16:38.191744Z",
852
+ "iopub.status.busy": "2025-05-10T20:16:38.191338Z",
853
+ "iopub.status.idle": "2025-05-10T20:16:56.134770Z",
854
+ "shell.execute_reply": "2025-05-10T20:16:56.133810Z",
855
+ "shell.execute_reply.started": "2025-05-10T20:16:38.191712Z"
856
+ },
857
+ "trusted": true
858
+ },
859
+ "outputs": [],
860
+ "source": []
861
+ },
862
+ {
863
+ "cell_type": "code",
864
+ "execution_count": null,
865
+ "metadata": {
866
+ "execution": {
867
+ "iopub.execute_input": "2025-05-10T20:19:28.100798Z",
868
+ "iopub.status.busy": "2025-05-10T20:19:28.099915Z",
869
+ "iopub.status.idle": "2025-05-10T20:19:28.249902Z",
870
+ "shell.execute_reply": "2025-05-10T20:19:28.248723Z",
871
+ "shell.execute_reply.started": "2025-05-10T20:19:28.100762Z"
872
+ },
873
+ "trusted": true
874
+ },
875
+ "outputs": [],
876
+ "source": [
877
+ "!mkdir dataset"
878
+ ]
879
+ },
880
+ {
881
+ "cell_type": "code",
882
+ "execution_count": null,
883
+ "metadata": {
884
+ "execution": {
885
+ "iopub.execute_input": "2025-05-10T20:20:05.322822Z",
886
+ "iopub.status.busy": "2025-05-10T20:20:05.322019Z",
887
+ "iopub.status.idle": "2025-05-10T20:20:05.567705Z",
888
+ "shell.execute_reply": "2025-05-10T20:20:05.566624Z",
889
+ "shell.execute_reply.started": "2025-05-10T20:20:05.322785Z"
890
+ },
891
+ "trusted": true
892
+ },
893
+ "outputs": [],
894
+ "source": [
895
+ "!rm -rf d /root/.cache/dataset"
896
+ ]
897
+ },
898
+ {
899
+ "cell_type": "code",
900
+ "execution_count": null,
901
+ "metadata": {
902
+ "execution": {
903
+ "iopub.execute_input": "2025-05-10T20:20:07.132689Z",
904
+ "iopub.status.busy": "2025-05-10T20:20:07.132287Z",
905
+ "iopub.status.idle": "2025-05-10T20:22:58.875583Z",
906
+ "shell.execute_reply": "2025-05-10T20:22:58.874368Z",
907
+ "shell.execute_reply.started": "2025-05-10T20:20:07.132656Z"
908
+ },
909
+ "trusted": true
910
+ },
911
+ "outputs": [],
912
+ "source": [
913
+ "!unzip -q /kaggle/working/F5-TTS/~/.cache/data_compress.zip -d /root/.cache/dataset"
914
+ ]
915
+ }
916
+ ],
917
+ "metadata": {
918
+ "kaggle": {
919
+ "accelerator": "none",
920
+ "dataSources": [
921
+ {
922
+ "sourceId": 245622735,
923
+ "sourceType": "kernelVersion"
924
+ }
925
+ ],
926
+ "dockerImageVersionId": 31012,
927
+ "isGpuEnabled": false,
928
+ "isInternetEnabled": true,
929
+ "language": "python",
930
+ "sourceType": "notebook"
931
+ },
932
+ "kernelspec": {
933
+ "display_name": "Python 3",
934
+ "language": "python",
935
+ "name": "python3"
936
+ },
937
+ "language_info": {
938
+ "codemirror_mode": {
939
+ "name": "ipython",
940
+ "version": 3
941
+ },
942
+ "file_extension": ".py",
943
+ "mimetype": "text/x-python",
944
+ "name": "python",
945
+ "nbconvert_exporter": "python",
946
+ "pygments_lexer": "ipython3",
947
+ "version": "3.11.11"
948
+ }
949
+ },
950
+ "nbformat": 4,
951
+ "nbformat_minor": 4
952
+ }