Spaces:

danhtran2mind
/

Vi-F5-TTS

Running

App Files Files Community

danhtran2mind commited on Aug 1

Commit

3f9cba0

verified ·

1 Parent(s): c87ce87

Upload 244 files

Browse files

This view is limited to 50 files because it contains too many changes. See raw diff

Files changed (50) hide show

.gitattributes +24 -0
apps/gradio_app.py +149 -0
apps/gradio_app/__init__.py +0 -0
apps/gradio_app/asr_utils.py +16 -0
apps/gradio_app/assets/examples/f5_tts/1/infer_audio.wav +3 -0
apps/gradio_app/assets/examples/f5_tts/1/infer_text.txt +1 -0
apps/gradio_app/assets/examples/f5_tts/1/refer_audio.mp3 +0 -0
apps/gradio_app/assets/examples/f5_tts/1/refer_text.txt +1 -0
apps/gradio_app/assets/examples/f5_tts/2/infer_audio.mp3 +0 -0
apps/gradio_app/assets/examples/f5_tts/2/infer_text.txt +1 -0
apps/gradio_app/assets/examples/f5_tts/2/refer_audio.mp3 +0 -0
apps/gradio_app/assets/examples/f5_tts/2/refer_text.txt +1 -0
apps/gradio_app/assets/examples/f5_tts/3/infer_audio.mp3 +0 -0
apps/gradio_app/assets/examples/f5_tts/3/infer_text.txt +1 -0
apps/gradio_app/assets/examples/f5_tts/3/refer_audio.mp3 +3 -0
apps/gradio_app/assets/examples/f5_tts/3/refer_text.txt +1 -0
apps/gradio_app/assets/examples/f5_tts/4/infer_audio.mp3 +0 -0
apps/gradio_app/assets/examples/f5_tts/4/infer_text.txt +1 -0
apps/gradio_app/assets/examples/f5_tts/4/refer_audio.mp3 +3 -0
apps/gradio_app/assets/examples/f5_tts/4/refer_text.txt +1 -0
apps/gradio_app/components.py +90 -0
apps/gradio_app/setup_scripts.py +61 -0
apps/gradio_app/static/scripts.js +0 -0
apps/gradio_app/static/styles.css +100 -0
apps/old-gradio_app.py +140 -0
assets/.gitkeep +0 -0
assets/examples/f5_tts/1/infer_audio.wav +3 -0
assets/examples/f5_tts/1/infer_text.txt +1 -0
assets/examples/f5_tts/1/refer_audio.mp3 +0 -0
assets/examples/f5_tts/1/refer_text.txt +1 -0
assets/examples/f5_tts/2/infer_audio.mp3 +0 -0
assets/examples/f5_tts/2/infer_text.txt +1 -0
assets/examples/f5_tts/2/refer_audio.mp3 +0 -0
assets/examples/f5_tts/2/refer_text.txt +1 -0
assets/examples/f5_tts/3/infer_audio.mp3 +0 -0
assets/examples/f5_tts/3/infer_text.txt +1 -0
assets/examples/f5_tts/3/refer_audio.mp3 +3 -0
assets/examples/f5_tts/3/refer_text.txt +1 -0
assets/examples/f5_tts/4/infer_audio.mp3 +0 -0
assets/examples/f5_tts/4/infer_text.txt +1 -0
assets/examples/f5_tts/4/refer_audio.mp3 +3 -0
assets/examples/f5_tts/4/refer_text.txt +1 -0
ckpts/.gitkeep +0 -0
configs/.gitkeep +0 -0
configs/vi-fine-tuned-f5-tts.yaml +52 -0
data/.gitkeep +0 -0
docs/.gitkeep +0 -0
docs/inference/inference_doc.md +38 -0
docs/training/training_doc.md +0 -0
notebooks/1-vi-fine-tuned-t5-tts.ipynb +952 -0

.gitattributes CHANGED Viewed

@@ -33,3 +33,27 @@ saved_model/**/* filter=lfs diff=lfs merge=lfs -text
 *.zip filter=lfs diff=lfs merge=lfs -text
 *.zst filter=lfs diff=lfs merge=lfs -text
 *tfevents* filter=lfs diff=lfs merge=lfs -text

 *.zip filter=lfs diff=lfs merge=lfs -text
 *.zst filter=lfs diff=lfs merge=lfs -text
 *tfevents* filter=lfs diff=lfs merge=lfs -text
+apps/gradio_app/assets/examples/f5_tts/1/infer_audio.wav filter=lfs diff=lfs merge=lfs -text
+apps/gradio_app/assets/examples/f5_tts/3/refer_audio.mp3 filter=lfs diff=lfs merge=lfs -text
+apps/gradio_app/assets/examples/f5_tts/4/refer_audio.mp3 filter=lfs diff=lfs merge=lfs -text
+assets/examples/f5_tts/1/infer_audio.wav filter=lfs diff=lfs merge=lfs -text
+assets/examples/f5_tts/3/refer_audio.mp3 filter=lfs diff=lfs merge=lfs -text
+assets/examples/f5_tts/4/refer_audio.mp3 filter=lfs diff=lfs merge=lfs -text
+src/f5_tts/src/f5_tts/infer/examples/basic/basic_ref_en.wav filter=lfs diff=lfs merge=lfs -text
+src/f5_tts/src/f5_tts/infer/examples/basic/basic_ref_zh.wav filter=lfs diff=lfs merge=lfs -text
+src/f5_tts/src/f5_tts/infer/examples/multi/country.flac filter=lfs diff=lfs merge=lfs -text
+src/f5_tts/src/f5_tts/infer/examples/multi/main.flac filter=lfs diff=lfs merge=lfs -text
+src/f5_tts/src/f5_tts/infer/examples/multi/town.flac filter=lfs diff=lfs merge=lfs -text
+src/third_party/BigVGAN/demo/examples/dance_24k.wav filter=lfs diff=lfs merge=lfs -text
+src/third_party/BigVGAN/demo/examples/hifitts_44k.wav filter=lfs diff=lfs merge=lfs -text
+src/third_party/BigVGAN/demo/examples/jensen_24k.wav filter=lfs diff=lfs merge=lfs -text
+src/third_party/BigVGAN/demo/examples/libritts_24k.wav filter=lfs diff=lfs merge=lfs -text
+src/third_party/BigVGAN/demo/examples/megalovania_24k.wav filter=lfs diff=lfs merge=lfs -text
+src/third_party/BigVGAN/demo/examples/musdbhq_44k.wav filter=lfs diff=lfs merge=lfs -text
+src/third_party/BigVGAN/demo/examples/musiccaps1_44k.wav filter=lfs diff=lfs merge=lfs -text
+src/third_party/BigVGAN/demo/examples/musiccaps2_44k.wav filter=lfs diff=lfs merge=lfs -text
+src/third_party/BigVGAN/demo/examples/queen_24k.wav filter=lfs diff=lfs merge=lfs -text
+src/third_party/BigVGAN/filelists/LibriTTS/train-full.txt filter=lfs diff=lfs merge=lfs -text
+tests/test_data/1/infer_audio.wav filter=lfs diff=lfs merge=lfs -text
+tests/test_data/3/refer_audio.mp3 filter=lfs diff=lfs merge=lfs -text
+tests/test_data/4/refer_audio.mp3 filter=lfs diff=lfs merge=lfs -text

apps/gradio_app.py ADDED Viewed

	@@ -0,0 +1,149 @@

+import gradio as gr
+from gradio_app.components import (
+    get_files_in_ckpts, handle_file_upload,
+    run_tts_inference,
+    run_setup_script
+)
+from gradio_app.asr_utils import transcribe_audio
+from pathlib import Path
+def create_gradio_app():
+    """Create Gradio interface for F5-TTS inference with Whisper ASR."""
+    # Run setup script to ensure dependencies are installed
+    run_setup_script()
+    # Function to update reference text based on audio file and Whisper checkbox
+    def update_ref_text(audio_file_path, use_whisper):
+        if use_whisper and audio_file_path:
+            return transcribe_audio(audio_file_path)
+        return gr.update()
+    def toggle_model_inputs(use_upload):
+        return (
+            gr.update(visible=not use_upload),
+            gr.update(visible=not use_upload),
+            gr.update(visible=not use_upload),
+            gr.update(visible=use_upload),
+            gr.update(visible=use_upload),
+            gr.update(visible=use_upload)
+        )
+    def load_example(ref_audio_path, ref_text, inf_text):
+        """Load example inputs and retrieve corresponding infer_audio for output."""
+        # Find the matching example folder to get infer_audio
+        example_dirs = [
+            Path("apps/gradio_app/assets/examples/f5_tts/1"),
+            Path("apps/gradio_app/assets/examples/f5_tts/2"),
+            Path("apps/gradio_app/assets/examples/f5_tts/3"),
+            Path("apps/gradio_app/assets/examples/f5_tts/4")
+        ]
+        inf_audio_path = None
+        for dir_path in example_dirs:
+            if dir_path.exists():
+                ref_audio = next((f for f in dir_path.glob("refer_audio.*") if f.suffix in [".mp3", ".wav"]), None)
+                if ref_audio and str(ref_audio) == ref_audio_path:
+                    inf_audio = next((f for f in dir_path.glob("infer_audio.*") if f.suffix in [".mp3", ".wav"]), None)
+                    inf_audio_path = str(inf_audio) if inf_audio else None
+                    break
+        return ref_audio_path, ref_text, inf_text, inf_audio_path
+    # Prepare examples for gr.Examples (exclude infer_audio from table)
+    example_dirs = [
+        Path("apps/gradio_app/assets/examples/f5_tts/1"),
+        Path("apps/gradio_app/assets/examples/f5_tts/2"),
+        Path("apps/gradio_app/assets/examples/f5_tts/3"),
+        Path("apps/gradio_app/assets/examples/f5_tts/4")
+    ]
+    examples = []
+    for dir_path in example_dirs:
+        if not dir_path.exists():
+            continue
+        # Read text files
+        ref_text = (dir_path / "refer_text.txt").read_text(encoding="utf-8") if (dir_path / "refer_text.txt").exists() else ""
+        inf_text = (dir_path / "infer_text.txt").read_text(encoding="utf-8") if (dir_path / "infer_text.txt").exists() else ""
+        # Find audio files (mp3 or wav)
+        ref_audio = next((f for f in dir_path.glob("refer_audio.*") if f.suffix in [".mp3", ".wav"]), None)
+        examples.append([
+            str(ref_audio) if ref_audio else None,
+            ref_text,
+            inf_text
+        ])
+    CSS = open("apps/gradio_app/static/styles.css", "r").read()
+    with gr.Blocks(css=CSS) as demo:
+        gr.Markdown("# F5-TTS Audio Generation")
+        gr.Markdown("Generate high-quality audio with a fine-tuned F5-TTS model. Upload reference audio, use Whisper ASR for transcription, enter text, adjust speed, and select or upload model files.")
+        with gr.Row():
+            with gr.Column():
+                ref_audio = gr.Audio(label="Reference Audio", type="filepath")
+                with gr.Group():
+                    use_whisper = gr.Checkbox(label="Use Whisper ASR for Transcription", value=False)
+                    ref_text = gr.Textbox(
+                        label="Reference Text",
+                        placeholder="e.g., Sau nhà Ngô, lần lượt các triều Đinh...",
+                        lines=1
+                    )
+                    gen_text = gr.Textbox(
+                        label="Generated Text",
+                        placeholder="e.g., Nhà Tiền Lê, Lý và Trần đã chống trả...",
+                        lines=1
+                    )
+                generate_btn = gr.Button("Generate Audio")
+            with gr.Column():
+                output_audio = gr.Audio(label="Generated Audio")
+                output_text = gr.Textbox(label="Status", interactive=False)
+                with gr.Group():
+                    speed = gr.Slider(0.5, 2.0, 1.0, step=0.1, label="Speed")
+                    model_cfg = gr.Dropdown(
+                        choices=get_files_in_ckpts([".yaml"]),
+                        label="Model Config (*.yaml)",
+                        value=get_files_in_ckpts([".yaml"])[0],
+                        visible=True
+                    )
+                    ckpt_file = gr.Dropdown(
+                        choices=get_files_in_ckpts([".pt", ".safetensors"], include_subdirs=True),
+                        label="Checkpoint File (*.pt or *.safetensors)",
+                        value=get_files_in_ckpts([".pt", ".safetensors"], include_subdirs=True)[0],
+                        visible=True
+                    )
+                    vocab_file = gr.Dropdown(
+                        choices=get_files_in_ckpts([".txt", ".safetensors"]),
+                        label="Vocab File (*.txt or *.safetensors)",
+                        value=get_files_in_ckpts([".txt", ".safetensors"])[0],
+                        visible=True
+                    )
+                    use_upload = gr.Checkbox(label="Upload Custom Model Files", value=False)
+                    model_cfg_upload = gr.File(label="Model Config (*.yaml)", file_types=[".yaml"], visible=False)
+                    ckpt_file_upload = gr.File(label="Checkpoint File (*.pt or *.safetensors)", file_types=[".pt", ".safetensors"], visible=False)
+                    vocab_file_upload = gr.File(label="Vocab File (*.txt or *.safetensors)", file_types=[".txt", ".safetensors"], visible=False)
+        # Add Examples component after both columns
+        gr.Examples(
+            examples=examples,
+            inputs=[ref_audio, ref_text, gen_text],
+            outputs=[ref_audio, ref_text, gen_text, output_audio],  # Keep output_audio to display infer_audio
+            fn=load_example,
+            label="Example Inputs",
+            examples_per_page=4,
+            cache_examples=False
+        )
+        ref_audio.change(fn=update_ref_text, inputs=[ref_audio, use_whisper], outputs=ref_text)
+        use_whisper.change(fn=update_ref_text, inputs=[ref_audio, use_whisper], outputs=ref_text)
+        use_upload.change(
+            fn=toggle_model_inputs,
+            inputs=[use_upload],
+            outputs=[model_cfg, ckpt_file, vocab_file, model_cfg_upload, ckpt_file_upload, vocab_file_upload]
+        )
+        generate_btn.click(
+            fn=run_tts_inference,
+            inputs=[ref_audio, ref_text, gen_text, speed, use_upload, model_cfg, ckpt_file, vocab_file],
+            outputs=[output_audio, output_text]
+        )
+    return demo
+if __name__ == "__main__":
+    demo = create_gradio_app()
+    demo.launch(share=True)

apps/gradio_app/__init__.py ADDED Viewed

File without changes

apps/gradio_app/asr_utils.py ADDED Viewed

	@@ -0,0 +1,16 @@

+from transformers import WhisperProcessor, WhisperForConditionalGeneration
+import librosa
+def transcribe_audio(audio_file_path):
+    """Transcribe audio using PhoWhisper-tiny model."""
+    try:
+        processor = WhisperProcessor.from_pretrained("vinai/PhoWhisper-tiny")
+        model = WhisperForConditionalGeneration.from_pretrained("vinai/PhoWhisper-tiny")
+        audio, sr = librosa.load(audio_file_path, sr=16000)
+        input_features = processor(audio, sampling_rate=16000, return_tensors="pt").input_features
+        forced_decoder_ids = processor.get_decoder_prompt_ids(language="vi", task="transcribe")
+        predicted_ids = model.generate(input_features, forced_decoder_ids=forced_decoder_ids)
+        transcription = processor.batch_decode(predicted_ids, skip_special_tokens=True)
+        return transcription[0] if transcription else ""
+    except Exception as e:
+        return f"Transcription error: {str(e)}"

apps/gradio_app/assets/examples/f5_tts/1/infer_audio.wav ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:2924700ad369afabb4489eceec9c5e1e9c0fae90a3409f480678aba7a79a7378
+size 127020

apps/gradio_app/assets/examples/f5_tts/1/infer_text.txt ADDED Viewed

	@@ -0,0 +1 @@


1	+ chào mọi người, mọi người khỏe không?

apps/gradio_app/assets/examples/f5_tts/1/refer_audio.mp3 ADDED Viewed

Binary file (35.3 kB). View file

apps/gradio_app/assets/examples/f5_tts/1/refer_text.txt ADDED Viewed

	@@ -0,0 +1 @@


1	+ bạn và tôi đều như nhau nhé, rồi chúng ta đi đâu nè

apps/gradio_app/assets/examples/f5_tts/2/infer_audio.mp3 ADDED Viewed

Binary file (12.7 kB). View file

apps/gradio_app/assets/examples/f5_tts/2/infer_text.txt ADDED Viewed

	@@ -0,0 +1 @@


1	+ Tôi rất khỏe,cảm ơn mọi người đã quan tâm.

apps/gradio_app/assets/examples/f5_tts/2/refer_audio.mp3 ADDED Viewed

Binary file (61.9 kB). View file

apps/gradio_app/assets/examples/f5_tts/2/refer_text.txt ADDED Viewed

	@@ -0,0 +1 @@


1	+ Chúng thường sống hòa bình với các loài động vật khác, kể cả những loài săn mồi.

apps/gradio_app/assets/examples/f5_tts/3/infer_audio.mp3 ADDED Viewed

Binary file (51.8 kB). View file

apps/gradio_app/assets/examples/f5_tts/3/infer_text.txt ADDED Viewed

	@@ -0,0 +1 @@


1	+ Nhà Tiền Lê, Lý và Trần đã chống trả các cuộc tấn công của nhà Tống và nhà Mông – Nguyên, đều thắng lợi và bảo vệ được Đại Việt.

apps/gradio_app/assets/examples/f5_tts/3/refer_audio.mp3 ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:cd15755a7704fd99247dfae618a4f8e9d9655af735def78e6fdec5467faca641
+size 183110

apps/gradio_app/assets/examples/f5_tts/3/refer_text.txt ADDED Viewed

	@@ -0,0 +1 @@


1	+ Sau nhà Ngô, lần lượt các triều Đinh, Tiền Lê, Lý và Trần tổ chức chính quyền tương tự các triều đại Trung Hoa, lấy Phật giáo làm tôn giáo chính của quốc gia và cho truyền bá cả Nho giáo và Đạo giáo.

apps/gradio_app/assets/examples/f5_tts/4/infer_audio.mp3 ADDED Viewed

Binary file (52.7 kB). View file

apps/gradio_app/assets/examples/f5_tts/4/infer_text.txt ADDED Viewed

	@@ -0,0 +1 @@


1	+ Người dân Đông Á cổ đại đã uống trà trong nhiều thế kỷ, thậm chí có thể là hàng thiên niên kỷ , trước khi sử dụng nó như một thức uống.

apps/gradio_app/assets/examples/f5_tts/4/refer_audio.mp3 ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:9ea81c8700f5ff2e6497c9beaa942b5ed107e03ae468472d78a4c8c80e3b63af
+size 138388

apps/gradio_app/assets/examples/f5_tts/4/refer_text.txt ADDED Viewed

	@@ -0,0 +1 @@


1	+ Cấu trúc sừng và mào là phổ biến ở tất cả các nhóm khủng long, và vài nhóm thậm chí còn phát triển các biến đổi bộ xương như giáp mô hoặc gai.

apps/gradio_app/components.py ADDED Viewed

	@@ -0,0 +1,90 @@

+import os
+import subprocess
+import uuid
+from pathlib import Path
+import shutil
+def run_setup_script():
+    setup_script = os.path.join(os.path.dirname(__file__), "setup_scripts.py")
+    try:
+        result = subprocess.run(["python", setup_script], capture_output=True, text=True, check=True)
+        return result.stdout
+    except subprocess.CalledProcessError as e:
+        return f"Setup script failed: {e.stderr}"
+def get_files_in_ckpts(extensions, include_subdirs=False):
+    """List files in ckpts directory with specified extensions, optionally including subdirectories."""
+    ckpts_dir = Path("ckpts")
+    if not ckpts_dir.exists():
+        return ["No files found"]
+    files = []
+    for ext in extensions:
+        if include_subdirs:
+            files.extend([str(f) for f in ckpts_dir.glob(f"**/*{ext}")])
+        else:
+            files.extend([str(f) for f in ckpts_dir.glob(f"*{ext}")])
+    return files if files else ["No files found"]
+def handle_file_upload(file_obj, allowed_extensions):
+    """Copy uploaded file to a permanent location and validate extension."""
+    if not file_obj:
+        return None, "No file uploaded."
+    try:
+        file_ext = os.path.splitext(file_obj.name)[1].lower()
+        if file_ext not in allowed_extensions:
+            return None, f"Invalid file extension. Allowed: {', '.join(allowed_extensions)}"
+        upload_dir = Path("uploads")
+        upload_dir.mkdir(exist_ok=True)
+        file_name = f"upload_{str(uuid.uuid4())[:8]}{file_ext}"
+        dest_path = upload_dir / file_name
+        shutil.copyfile(file_obj.name, dest_path)
+        return str(dest_path), None
+    except Exception as e:
+        return None, f"File upload error: {str(e)}"
+def run_tts_inference(ref_audio, ref_text, gen_text, speed, use_upload, model_cfg, ckpt_file, vocab_file):
+    """Run F5-TTS inference with selected or uploaded model files."""
+    if use_upload:
+        model_cfg_path, model_cfg_error = handle_file_upload(model_cfg, [".yaml"])
+        ckpt_file_path, ckpt_file_error = handle_file_upload(ckpt_file, [".pt", ".safetensors"])
+        vocab_file_path, vocab_file_error = handle_file_upload(vocab_file, [".txt", ".safetensors"])
+        if model_cfg_error or ckpt_file_error or vocab_file_error:
+            return None, model_cfg_error or ckpt_file_error or vocab_file_error
+        if not (model_cfg_path and ckpt_file_path and vocab_file_path):
+            return None, "Please upload all model files (model_cfg, ckpt_file, vocab_file)."
+        config = {"model_cfg": model_cfg_path, "ckpt_file": ckpt_file_path, "vocab_file": vocab_file_path}
+    else:
+        if any(f == "No files found" for f in [model_cfg, ckpt_file, vocab_file]):
+            return None, "No valid model files found in ckpts. Upload custom files or add files to ckpts."
+        config = {"model_cfg": model_cfg, "ckpt_file": ckpt_file, "vocab_file": vocab_file}
+    if not ref_audio:
+        return None, "Reference audio is required."
+    output_dir = "apps/gradio_app/temp_data"
+    os.makedirs(output_dir, exist_ok=True)
+    output_file = f"infer_audio_{str(uuid.uuid4())[:8]}.mp3"
+    output_path = os.path.join(output_dir, output_file)
+    try:
+        command = [
+            "python", "src/f5_tts/infer/infer_cli.py",
+            "--model_cfg", config["model_cfg"],
+            "--ckpt_file", config["ckpt_file"],
+            "--vocab_file", config["vocab_file"],
+            "--ref_audio", ref_audio,
+            "--ref_text", ref_text,
+            "--gen_text", gen_text,
+            "--speed", str(speed),
+            "--output_dir", output_dir,
+            "--output_file", output_file
+        ]
+        result = subprocess.run(command, capture_output=True, text=True)
+        if result.returncode != 0:
+            return None, f"Inference error: {result.stderr}"
+        if not os.path.exists(output_path):
+            return None, f"Output audio not found at {output_path}"
+        return output_path, "Audio generated successfully!"
+    except Exception as e:
+        return None, f"Inference error: {str(e)}"

apps/gradio_app/setup_scripts.py ADDED Viewed

	@@ -0,0 +1,61 @@

+import subprocess
+import sys
+import os
+def run_script(script_path, args=None):
+    """
+    Run a Python script using subprocess with optional arguments and handle errors.
+    Returns True if successful, False otherwise.
+    """
+    try:
+        command = [sys.executable, script_path]
+        if args:
+            command.extend(args)
+        result = subprocess.run(
+            command,
+            check=True,
+            text=True,
+            capture_output=True
+        )
+        print(f"Successfully executed {script_path}")
+        print(result.stdout)
+        return True
+    except subprocess.CalledProcessError as e:
+        print(f"Error executing {script_path}:")
+        print(e.stderr)
+        return False
+    except FileNotFoundError:
+        print(f"Script not found: {script_path}")
+        return False
+def main():
+    """
+    Main function to execute setup_third_party.py and download_ckpts.py in sequence.
+    """
+    scripts_dir = "scripts"
+    scripts = [
+        {
+            "path": os.path.join(scripts_dir, "setup_third_party.py"),
+            "args": None
+        },
+        {
+            "path": os.path.join(scripts_dir, "download_ckpts.py"),
+            "args": [
+                "--repo_id", "danhtran2mind/Vi-F5-TTS",
+                "--local_dir", "./ckpts",
+                "--pruning_model"
+            ]
+        }
+    ]
+    for script in scripts:
+        script_path = script["path"]
+        args = script["args"]
+        print(f"Start running {script_path} {' '.join(args) if args else ''}\n")
+        if not run_script(script_path, args):
+            print(f"Stopping execution due to error in {script_path}")
+            sys.exit(1)
+        print(f"Completed {script_path}\n")
+if __name__ == "__main__":
+    main()

apps/gradio_app/static/scripts.js ADDED Viewed

File without changes

apps/gradio_app/static/styles.css ADDED Viewed

	@@ -0,0 +1,100 @@

+/* General body styling */
+.gradio-container {
+    background: linear-gradient(180deg, #f9fafb, #f1efef);
+    font-family: 'Quicksand', ui-sans-serif, sans-serif;
+    color: #6b46c1; /* Purple-800 for text (neutral hue) */
+    font-size: 16px; /* Medium text size */
+    font-weight: 400;
+}
+/* Dark mode background */
+@media (prefers-color-scheme: dark) {
+    .gradio-container {
+        background: linear-gradient(180deg, #1f2937, #111827);
+        color: #d6bcfa; /* Lighter purple for dark mode */
+    }
+}
+/* Block styling (containers for components) */
+.block {
+    border: 1px solid #e9d8fd; /* Purple-200 for borders */
+    border-radius: 8px; /* Medium radius */
+    box-shadow: 0 1px 2px rgba(0, 0, 0, 0.05); /* Small shadow */
+    padding: 16px; /* Medium spacing */
+    background: #f1efef;
+}
+/* Input fields */
+input[type="text"], textarea {
+    background: #faf5ff; /* Purple-50 for input background */
+    border: 1px solid #e9d8fd; /* Purple-200 for borders */
+    border-radius: 8px;
+    padding: 8px;
+    font-family: 'Quicksand', ui-sans-serif, sans-serif;
+    font-size: 16px;
+    color: #6b46c1;
+    box-shadow: none;
+}
+input[type="text"]:focus, textarea:focus {
+    outline: none;
+    box-shadow: 0 1px 2px rgba(0, 0, 0, 0.1); /* Small shadow on focus */
+    border-color: #48bb78; /* Green-400 for focus */
+}
+/* Primary button */
+button.primary {
+    background: #48bb78; /* Green-400 */
+    color: #f1efef;
+    border: none;
+    border-radius: 8px;
+    padding: 8px 16px;
+    font-family: 'Quicksand', ui-sans-serif, sans-serif;
+    font-size: 16px;
+    font-weight: 500;
+    box-shadow: 0 1px 2px rgba(0, 0, 0, 0.05);
+    cursor: pointer;
+}
+button.primary:hover {
+    background: #ed8936; /* Orange-400 for hover */
+    box-shadow: 0 2px 4px rgba(0, 0, 0, 0.1); /* Medium shadow on hover */
+}
+/* Secondary button */
+button.secondary {
+    color: #48bb78; /* Green-400 for text */
+    border: 1px solid #48bb78; /* Green-400 for border */
+    border-radius: 8px;
+    padding: 8px 16px;
+    font-family: 'Quicksand', ui-sans-serif, sans-serif;
+    font-size: 16px;
+    font-weight: 500;
+    box-shadow: 0 1px 2px rgba(0, 0, 0, 0.05);
+    cursor: pointer;
+}
+button.secondary:hover {
+    background: #ed8936; /* Orange-400 for hover */
+    color: #48bb78;
+    box-shadow: 0 2px 4px rgba(0, 0, 0, 0.1);
+}
+/* Slider styling */
+input[type="range"] {
+    accent-color: #ed8936; /* Orange-400 for slider */
+}
+@media (prefers-color-scheme: dark) {
+    input[type="range"] {
+        accent-color: #f6ad55; /* Orange-600 for dark mode */
+    }
+}
+/* Markdown headers */
+h2 {
+    font-weight: 500;
+    color: #6b46c1; /* Purple-800 */
+    margin-bottom: 16px;
+}
+/* Code or monospace elements */
+code, pre {
+    font-family: 'IBM Plex Mono', ui-monospace, monospace;
+}

apps/old-gradio_app.py ADDED Viewed

	@@ -0,0 +1,140 @@

+import gradio as gr
+import os
+import subprocess
+import tempfile
+from pathlib import Path
+from transformers import WhisperProcessor, WhisperForConditionalGeneration
+import librosa
+def transcribe_audio(audio_file_path):
+    """Transcribe audio using PhoWhisper-tiny model."""
+    try:
+        processor = WhisperProcessor.from_pretrained("vinai/PhoWhisper-tiny")
+        model = WhisperForConditionalGeneration.from_pretrained("vinai/PhoWhisper-tiny")
+        audio, sr = librosa.load(audio_file_path, sr=16000)
+        input_features = processor(audio, sampling_rate=16000, return_tensors="pt").input_features
+        forced_decoder_ids = processor.get_decoder_prompt_ids(language="vi", task="transcribe")
+        predicted_ids = model.generate(input_features, forced_decoder_ids=forced_decoder_ids)
+        transcription = processor.batch_decode(predicted_ids, skip_special_tokens=True)
+        return transcription[0] if transcription else ""
+    except Exception as e:
+        return f"Error during transcription: {str(e)}"
+def run_tts_inference(ref_audio, ref_text, gen_text, speed, model_option):
+    """
+    Run the F5-TTS inference script with provided inputs and return the output audio path.
+    """
+    model_configs = {
+        "Vietnamese Fine-Tuned": {
+            "model_cfg": "ckpts/vi-fine-tuned-f5-tts.yaml",
+            "ckpt_file": "ckpts/Vi_F5_TTS_ckpts/pruning_model.pt",
+            "vocab_file": "ckpts/vocab.txt"
+        },
+    }
+    if model_option not in model_configs:
+        return None, f"Invalid model option: {model_option}"
+    config = model_configs[model_option]
+    output_dir = "apps/gradio_app/temp_data"
+    os.makedirs(output_dir, exist_ok=True)
+    output_file = "infer_audio.mp3"
+    output_path = os.path.join(output_dir, output_file)
+    if ref_audio:
+        temp_audio = ref_audio
+    else:
+        return None, "Reference audio is required"
+    # with tempfile.NamedTemporaryFile(mode='w', suffix='.txt', delete=False) as temp_ref_text:
+    #     temp_ref_text.write(ref_text or "")
+    #     temp_ref_text_path = temp_ref_text.name
+    # with tempfile.NamedTemporaryFile(mode='w', suffix='.txt', delete=False) as temp_gen_text:
+    #     temp_gen_text.write(gen_text or "")
+    #     temp_gen_text_path = temp_gen_text.name
+    try:
+        command = [
+            "python", "src/f5_tts/infer/infer_cli.py",
+            "--model_cfg", config["model_cfg"],
+            "--ckpt_file", config["ckpt_file"],
+            "--vocab_file", config["vocab_file"],
+            "--ref_audio", temp_audio,
+            "--ref_text", ref_text,
+            "--gen_text", gen_text,
+            "--speed", str(speed),
+            "--output_dir", output_dir,
+            "--output_file", output_file
+        ]
+        result = subprocess.run(command, capture_output=True, text=True)
+        if result.returncode != 0:
+            return None, f"Error running inference: {result.stderr}"
+        if not os.path.exists(output_path):
+            return None, f"Output audio file not found at {output_path}"
+        return output_path, "Audio generated successfully!"
+    except Exception as e:
+        return None, f"Error during inference: {str(e)}"
+def create_gradio_app():
+    """
+    Create and return a Gradio interface for the F5-TTS inference with optional Whisper ASR.
+    """
+    def update_ref_text(audio_file_path, use_whisper):
+        """Conditionally transcribe audio based on Whisper checkbox."""
+        if use_whisper and audio_file_path:
+            return transcribe_audio(audio_file_path)
+        return gr.update()  # Keep current text if Whisper is disabled or no audio
+    with gr.Blocks() as demo:
+        gr.Markdown("# F5-TTS Audio Generation App")
+        gr.Markdown("Generate audio using a fine-tuned F5-TTS model. Upload a reference audio, enable Whisper ASR for auto-transcription or manually enter reference text, provide generated text, and adjust the speed.")
+        with gr.Row():
+            with gr.Column():
+                ref_audio = gr.Audio(label="Reference Audio", type="filepath")
+                use_whisper = gr.Checkbox(label="Use Whisper ASR for Reference Text", value=False)
+                ref_text = gr.Textbox(label="Reference Text", placeholder="e.g., Sau nhà Ngô, lần lượt các triều Đinh...")
+                gen_text = gr.Textbox(label="Generated Text", placeholder="e.g., Nhà Tiền Lê, Lý và Trần đã chống trả...")
+                speed = gr.Slider(0.5, 2.0, 1.0, step=0.1, label="Speed")
+                model_option = gr.Dropdown(
+                    choices=["Vietnamese Fine-Tuned"],
+                    label="Model Option",
+                    value="Vietnamese Fine-Tuned"
+                )
+                generate_btn = gr.Button("Generate Audio")
+            with gr.Column():
+                output_audio = gr.Audio(label="Generated Audio")
+                output_text = gr.Textbox(label="Status")
+        # Update reference text when audio is uploaded or Whisper checkbox changes
+        ref_audio.change(
+            fn=update_ref_text,
+            inputs=[ref_audio, use_whisper],
+            outputs=ref_text
+        )
+        use_whisper.change(
+            fn=update_ref_text,
+            inputs=[ref_audio, use_whisper],
+            outputs=ref_text
+        )
+        generate_btn.click(
+            fn=run_tts_inference,
+            inputs=[ref_audio, ref_text, gen_text, speed, model_option],
+            outputs=[output_audio, output_text]
+        )
+    return demo
+if __name__ == "__main__":
+    demo = create_gradio_app()
+    demo.launch(share=True)

assets/.gitkeep ADDED Viewed

File without changes

assets/examples/f5_tts/1/infer_audio.wav ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:2924700ad369afabb4489eceec9c5e1e9c0fae90a3409f480678aba7a79a7378
+size 127020

assets/examples/f5_tts/1/infer_text.txt ADDED Viewed

	@@ -0,0 +1 @@


1	+ chào mọi người, mọi người khỏe không?

assets/examples/f5_tts/1/refer_audio.mp3 ADDED Viewed

Binary file (35.3 kB). View file

assets/examples/f5_tts/1/refer_text.txt ADDED Viewed

	@@ -0,0 +1 @@


1	+ bạn và tôi đều như nhau nhé, rồi chúng ta đi đâu nè

assets/examples/f5_tts/2/infer_audio.mp3 ADDED Viewed

Binary file (12.7 kB). View file

assets/examples/f5_tts/2/infer_text.txt ADDED Viewed

	@@ -0,0 +1 @@


1	+ Tôi rất khỏe,cảm ơn mọi người đã quan tâm.

assets/examples/f5_tts/2/refer_audio.mp3 ADDED Viewed

Binary file (61.9 kB). View file

assets/examples/f5_tts/2/refer_text.txt ADDED Viewed

	@@ -0,0 +1 @@


1	+ Chúng thường sống hòa bình với các loài động vật khác, kể cả những loài săn mồi.

assets/examples/f5_tts/3/infer_audio.mp3 ADDED Viewed

Binary file (51.8 kB). View file

assets/examples/f5_tts/3/infer_text.txt ADDED Viewed

	@@ -0,0 +1 @@


1	+ Nhà Tiền Lê, Lý và Trần đã chống trả các cuộc tấn công của nhà Tống và nhà Mông – Nguyên, đều thắng lợi và bảo vệ được Đại Việt.

assets/examples/f5_tts/3/refer_audio.mp3 ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:cd15755a7704fd99247dfae618a4f8e9d9655af735def78e6fdec5467faca641
+size 183110

assets/examples/f5_tts/3/refer_text.txt ADDED Viewed

	@@ -0,0 +1 @@


1	+ Sau nhà Ngô, lần lượt các triều Đinh, Tiền Lê, Lý và Trần tổ chức chính quyền tương tự các triều đại Trung Hoa, lấy Phật giáo làm tôn giáo chính của quốc gia và cho truyền bá cả Nho giáo và Đạo giáo.

assets/examples/f5_tts/4/infer_audio.mp3 ADDED Viewed

Binary file (52.7 kB). View file

assets/examples/f5_tts/4/infer_text.txt ADDED Viewed

	@@ -0,0 +1 @@


1	+ Người dân Đông Á cổ đại đã uống trà trong nhiều thế kỷ, thậm chí có thể là hàng thiên niên kỷ , trước khi sử dụng nó như một thức uống.

assets/examples/f5_tts/4/refer_audio.mp3 ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:9ea81c8700f5ff2e6497c9beaa942b5ed107e03ae468472d78a4c8c80e3b63af
+size 138388

assets/examples/f5_tts/4/refer_text.txt ADDED Viewed

	@@ -0,0 +1 @@


1	+ Cấu trúc sừng và mào là phổ biến ở tất cả các nhóm khủng long, và vài nhóm thậm chí còn phát triển các biến đổi bộ xương như giáp mô hoặc gai.

ckpts/.gitkeep ADDED Viewed

File without changes

configs/.gitkeep ADDED Viewed

File without changes

configs/vi-fine-tuned-f5-tts.yaml ADDED Viewed

	@@ -0,0 +1,52 @@

+hydra:
+  run:
+    dir: ckpts/${model.name}_${model.mel_spec.mel_spec_type}_${model.tokenizer}_${datasets.name}/${now:%Y-%m-%d}/${now:%H-%M-%S}
+datasets:
+  name: vin100h-preprocessed-v2  # dataset name
+  batch_size_per_gpu: 3200  # 1 GPUs, 1 * 3200 = 3200
+  batch_size_type: frame  # frame | sample
+  max_samples: 64  # max sequences per batch if use frame-wise batch_size. we set 32 for small models, 64 for base models
+  num_workers: 4
+optim:
+  epochs: 80
+  learning_rate: 1e-5
+  num_warmup_updates: 2761  # warmup updates
+  grad_accumulation_steps: 2  # note: updates = steps / grad_accumulation_steps
+  max_grad_norm: 1.0  # gradient clipping
+  bnb_optimizer: False  # use bnb 8bit AdamW optimizer or not
+model:
+  name: vi_fine_tuned_t5_tts  # model name
+  tokenizer: pinyin  # tokenizer type
+  tokenizer_path: null  # if 'custom' tokenizer, define the path want to use (should be vocab.txt)
+  backbone: DiT
+  arch:
+    dim: 1024
+    depth: 22
+    heads: 16
+    ff_mult: 2
+    text_dim: 512
+    text_mask_padding: False
+    conv_layers: 4
+    pe_attn_head: 1
+    checkpoint_activations: False  # recompute activations and save memory for extra compute
+  mel_spec:
+    target_sample_rate: 24000
+    n_mel_channels: 100
+    hop_length: 256
+    win_length: 1024
+    n_fft: 1024
+    mel_spec_type: vocos  # vocos | bigvgan
+  vocoder:
+    is_local: False  # use local offline ckpt or not
+    local_path: null  # local vocoder path
+ckpts:
+  logger: null  # wandb | tensorboard | null
+  log_samples: True  # infer random sample per save checkpoint. wip, normal to fail with extra long samples
+  save_per_updates: 4000  # save checkpoint per updates
+  keep_last_n_checkpoints: 1  # -1 to keep all, 0 to not save intermediate, > 0 to keep last N checkpoints
+  last_per_updates: 4000  # save last checkpoint per updates
+  save_dir: ckpts/${model.name}_${model.mel_spec.mel_spec_type}_${model.tokenizer}_${datasets.name}

data/.gitkeep ADDED Viewed

File without changes

docs/.gitkeep ADDED Viewed

File without changes

docs/inference/inference_doc.md ADDED Viewed

	@@ -0,0 +1,38 @@

+# Inference Arguments
+The following table describes the command-line arguments available for the `infer-cli.py` script, which is used for text-to-speech (TTS) inference with advanced batch processing capabilities. These arguments allow users to override settings defined in the configuration file (`basic.toml` by default).
+| Argument | Description | Type | Default Value | Notes |
+|----------|-------------|------|---------------|-------|
+| `-c`, `--config` | Path to the configuration file. | `str` | `f5_tts/infer/examples/basic/basic.toml` | Specifies the TOML configuration file to use. |
+| `-m`, `--model` | Model name to use for inference. | `str` | `F5TTS_v1_Base` (from config) | Options: `F5TTS_v1_Base`, `F5TTS_Base`, `E2TTS_Base`, etc. |
+| `-mc`, `--model_cfg` | Path to the model's YAML configuration file. | `str` | `configs/<model>.yaml` (from config) | Defines model-specific settings. |
+| `-p`, `--ckpt_file` | Path to the model checkpoint file (.pt). | `str` | (from config) | Leave blank to use default checkpoint. |
+| `-v`, `--vocab_file` | Path to the vocabulary file (.txt). | `str` | (from config) | Leave blank to use default vocabulary. |
+| `-r`, `--ref_audio` | Path to the reference audio file. | `str` | `infer/examples/basic/basic_ref_en.wav` (from config) | Used as a reference for voice synthesis. |
+| `-s`, `--ref_text` | Transcript or subtitle for the reference audio. | `str` | `Some call me nature, others call me mother nature.` (from config) | Text corresponding to the reference audio. |
+| `-t`, `--gen_text` | Text to synthesize into speech. | `str` | `Here we generate something just for test.` (from config) | Ignored if `--gen_file` is provided. |
+| `-f`, `--gen_file` | Path to a file containing text to synthesize. | `str` | (from config) | Overrides `--gen_text` if specified. |
+| `-o`, `--output_dir` | Path to the output directory. | `str` | `tests` (from config) | Directory where generated audio files are saved. |
+| `-w`, `--output_file` | Name of the output audio file. | `str` | `infer_cli_<timestamp>.wav` (from config) | Timestamp format: `%Y%m%d_%H%M%S`. |
+| `--save_chunk` | Save individual audio chunks during inference. | `bool` | `False` (from config) | If enabled, saves chunks to `<output_dir>/<output_file>_chunks/`. |
+| `--no_legacy_text` | Disable lossy ASCII transliteration for Unicode text in file names. | `bool` | `False` (from config) | If disabled, uses Unicode in file names; warns if used with `--save_chunk`. |
+| `--remove_silence` | Remove long silences from the generated audio. | `bool` | `False` (from config) | Applies silence removal post-processing. |
+| `--load_vocoder_from_local` | Load vocoder from a local directory. | `bool` | `False` (from config) | Uses `../checkpoints/vocos-mel-24khz` or similar if enabled. |
+| `--vocoder_name` | Name of the vocoder to use. | `str` | (from config, defaults to `mel_spec_type`) | Options: `vocos`, `bigvgan`. |
+| `--target_rms` | Target loudness normalization value for output speech. | `float` | (from config, defaults to `target_rms`) | Adjusts audio loudness. |
+| `--cross_fade_duration` | Duration of cross-fade between audio segments (seconds). | `float` | (from config, defaults to `cross_fade_duration`) | Smooths transitions between segments. |
+| `--nfe_step` | Number of function evaluation (denoising) steps. | `int` | (from config, defaults to `nfe_step`) | Controls inference quality. |
+| `--cfg_strength` | Classifier-free guidance strength. | `float` | (from config, defaults to `cfg_strength`) | Influences generation quality. |
+| `--sway_sampling_coef` | Sway sampling coefficient. | `float` | (from config, defaults to `sway_sampling_coef`) | Affects sampling behavior. |
+| `--speed` | Speed of the generated audio. | `float` | (from config, defaults to `speed`) | Adjusts playback speed. |
+| `--fix_duration` | Fixed total duration for reference and generated audio (seconds). | `float` | (from config, defaults to `fix_duration`) | Enforces a specific duration. |
+| `--device` | Device to run inference on. | `str` | (from config, defaults to `device`) | E.g., `cpu`, `cuda`. |
+## Notes
+- Arguments without default values in the script (e.g., `--model`, `--ref_audio`) inherit defaults from the configuration file.
+- The `--no_legacy_text` flag is implemented as `store_false`, so enabling it sets `use_legacy_text` to `False`.
+- If `--gen_file` is provided, it overrides `--gen_text`.
+- The script supports multiple voices defined in the config file under the `voices` key, with a fallback to a `main` voice.
+- The output audio is saved as a WAV file, and optional chunked audio segments can be saved if `--save_chunk` is enabled.
+- The script uses `cached_path` for downloading model checkpoints from Hugging Face if no local checkpoint is specified.

docs/training/training_doc.md ADDED Viewed

File without changes

notebooks/1-vi-fine-tuned-t5-tts.ipynb ADDED Viewed

	@@ -0,0 +1,952 @@

+{
+ "cells": [
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {
+    "_cell_guid": "b1076dfc-b9ad-4769-8c92-a6c4dae69d19",
+    "_uuid": "8f2839f25d086af736a60e9eeb907d3b93b6e0e5",
+    "execution": {
+     "iopub.execute_input": "2025-06-15T14:21:25.974502Z",
+     "iopub.status.busy": "2025-06-15T14:21:25.974227Z",
+     "iopub.status.idle": "2025-06-15T14:21:31.475226Z",
+     "shell.execute_reply": "2025-06-15T14:21:31.474663Z",
+     "shell.execute_reply.started": "2025-06-15T14:21:25.974478Z"
+    },
+    "trusted": true
+   },
+   "outputs": [],
+   "source": [
+    "import os\n",
+    "os.system(\"pip install -q wget\")"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {
+    "execution": {
+     "iopub.execute_input": "2025-06-15T14:21:31.476734Z",
+     "iopub.status.busy": "2025-06-15T14:21:31.476449Z",
+     "iopub.status.idle": "2025-06-15T14:21:37.092039Z",
+     "shell.execute_reply": "2025-06-15T14:21:37.091491Z",
+     "shell.execute_reply.started": "2025-06-15T14:21:31.476715Z"
+    },
+    "trusted": true
+   },
+   "outputs": [],
+   "source": [
+    "import wget\n",
+    "import tarfile\n",
+    "import torchaudio\n",
+    "import pandas as pd\n",
+    "from huggingface_hub import snapshot_download, login\n",
+    "login(\"<your_huggingface_token>\")"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {
+    "execution": {
+     "iopub.execute_input": "2025-06-15T14:21:37.092984Z",
+     "iopub.status.busy": "2025-06-15T14:21:37.092705Z",
+     "iopub.status.idle": "2025-06-15T14:21:37.096562Z",
+     "shell.execute_reply": "2025-06-15T14:21:37.096039Z",
+     "shell.execute_reply.started": "2025-06-15T14:21:37.092967Z"
+    },
+    "trusted": true
+   },
+   "outputs": [],
+   "source": [
+    "os.chdir(\"/content\")"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {
+    "execution": {
+     "iopub.execute_input": "2025-06-15T13:59:06.772020Z",
+     "iopub.status.busy": "2025-06-15T13:59:06.771694Z",
+     "iopub.status.idle": "2025-06-15T14:00:28.043176Z",
+     "shell.execute_reply": "2025-06-15T14:00:28.041603Z",
+     "shell.execute_reply.started": "2025-06-15T13:59:06.771995Z"
+    },
+    "trusted": true
+   },
+   "outputs": [],
+   "source": [
+    "from huggingface_hub import HfApi\n",
+    "from huggingface_hub import snapshot_download\n",
+    "import os\n",
+    "api = HfApi()\n",
+    "!git lfs install --force\n",
+    "\n",
+    "# Define the dataset name and local directory\n",
+    "\n",
+    "repo_id = \"heboya8/t5-tts-temp-model\"\n",
+    "save_path = \".\"\n",
+    "\n",
+    "# Create the directory if it doesn't exist\n",
+    "os.makedirs(save_path, exist_ok=True)\n",
+    "\n",
+    "# Download the dataset\n",
+    "snapshot_download(repo_id=repo_id, repo_type=\"model\", local_dir=save_path)"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {
+    "execution": {
+     "iopub.execute_input": "2025-06-15T14:21:37.389642Z",
+     "iopub.status.busy": "2025-06-15T14:21:37.389399Z",
+     "iopub.status.idle": "2025-06-15T14:24:47.468892Z",
+     "shell.execute_reply": "2025-06-15T14:24:47.468139Z",
+     "shell.execute_reply.started": "2025-06-15T14:21:37.389623Z"
+    },
+    "trusted": true
+   },
+   "outputs": [],
+   "source": [
+    "# Step 1: Set Up the Environment\n",
+    "os.system(\"pip install -e . >/dev/null 2>&1\")\n",
+    "os.system(\"pip install torch==2.4.0+cu124 torchaudio==2.4.0+cu124 torchvision==0.19.0+cu124 --extra-index-url https://download.pytorch.org/whl/cu124 >/dev/null 2>&1\")\n",
+    "os.system(\"pip install accelerate==0.33.0 tensorboard >/dev/null 2>&1\")\n",
+    "if not os.path.exists(\"F5-TTS\"):\n",
+    "    os.system(\"git clone https://github.com/SWivid/F5-TTS.git\")\n",
+    "os.chdir(\"F5-TTS\")\n",
+    "os.system(\"pip install -e . >/dev/null 2>&1\")\n"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {
+    "execution": {
+     "iopub.execute_input": "2025-06-15T14:24:47.470454Z",
+     "iopub.status.busy": "2025-06-15T14:24:47.470177Z",
+     "iopub.status.idle": "2025-06-15T14:24:47.473922Z",
+     "shell.execute_reply": "2025-06-15T14:24:47.473261Z",
+     "shell.execute_reply.started": "2025-06-15T14:24:47.470429Z"
+    },
+    "trusted": true
+   },
+   "outputs": [],
+   "source": [
+    "os.chdir(\"/content/F5-TTS\")\n",
+    "    # os.chdir(\"F5-TTS-Vietnamese\")"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {
+    "execution": {
+     "iopub.execute_input": "2025-06-15T06:47:34.909957Z",
+     "iopub.status.busy": "2025-06-15T06:47:34.909372Z",
+     "iopub.status.idle": "2025-06-15T06:47:35.040348Z",
+     "shell.execute_reply": "2025-06-15T06:47:35.039424Z",
+     "shell.execute_reply.started": "2025-06-15T06:47:34.909927Z"
+    },
+    "trusted": true
+   },
+   "outputs": [],
+   "source": [
+    "!pwd"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {
+    "execution": {
+     "iopub.execute_input": "2025-06-15T14:24:47.475053Z",
+     "iopub.status.busy": "2025-06-15T14:24:47.474827Z",
+     "iopub.status.idle": "2025-06-15T14:24:47.644337Z",
+     "shell.execute_reply": "2025-06-15T14:24:47.643562Z",
+     "shell.execute_reply.started": "2025-06-15T14:24:47.475031Z"
+    },
+    "trusted": true
+   },
+   "outputs": [],
+   "source": [
+    "!mkdir ./ckpts/vin100h-preprocessed-v2\n",
+    "# !cp /kaggle/input/vi-fine-tuned-t5-tts/69/model_last.pt \\\n",
+    "# ./ckpts/vin100h-preprocessed-v2\n",
+    "# !cp -r /content/73/* ./ckpts/vin100h-preprocessed-v2"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {
+    "execution": {
+     "iopub.execute_input": "2025-06-15T14:24:47.646473Z",
+     "iopub.status.busy": "2025-06-15T14:24:47.646278Z",
+     "iopub.status.idle": "2025-06-15T14:25:20.275283Z",
+     "shell.execute_reply": "2025-06-15T14:25:20.274453Z",
+     "shell.execute_reply.started": "2025-06-15T14:24:47.646454Z"
+    },
+    "trusted": true
+   },
+   "outputs": [],
+   "source": [
+    "# !cp -r /kaggle/input/vi-fine-tuned-t5-tts/7/* ./ckpts\n",
+    "!cp -r /kaggle/input/vi-fine-tuned-t5-tts/75/model_last.pt \\\n",
+    "    ./ckpts/vin100h-preprocessed-v2"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {
+    "execution": {
+     "iopub.execute_input": "2025-06-15T14:25:20.276407Z",
+     "iopub.status.busy": "2025-06-15T14:25:20.276159Z",
+     "iopub.status.idle": "2025-06-15T14:25:20.413414Z",
+     "shell.execute_reply": "2025-06-15T14:25:20.412180Z",
+     "shell.execute_reply.started": "2025-06-15T14:25:20.276382Z"
+    },
+    "trusted": true
+   },
+   "outputs": [],
+   "source": [
+    "!ls -a ./ckpts/vin100h-preprocessed-v2"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {
+    "execution": {
+     "iopub.execute_input": "2025-05-10T15:59:08.329794Z",
+     "iopub.status.busy": "2025-05-10T15:59:08.329442Z",
+     "iopub.status.idle": "2025-05-10T15:59:09.362207Z",
+     "shell.execute_reply": "2025-05-10T15:59:09.361253Z",
+     "shell.execute_reply.started": "2025-05-10T15:59:08.329757Z"
+    },
+    "trusted": true
+   },
+   "outputs": [],
+   "source": [
+    "import json\n",
+    "import os\n",
+    "from pathlib import Path\n",
+    "import shutil\n",
+    "import torchaudio\n",
+    "from datasets import load_dataset\n",
+    "from datasets.arrow_writer import ArrowWriter\n",
+    "from tqdm import tqdm\n",
+    "import soundfile as sf\n",
+    "import csv\n",
+    "\n",
+    "def save_dataset_to_local_disk(output_dir=\"./data/vin100h-preprocessed-v2\",\n",
+    "                               base_model=\"htdung167/vin100h-preprocessed-v2\",\n",
+    "                               audio_header='audio',\n",
+    "                               text_header='transcription'):\n",
+    "  \n",
+    "    wavs_dir = os.path.join(output_dir, \"wavs\")\n",
+    "    metadata_path = os.path.join(output_dir, \"metadata.csv\")\n",
+    "    os.makedirs(wavs_dir, exist_ok=True)\n",
+    "\n",
+    "    ds = load_dataset(base_model)['train']\n",
+    "    metadata = []\n",
+    "\n",
+    "    for idx, sample in tqdm(enumerate(ds), total=len(ds),\n",
+    "                            desc=\"Saving samples to directory\"):\n",
+    "        audio_array = sample[audio_header]['array']\n",
+    "        sampling_rate = sample[audio_header]['sampling_rate']\n",
+    "        filename = f\"audio_{idx:06d}.wav\"\n",
+    "        sf.write(os.path.join(wavs_dir, filename), audio_array, sampling_rate)\n",
+    "        # metadata.append([f\"wavs/{filename}\", sample['preprocessed_sentence_v2']])\n",
+    "        metadata.append([f\"wavs/{filename}\", sample[text_header]])\n",
+    "        # metadata.append([f\"{filename}\", sample['transcription']])\n",
+    "        \n",
+    "    with open(metadata_path, 'w', newline='', encoding='utf-8') as f:\n",
+    "        csv.writer(f, delimiter='|').writerows(metadata)\n",
+    "\n",
+    "    print(f\"Dataset saved to {output_dir}\")\n"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {
+    "execution": {
+     "iopub.execute_input": "2025-05-10T15:59:10.399030Z",
+     "iopub.status.busy": "2025-05-10T15:59:10.397916Z",
+     "iopub.status.idle": "2025-05-10T16:10:46.269067Z",
+     "shell.execute_reply": "2025-05-10T16:10:46.267298Z",
+     "shell.execute_reply.started": "2025-05-10T15:59:10.398995Z"
+    },
+    "trusted": true
+   },
+   "outputs": [],
+   "source": [
+    "output_dir = \"./data/vin100h-preprocessed-v2\"\n",
+    "tokenizer_type = \"pinyin\"\n",
+    "\n",
+    "save_dataset_to_local_disk(output_dir=output_dir,\n",
+    "                           base_model=\"htdung167/vin100h-preprocessed-v2\",\n",
+    "                           text_header=\"preprocessed_sentence_v2\"\n",
+    "                          )"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {
+    "_kg_hide-output": true,
+    "execution": {
+     "iopub.execute_input": "2025-05-10T16:10:46.273403Z",
+     "iopub.status.busy": "2025-05-10T16:10:46.272176Z",
+     "iopub.status.idle": "2025-05-10T17:15:19.405258Z",
+     "shell.execute_reply": "2025-05-10T17:15:19.402002Z",
+     "shell.execute_reply.started": "2025-05-10T16:10:46.273366Z"
+    },
+    "trusted": true
+   },
+   "outputs": [],
+   "source": [
+    "!python ./src/f5_tts/train/datasets/prepare_csv_wavs.py \\\n",
+    "    \"./data/vin100h-preprocessed-v2\" \\\n",
+    "    \"./data/vin100h-preprocessed-v2_pinyin\" \\\n",
+    "    --workers 4 # Sets the number of parallel processes for preprocessing."
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {
+    "execution": {
+     "iopub.execute_input": "2025-06-15T14:25:20.414900Z",
+     "iopub.status.busy": "2025-06-15T14:25:20.414621Z",
+     "iopub.status.idle": "2025-06-15T14:25:21.649820Z",
+     "shell.execute_reply": "2025-06-15T14:25:21.648942Z",
+     "shell.execute_reply.started": "2025-06-15T14:25:20.414873Z"
+    },
+    "trusted": true
+   },
+   "outputs": [],
+   "source": [
+    "%%writefile ./src/f5_tts/configs/vi-fine-tuned-t5-tts.yaml\n",
+    "hydra:\n",
+    "  run:\n",
+    "    dir: ckpts/${model.name}_${model.mel_spec.mel_spec_type}_${model.tokenizer}_${datasets.name}/${now:%Y-%m-%d}/${now:%H-%M-%S}\n",
+    "\n",
+    "datasets:\n",
+    "  name: vin100h-preprocessed-v2  # dataset name\n",
+    "  batch_size_per_gpu: 3200  # 1 GPUs, 1 * 3200 = 3200\n",
+    "  batch_size_type: frame  # frame | sample\n",
+    "  max_samples: 64  # max sequences per batch if use frame-wise batch_size. we set 32 for small models, 64 for base models\n",
+    "  num_workers: 4\n",
+    "\n",
+    "optim:\n",
+    "  epochs: 10\n",
+    "  learning_rate: 1e-5\n",
+    "  num_warmup_updates: 2761  # warmup updates\n",
+    "  grad_accumulation_steps: 2  # note: updates = steps / grad_accumulation_steps\n",
+    "  max_grad_norm: 1.0  # gradient clipping\n",
+    "  bnb_optimizer: False  # use bnb 8bit AdamW optimizer or not\n",
+    "\n",
+    "model:\n",
+    "  name: vi_fine_tuned_t5_tts  # model name\n",
+    "  tokenizer: pinyin  # tokenizer type\n",
+    "  tokenizer_path: null  # if 'custom' tokenizer, define the path want to use (should be vocab.txt)\n",
+    "  backbone: DiT\n",
+    "  arch:\n",
+    "    dim: 1024\n",
+    "    depth: 22\n",
+    "    heads: 16\n",
+    "    ff_mult: 2\n",
+    "    text_dim: 512\n",
+    "    text_mask_padding: False\n",
+    "    conv_layers: 4\n",
+    "    pe_attn_head: 1\n",
+    "    checkpoint_activations: False  # recompute activations and save memory for extra compute\n",
+    "  mel_spec:\n",
+    "    target_sample_rate: 24000\n",
+    "    n_mel_channels: 100\n",
+    "    hop_length: 256\n",
+    "    win_length: 1024\n",
+    "    n_fft: 1024\n",
+    "    mel_spec_type: vocos  # vocos | bigvgan\n",
+    "  vocoder:\n",
+    "    is_local: False  # use local offline ckpt or not\n",
+    "    local_path: null  # local vocoder path\n",
+    "\n",
+    "ckpts:\n",
+    "  logger: null  # wandb | tensorboard | null\n",
+    "  log_samples: True  # infer random sample per save checkpoint. wip, normal to fail with extra long samples\n",
+    "  save_per_updates: 4000  # save checkpoint per updates\n",
+    "  keep_last_n_checkpoints: 1  # -1 to keep all, 0 to not save intermediate, > 0 to keep last N checkpoints\n",
+    "  last_per_updates: 4000  # save last checkpoint per updates\n",
+    "  save_dir: ckpts/${model.name}_${model.mel_spec.mel_spec_type}_${model.tokenizer}_${datasets.name}"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {
+    "execution": {
+     "iopub.execute_input": "2025-06-15T14:25:21.651011Z",
+     "iopub.status.busy": "2025-06-15T14:25:21.650749Z",
+     "iopub.status.idle": "2025-06-15T14:25:22.958480Z",
+     "shell.execute_reply": "2025-06-15T14:25:22.957781Z",
+     "shell.execute_reply.started": "2025-06-15T14:25:21.650992Z"
+    },
+    "trusted": true
+   },
+   "outputs": [],
+   "source": [
+    "!echo hello"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {
+    "execution": {
+     "iopub.execute_input": "2025-06-15T14:25:22.959726Z",
+     "iopub.status.busy": "2025-06-15T14:25:22.959476Z",
+     "iopub.status.idle": "2025-06-15T14:25:38.131765Z",
+     "shell.execute_reply": "2025-06-15T14:25:38.130931Z",
+     "shell.execute_reply.started": "2025-06-15T14:25:22.959692Z"
+    },
+    "trusted": true
+   },
+   "outputs": [],
+   "source": [
+    "!accelerate config default"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {
+    "execution": {
+     "iopub.execute_input": "2025-06-15T14:28:31.671797Z",
+     "iopub.status.busy": "2025-06-15T14:28:31.671483Z",
+     "iopub.status.idle": "2025-06-15T14:28:31.803519Z",
+     "shell.execute_reply": "2025-06-15T14:28:31.802848Z",
+     "shell.execute_reply.started": "2025-06-15T14:28:31.671770Z"
+    },
+    "trusted": true
+   },
+   "outputs": [],
+   "source": [
+    "!echo go"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {
+    "execution": {
+     "iopub.execute_input": "2025-06-15T14:28:31.804624Z",
+     "iopub.status.busy": "2025-06-15T14:28:31.804419Z",
+     "iopub.status.idle": "2025-06-15T17:59:02.693078Z",
+     "shell.execute_reply": "2025-06-15T17:59:02.692025Z",
+     "shell.execute_reply.started": "2025-06-15T14:28:31.804591Z"
+    },
+    "trusted": true
+   },
+   "outputs": [],
+   "source": [
+    "# ************\n",
+    "!accelerate launch ./src/f5_tts/train/finetune_cli.py \\\n",
+    "                    --exp_name F5TTS_Base \\\n",
+    "                    --dataset_name vin100h-preprocessed-v2 \\\n",
+    "                    --finetune \\\n",
+    "                    --tokenizer pinyin \\\n",
+    "                    --learning_rate 1e-05 \\\n",
+    "                    --batch_size_type frame \\\n",
+    "                    --batch_size_per_gpu 3200 \\\n",
+    "                    --max_samples 64 \\\n",
+    "                    --grad_accumulation_steps 2 \\\n",
+    "                    --max_grad_norm 1 \\\n",
+    "                    --epochs 76 \\\n",
+    "                    --num_warmup_updates 2761 \\\n",
+    "                    --save_per_updates 4000 \\\n",
+    "                    --keep_last_n_checkpoints 1 \\\n",
+    "                    --last_per_updates 4000 \\\n",
+    "                    --log_samples \\\n",
+    "                    --pretrain ./ckpts/vin100h-preprocessed-v2/model_last.pt\n"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {
+    "execution": {
+     "iopub.execute_input": "2025-06-15T18:05:50.705629Z",
+     "iopub.status.busy": "2025-06-15T18:05:50.704903Z",
+     "iopub.status.idle": "2025-06-15T18:05:50.891227Z",
+     "shell.execute_reply": "2025-06-15T18:05:50.890434Z",
+     "shell.execute_reply.started": "2025-06-15T18:05:50.705578Z"
+    },
+    "trusted": true
+   },
+   "outputs": [],
+   "source": [
+    "!echo abc"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "# Copy and save"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {
+    "execution": {
+     "iopub.execute_input": "2025-06-14T10:18:46.384990Z",
+     "iopub.status.busy": "2025-06-14T10:18:46.384685Z",
+     "iopub.status.idle": "2025-06-14T10:18:46.518166Z",
+     "shell.execute_reply": "2025-06-14T10:18:46.517174Z",
+     "shell.execute_reply.started": "2025-06-14T10:18:46.384965Z"
+    },
+    "trusted": true
+   },
+   "outputs": [],
+   "source": [
+    "!rm -rf /kaggle/working/.cache"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {
+    "execution": {
+     "iopub.execute_input": "2025-06-07T16:58:20.250613Z",
+     "iopub.status.busy": "2025-06-07T16:58:20.250305Z",
+     "iopub.status.idle": "2025-06-07T16:58:20.446725Z",
+     "shell.execute_reply": "2025-06-07T16:58:20.445927Z",
+     "shell.execute_reply.started": "2025-06-07T16:58:20.250588Z"
+    },
+    "trusted": true
+   },
+   "outputs": [],
+   "source": [
+    "!ls -a ckpts/vin100h-preprocessed-v2"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {
+    "execution": {
+     "iopub.execute_input": "2025-06-15T18:06:00.980687Z",
+     "iopub.status.busy": "2025-06-15T18:06:00.979884Z",
+     "iopub.status.idle": "2025-06-15T18:06:07.418545Z",
+     "shell.execute_reply": "2025-06-15T18:06:07.417240Z",
+     "shell.execute_reply.started": "2025-06-15T18:06:00.980649Z"
+    },
+    "trusted": true
+   },
+   "outputs": [],
+   "source": [
+    "# *******************Importance\n",
+    "model_dir = \"/kaggle/working/76\"\n",
+    "os.makedirs(model_dir, exist_ok=True)\n",
+    "!cp -r ./ckpts/vin100h-preprocessed-v2/model_last.pt $model_dir"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {
+    "execution": {
+     "iopub.status.busy": "2025-06-14T10:34:21.049620Z",
+     "iopub.status.idle": "2025-06-14T10:34:21.049856Z",
+     "shell.execute_reply": "2025-06-14T10:34:21.049753Z",
+     "shell.execute_reply.started": "2025-06-14T10:34:21.049740Z"
+    },
+    "trusted": true
+   },
+   "outputs": [],
+   "source": [
+    "# To temporary Model hub\n",
+    "from huggingface_hub import HfApi\n",
+    "from huggingface_hub import snapshot_download\n",
+    "# Initialize API\n",
+    "api = HfApi()\n",
+    "\n",
+    "# Upload the folder to the repository root\n",
+    "api.upload_large_folder(\n",
+    "    folder_path=\"/kaggle/working\",  # Local folder path\n",
+    "    repo_id=\"heboya8/t5-tts-temp-model\",\n",
+    "    repo_type=\"model\"\n",
+    ")"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "# Prune Checkpoint"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {
+    "execution": {
+     "iopub.execute_input": "2025-05-11T14:11:57.837831Z",
+     "iopub.status.busy": "2025-05-11T14:11:57.837476Z",
+     "iopub.status.idle": "2025-05-11T14:11:57.844498Z",
+     "shell.execute_reply": "2025-05-11T14:11:57.843701Z",
+     "shell.execute_reply.started": "2025-05-11T14:11:57.837803Z"
+    },
+    "trusted": true
+   },
+   "outputs": [],
+   "source": [
+    "import torch\n",
+    "\n",
+    "def prune_checkpoint(checkpoint_path: str, new_checkpoint_path: str, save_ema: bool, safetensors: bool) -> str:\n",
+    "    try:\n",
+    "        checkpoint = torch.load(checkpoint_path, weights_only=True)\n",
+    "        print(\"Original Checkpoint Keys:\", checkpoint.keys())\n",
+    "\n",
+    "        to_retain = \"ema_model_state_dict\" if save_ema else \"model_state_dict\"\n",
+    "        try:\n",
+    "            model_state_dict_to_retain = checkpoint[to_retain]\n",
+    "        except KeyError:\n",
+    "            return f\"{to_retain} not found in the checkpoint.\"\n",
+    "\n",
+    "        if safetensors:\n",
+    "            new_checkpoint_path = new_checkpoint_path.replace(\".pt\", \".safetensors\")\n",
+    "            save_file(model_state_dict_to_retain, new_checkpoint_path)\n",
+    "        else:\n",
+    "            new_checkpoint_path = new_checkpoint_path.replace(\".safetensors\", \".pt\")\n",
+    "            new_checkpoint = {\"ema_model_state_dict\": model_state_dict_to_retain}\n",
+    "            torch.save(new_checkpoint, new_checkpoint_path)\n",
+    "\n",
+    "        return f\"New checkpoint saved at: {new_checkpoint_path}\"\n",
+    "\n",
+    "    except Exception as e:\n",
+    "        return f\"An error occurred: {e}\""
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {
+    "execution": {
+     "iopub.execute_input": "2025-05-11T14:22:24.624318Z",
+     "iopub.status.busy": "2025-05-11T14:22:24.623974Z",
+     "iopub.status.idle": "2025-05-11T14:22:30.316195Z",
+     "shell.execute_reply": "2025-05-11T14:22:30.315529Z",
+     "shell.execute_reply.started": "2025-05-11T14:22:24.624292Z"
+    },
+    "trusted": true
+   },
+   "outputs": [],
+   "source": [
+    "# Prune a checkpoint after training resize model\n",
+    "result = prune_checkpoint(\n",
+    "    checkpoint_path=\"/kaggle/working/F5-TTS/ckpts/vin100h-preprocessed-v2/model_last.pt\",\n",
+    "    new_checkpoint_path=\"/root/.cache/abc.pt\",\n",
+    "    save_ema=False,\n",
+    "    safetensors=False\n",
+    ")\n",
+    "print(result)"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "# Inference"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {
+    "execution": {
+     "iopub.execute_input": "2025-05-20T17:08:02.683953Z",
+     "iopub.status.busy": "2025-05-20T17:08:02.683595Z",
+     "iopub.status.idle": "2025-05-20T17:08:02.753448Z",
+     "shell.execute_reply": "2025-05-20T17:08:02.752714Z",
+     "shell.execute_reply.started": "2025-05-20T17:08:02.683922Z"
+    },
+    "trusted": true
+   },
+   "outputs": [],
+   "source": [
+    "from IPython.display import Audio\n",
+    "\n",
+    "# Path to your audio file\n",
+    "audio_path = './data/vin100h-preprocessed-v2/wavs/audio_000010.wav'\n",
+    "\n",
+    "# Display and play the audio\n",
+    "Audio(audio_path)\n"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {
+    "execution": {
+     "iopub.execute_input": "2025-06-14T10:24:03.249295Z",
+     "iopub.status.busy": "2025-06-14T10:24:03.248968Z",
+     "iopub.status.idle": "2025-06-14T10:24:41.393133Z",
+     "shell.execute_reply": "2025-06-14T10:24:41.391987Z",
+     "shell.execute_reply.started": "2025-06-14T10:24:03.249273Z"
+    },
+    "trusted": true
+   },
+   "outputs": [],
+   "source": [
+    "!python ./src/f5_tts/infer/infer_cli.py \\\n",
+    "        --model \"vin100h-preprocessed-v2\" \\\n",
+    "        --model_cfg \"./src/f5_tts/configs/F5TTS_Base.yaml\" \\\n",
+    "        --ckpt_file \"./ckpts/vin100h-preprocessed-v2/model_last.pt\" \\\n",
+    "        --vocab_file \"./data/vin100h-preprocessed-v2_pinyin/vocab.txt\" \\\n",
+    "        --ref_audio \"./data/vin100h-preprocessed-v2/wavs/audio_000010.wav\" \\\n",
+    "        --ref_text \"Về giá cả so với giá bán ngoài các siêu thị thì dâu trái ở đây rẻ hơn khá nhiều. Giả sử như bó rau ở siêu thị bán khoảng 2 đô la một bó thì ở đây chỉ có một đô la một bó. Có khi mua 50 bó được tặng thêm một bó nữa.\" \\\n",
+    "        --gen_text \"Về giá cả so với giá bán ngoài các siêu thị\" \\\n",
+    "        --output_dir \"/kaggle/working/\"\n",
+    "        # --output_file \"/content/abc.wav\""
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {
+    "execution": {
+     "iopub.execute_input": "2025-06-14T10:24:41.395230Z",
+     "iopub.status.busy": "2025-06-14T10:24:41.394917Z",
+     "iopub.status.idle": "2025-06-14T10:24:41.404325Z",
+     "shell.execute_reply": "2025-06-14T10:24:41.403321Z",
+     "shell.execute_reply.started": "2025-06-14T10:24:41.395199Z"
+    },
+    "trusted": true
+   },
+   "outputs": [],
+   "source": [
+    "from IPython.display import Audio\n",
+    "\n",
+    "# Path to your audio file\n",
+    "audio_path = '/kaggle/working/infer_cli_basic.wav'\n",
+    "\n",
+    "# Display and play the audio\n",
+    "Audio(audio_path)\n"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "## Download"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {
+    "execution": {
+     "iopub.execute_input": "2025-06-15T14:25:38.133173Z",
+     "iopub.status.busy": "2025-06-15T14:25:38.132898Z",
+     "iopub.status.idle": "2025-06-15T14:26:12.006111Z",
+     "shell.execute_reply": "2025-06-15T14:26:12.005444Z",
+     "shell.execute_reply.started": "2025-06-15T14:25:38.133137Z"
+    },
+    "trusted": true
+   },
+   "outputs": [],
+   "source": [
+    "from huggingface_hub import HfApi\n",
+    "from huggingface_hub import snapshot_download\n",
+    "import os\n",
+    "api = HfApi()\n",
+    "!git lfs install --force\n",
+    "\n",
+    "# Define the dataset name and local directory\n",
+    "repo_id = \"heboya8/f5-tts-dataset\"\n",
+    "save_path = \"/root/.cache\"\n",
+    "\n",
+    "# Create the directory if it doesn't exist\n",
+    "os.makedirs(save_path, exist_ok=True)\n",
+    "\n",
+    "# Download the dataset\n",
+    "snapshot_download(repo_id=repo_id, repo_type=\"dataset\", local_dir=save_path)"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {
+    "execution": {
+     "iopub.execute_input": "2025-06-15T14:26:12.009357Z",
+     "iopub.status.busy": "2025-06-15T14:26:12.009122Z",
+     "iopub.status.idle": "2025-06-15T14:28:31.670192Z",
+     "shell.execute_reply": "2025-06-15T14:28:31.669158Z",
+     "shell.execute_reply.started": "2025-06-15T14:26:12.009338Z"
+    },
+    "trusted": true
+   },
+   "outputs": [],
+   "source": [
+    "!unzip -q -o /root/.cache/data_compress.zip -d \".\""
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "## Upload"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {
+    "execution": {
+     "iopub.execute_input": "2025-05-10T20:06:26.721683Z",
+     "iopub.status.busy": "2025-05-10T20:06:26.720825Z",
+     "iopub.status.idle": "2025-05-10T20:11:36.850624Z",
+     "shell.execute_reply": "2025-05-10T20:11:36.849599Z",
+     "shell.execute_reply.started": "2025-05-10T20:06:26.721632Z"
+    },
+    "trusted": true
+   },
+   "outputs": [],
+   "source": [
+    "from huggingface_hub import HfApi\n",
+    "from huggingface_hub import snapshot_download\n",
+    "# Initialize API\n",
+    "api = HfApi()\n",
+    "\n",
+    "# Upload the folder to the repository root\n",
+    "api.upload_large_folder(\n",
+    "    folder_path=\"/root/.cache/dataset\",  # Local folder path\n",
+    "    repo_id=\"heboya8/f5-tts-dataset\",\n",
+    "    repo_type=\"dataset\",\n",
+    "    # multi_commits=True,  # Enable resumable uploads\n",
+    "    # multi_commits_verbose=True  # Show progress\n",
+    ")"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "## /kaggle/working/F5-TTS/ckpts/vin100h-preprocessed-v2/model_last.ptDowload Dataset"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {
+    "execution": {
+     "iopub.execute_input": "2025-05-10T20:16:38.191744Z",
+     "iopub.status.busy": "2025-05-10T20:16:38.191338Z",
+     "iopub.status.idle": "2025-05-10T20:16:56.134770Z",
+     "shell.execute_reply": "2025-05-10T20:16:56.133810Z",
+     "shell.execute_reply.started": "2025-05-10T20:16:38.191712Z"
+    },
+    "trusted": true
+   },
+   "outputs": [],
+   "source": []
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {
+    "execution": {
+     "iopub.execute_input": "2025-05-10T20:19:28.100798Z",
+     "iopub.status.busy": "2025-05-10T20:19:28.099915Z",
+     "iopub.status.idle": "2025-05-10T20:19:28.249902Z",
+     "shell.execute_reply": "2025-05-10T20:19:28.248723Z",
+     "shell.execute_reply.started": "2025-05-10T20:19:28.100762Z"
+    },
+    "trusted": true
+   },
+   "outputs": [],
+   "source": [
+    "!mkdir dataset"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {
+    "execution": {
+     "iopub.execute_input": "2025-05-10T20:20:05.322822Z",
+     "iopub.status.busy": "2025-05-10T20:20:05.322019Z",
+     "iopub.status.idle": "2025-05-10T20:20:05.567705Z",
+     "shell.execute_reply": "2025-05-10T20:20:05.566624Z",
+     "shell.execute_reply.started": "2025-05-10T20:20:05.322785Z"
+    },
+    "trusted": true
+   },
+   "outputs": [],
+   "source": [
+    "!rm -rf d /root/.cache/dataset"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {
+    "execution": {
+     "iopub.execute_input": "2025-05-10T20:20:07.132689Z",
+     "iopub.status.busy": "2025-05-10T20:20:07.132287Z",
+     "iopub.status.idle": "2025-05-10T20:22:58.875583Z",
+     "shell.execute_reply": "2025-05-10T20:22:58.874368Z",
+     "shell.execute_reply.started": "2025-05-10T20:20:07.132656Z"
+    },
+    "trusted": true
+   },
+   "outputs": [],
+   "source": [
+    "!unzip -q /kaggle/working/F5-TTS/~/.cache/data_compress.zip -d /root/.cache/dataset"
+   ]
+  }
+ ],
+ "metadata": {
+  "kaggle": {
+   "accelerator": "none",
+   "dataSources": [
+    {
+     "sourceId": 245622735,
+     "sourceType": "kernelVersion"
+    }
+   ],
+   "dockerImageVersionId": 31012,
+   "isGpuEnabled": false,
+   "isInternetEnabled": true,
+   "language": "python",
+   "sourceType": "notebook"
+  },
+  "kernelspec": {
+   "display_name": "Python 3",
+   "language": "python",
+   "name": "python3"
+  },
+  "language_info": {
+   "codemirror_mode": {
+    "name": "ipython",
+    "version": 3
+   },
+   "file_extension": ".py",
+   "mimetype": "text/x-python",
+   "name": "python",
+   "nbconvert_exporter": "python",
+   "pygments_lexer": "ipython3",
+   "version": "3.11.11"
+  }
+ },
+ "nbformat": 4,
+ "nbformat_minor": 4
+}