Spaces:

danhtran2mind
/

Vi-F5-TTS

Running

File size: 8,757 Bytes

3f9cba0

# import json
# import os
# from pathlib import Path
# import shutil
# import torchaudio
# from datasets import load_dataset
# from datasets.arrow_writer import ArrowWriter
# from tqdm import tqdm
# import soundfile as sf
# import csv

# def save_dataset_to_local_disk(output_dir="./data/vin100h-preprocessed-v2",
#                                base_model="htdung167/vin100h-preprocessed-v2",
#                                audio_header='audio', text_header='transcription'):
  
#     wavs_dir = os.path.join(output_dir, "wavs")
#     metadata_path = os.path.join(output_dir, "metadata.csv")
#     os.makedirs(wavs_dir, exist_ok=True)

#     ds = load_dataset(base_model)['train']
#     metadata = []

#     for idx, sample in tqdm(enumerate(ds), total=len(ds),
#                             desc="Saving samples to directory"):
#         audio_array = sample[audio_header]['array']
#         sampling_rate = sample[audio_header]['sampling_rate']
#         filename = f"audio_{idx:06d}.wav"
#         sf.write(os.path.join(wavs_dir, filename), audio_array, sampling_rate)
#         metadata.append([f"wavs/{filename}", sample[text_header]])
        
#     with open(metadata_path, 'w', newline='', encoding='utf-8') as f:
#         csv.writer(f, delimiter='|').writerows(metadata)
#     print(f"Dataset saved to {output_dir}")


# # !python ./src/f5_tts/train/datasets/prepare_csv_wavs.py \
# #     "./data/vin100h-preprocessed-v2" \
# #     "./data/vin100h-preprocessed-v2_pinyin" \
# #     --workers 4 # Sets the number of parallel processes for preprocessing.

# # if __name__ == "__main__":
# # Define the output directory and tokenizer type
# output_dir = "./data/vin100h-preprocessed-v2"
# # tokenizer_type = "pinyin"

# save_dataset_to_local_disk(output_dir=output_dir,
#                         base_model="htdung167/vin100h-preprocessed-v2",
#                         text_header="preprocessed_sentence_v2"
#                         )
    


#     #############

# import subprocess
# import argparse

# def run_preprocess(input_dir, output_dir, workers):
#     command = [
#         "python", "./src/f5_tts/train/datasets/prepare_csv_wavs.py",
#         input_dir,
#         output_dir,
#         "--workers", str(workers)
#     ]
#     process = subprocess.Popen(command, stdout=subprocess.PIPE, stderr=subprocess.PIPE, text=True)
#     stdout, stderr = process.communicate()
    
#     if process.returncode == 0:
#         print("Preprocessing completed successfully.")
#         print(stdout)
#     else:
#         print("Error during preprocessing:")
#         print(stderr)

# if __name__ == "__main__":
#     parser = argparse.ArgumentParser(description="Run preprocessing script for dataset.")
#     parser.add_argument("input_dir", type=str, help="Input directory for preprocessing")
#     parser.add_argument("output_dir", type=str, help="Output directory for processed data")
#     parser.add_argument("--workers", type=int, default=4, help="Number of parallel processes")
    
#     args = parser.parse_args()
#     run_preprocess(args.input_dir, args.output_dir, args.workers)

######################################3
# prepare_dataset.py

import json
import os
from pathlib import Path
import shutil
import torchaudio
from datasets import load_dataset
from datasets.arrow_writer import ArrowWriter
from tqdm import tqdm
import soundfile as sf
import csv
import subprocess
import argparse

def save_dataset_to_local_disk(output_dir, base_model, audio_header, text_header):
    """

    Saves a dataset to a local directory.



    Args:

    - output_dir (str): The directory to save the dataset to.

    - base_model (str): The base model to load the dataset from.

    - audio_header (str): The header for the audio data in the dataset.

    - text_header (str): The header for the text data in the dataset.

    """
    wavs_dir = os.path.join(output_dir, "wavs")
    metadata_path = os.path.join(output_dir, "metadata.csv")
    os.makedirs(wavs_dir, exist_ok=True)

    ds = load_dataset(base_model)['train']
    metadata = []

    for idx, sample in tqdm(enumerate(ds), total=len(ds),
                            desc="Saving samples to directory"):
        audio_array = sample[audio_header]['array']
        sampling_rate = sample[audio_header]['sampling_rate']
        filename = f"audio_{idx:06d}.wav"
        sf.write(os.path.join(wavs_dir, filename), audio_array, sampling_rate)
        metadata.append([f"wavs/{filename}", sample[text_header]])
        
    with open(metadata_path, 'w', newline='', encoding='utf-8') as f:
        csv.writer(f, delimiter='|').writerows(metadata)
    print(f"Dataset saved to {output_dir}")


# def run_preprocess(input_dir, output_dir, workers):
#     """
#     Runs the preprocessing script for the dataset.

#     Args:
#     - input_dir (str): The input directory for preprocessing.
#     - output_dir (str): The output directory for processed data.
#     - workers (int): The number of parallel processes.
#     """
#     command = [
#         "python", "./src/f5_tts/train/datasets/prepare_csv_wavs.py",
#         input_dir,
#         output_dir,
#         "--workers", str(workers)
#     ]
#     process = subprocess.Popen(command, stdout=subprocess.PIPE, stderr=subprocess.PIPE, text=True)
#     stdout, stderr = process.communicate()
    
#     if process.returncode == 0:
#         print("Preprocessing completed successfully.")
#         print(stdout)
#     else:
#         print("Error during preprocessing:")
#         print(stderr)

def run_preprocess(input_dir, output_dir, workers):
    command = [
        "python", "./src/f5_tts/train/datasets/prepare_csv_wavs.py",
        input_dir,
        output_dir,
        "--workers", str(workers)
    ]
    process = subprocess.Popen(
        command,
        stdout=subprocess.PIPE,
        stderr=subprocess.PIPE,
        text=True,
        bufsize=1,  # Line buffered
        universal_newlines=True
    )

    # Real-time output for stdout and stderr
    while True:
        stdout_line = process.stdout.readline()
        stderr_line = process.stderr.readline()
        
        if stdout_line:
            print(stdout_line, end='', flush=True)
        if stderr_line:
            print(stderr_line, end='', flush=True, file=sys.stderr)
            
        if process.poll() is not None:
            break

    # Capture any remaining output
    stdout, stderr = process.communicate()
    if stdout:
        print(stdout, end='', flush=True)
    if stderr:
        print(stderr, end='', flush=True, file=sys.stderr)

    if process.returncode == 0:
        print("\nPreprocessing completed successfully.")
    else:
        print("\nError during preprocessing.", file=sys.stderr)


if __name__ == "__main__":
    # Set up argument parsing
    parser = argparse.ArgumentParser(description="Prepare dataset for training.")
    subparsers = parser.add_subparsers(dest="command")

    # Subcommand to save dataset to local disk
    save_parser = subparsers.add_parser("save", help="Save dataset to local disk")
    save_parser.add_argument("--output_dir", type=str, default="./data/vin100h-preprocessed-v2", help="Output directory")
    save_parser.add_argument("--base_model", type=str, default="htdung167/vin100h-preprocessed-v2", help="Base model")
    save_parser.add_argument("--audio_header", type=str, default="audio", help="Audio header")
    save_parser.add_argument("--text_header", type=str, default="preprocessed_sentence_v2", help="Text header")

    # Subcommand to run preprocessing
    preprocess_parser = subparsers.add_parser("preprocess", help="Run preprocessing script")
    preprocess_parser.add_argument("--prepare_csv_input_dir", type=str,
                                    default="./data/vin100h-preprocessed-v2",
                                    help="Input directory for preprocessing")
    preprocess_parser.add_argument("--prepare_csv_output_dir", type=str, 
                                   default="./data/vin100h-preprocessed-v2_pinyin",
                                   help="Output directory for processed data")
    preprocess_parser.add_argument("--workers", type=int, default=4, help="Number of parallel processes")

    args = parser.parse_args()

    if args.command == "save":
        save_dataset_to_local_disk(args.output_dir, args.base_model, args.audio_header, args.text_header)
    elif args.command == "preprocess":
        run_preprocess(args.prepare_csv_input_dir, args.prepare_csv_output_dir, args.workers)
    else:
        parser.print_help()