# import json # import os # from pathlib import Path # import shutil # import torchaudio # from datasets import load_dataset # from datasets.arrow_writer import ArrowWriter # from tqdm import tqdm # import soundfile as sf # import csv # def save_dataset_to_local_disk(output_dir="./data/vin100h-preprocessed-v2", # base_model="htdung167/vin100h-preprocessed-v2", # audio_header='audio', text_header='transcription'): # wavs_dir = os.path.join(output_dir, "wavs") # metadata_path = os.path.join(output_dir, "metadata.csv") # os.makedirs(wavs_dir, exist_ok=True) # ds = load_dataset(base_model)['train'] # metadata = [] # for idx, sample in tqdm(enumerate(ds), total=len(ds), # desc="Saving samples to directory"): # audio_array = sample[audio_header]['array'] # sampling_rate = sample[audio_header]['sampling_rate'] # filename = f"audio_{idx:06d}.wav" # sf.write(os.path.join(wavs_dir, filename), audio_array, sampling_rate) # metadata.append([f"wavs/{filename}", sample[text_header]]) # with open(metadata_path, 'w', newline='', encoding='utf-8') as f: # csv.writer(f, delimiter='|').writerows(metadata) # print(f"Dataset saved to {output_dir}") # # !python ./src/f5_tts/train/datasets/prepare_csv_wavs.py \ # # "./data/vin100h-preprocessed-v2" \ # # "./data/vin100h-preprocessed-v2_pinyin" \ # # --workers 4 # Sets the number of parallel processes for preprocessing. # # if __name__ == "__main__": # # Define the output directory and tokenizer type # output_dir = "./data/vin100h-preprocessed-v2" # # tokenizer_type = "pinyin" # save_dataset_to_local_disk(output_dir=output_dir, # base_model="htdung167/vin100h-preprocessed-v2", # text_header="preprocessed_sentence_v2" # ) # ############# # import subprocess # import argparse # def run_preprocess(input_dir, output_dir, workers): # command = [ # "python", "./src/f5_tts/train/datasets/prepare_csv_wavs.py", # input_dir, # output_dir, # "--workers", str(workers) # ] # process = subprocess.Popen(command, stdout=subprocess.PIPE, stderr=subprocess.PIPE, text=True) # stdout, stderr = process.communicate() # if process.returncode == 0: # print("Preprocessing completed successfully.") # print(stdout) # else: # print("Error during preprocessing:") # print(stderr) # if __name__ == "__main__": # parser = argparse.ArgumentParser(description="Run preprocessing script for dataset.") # parser.add_argument("input_dir", type=str, help="Input directory for preprocessing") # parser.add_argument("output_dir", type=str, help="Output directory for processed data") # parser.add_argument("--workers", type=int, default=4, help="Number of parallel processes") # args = parser.parse_args() # run_preprocess(args.input_dir, args.output_dir, args.workers) ######################################3 # prepare_dataset.py import json import os from pathlib import Path import shutil import torchaudio from datasets import load_dataset from datasets.arrow_writer import ArrowWriter from tqdm import tqdm import soundfile as sf import csv import subprocess import argparse def save_dataset_to_local_disk(output_dir, base_model, audio_header, text_header): """ Saves a dataset to a local directory. Args: - output_dir (str): The directory to save the dataset to. - base_model (str): The base model to load the dataset from. - audio_header (str): The header for the audio data in the dataset. - text_header (str): The header for the text data in the dataset. """ wavs_dir = os.path.join(output_dir, "wavs") metadata_path = os.path.join(output_dir, "metadata.csv") os.makedirs(wavs_dir, exist_ok=True) ds = load_dataset(base_model)['train'] metadata = [] for idx, sample in tqdm(enumerate(ds), total=len(ds), desc="Saving samples to directory"): audio_array = sample[audio_header]['array'] sampling_rate = sample[audio_header]['sampling_rate'] filename = f"audio_{idx:06d}.wav" sf.write(os.path.join(wavs_dir, filename), audio_array, sampling_rate) metadata.append([f"wavs/{filename}", sample[text_header]]) with open(metadata_path, 'w', newline='', encoding='utf-8') as f: csv.writer(f, delimiter='|').writerows(metadata) print(f"Dataset saved to {output_dir}") # def run_preprocess(input_dir, output_dir, workers): # """ # Runs the preprocessing script for the dataset. # Args: # - input_dir (str): The input directory for preprocessing. # - output_dir (str): The output directory for processed data. # - workers (int): The number of parallel processes. # """ # command = [ # "python", "./src/f5_tts/train/datasets/prepare_csv_wavs.py", # input_dir, # output_dir, # "--workers", str(workers) # ] # process = subprocess.Popen(command, stdout=subprocess.PIPE, stderr=subprocess.PIPE, text=True) # stdout, stderr = process.communicate() # if process.returncode == 0: # print("Preprocessing completed successfully.") # print(stdout) # else: # print("Error during preprocessing:") # print(stderr) def run_preprocess(input_dir, output_dir, workers): command = [ "python", "./src/f5_tts/train/datasets/prepare_csv_wavs.py", input_dir, output_dir, "--workers", str(workers) ] process = subprocess.Popen( command, stdout=subprocess.PIPE, stderr=subprocess.PIPE, text=True, bufsize=1, # Line buffered universal_newlines=True ) # Real-time output for stdout and stderr while True: stdout_line = process.stdout.readline() stderr_line = process.stderr.readline() if stdout_line: print(stdout_line, end='', flush=True) if stderr_line: print(stderr_line, end='', flush=True, file=sys.stderr) if process.poll() is not None: break # Capture any remaining output stdout, stderr = process.communicate() if stdout: print(stdout, end='', flush=True) if stderr: print(stderr, end='', flush=True, file=sys.stderr) if process.returncode == 0: print("\nPreprocessing completed successfully.") else: print("\nError during preprocessing.", file=sys.stderr) if __name__ == "__main__": # Set up argument parsing parser = argparse.ArgumentParser(description="Prepare dataset for training.") subparsers = parser.add_subparsers(dest="command") # Subcommand to save dataset to local disk save_parser = subparsers.add_parser("save", help="Save dataset to local disk") save_parser.add_argument("--output_dir", type=str, default="./data/vin100h-preprocessed-v2", help="Output directory") save_parser.add_argument("--base_model", type=str, default="htdung167/vin100h-preprocessed-v2", help="Base model") save_parser.add_argument("--audio_header", type=str, default="audio", help="Audio header") save_parser.add_argument("--text_header", type=str, default="preprocessed_sentence_v2", help="Text header") # Subcommand to run preprocessing preprocess_parser = subparsers.add_parser("preprocess", help="Run preprocessing script") preprocess_parser.add_argument("--prepare_csv_input_dir", type=str, default="./data/vin100h-preprocessed-v2", help="Input directory for preprocessing") preprocess_parser.add_argument("--prepare_csv_output_dir", type=str, default="./data/vin100h-preprocessed-v2_pinyin", help="Output directory for processed data") preprocess_parser.add_argument("--workers", type=int, default=4, help="Number of parallel processes") args = parser.parse_args() if args.command == "save": save_dataset_to_local_disk(args.output_dir, args.base_model, args.audio_header, args.text_header) elif args.command == "preprocess": run_preprocess(args.prepare_csv_input_dir, args.prepare_csv_output_dir, args.workers) else: parser.print_help()