Vi-F5-TTS / scripts /old-process_dataset.py
danhtran2mind's picture
Upload 244 files
3f9cba0 verified
raw
history blame
8.76 kB
# import json
# import os
# from pathlib import Path
# import shutil
# import torchaudio
# from datasets import load_dataset
# from datasets.arrow_writer import ArrowWriter
# from tqdm import tqdm
# import soundfile as sf
# import csv
# def save_dataset_to_local_disk(output_dir="./data/vin100h-preprocessed-v2",
# base_model="htdung167/vin100h-preprocessed-v2",
# audio_header='audio', text_header='transcription'):
# wavs_dir = os.path.join(output_dir, "wavs")
# metadata_path = os.path.join(output_dir, "metadata.csv")
# os.makedirs(wavs_dir, exist_ok=True)
# ds = load_dataset(base_model)['train']
# metadata = []
# for idx, sample in tqdm(enumerate(ds), total=len(ds),
# desc="Saving samples to directory"):
# audio_array = sample[audio_header]['array']
# sampling_rate = sample[audio_header]['sampling_rate']
# filename = f"audio_{idx:06d}.wav"
# sf.write(os.path.join(wavs_dir, filename), audio_array, sampling_rate)
# metadata.append([f"wavs/{filename}", sample[text_header]])
# with open(metadata_path, 'w', newline='', encoding='utf-8') as f:
# csv.writer(f, delimiter='|').writerows(metadata)
# print(f"Dataset saved to {output_dir}")
# # !python ./src/f5_tts/train/datasets/prepare_csv_wavs.py \
# # "./data/vin100h-preprocessed-v2" \
# # "./data/vin100h-preprocessed-v2_pinyin" \
# # --workers 4 # Sets the number of parallel processes for preprocessing.
# # if __name__ == "__main__":
# # Define the output directory and tokenizer type
# output_dir = "./data/vin100h-preprocessed-v2"
# # tokenizer_type = "pinyin"
# save_dataset_to_local_disk(output_dir=output_dir,
# base_model="htdung167/vin100h-preprocessed-v2",
# text_header="preprocessed_sentence_v2"
# )
# #############
# import subprocess
# import argparse
# def run_preprocess(input_dir, output_dir, workers):
# command = [
# "python", "./src/f5_tts/train/datasets/prepare_csv_wavs.py",
# input_dir,
# output_dir,
# "--workers", str(workers)
# ]
# process = subprocess.Popen(command, stdout=subprocess.PIPE, stderr=subprocess.PIPE, text=True)
# stdout, stderr = process.communicate()
# if process.returncode == 0:
# print("Preprocessing completed successfully.")
# print(stdout)
# else:
# print("Error during preprocessing:")
# print(stderr)
# if __name__ == "__main__":
# parser = argparse.ArgumentParser(description="Run preprocessing script for dataset.")
# parser.add_argument("input_dir", type=str, help="Input directory for preprocessing")
# parser.add_argument("output_dir", type=str, help="Output directory for processed data")
# parser.add_argument("--workers", type=int, default=4, help="Number of parallel processes")
# args = parser.parse_args()
# run_preprocess(args.input_dir, args.output_dir, args.workers)
######################################3
# prepare_dataset.py
import json
import os
from pathlib import Path
import shutil
import torchaudio
from datasets import load_dataset
from datasets.arrow_writer import ArrowWriter
from tqdm import tqdm
import soundfile as sf
import csv
import subprocess
import argparse
def save_dataset_to_local_disk(output_dir, base_model, audio_header, text_header):
"""
Saves a dataset to a local directory.
Args:
- output_dir (str): The directory to save the dataset to.
- base_model (str): The base model to load the dataset from.
- audio_header (str): The header for the audio data in the dataset.
- text_header (str): The header for the text data in the dataset.
"""
wavs_dir = os.path.join(output_dir, "wavs")
metadata_path = os.path.join(output_dir, "metadata.csv")
os.makedirs(wavs_dir, exist_ok=True)
ds = load_dataset(base_model)['train']
metadata = []
for idx, sample in tqdm(enumerate(ds), total=len(ds),
desc="Saving samples to directory"):
audio_array = sample[audio_header]['array']
sampling_rate = sample[audio_header]['sampling_rate']
filename = f"audio_{idx:06d}.wav"
sf.write(os.path.join(wavs_dir, filename), audio_array, sampling_rate)
metadata.append([f"wavs/{filename}", sample[text_header]])
with open(metadata_path, 'w', newline='', encoding='utf-8') as f:
csv.writer(f, delimiter='|').writerows(metadata)
print(f"Dataset saved to {output_dir}")
# def run_preprocess(input_dir, output_dir, workers):
# """
# Runs the preprocessing script for the dataset.
# Args:
# - input_dir (str): The input directory for preprocessing.
# - output_dir (str): The output directory for processed data.
# - workers (int): The number of parallel processes.
# """
# command = [
# "python", "./src/f5_tts/train/datasets/prepare_csv_wavs.py",
# input_dir,
# output_dir,
# "--workers", str(workers)
# ]
# process = subprocess.Popen(command, stdout=subprocess.PIPE, stderr=subprocess.PIPE, text=True)
# stdout, stderr = process.communicate()
# if process.returncode == 0:
# print("Preprocessing completed successfully.")
# print(stdout)
# else:
# print("Error during preprocessing:")
# print(stderr)
def run_preprocess(input_dir, output_dir, workers):
command = [
"python", "./src/f5_tts/train/datasets/prepare_csv_wavs.py",
input_dir,
output_dir,
"--workers", str(workers)
]
process = subprocess.Popen(
command,
stdout=subprocess.PIPE,
stderr=subprocess.PIPE,
text=True,
bufsize=1, # Line buffered
universal_newlines=True
)
# Real-time output for stdout and stderr
while True:
stdout_line = process.stdout.readline()
stderr_line = process.stderr.readline()
if stdout_line:
print(stdout_line, end='', flush=True)
if stderr_line:
print(stderr_line, end='', flush=True, file=sys.stderr)
if process.poll() is not None:
break
# Capture any remaining output
stdout, stderr = process.communicate()
if stdout:
print(stdout, end='', flush=True)
if stderr:
print(stderr, end='', flush=True, file=sys.stderr)
if process.returncode == 0:
print("\nPreprocessing completed successfully.")
else:
print("\nError during preprocessing.", file=sys.stderr)
if __name__ == "__main__":
# Set up argument parsing
parser = argparse.ArgumentParser(description="Prepare dataset for training.")
subparsers = parser.add_subparsers(dest="command")
# Subcommand to save dataset to local disk
save_parser = subparsers.add_parser("save", help="Save dataset to local disk")
save_parser.add_argument("--output_dir", type=str, default="./data/vin100h-preprocessed-v2", help="Output directory")
save_parser.add_argument("--base_model", type=str, default="htdung167/vin100h-preprocessed-v2", help="Base model")
save_parser.add_argument("--audio_header", type=str, default="audio", help="Audio header")
save_parser.add_argument("--text_header", type=str, default="preprocessed_sentence_v2", help="Text header")
# Subcommand to run preprocessing
preprocess_parser = subparsers.add_parser("preprocess", help="Run preprocessing script")
preprocess_parser.add_argument("--prepare_csv_input_dir", type=str,
default="./data/vin100h-preprocessed-v2",
help="Input directory for preprocessing")
preprocess_parser.add_argument("--prepare_csv_output_dir", type=str,
default="./data/vin100h-preprocessed-v2_pinyin",
help="Output directory for processed data")
preprocess_parser.add_argument("--workers", type=int, default=4, help="Number of parallel processes")
args = parser.parse_args()
if args.command == "save":
save_dataset_to_local_disk(args.output_dir, args.base_model, args.audio_header, args.text_header)
elif args.command == "preprocess":
run_preprocess(args.prepare_csv_input_dir, args.prepare_csv_output_dir, args.workers)
else:
parser.print_help()