Spaces:
Build error
Build error
Upload 17 files
Browse files- tools/convert_to_gif.sh +43 -0
- tools/generate_ebooks.py +85 -0
- tools/gpu_notebook_test.py +11 -0
- tools/gpu_test.py +32 -0
- tools/m4b_chapter_extractor.py +245 -0
- tools/normalize_wav_file.py +103 -0
- tools/normalize_wav_folder.py +106 -0
- tools/npz_to_wav.py +61 -0
- tools/trim_silences.py +39 -0
- tools/wav_to_npz.py +141 -0
- tools/workflow-testing/long_test.txt +1 -0
- tools/workflow-testing/test1.txt +1 -0
- tools/workflow-testing/test2.txt +1 -0
- tools/workflow-testing/test3.txt +1 -0
- tools/workflow-testing/test4.txt +1 -0
- tools/workflow-testing/test5.txt +1 -0
- tools/workflow-testing/urd-script_davanagari-test.txt +1 -0
tools/convert_to_gif.sh
ADDED
@@ -0,0 +1,43 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
#!/bin/bash
|
2 |
+
|
3 |
+
# Help function
|
4 |
+
function show_help() {
|
5 |
+
echo "Usage: $0 <input_video>"
|
6 |
+
echo
|
7 |
+
echo "This script converts a video file to a GIF with the following properties:"
|
8 |
+
echo " - Frame rate: 10 fps"
|
9 |
+
echo " - Resolution: 1728x1028 (with padding if needed)"
|
10 |
+
echo
|
11 |
+
echo "Example:"
|
12 |
+
echo " $0 input.mov"
|
13 |
+
echo
|
14 |
+
echo "The output file will be named 'output.gif'."
|
15 |
+
}
|
16 |
+
|
17 |
+
# Check if help is requested
|
18 |
+
if [[ "$1" == "-h" || "$1" == "--h" || "$1" == "-help" || "$1" == "--help" ]]; then
|
19 |
+
show_help
|
20 |
+
exit 0
|
21 |
+
fi
|
22 |
+
|
23 |
+
# Check if the input file is provided
|
24 |
+
if [ "$#" -ne 1 ]; then
|
25 |
+
echo "Error: Missing input file."
|
26 |
+
echo "Use --help for usage information."
|
27 |
+
exit 1
|
28 |
+
fi
|
29 |
+
|
30 |
+
# Input video file
|
31 |
+
input_file="$1"
|
32 |
+
output_file="output.gif"
|
33 |
+
|
34 |
+
# Conversion parameters
|
35 |
+
fps=10
|
36 |
+
width=1728
|
37 |
+
height=1028
|
38 |
+
|
39 |
+
# Convert the video to GIF
|
40 |
+
ffmpeg -i "$input_file" -vf "fps=$fps,scale=${width}:${height}:force_original_aspect_ratio=decrease,pad=${width}:${height}:(ow-iw)/2:(oh-ih)/2" "$output_file"
|
41 |
+
|
42 |
+
echo "Conversion complete! Output file: $output_file"
|
43 |
+
|
tools/generate_ebooks.py
ADDED
@@ -0,0 +1,85 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
import os
|
2 |
+
import sys
|
3 |
+
import subprocess
|
4 |
+
|
5 |
+
from iso639 import languages
|
6 |
+
from deep_translator import GoogleTranslator
|
7 |
+
from tqdm import tqdm
|
8 |
+
|
9 |
+
#parent_dir = os.path.abspath(os.path.join(os.path.dirname(__file__), ".."))
|
10 |
+
#sys.path.append(parent_dir)
|
11 |
+
|
12 |
+
# Your language mapping dictionary from lang.py
|
13 |
+
from lib.lang import language_mapping
|
14 |
+
|
15 |
+
env = os.environ.copy()
|
16 |
+
os.environ['COQUI_TOS_AGREED'] = '1'
|
17 |
+
env["PYTHONIOENCODING"] = "utf-8";
|
18 |
+
env["LANG"] = "en_US.UTF-8"
|
19 |
+
|
20 |
+
# Base text to be translated
|
21 |
+
base_text = "This is a test from the result of text file to audiobook conversion."
|
22 |
+
|
23 |
+
# Output directory
|
24 |
+
output_dir = "../ebooks/tests"
|
25 |
+
if not os.path.exists(output_dir):
|
26 |
+
os.makedirs(output_dir)
|
27 |
+
|
28 |
+
# Path to your base cover image (adjust the path accordingly)
|
29 |
+
base_cover_image = "../ebooks/tests/__cover.jpg"
|
30 |
+
|
31 |
+
# List to keep track of languages that failed
|
32 |
+
failed_languages = []
|
33 |
+
|
34 |
+
# Loop over languages with a progress bar
|
35 |
+
for lang_code, lang_info in tqdm(language_mapping.items(), desc="Processing languages"):
|
36 |
+
try:
|
37 |
+
lang_iso = lang_code
|
38 |
+
language_array = languages.get(part3=lang_code)
|
39 |
+
if language_array and language_array.part1:
|
40 |
+
lang_iso = language_array.part1
|
41 |
+
if lang_iso == "zh":
|
42 |
+
lang_iso = "zh-CN"
|
43 |
+
# Translate the text
|
44 |
+
translated_text = GoogleTranslator(source='en', target=lang_iso).translate(base_text)
|
45 |
+
print(f"\nTranslated text for {lang_info['name']} ({lang_iso}): {translated_text}")
|
46 |
+
|
47 |
+
# Write the translated text to a txt file
|
48 |
+
txt_filename = f"test_{lang_code}.txt"
|
49 |
+
txt_filepath = os.path.join(output_dir, txt_filename)
|
50 |
+
with open(txt_filepath, 'w', encoding='utf-8') as f:
|
51 |
+
f.write(translated_text)
|
52 |
+
|
53 |
+
# Prepare the ebook-convert command
|
54 |
+
azw3_filename = f"test_{lang_code}.azw3"
|
55 |
+
azw3_filepath = os.path.join(output_dir, azw3_filename)
|
56 |
+
|
57 |
+
title = f"Ebook {lang_info['name']} Test"
|
58 |
+
authors = "Dev Team"
|
59 |
+
language = lang_iso
|
60 |
+
|
61 |
+
command = [
|
62 |
+
"ebook-convert",
|
63 |
+
txt_filepath,
|
64 |
+
azw3_filepath,
|
65 |
+
"--cover", base_cover_image,
|
66 |
+
"--title", title,
|
67 |
+
"--authors", authors,
|
68 |
+
"--language", language,
|
69 |
+
"--input-encoding", "utf-8"
|
70 |
+
]
|
71 |
+
|
72 |
+
result = subprocess.run(command, env=env, text=True, encoding="utf-8")
|
73 |
+
print(f"Ebook generated for {lang_info['name']} at {azw3_filepath}\n")
|
74 |
+
|
75 |
+
except Exception as e:
|
76 |
+
print(f"Erro: language {lang_code} not supported!")
|
77 |
+
failed_languages.append(lang_code)
|
78 |
+
continue
|
79 |
+
|
80 |
+
# After processing all languages, output the list of languages that failed
|
81 |
+
if failed_languages:
|
82 |
+
print("\nThe following languages could not be processed:")
|
83 |
+
for lang_code in failed_languages:
|
84 |
+
lang_name = language_mapping[lang_code]['name']
|
85 |
+
print(f"- {lang_name} ({lang_code})")
|
tools/gpu_notebook_test.py
ADDED
@@ -0,0 +1,11 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
import torch
|
2 |
+
|
3 |
+
print("Torch version:", torch.__version__)
|
4 |
+
print("CUDA Available:", torch.cuda.is_available())
|
5 |
+
|
6 |
+
if torch.cuda.is_available():
|
7 |
+
print("Number of GPUs:", torch.cuda.device_count())
|
8 |
+
for i in range(torch.cuda.device_count()):
|
9 |
+
print(f"GPU {i}: {torch.cuda.get_device_name(i)}")
|
10 |
+
else:
|
11 |
+
print("No GPU detected.")
|
tools/gpu_test.py
ADDED
@@ -0,0 +1,32 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
import torch
|
2 |
+
|
3 |
+
def gpu_test():
|
4 |
+
print("Checking PyTorch GPU support...")
|
5 |
+
|
6 |
+
# Check if CUDA is available
|
7 |
+
if not torch.cuda.is_available():
|
8 |
+
print("CUDA is NOT available. Running on CPU.")
|
9 |
+
return
|
10 |
+
|
11 |
+
# Get number of GPUs
|
12 |
+
num_gpus = torch.cuda.device_count()
|
13 |
+
print(f"CUDA is available. Number of GPUs: {num_gpus}")
|
14 |
+
|
15 |
+
# List all GPUs
|
16 |
+
for i in range(num_gpus):
|
17 |
+
print(f"GPU {i}: {torch.cuda.get_device_name(i)}")
|
18 |
+
|
19 |
+
# Run a small test on the first GPU
|
20 |
+
device = torch.device("cuda:0")
|
21 |
+
print(f"Running a small test on {torch.cuda.get_device_name(0)}...")
|
22 |
+
|
23 |
+
# Test computation
|
24 |
+
try:
|
25 |
+
x = torch.rand(1000, 1000, device=device)
|
26 |
+
y = torch.mm(x, x)
|
27 |
+
print("GPU computation successful!")
|
28 |
+
except Exception as e:
|
29 |
+
print(f"GPU computation failed: {e}")
|
30 |
+
|
31 |
+
if __name__ == "__main__":
|
32 |
+
gpu_test()
|
tools/m4b_chapter_extractor.py
ADDED
@@ -0,0 +1,245 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
#!/usr/bin/env python3
|
2 |
+
"""
|
3 |
+
M4B Chapter Extractor
|
4 |
+
|
5 |
+
A command-line tool to extract chapters from M4B audiobook files
|
6 |
+
and save them as individual MP3 files.
|
7 |
+
|
8 |
+
Requirements:
|
9 |
+
- ffmpeg installed and accessible in PATH
|
10 |
+
- Python 3.6+
|
11 |
+
|
12 |
+
Usage:
|
13 |
+
python m4b_chapter_extractor.py input.m4b -o output_folder
|
14 |
+
"""
|
15 |
+
|
16 |
+
import argparse
|
17 |
+
import os
|
18 |
+
import sys
|
19 |
+
import subprocess
|
20 |
+
import json
|
21 |
+
import re
|
22 |
+
from pathlib import Path
|
23 |
+
from typing import List, Dict, Optional
|
24 |
+
|
25 |
+
|
26 |
+
class M4BChapterExtractor:
|
27 |
+
def __init__(self, input_file: str, output_dir: str, quality: str = "192k"):
|
28 |
+
self.input_file = Path(input_file)
|
29 |
+
self.output_dir = Path(output_dir)
|
30 |
+
self.quality = quality
|
31 |
+
|
32 |
+
# Validate input file
|
33 |
+
if not self.input_file.exists():
|
34 |
+
raise FileNotFoundError(f"Input file not found: {input_file}")
|
35 |
+
|
36 |
+
if not self.input_file.suffix.lower() in ['.m4b', '.m4a']:
|
37 |
+
raise ValueError("Input file must be an M4B or M4A file")
|
38 |
+
|
39 |
+
# Create output directory
|
40 |
+
self.output_dir.mkdir(parents=True, exist_ok=True)
|
41 |
+
|
42 |
+
def check_ffmpeg(self) -> bool:
|
43 |
+
"""Check if FFmpeg is available in the system PATH."""
|
44 |
+
try:
|
45 |
+
result = subprocess.run(['ffmpeg', '-version'],
|
46 |
+
capture_output=True, text=True)
|
47 |
+
return result.returncode == 0
|
48 |
+
except FileNotFoundError:
|
49 |
+
return False
|
50 |
+
|
51 |
+
def get_chapters(self) -> List[Dict]:
|
52 |
+
"""Extract chapter information from the M4B file."""
|
53 |
+
cmd = [
|
54 |
+
'ffprobe',
|
55 |
+
'-v', 'quiet',
|
56 |
+
'-print_format', 'json',
|
57 |
+
'-show_chapters',
|
58 |
+
str(self.input_file)
|
59 |
+
]
|
60 |
+
|
61 |
+
try:
|
62 |
+
result = subprocess.run(cmd, capture_output=True, text=True, check=True)
|
63 |
+
data = json.loads(result.stdout)
|
64 |
+
return data.get('chapters', [])
|
65 |
+
except subprocess.CalledProcessError as e:
|
66 |
+
print(f"Error getting chapters: {e}")
|
67 |
+
return []
|
68 |
+
except json.JSONDecodeError as e:
|
69 |
+
print(f"Error parsing chapter data: {e}")
|
70 |
+
return []
|
71 |
+
|
72 |
+
def sanitize_filename(self, filename: str) -> str:
|
73 |
+
"""Sanitize filename by removing/replacing invalid characters."""
|
74 |
+
# Remove or replace invalid characters
|
75 |
+
filename = re.sub(r'[<>:"/\\|?*]', '', filename)
|
76 |
+
filename = re.sub(r'\s+', ' ', filename).strip()
|
77 |
+
|
78 |
+
# Ensure filename isn't too long (limit to 200 characters)
|
79 |
+
if len(filename) > 200:
|
80 |
+
filename = filename[:200].strip()
|
81 |
+
|
82 |
+
return filename or "Chapter"
|
83 |
+
|
84 |
+
def format_time(self, seconds: float) -> str:
|
85 |
+
"""Convert seconds to HH:MM:SS.mmm format."""
|
86 |
+
hours = int(seconds // 3600)
|
87 |
+
minutes = int((seconds % 3600) // 60)
|
88 |
+
secs = seconds % 60
|
89 |
+
return f"{hours:02d}:{minutes:02d}:{secs:06.3f}"
|
90 |
+
|
91 |
+
def extract_chapter(self, chapter: Dict, chapter_num: int, total_chapters: int) -> bool:
|
92 |
+
"""Extract a single chapter to MP3 file."""
|
93 |
+
# Get chapter title
|
94 |
+
title = chapter.get('tags', {}).get('title', f"Chapter {chapter_num:02d}")
|
95 |
+
title = self.sanitize_filename(title)
|
96 |
+
|
97 |
+
# Create output filename
|
98 |
+
output_filename = f"{chapter_num:02d} - {title}.mp3"
|
99 |
+
output_path = self.output_dir / output_filename
|
100 |
+
|
101 |
+
# Get start and end times
|
102 |
+
start_time = float(chapter['start_time'])
|
103 |
+
end_time = float(chapter['end_time'])
|
104 |
+
duration = end_time - start_time
|
105 |
+
|
106 |
+
print(f"Extracting [{chapter_num}/{total_chapters}]: {title}")
|
107 |
+
print(f" Duration: {self.format_time(duration)}")
|
108 |
+
|
109 |
+
# FFmpeg command to extract chapter
|
110 |
+
cmd = [
|
111 |
+
'ffmpeg',
|
112 |
+
'-i', str(self.input_file),
|
113 |
+
'-ss', str(start_time),
|
114 |
+
'-t', str(duration),
|
115 |
+
'-acodec', 'libmp3lame',
|
116 |
+
'-ab', self.quality,
|
117 |
+
'-map_metadata', '0',
|
118 |
+
'-id3v2_version', '3',
|
119 |
+
'-metadata', f'title={title}',
|
120 |
+
'-metadata', f'track={chapter_num}/{total_chapters}',
|
121 |
+
'-y', # Overwrite output file
|
122 |
+
str(output_path)
|
123 |
+
]
|
124 |
+
|
125 |
+
try:
|
126 |
+
result = subprocess.run(cmd, capture_output=True, text=True)
|
127 |
+
if result.returncode == 0:
|
128 |
+
print(f" ✓ Saved: {output_filename}")
|
129 |
+
return True
|
130 |
+
else:
|
131 |
+
print(f" ✗ Error extracting chapter: {result.stderr}")
|
132 |
+
return False
|
133 |
+
except Exception as e:
|
134 |
+
print(f" ✗ Exception during extraction: {e}")
|
135 |
+
return False
|
136 |
+
|
137 |
+
def extract_all_chapters(self) -> bool:
|
138 |
+
"""Extract all chapters from the M4B file."""
|
139 |
+
print(f"Processing: {self.input_file.name}")
|
140 |
+
print(f"Output directory: {self.output_dir}")
|
141 |
+
|
142 |
+
# Check if FFmpeg is available
|
143 |
+
if not self.check_ffmpeg():
|
144 |
+
print("Error: FFmpeg not found. Please install FFmpeg and ensure it's in your PATH.")
|
145 |
+
return False
|
146 |
+
|
147 |
+
# Get chapters
|
148 |
+
chapters = self.get_chapters()
|
149 |
+
if not chapters:
|
150 |
+
print("No chapters found in the M4B file.")
|
151 |
+
return False
|
152 |
+
|
153 |
+
print(f"Found {len(chapters)} chapters")
|
154 |
+
print("-" * 50)
|
155 |
+
|
156 |
+
# Extract each chapter
|
157 |
+
success_count = 0
|
158 |
+
for i, chapter in enumerate(chapters, 1):
|
159 |
+
if self.extract_chapter(chapter, i, len(chapters)):
|
160 |
+
success_count += 1
|
161 |
+
print()
|
162 |
+
|
163 |
+
# Summary
|
164 |
+
print("-" * 50)
|
165 |
+
print(f"Extraction complete: {success_count}/{len(chapters)} chapters extracted successfully")
|
166 |
+
|
167 |
+
if success_count == len(chapters):
|
168 |
+
print("All chapters extracted successfully!")
|
169 |
+
return True
|
170 |
+
else:
|
171 |
+
print(f"Warning: {len(chapters) - success_count} chapters failed to extract")
|
172 |
+
return False
|
173 |
+
|
174 |
+
|
175 |
+
def main():
|
176 |
+
parser = argparse.ArgumentParser(
|
177 |
+
description="Extract chapters from M4B audiobook files as individual MP3 files",
|
178 |
+
formatter_class=argparse.RawDescriptionHelpFormatter,
|
179 |
+
epilog="""
|
180 |
+
Examples:
|
181 |
+
python m4b_chapter_extractor.py audiobook.m4b -o chapters/
|
182 |
+
python m4b_chapter_extractor.py audiobook.m4b -o output/ -q 128k
|
183 |
+
python m4b_chapter_extractor.py audiobook.m4b -o output/ --quality 256k
|
184 |
+
|
185 |
+
Requirements:
|
186 |
+
- FFmpeg must be installed and accessible in PATH
|
187 |
+
- Input file must be M4B or M4A format
|
188 |
+
"""
|
189 |
+
)
|
190 |
+
|
191 |
+
parser.add_argument(
|
192 |
+
'input_file',
|
193 |
+
help='Path to the input M4B audiobook file'
|
194 |
+
)
|
195 |
+
|
196 |
+
parser.add_argument(
|
197 |
+
'-o', '--output',
|
198 |
+
required=True,
|
199 |
+
help='Output directory for extracted MP3 chapters'
|
200 |
+
)
|
201 |
+
|
202 |
+
parser.add_argument(
|
203 |
+
'-q', '--quality',
|
204 |
+
default='192k',
|
205 |
+
help='MP3 audio quality/bitrate (default: 192k). Examples: 128k, 192k, 256k, 320k'
|
206 |
+
)
|
207 |
+
|
208 |
+
parser.add_argument(
|
209 |
+
'-v', '--verbose',
|
210 |
+
action='store_true',
|
211 |
+
help='Enable verbose output'
|
212 |
+
)
|
213 |
+
|
214 |
+
args = parser.parse_args()
|
215 |
+
|
216 |
+
try:
|
217 |
+
# Create extractor instance
|
218 |
+
extractor = M4BChapterExtractor(
|
219 |
+
input_file=args.input_file,
|
220 |
+
output_dir=args.output,
|
221 |
+
quality=args.quality
|
222 |
+
)
|
223 |
+
|
224 |
+
# Extract chapters
|
225 |
+
success = extractor.extract_all_chapters()
|
226 |
+
|
227 |
+
# Exit with appropriate code
|
228 |
+
sys.exit(0 if success else 1)
|
229 |
+
|
230 |
+
except FileNotFoundError as e:
|
231 |
+
print(f"Error: {e}")
|
232 |
+
sys.exit(1)
|
233 |
+
except ValueError as e:
|
234 |
+
print(f"Error: {e}")
|
235 |
+
sys.exit(1)
|
236 |
+
except KeyboardInterrupt:
|
237 |
+
print("\nOperation cancelled by user")
|
238 |
+
sys.exit(1)
|
239 |
+
except Exception as e:
|
240 |
+
print(f"Unexpected error: {e}")
|
241 |
+
sys.exit(1)
|
242 |
+
|
243 |
+
|
244 |
+
if __name__ == "__main__":
|
245 |
+
main()
|
tools/normalize_wav_file.py
ADDED
@@ -0,0 +1,103 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
import os
|
2 |
+
import sys
|
3 |
+
import subprocess
|
4 |
+
import argparse
|
5 |
+
import shutil
|
6 |
+
import torch
|
7 |
+
|
8 |
+
def demucs_voice(wav_file, output_dir, models_dir):
|
9 |
+
try:
|
10 |
+
# Set TORCH_HOME for demucs
|
11 |
+
torch.hub.set_dir(models_dir)
|
12 |
+
os.environ['TORCH_HOME'] = models_dir
|
13 |
+
demucs_app = shutil.which('demucs')
|
14 |
+
if not demucs_app:
|
15 |
+
demucs_app = os.path.join('..', 'python_env', 'Scripts', 'demucs')
|
16 |
+
# Run demucs subprocess
|
17 |
+
cmd = [
|
18 |
+
demucs_app,
|
19 |
+
"--verbose",
|
20 |
+
"--two-stems=vocals",
|
21 |
+
"--out", output_dir,
|
22 |
+
wav_file
|
23 |
+
]
|
24 |
+
|
25 |
+
print(f"🔄 Running: {' '.join(cmd)}")
|
26 |
+
subprocess.run(cmd, check=True)
|
27 |
+
|
28 |
+
# Output folder name is based on input filename
|
29 |
+
base_name = os.path.splitext(os.path.basename(wav_file))[0]
|
30 |
+
demucs_output_path = os.path.join(output_dir, "htdemucs", "vocals.wav")
|
31 |
+
if os.path.exists(demucs_output_path):
|
32 |
+
print(f"✅ Voice track saved to: {demucs_output_path}")
|
33 |
+
return demucs_output_path
|
34 |
+
else:
|
35 |
+
raise FileNotFoundError(f"Expected output not found: {demucs_output_path}")
|
36 |
+
|
37 |
+
except subprocess.CalledProcessError as e:
|
38 |
+
raise RuntimeError(
|
39 |
+
f"demucs failed with exit code {e.returncode}.\n"
|
40 |
+
f"stdout: {getattr(e, 'output', 'N/A')}\n"
|
41 |
+
f"stderr: {getattr(e, 'stderr', 'N/A')}"
|
42 |
+
)
|
43 |
+
except FileNotFoundError as e:
|
44 |
+
raise RuntimeError(f"FileNotFoundError: {e}")
|
45 |
+
except Exception as e:
|
46 |
+
raise RuntimeError(f"Unexpected error: {e}")
|
47 |
+
|
48 |
+
def normalize_audio_file(input_file, output_file):
|
49 |
+
models_dir = os.path.join('..', 'models', 'tts')
|
50 |
+
output_dir = os.path.dirname(output_file)
|
51 |
+
#demucs_file = demucs_voice(input_file, output_dir, models_dir)
|
52 |
+
ffmpeg_cmd = [shutil.which('ffmpeg'), '-hide_banner', '-nostats', '-i', input_file]
|
53 |
+
filter_complex = (
|
54 |
+
'agate=threshold=-25dB:ratio=1.4:attack=10:release=250,'
|
55 |
+
'afftdn=nf=-70,'
|
56 |
+
'acompressor=threshold=-20dB:ratio=2:attack=80:release=200:makeup=1dB,'
|
57 |
+
'loudnorm=I=-14:TP=-3:LRA=7:linear=true,'
|
58 |
+
'equalizer=f=150:t=q:w=2:g=1,'
|
59 |
+
'equalizer=f=250:t=q:w=2:g=-3,'
|
60 |
+
'equalizer=f=3000:t=q:w=2:g=2,'
|
61 |
+
'equalizer=f=5500:t=q:w=2:g=-4,'
|
62 |
+
'equalizer=f=9000:t=q:w=2:g=-2,'
|
63 |
+
'highpass=f=63[audio]'
|
64 |
+
)
|
65 |
+
ffmpeg_cmd += [
|
66 |
+
'-filter_complex', filter_complex,
|
67 |
+
'-map', '[audio]',
|
68 |
+
'-ar', '24000',
|
69 |
+
'-y', output_file
|
70 |
+
]
|
71 |
+
try:
|
72 |
+
process = subprocess.Popen(
|
73 |
+
ffmpeg_cmd,
|
74 |
+
env={},
|
75 |
+
stdout=subprocess.PIPE,
|
76 |
+
stderr=subprocess.STDOUT,
|
77 |
+
text=True,
|
78 |
+
universal_newlines=True,
|
79 |
+
encoding='utf-8'
|
80 |
+
)
|
81 |
+
for line in process.stdout:
|
82 |
+
print(line, end='') # Print each line of stdout
|
83 |
+
process.wait()
|
84 |
+
if process.returncode != 0:
|
85 |
+
error = f'normalize_audio(): process.returncode: {process.returncode}'
|
86 |
+
elif not os.path.exists(output_file) or os.path.getsize(output_file) == 0:
|
87 |
+
error = f'normalize_audio() error: {output_file} was not created or is empty.'
|
88 |
+
else:
|
89 |
+
print(f"File denoised and normalized!: {output_file}")
|
90 |
+
except subprocess.CalledProcessError as e:
|
91 |
+
error = f'_normalize_audio() ffmpeg.Error: {e.stderr.decode()}'
|
92 |
+
except subprocess.CalledProcessError as e:
|
93 |
+
print(f"Error processing file {input_file}: {e}")
|
94 |
+
except Exception as e:
|
95 |
+
print(f"Unexpected error: {e}")
|
96 |
+
|
97 |
+
if __name__ == "__main__":
|
98 |
+
if len(sys.argv) != 3:
|
99 |
+
print(f"Usage: python {os.path.basename(__file__)} <input_file> <output_file>")
|
100 |
+
sys.exit(1)
|
101 |
+
input_file = os.path.abspath(sys.argv[1])
|
102 |
+
output_file = os.path.abspath(sys.argv[2])
|
103 |
+
normalize_audio_file(input_file, output_file)
|
tools/normalize_wav_folder.py
ADDED
@@ -0,0 +1,106 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
import os
|
2 |
+
import sys
|
3 |
+
import subprocess
|
4 |
+
import argparse
|
5 |
+
import torch
|
6 |
+
import shutil
|
7 |
+
|
8 |
+
def demucs_voice(wav_file, output_dir, models_dir):
|
9 |
+
try:
|
10 |
+
# Set TORCH_HOME for demucs
|
11 |
+
torch.hub.set_dir(models_dir)
|
12 |
+
os.environ['TORCH_HOME'] = models_dir
|
13 |
+
# Run demucs subprocess
|
14 |
+
cmd = [
|
15 |
+
os.path.join('..', 'python_env', 'bin', 'demucs'),
|
16 |
+
"--verbose",
|
17 |
+
"--two-stems=vocals",
|
18 |
+
"--out", output_dir,
|
19 |
+
wav_file
|
20 |
+
]
|
21 |
+
print(f"🔄 Running: {' '.join(cmd)}")
|
22 |
+
subprocess.run(cmd, check=True)
|
23 |
+
# Output folder name is based on input filename
|
24 |
+
base_name = os.path.splitext(os.path.basename(wav_file))[0]
|
25 |
+
demucs_output_path = os.path.join(output_dir, "htdemucs", base_name, "vocals.wav")
|
26 |
+
if os.path.exists(demucs_output_path):
|
27 |
+
print(f"✅ Voice track saved to: {demucs_output_path}")
|
28 |
+
return demucs_output_path
|
29 |
+
else:
|
30 |
+
raise FileNotFoundError(f"Expected output not found: {demucs_output_path}")
|
31 |
+
except subprocess.CalledProcessError as e:
|
32 |
+
raise RuntimeError(
|
33 |
+
f"demucs failed with exit code {e.returncode}.\n"
|
34 |
+
f"stdout: {getattr(e, 'output', 'N/A')}\n"
|
35 |
+
f"stderr: {getattr(e, 'stderr', 'N/A')}"
|
36 |
+
)
|
37 |
+
except FileNotFoundError as e:
|
38 |
+
raise RuntimeError(f"FileNotFoundError: {e}")
|
39 |
+
except Exception as e:
|
40 |
+
raise RuntimeError(f"Unexpected error: {e}")
|
41 |
+
|
42 |
+
def normalize_audio_folder(folder_path):
|
43 |
+
for root, dirs, files in os.walk(folder_path):
|
44 |
+
for file in files:
|
45 |
+
if file.lower().endswith('.wav'):
|
46 |
+
input_file = os.path.join(root, file)
|
47 |
+
models_dir = os.path.join('..', 'models', 'tts')
|
48 |
+
#demucs_file = demucs_voice(input_file, folder_path, models_dir)
|
49 |
+
process_file = os.path.join(root, 'temp_output.wav') # Temporary file to avoid overwriting during processing
|
50 |
+
ffmpeg_cmd = [shutil.which('ffmpeg'), '-hide_banner', '-nostats', '-i', input_file]
|
51 |
+
filter_complex = (
|
52 |
+
'agate=threshold=-25dB:ratio=1.4:attack=10:release=250,'
|
53 |
+
'afftdn=nf=-70,'
|
54 |
+
'acompressor=threshold=-20dB:ratio=2:attack=80:release=200:makeup=1dB,'
|
55 |
+
'loudnorm=I=-14:TP=-3:LRA=7:linear=true,'
|
56 |
+
'equalizer=f=150:t=q:w=2:g=1,'
|
57 |
+
'equalizer=f=250:t=q:w=2:g=-3,'
|
58 |
+
'equalizer=f=3000:t=q:w=2:g=2,'
|
59 |
+
'equalizer=f=5500:t=q:w=2:g=-4,'
|
60 |
+
'equalizer=f=9000:t=q:w=2:g=-2,'
|
61 |
+
'highpass=f=63[audio]'
|
62 |
+
)
|
63 |
+
ffmpeg_cmd += [
|
64 |
+
'-filter_complex', filter_complex,
|
65 |
+
'-map', '[audio]',
|
66 |
+
'-ar', '24000',
|
67 |
+
'-y', process_file
|
68 |
+
]
|
69 |
+
try:
|
70 |
+
process = subprocess.Popen(
|
71 |
+
ffmpeg_cmd,
|
72 |
+
env={},
|
73 |
+
stdout=subprocess.PIPE,
|
74 |
+
stderr=subprocess.STDOUT,
|
75 |
+
text=True,
|
76 |
+
universal_newlines=True,
|
77 |
+
encoding='utf-8'
|
78 |
+
)
|
79 |
+
for line in process.stdout:
|
80 |
+
print(line, end='') # Print each line of stdout
|
81 |
+
process.wait()
|
82 |
+
if process.returncode != 0:
|
83 |
+
error = f'normalize_audio(): process.returncode: {process.returncode}'
|
84 |
+
break
|
85 |
+
elif not os.path.exists(process_file) or os.path.getsize(process_file) == 0:
|
86 |
+
error = f'normalize_audio() error: {process_file} was not created or is empty.'
|
87 |
+
break
|
88 |
+
else:
|
89 |
+
os.replace(process_file, input_file)
|
90 |
+
print(f"File processed and replaced: {input_file}")
|
91 |
+
except subprocess.CalledProcessError as e:
|
92 |
+
error = f'_normalize_audio() ffmpeg.Error: {e.stderr.decode()}'
|
93 |
+
break
|
94 |
+
except subprocess.CalledProcessError as e:
|
95 |
+
print(f"Error processing file {input_file}: {e}")
|
96 |
+
break
|
97 |
+
except Exception as e:
|
98 |
+
print(f"Unexpected error: {e}")
|
99 |
+
break
|
100 |
+
|
101 |
+
if __name__ == "__main__":
|
102 |
+
if len(sys.argv) != 2:
|
103 |
+
print(f"Usage: python {os.path.basename(__file__)} <folder_path>")
|
104 |
+
sys.exit(1)
|
105 |
+
folder_path = os.path.abspath(sys.argv[1])
|
106 |
+
normalize_audio_folder(folder_path)
|
tools/npz_to_wav.py
ADDED
@@ -0,0 +1,61 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
import os
|
2 |
+
import platform
|
3 |
+
import argparse
|
4 |
+
|
5 |
+
tmp_dir = os.path.abspath(os.path.join('..', 'tmp'))
|
6 |
+
models_dir = os.path.abspath(os.path.join('..', 'models'))
|
7 |
+
tts_dir = os.path.join(models_dir, 'tts')
|
8 |
+
|
9 |
+
os.environ['PYTHONUTF8'] = '1'
|
10 |
+
os.environ['PYTHONIOENCODING'] = 'utf-8'
|
11 |
+
os.environ['COQUI_TOS_AGREED'] = '1'
|
12 |
+
os.environ['PYTHONIOENCODING'] = 'utf-8'
|
13 |
+
os.environ['CALIBRE_NO_NATIVE_FILEDIALOGS'] = '1'
|
14 |
+
os.environ['DO_NOT_TRACK'] = 'true'
|
15 |
+
os.environ['CALIBRE_TEMP_DIR'] = tmp_dir
|
16 |
+
os.environ['CALIBRE_CACHE_DIRECTORY'] = tmp_dir
|
17 |
+
os.environ['HUGGINGFACE_HUB_CACHE'] = tts_dir
|
18 |
+
os.environ['HF_HOME'] = tts_dir
|
19 |
+
os.environ['HF_DATASETS_CACHE'] = tts_dir
|
20 |
+
os.environ['BARK_CACHE_DIR'] = tts_dir
|
21 |
+
os.environ['TTS_CACHE'] = tts_dir
|
22 |
+
os.environ['TORCH_HOME'] = tts_dir
|
23 |
+
os.environ['TTS_HOME'] = models_dir
|
24 |
+
os.environ['XDG_CACHE_HOME'] = models_dir
|
25 |
+
os.environ['ARGOS_TRANSLATE_PACKAGE_PATH'] = os.path.join(models_dir, 'argostranslate')
|
26 |
+
os.environ['HF_TOKEN_PATH'] = os.path.join(os.path.expanduser('~'), '.huggingface_token')
|
27 |
+
os.environ['TORCH_FORCE_NO_WEIGHTS_ONLY_LOAD'] = '1'
|
28 |
+
os.environ['PYTORCH_ENABLE_MPS_FALLBACK'] = '1'
|
29 |
+
os.environ['SUNO_OFFLOAD_CPU'] = 'False' # BARK option: False needs A GPU
|
30 |
+
os.environ['SUNO_USE_SMALL_MODELS'] = 'False' # BARK option: False needs a GPU with VRAM > 4GB
|
31 |
+
if platform.system() == 'Windows':
|
32 |
+
os.environ['ESPEAK_DATA_PATH'] = os.path.expandvars(r"%USERPROFILE%\scoop\apps\espeak-ng\current\eSpeak NG\espeak-ng-data")
|
33 |
+
|
34 |
+
import torch
|
35 |
+
import torchaudio
|
36 |
+
import numpy as np
|
37 |
+
from pathlib import Path
|
38 |
+
from bark import SAMPLE_RATE, preload_models
|
39 |
+
from bark.generation import codec_decode
|
40 |
+
|
41 |
+
def npz_to_wav(npz_path, output_path):
|
42 |
+
preload_models()
|
43 |
+
data = np.load(npz_path)
|
44 |
+
fine_prompt = data["fine_prompt"]
|
45 |
+
audio_array = codec_decode(fine_prompt)
|
46 |
+
audio_tensor = torch.tensor(audio_array).unsqueeze(0)
|
47 |
+
torchaudio.save(output_path, audio_tensor, SAMPLE_RATE)
|
48 |
+
print(f"✅ Saved: {output_path}")
|
49 |
+
|
50 |
+
def process_all_npz_in_folder(folder_path):
|
51 |
+
preload_models()
|
52 |
+
for npz_file in Path(folder_path).rglob("*.npz"):
|
53 |
+
output_path = npz_file.with_suffix(".wav")
|
54 |
+
npz_to_wav(str(npz_file), str(output_path))
|
55 |
+
|
56 |
+
if __name__ == "__main__":
|
57 |
+
parser = argparse.ArgumentParser(description="Process all NPZ files in a folder.")
|
58 |
+
parser.add_argument("--folder_path", type=str, required=True, help="Path to the folder containing NPZ files")
|
59 |
+
args = parser.parse_args()
|
60 |
+
folder_path = os.path.abspath(args.folder_path)
|
61 |
+
process_all_npz_in_folder(folder_path)
|
tools/trim_silences.py
ADDED
@@ -0,0 +1,39 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
import os
|
2 |
+
from pydub import AudioSegment
|
3 |
+
from pydub.silence import detect_nonsilent
|
4 |
+
import sys
|
5 |
+
|
6 |
+
|
7 |
+
def trim_silence(audio_path, silence_thresh=-70, min_silence_len=1000):
|
8 |
+
audio = AudioSegment.from_file(audio_path)
|
9 |
+
nonsilent_ranges = detect_nonsilent(audio, min_silence_len=min_silence_len, silence_thresh=silence_thresh)
|
10 |
+
|
11 |
+
if not nonsilent_ranges:
|
12 |
+
return None # No nonsilent segment found
|
13 |
+
|
14 |
+
start_trim = nonsilent_ranges[0][0]
|
15 |
+
end_trim = nonsilent_ranges[-1][1]
|
16 |
+
trimmed_audio = audio[start_trim:end_trim]
|
17 |
+
return trimmed_audio
|
18 |
+
|
19 |
+
|
20 |
+
def process_folder(folder_path):
|
21 |
+
for root, _, files in os.walk(folder_path):
|
22 |
+
for file in files:
|
23 |
+
if file.endswith(".wav"):
|
24 |
+
wav_path = os.path.join(root, file)
|
25 |
+
print(f"Trimming: {wav_path}")
|
26 |
+
trimmed = trim_silence(wav_path)
|
27 |
+
if trimmed:
|
28 |
+
trimmed.export(wav_path, format="wav")
|
29 |
+
else:
|
30 |
+
print(f"Warning: only silence found in {wav_path}")
|
31 |
+
|
32 |
+
|
33 |
+
if __name__ == "__main__":
|
34 |
+
if len(sys.argv) != 2:
|
35 |
+
print("Usage: python script.py <folder_path>")
|
36 |
+
sys.exit(1)
|
37 |
+
|
38 |
+
target_folder = sys.argv[1]
|
39 |
+
process_folder(target_folder)
|
tools/wav_to_npz.py
ADDED
@@ -0,0 +1,141 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
# NOTE: to run this script you must move it to the root of ebook2audiobook
|
2 |
+
|
3 |
+
import os
|
4 |
+
|
5 |
+
os.environ['PYTHONUTF8'] = '1'
|
6 |
+
os.environ['PYTHONIOENCODING'] = 'utf-8'
|
7 |
+
os.environ['COQUI_TOS_AGREED'] = '1'
|
8 |
+
os.environ['PYTHONIOENCODING'] = 'utf-8'
|
9 |
+
os.environ['DO_NOT_TRACK'] = 'true'
|
10 |
+
os.environ['HUGGINGFACE_HUB_CACHE'] = tts_dir
|
11 |
+
os.environ['HF_HOME'] = tts_dir
|
12 |
+
os.environ['TRANSFORMERS_CACHE'] = tts_dir
|
13 |
+
os.environ['HF_DATASETS_CACHE'] = tts_dir
|
14 |
+
os.environ['BARK_CACHE_DIR'] = tts_dir
|
15 |
+
os.environ['TTS_CACHE'] = tts_dir
|
16 |
+
os.environ['TORCH_HOME'] = tts_dir
|
17 |
+
os.environ['TTS_HOME'] = models_dir
|
18 |
+
os.environ['XDG_CACHE_HOME'] = models_dir
|
19 |
+
os.environ['HF_TOKEN_PATH'] = os.path.join(os.path.expanduser('~'), '.huggingface_token')
|
20 |
+
os.environ['TORCH_FORCE_NO_WEIGHTS_ONLY_LOAD'] = '1'
|
21 |
+
os.environ['PYTORCH_ENABLE_MPS_FALLBACK'] = '1'
|
22 |
+
os.environ['SUNO_OFFLOAD_CPU'] = 'False'
|
23 |
+
os.environ['SUNO_USE_SMALL_MODELS'] = 'False'
|
24 |
+
|
25 |
+
import argparse
|
26 |
+
import hashlib
|
27 |
+
import numpy as np
|
28 |
+
import regex as re
|
29 |
+
import shutil
|
30 |
+
import soundfile as sf
|
31 |
+
import subprocess
|
32 |
+
import tempfile
|
33 |
+
import torch
|
34 |
+
import torchaudio
|
35 |
+
import threading
|
36 |
+
import uuid
|
37 |
+
|
38 |
+
from iso639 import languages
|
39 |
+
from huggingface_hub import hf_hub_download
|
40 |
+
from pathlib import Path
|
41 |
+
from scipy.io import wavfile as wav
|
42 |
+
from scipy.signal import find_peaks
|
43 |
+
from TTS.tts.configs.bark_config import BarkConfig
|
44 |
+
from TTS.tts.models.bark import Bark
|
45 |
+
|
46 |
+
from lib import *
|
47 |
+
|
48 |
+
import logging
|
49 |
+
logging.basicConfig(level=logging.DEBUG)
|
50 |
+
|
51 |
+
|
52 |
+
torch.hub.set_dir(models_dir)
|
53 |
+
|
54 |
+
loaded_tts = {}
|
55 |
+
|
56 |
+
def load_checkpoint(**kwargs):
|
57 |
+
try:
|
58 |
+
key = kwargs.get('key')
|
59 |
+
tts_engine = kwargs.get('tts_engine')
|
60 |
+
device = kwargs.get('device')
|
61 |
+
checkpoint_dir = kwargs.get('checkpoint_dir')
|
62 |
+
config = BarkConfig()
|
63 |
+
config.CACHE_DIR = tts_dir
|
64 |
+
config.USE_SMALLER_MODELS = os.environ.get('SUNO_USE_SMALL_MODELS', '').lower() == 'true'
|
65 |
+
tts = Bark.init_from_config(config)
|
66 |
+
tts.load_checkpoint(
|
67 |
+
config,
|
68 |
+
checkpoint_dir=checkpoint_dir,
|
69 |
+
eval=True
|
70 |
+
)
|
71 |
+
if tts:
|
72 |
+
if device == 'cuda':
|
73 |
+
tts.cuda()
|
74 |
+
else:
|
75 |
+
tts.to(device)
|
76 |
+
loaded_tts[key] = {"engine": tts, "config": config}
|
77 |
+
msg = f'{tts_engine} Loaded!'
|
78 |
+
print(msg)
|
79 |
+
return tts
|
80 |
+
else:
|
81 |
+
error = 'TTS engine could not be created!'
|
82 |
+
print(error)
|
83 |
+
except Exception as e:
|
84 |
+
error = f'_load_checkpoint() error: {e}'
|
85 |
+
return False
|
86 |
+
|
87 |
+
def wav_to_npz(bark_dir, wav_dir):
|
88 |
+
try:
|
89 |
+
tts_internal_key = f"TTS_ENGINES['BARK']-internal"
|
90 |
+
hf_repo = models[TTS_ENGINES['BARK']]['internal']['repo']
|
91 |
+
hf_sub = models[TTS_ENGINES['BARK']]['internal']['sub']
|
92 |
+
text_model_path = hf_hub_download(repo_id=hf_repo, filename=f"{hf_sub}{models[TTS_ENGINES['BARK']]['internal']['files'][0]}", cache_dir=tts_dir)
|
93 |
+
coarse_model_path = hf_hub_download(repo_id=hf_repo, filename=f"{hf_sub}{models[TTS_ENGINES['BARK']]['internal']['files'][1]}", cache_dir=tts_dir)
|
94 |
+
fine_model_path = hf_hub_download(repo_id=hf_repo, filename=f"{hf_sub}{models[TTS_ENGINES['BARK']]['internal']['files'][2]}", cache_dir=tts_dir)
|
95 |
+
checkpoint_dir = os.path.dirname(text_model_path)
|
96 |
+
tts = load_checkpoint(tts_engine=TTS_ENGINES['BARK'], key=tts_internal_key, checkpoint_dir=checkpoint_dir, device='cpu')
|
97 |
+
if tts:
|
98 |
+
fine_tuned_params = {
|
99 |
+
"text_temp": default_engine_settings[TTS_ENGINES['BARK']]['text_temp'],
|
100 |
+
"waveform_temp": default_engine_settings[TTS_ENGINES['BARK']]['waveform_temp']
|
101 |
+
}
|
102 |
+
for root, dirs, files in os.walk(wav_dir):
|
103 |
+
for file in files:
|
104 |
+
if file.lower().endswith('.wav'):
|
105 |
+
match = re.match(r"^([a-z]{2})_", file)
|
106 |
+
if match:
|
107 |
+
speaker = os.path.splitext(file)[0]
|
108 |
+
npz_file = f'{speaker}.npz'
|
109 |
+
iso1_lang = match.group(1)
|
110 |
+
lang_array = languages.get(part1=iso1_lang)
|
111 |
+
if lang_array:
|
112 |
+
iso3_lang = lang_array.part3
|
113 |
+
default_text_file = os.path.join(voices_dir, iso3_lang, 'default.txt')
|
114 |
+
default_text = Path(default_text_file).read_text(encoding="utf-8")
|
115 |
+
with torch.no_grad():
|
116 |
+
torch.manual_seed(67878789)
|
117 |
+
audio_data = tts.synthesize(
|
118 |
+
default_text,
|
119 |
+
loaded_tts[tts_internal_key]['config'],
|
120 |
+
speaker_id=speaker,
|
121 |
+
voice_dirs=bark_dir,
|
122 |
+
silent=True,
|
123 |
+
**fine_tuned_params
|
124 |
+
)
|
125 |
+
del audio_data
|
126 |
+
msg = f"Saved NPZ file: {npz_file}"
|
127 |
+
print(msg)
|
128 |
+
else:
|
129 |
+
print('tts bark not loaded')
|
130 |
+
except Exception as e:
|
131 |
+
print(f'wav_to_npz() error: {e}')
|
132 |
+
|
133 |
+
if __name__ == "__main__":
|
134 |
+
parser = argparse.ArgumentParser(description="Convert WAV files to Bark NPZ format.")
|
135 |
+
parser.add_argument("--bark_dir", type=str, required=True, help="Path to the Bark asset directory")
|
136 |
+
parser.add_argument("--wav_dir", type=str, required=True, help="Path to the output WAV directory")
|
137 |
+
args = parser.parse_args()
|
138 |
+
bark_dir = os.path.abspath(args.bark_dir)
|
139 |
+
wav_dir = os.path.abspath(args.wav_dir)
|
140 |
+
wav_to_npz(bark_dir, wav_dir)
|
141 |
+
|
tools/workflow-testing/long_test.txt
ADDED
@@ -0,0 +1 @@
|
|
|
|
|
1 |
+
Alright here you go just a long stream of words flowing endlessly without any stops or breaks just moving forward like a river that never ends carrying thoughts and ideas without pause or hesitation like a mind racing through the night trying to grasp onto something solid but finding only more thoughts and more words and more movement like a dream that never quite settles into focus always shifting always changing always just beyond reach like running through a field with the wind in your hair feeling free and untethered by the rules of language or structure or anything at all just existing in the purest form of expression where nothing has to make perfect sense and yet it all still feels like it does in some strange and beautiful way like the way music can make you feel something without ever needing words at all just rhythm and motion and the way it carries you forward into something bigger than yourself into something vast and infinite and endless.
|
tools/workflow-testing/test1.txt
ADDED
@@ -0,0 +1 @@
|
|
|
|
|
1 |
+
This is test file number 1.
|
tools/workflow-testing/test2.txt
ADDED
@@ -0,0 +1 @@
|
|
|
|
|
1 |
+
This is test file number 2.
|
tools/workflow-testing/test3.txt
ADDED
@@ -0,0 +1 @@
|
|
|
|
|
1 |
+
This is test file number 3.
|
tools/workflow-testing/test4.txt
ADDED
@@ -0,0 +1 @@
|
|
|
|
|
1 |
+
This is test file number 4.
|
tools/workflow-testing/test5.txt
ADDED
@@ -0,0 +1 @@
|
|
|
|
|
1 |
+
This is test file number 5.
|
tools/workflow-testing/urd-script_davanagari-test.txt
ADDED
@@ -0,0 +1 @@
|
|
|
|
|
1 |
+
यह टेस्ट फ़ाइल नंबर १ है।
|