Update stablecuda12build1.py
Browse files16 bit and addition in audio have diff maths for each float fixed up errors
- stablecuda12build1.py +50 -38
stablecuda12build1.py
CHANGED
|
@@ -19,6 +19,7 @@ from pathlib import Path
|
|
| 19 |
import mmap
|
| 20 |
import subprocess
|
| 21 |
import re
|
|
|
|
| 22 |
|
| 23 |
# Suppress warnings for cleaner output
|
| 24 |
warnings.filterwarnings("ignore")
|
|
@@ -149,13 +150,16 @@ def balance_stereo(audio_segment, noise_threshold=-60, sample_rate=16000):
|
|
| 149 |
stereo_samples = stereo_samples * mask
|
| 150 |
left_nonzero = stereo_samples[:, 0][stereo_samples[:, 0] != 0]
|
| 151 |
right_nonzero = stereo_samples[:, 1][stereo_samples[:, 1] != 0]
|
| 152 |
-
left_rms = np.sqrt(np.mean(left_nonzero**2)) if len(left_nonzero) > 0
|
| 153 |
right_rms = np.sqrt(np.mean(right_nonzero**2)) if len(right_nonzero) > 0 else 0
|
| 154 |
if left_rms > 0 and right_rms > 0:
|
| 155 |
avg_rms = (left_rms + right_rms) / 2
|
| 156 |
stereo_samples[:, 0] = stereo_samples[:, 0] * (avg_rms / left_rms)
|
| 157 |
stereo_samples[:, 1] = stereo_samples[:, 1] * (avg_rms / right_rms)
|
| 158 |
balanced_samples = stereo_samples.flatten().astype(np.int32 if audio_segment.sample_width == 3 else np.int16)
|
|
|
|
|
|
|
|
|
|
| 159 |
balanced_segment = AudioSegment(
|
| 160 |
balanced_samples.tobytes(),
|
| 161 |
frame_rate=sample_rate,
|
|
@@ -206,6 +210,9 @@ def hard_limit(audio_segment, limit_db=-3.0, sample_rate=16000):
|
|
| 206 |
limit = 10 ** (limit_db / 20.0) * (2**23 if audio_segment.sample_width == 3 else 32767)
|
| 207 |
samples = np.array(audio_segment.get_array_of_samples(), dtype=np.float32)
|
| 208 |
samples = np.clip(samples, -limit, limit).astype(np.int32 if audio_segment.sample_width == 3 else np.int16)
|
|
|
|
|
|
|
|
|
|
| 209 |
limited_segment = AudioSegment(
|
| 210 |
samples.tobytes(),
|
| 211 |
frame_rate=sample_rate,
|
|
@@ -248,12 +255,15 @@ def apply_fade(segment, fade_in_duration=500, fade_out_duration=500):
|
|
| 248 |
# Genre prompt functions
|
| 249 |
def set_red_hot_chili_peppers_prompt(bpm, drum_beat, synthesizer, rhythmic_steps, bass_style, guitar_style):
|
| 250 |
try:
|
| 251 |
-
rhythm = f" with {rhythmic_steps}" if rhythmic_steps != "none" else ("
|
| 252 |
-
drum = f", {drum_beat} drums" if drum_beat != "none" else ""
|
| 253 |
synth = f", {synthesizer} accents" if synthesizer != "none" else ""
|
| 254 |
-
bass = f", {bass_style}" if bass_style != "none" else ",
|
| 255 |
-
guitar = f", {guitar_style} guitar riffs" if guitar_style != "none" else ", syncopated guitar riffs"
|
| 256 |
-
prompt =
|
|
|
|
|
|
|
|
|
|
| 257 |
logger.debug(f"Generated RHCP prompt: {prompt}")
|
| 258 |
return prompt
|
| 259 |
except Exception as e:
|
|
@@ -468,7 +478,8 @@ PRESETS = {
|
|
| 468 |
"rock": {"cfg_scale": 2.0, "top_k": 110, "top_p": 0.9, "temperature": 0.9},
|
| 469 |
"techno": {"cfg_scale": 1.5, "top_k": 130, "top_p": 0.85, "temperature": 0.7},
|
| 470 |
"grunge": {"cfg_scale": 1.8, "top_k": 120, "top_p": 0.9, "temperature": 0.85},
|
| 471 |
-
"indie": {"cfg_scale": 1.9, "top_k": 115, "top_p": 0.9, "temperature": 0.8}
|
|
|
|
| 472 |
}
|
| 473 |
|
| 474 |
# Function to get the latest log file
|
|
@@ -523,7 +534,7 @@ def set_bit_depth_24():
|
|
| 523 |
return "24"
|
| 524 |
|
| 525 |
# Optimized generation function
|
| 526 |
-
def generate_music(instrumental_prompt: str, cfg_scale: float, top_k: int, top_p: float, temperature: float, total_duration: int, bpm: int, drum_beat: str, synthesizer: str, rhythmic_steps: str, bass_style: str, guitar_style: str, target_volume: float, preset: str, max_steps: str, vram_status: str, bitrate: str, output_sample_rate: str, bit_depth: str):
|
| 527 |
global musicgen_model
|
| 528 |
if not instrumental_prompt.strip():
|
| 529 |
logger.warning("Empty instrumental prompt provided")
|
|
@@ -550,6 +561,10 @@ def generate_music(instrumental_prompt: str, cfg_scale: float, top_k: int, top_p
|
|
| 550 |
except ValueError:
|
| 551 |
logger.error(f"Invalid bit_depth value: {bit_depth}")
|
| 552 |
return None, "❌ Invalid bit depth; must be 16 or 24", vram_status
|
|
|
|
|
|
|
|
|
|
|
|
|
| 553 |
max_duration = min(max_steps_int / 50, 30) # Convert steps to seconds, cap at 30s
|
| 554 |
total_duration = min(max(total_duration, 30), 120) # Clamp between 30s and 120s
|
| 555 |
processing_sample_rate = 16000 # Fixed for processing
|
|
@@ -570,8 +585,6 @@ def generate_music(instrumental_prompt: str, cfg_scale: float, top_k: int, top_p
|
|
| 570 |
logger.error("Insufficient disk space")
|
| 571 |
return None, "⚠️ Insufficient disk space. Free up at least 1 GB.", vram_status
|
| 572 |
|
| 573 |
-
# Set random seed for this generation run
|
| 574 |
-
seed = random.randint(0, 10000)
|
| 575 |
logger.info(f"Generating audio for {total_duration}s with seed={seed}, max_steps={max_steps_int}, output_sample_rate={output_sample_rate_int} Hz, bit_depth={bit_depth_int}-bit")
|
| 576 |
base_prompt = instrumental_prompt
|
| 577 |
clean_memory()
|
|
@@ -731,10 +744,7 @@ def generate_music(instrumental_prompt: str, cfg_scale: float, top_k: int, top_p
|
|
| 731 |
logger.debug(f"Applying crossfade between chunks {i} and {i+1}")
|
| 732 |
prev_overlap = final_segment[-overlap_ms:]
|
| 733 |
curr_overlap = current_segment[:overlap_ms]
|
| 734 |
-
#
|
| 735 |
-
prev_overlap = ensure_stereo(prev_overlap, processing_sample_rate, sample_width)
|
| 736 |
-
curr_overlap = ensure_stereo(curr_overlap, processing_sample_rate, sample_width)
|
| 737 |
-
# Calculate samples using torchaudio for precision
|
| 738 |
prev_audio, _ = torchaudio.load(io.BytesIO(prev_overlap.raw_data))
|
| 739 |
curr_audio, _ = torchaudio.load(io.BytesIO(curr_overlap.raw_data))
|
| 740 |
num_samples = min(prev_audio.shape[1], curr_audio.shape[1])
|
|
@@ -744,27 +754,21 @@ def generate_music(instrumental_prompt: str, cfg_scale: float, top_k: int, top_p
|
|
| 744 |
logger.warning(f"Skipping crossfade for chunk {i+1} due to insufficient samples")
|
| 745 |
final_segment += current_segment
|
| 746 |
continue
|
| 747 |
-
blended_samples =
|
| 748 |
-
prev_samples = prev_audio[:, :num_samples]
|
| 749 |
-
curr_samples = curr_audio[:, :num_samples]
|
| 750 |
-
hann_window =
|
| 751 |
-
fade_out = hann_window
|
| 752 |
fade_in = hann_window
|
| 753 |
-
blended_samples = (prev_samples * fade_out
|
| 754 |
-
#
|
| 755 |
-
blended_samples = blended_samples.
|
| 756 |
-
|
| 757 |
-
|
| 758 |
-
|
| 759 |
-
|
| 760 |
-
|
| 761 |
-
|
| 762 |
-
blended_segment = AudioSegment(
|
| 763 |
-
byte_data,
|
| 764 |
-
frame_rate=processing_sample_rate,
|
| 765 |
-
sample_width=sample_width,
|
| 766 |
-
channels=2
|
| 767 |
-
)
|
| 768 |
blended_segment = rms_normalize(blended_segment, target_rms_db=target_volume, peak_limit_db=-3.0, sample_rate=processing_sample_rate)
|
| 769 |
final_segment = final_segment[:-overlap_ms] + blended_segment + current_segment[overlap_ms:]
|
| 770 |
else:
|
|
@@ -822,7 +826,7 @@ def generate_music(instrumental_prompt: str, cfg_scale: float, top_k: int, top_p
|
|
| 822 |
# Clear inputs function
|
| 823 |
def clear_inputs():
|
| 824 |
logger.info("Clearing input fields")
|
| 825 |
-
return "", 1.8, 120, 0.9, 0.8, 30, 120, "none", "none", "none", "none", "none", -23.0, "default", 1300, "96k", "32000", "16"
|
| 826 |
|
| 827 |
# Custom CSS
|
| 828 |
css = """
|
|
@@ -1024,7 +1028,7 @@ with gr.Blocks(css=css) as demo:
|
|
| 1024 |
)
|
| 1025 |
preset = gr.Dropdown(
|
| 1026 |
label="Preset Configuration 🎛️",
|
| 1027 |
-
choices=["default", "rock", "techno", "grunge", "indie"],
|
| 1028 |
value="default",
|
| 1029 |
info="Select a preset optimized for specific genres."
|
| 1030 |
)
|
|
@@ -1034,6 +1038,14 @@ with gr.Blocks(css=css) as demo:
|
|
| 1034 |
value=1300,
|
| 1035 |
info="Number of generation steps per chunk (1000=~20s, 1500=~30s)."
|
| 1036 |
)
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1037 |
bitrate_state = gr.State(value="96k") # Default bitrate
|
| 1038 |
sample_rate_state = gr.State(value="32000") # Default output sampling rate
|
| 1039 |
bit_depth_state = gr.State(value="16") # Default bit depth
|
|
@@ -1088,13 +1100,13 @@ with gr.Blocks(css=css) as demo:
|
|
| 1088 |
bit_depth_24_btn.click(set_bit_depth_24, inputs=None, outputs=bit_depth_state)
|
| 1089 |
gen_btn.click(
|
| 1090 |
generate_music,
|
| 1091 |
-
inputs=[instrumental_prompt, cfg_scale, top_k, top_p, temperature, total_duration, bpm, drum_beat, synthesizer, rhythmic_steps, bass_style, guitar_style, target_volume, preset, max_steps, vram_status, bitrate_state, sample_rate_state, bit_depth_state],
|
| 1092 |
outputs=[out_audio, status, vram_status]
|
| 1093 |
)
|
| 1094 |
clr_btn.click(
|
| 1095 |
clear_inputs,
|
| 1096 |
inputs=None,
|
| 1097 |
-
outputs=[instrumental_prompt, cfg_scale, top_k, top_p, temperature, total_duration, bpm, drum_beat, synthesizer, rhythmic_steps, bass_style, guitar_style, target_volume, preset, max_steps, bitrate_state, sample_rate_state, bit_depth_state]
|
| 1098 |
)
|
| 1099 |
log_btn.click(
|
| 1100 |
get_latest_log,
|
|
|
|
| 19 |
import mmap
|
| 20 |
import subprocess
|
| 21 |
import re
|
| 22 |
+
import io
|
| 23 |
|
| 24 |
# Suppress warnings for cleaner output
|
| 25 |
warnings.filterwarnings("ignore")
|
|
|
|
| 150 |
stereo_samples = stereo_samples * mask
|
| 151 |
left_nonzero = stereo_samples[:, 0][stereo_samples[:, 0] != 0]
|
| 152 |
right_nonzero = stereo_samples[:, 1][stereo_samples[:, 1] != 0]
|
| 153 |
+
left_rms = np.sqrt(np.mean(left_nonzero**2)) if len(left_nonzero) > 0 else 0
|
| 154 |
right_rms = np.sqrt(np.mean(right_nonzero**2)) if len(right_nonzero) > 0 else 0
|
| 155 |
if left_rms > 0 and right_rms > 0:
|
| 156 |
avg_rms = (left_rms + right_rms) / 2
|
| 157 |
stereo_samples[:, 0] = stereo_samples[:, 0] * (avg_rms / left_rms)
|
| 158 |
stereo_samples[:, 1] = stereo_samples[:, 1] * (avg_rms / right_rms)
|
| 159 |
balanced_samples = stereo_samples.flatten().astype(np.int32 if audio_segment.sample_width == 3 else np.int16)
|
| 160 |
+
# Ensure sample length is even for stereo
|
| 161 |
+
if len(balanced_samples) % 2 != 0:
|
| 162 |
+
balanced_samples = balanced_samples[:-1]
|
| 163 |
balanced_segment = AudioSegment(
|
| 164 |
balanced_samples.tobytes(),
|
| 165 |
frame_rate=sample_rate,
|
|
|
|
| 210 |
limit = 10 ** (limit_db / 20.0) * (2**23 if audio_segment.sample_width == 3 else 32767)
|
| 211 |
samples = np.array(audio_segment.get_array_of_samples(), dtype=np.float32)
|
| 212 |
samples = np.clip(samples, -limit, limit).astype(np.int32 if audio_segment.sample_width == 3 else np.int16)
|
| 213 |
+
# Ensure sample length is even for stereo
|
| 214 |
+
if len(samples) % 2 != 0:
|
| 215 |
+
samples = samples[:-1]
|
| 216 |
limited_segment = AudioSegment(
|
| 217 |
samples.tobytes(),
|
| 218 |
frame_rate=sample_rate,
|
|
|
|
| 255 |
# Genre prompt functions
|
| 256 |
def set_red_hot_chili_peppers_prompt(bpm, drum_beat, synthesizer, rhythmic_steps, bass_style, guitar_style):
|
| 257 |
try:
|
| 258 |
+
rhythm = f" with {rhythmic_steps}" if rhythmic_steps != "none" else ("syncopated funk rhythms" if bpm > 120 else "groovy funk flow")
|
| 259 |
+
drum = f", {drum_beat} drums" if drum_beat != "none" else ", tight funk drums with punchy snares"
|
| 260 |
synth = f", {synthesizer} accents" if synthesizer != "none" else ""
|
| 261 |
+
bass = f", {bass_style}" if bass_style != "none" else ", prominent slap bass with funky grooves"
|
| 262 |
+
guitar = f", {guitar_style} guitar riffs" if guitar_style != "none" else ", syncopated funk guitar riffs with clean and distorted tones"
|
| 263 |
+
prompt = (
|
| 264 |
+
f"Instrumental funk rock{bass}{guitar}{drum}{synth}, Red Hot Chili Peppers-inspired vibe with high-energy slap bass, "
|
| 265 |
+
f"syncopated guitar riffs, dynamic breakdowns, and a raw, funky edge, {rhythm} at {bpm} BPM."
|
| 266 |
+
)
|
| 267 |
logger.debug(f"Generated RHCP prompt: {prompt}")
|
| 268 |
return prompt
|
| 269 |
except Exception as e:
|
|
|
|
| 478 |
"rock": {"cfg_scale": 2.0, "top_k": 110, "top_p": 0.9, "temperature": 0.9},
|
| 479 |
"techno": {"cfg_scale": 1.5, "top_k": 130, "top_p": 0.85, "temperature": 0.7},
|
| 480 |
"grunge": {"cfg_scale": 1.8, "top_k": 120, "top_p": 0.9, "temperature": 0.85},
|
| 481 |
+
"indie": {"cfg_scale": 1.9, "top_k": 115, "top_p": 0.9, "temperature": 0.8},
|
| 482 |
+
"funk_rock": {"cfg_scale": 2.2, "top_k": 150, "top_p": 0.95, "temperature": 1.0} # Enhanced for RHCP
|
| 483 |
}
|
| 484 |
|
| 485 |
# Function to get the latest log file
|
|
|
|
| 534 |
return "24"
|
| 535 |
|
| 536 |
# Optimized generation function
|
| 537 |
+
def generate_music(instrumental_prompt: str, cfg_scale: float, top_k: int, top_p: float, temperature: float, total_duration: int, bpm: int, drum_beat: str, synthesizer: str, rhythmic_steps: str, bass_style: str, guitar_style: str, target_volume: float, preset: str, max_steps: str, vram_status: str, bitrate: str, output_sample_rate: str, bit_depth: str, seed: int):
|
| 538 |
global musicgen_model
|
| 539 |
if not instrumental_prompt.strip():
|
| 540 |
logger.warning("Empty instrumental prompt provided")
|
|
|
|
| 561 |
except ValueError:
|
| 562 |
logger.error(f"Invalid bit_depth value: {bit_depth}")
|
| 563 |
return None, "❌ Invalid bit depth; must be 16 or 24", vram_status
|
| 564 |
+
# Validate seed
|
| 565 |
+
if not (0 <= seed <= 10000):
|
| 566 |
+
logger.error(f"Invalid seed value: {seed}. Must be between 0 and 10000.")
|
| 567 |
+
return None, "❌ Invalid seed value; must be between 0 and 10000", vram_status
|
| 568 |
max_duration = min(max_steps_int / 50, 30) # Convert steps to seconds, cap at 30s
|
| 569 |
total_duration = min(max(total_duration, 30), 120) # Clamp between 30s and 120s
|
| 570 |
processing_sample_rate = 16000 # Fixed for processing
|
|
|
|
| 585 |
logger.error("Insufficient disk space")
|
| 586 |
return None, "⚠️ Insufficient disk space. Free up at least 1 GB.", vram_status
|
| 587 |
|
|
|
|
|
|
|
| 588 |
logger.info(f"Generating audio for {total_duration}s with seed={seed}, max_steps={max_steps_int}, output_sample_rate={output_sample_rate_int} Hz, bit_depth={bit_depth_int}-bit")
|
| 589 |
base_prompt = instrumental_prompt
|
| 590 |
clean_memory()
|
|
|
|
| 744 |
logger.debug(f"Applying crossfade between chunks {i} and {i+1}")
|
| 745 |
prev_overlap = final_segment[-overlap_ms:]
|
| 746 |
curr_overlap = current_segment[:overlap_ms]
|
| 747 |
+
# Use torchaudio for precise crossfading
|
|
|
|
|
|
|
|
|
|
| 748 |
prev_audio, _ = torchaudio.load(io.BytesIO(prev_overlap.raw_data))
|
| 749 |
curr_audio, _ = torchaudio.load(io.BytesIO(curr_overlap.raw_data))
|
| 750 |
num_samples = min(prev_audio.shape[1], curr_audio.shape[1])
|
|
|
|
| 754 |
logger.warning(f"Skipping crossfade for chunk {i+1} due to insufficient samples")
|
| 755 |
final_segment += current_segment
|
| 756 |
continue
|
| 757 |
+
blended_samples = torch.zeros(2, num_samples, dtype=torch.float32)
|
| 758 |
+
prev_samples = prev_audio[:, :num_samples]
|
| 759 |
+
curr_samples = curr_audio[:, :num_samples]
|
| 760 |
+
hann_window = torch.hann_window(num_samples, periodic=False)
|
| 761 |
+
fade_out = hann_window.flip(0)
|
| 762 |
fade_in = hann_window
|
| 763 |
+
blended_samples = (prev_samples * fade_out + curr_samples * fade_in)
|
| 764 |
+
# Convert to appropriate dtype for bit depth
|
| 765 |
+
blended_samples = (blended_samples * (2**23 if sample_width == 3 else 32767)).to(torch.int32 if sample_width == 3 else torch.int16)
|
| 766 |
+
# Save to temporary WAV to create AudioSegment
|
| 767 |
+
temp_crossfade_path = f"temp_crossfade_{int(time.time()*1000)}.wav"
|
| 768 |
+
torchaudio.save(temp_crossfade_path, blended_samples, processing_sample_rate, bits_per_sample=bit_depth_int)
|
| 769 |
+
blended_segment = AudioSegment.from_wav(temp_crossfade_path)
|
| 770 |
+
os.remove(temp_crossfade_path)
|
| 771 |
+
blended_segment = ensure_stereo(blended_segment, processing_sample_rate, sample_width)
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 772 |
blended_segment = rms_normalize(blended_segment, target_rms_db=target_volume, peak_limit_db=-3.0, sample_rate=processing_sample_rate)
|
| 773 |
final_segment = final_segment[:-overlap_ms] + blended_segment + current_segment[overlap_ms:]
|
| 774 |
else:
|
|
|
|
| 826 |
# Clear inputs function
|
| 827 |
def clear_inputs():
|
| 828 |
logger.info("Clearing input fields")
|
| 829 |
+
return "", 1.8, 120, 0.9, 0.8, 30, 120, "none", "none", "none", "none", "none", -23.0, "default", 1300, "96k", "32000", "16", 0
|
| 830 |
|
| 831 |
# Custom CSS
|
| 832 |
css = """
|
|
|
|
| 1028 |
)
|
| 1029 |
preset = gr.Dropdown(
|
| 1030 |
label="Preset Configuration 🎛️",
|
| 1031 |
+
choices=["default", "rock", "techno", "grunge", "indie", "funk_rock"],
|
| 1032 |
value="default",
|
| 1033 |
info="Select a preset optimized for specific genres."
|
| 1034 |
)
|
|
|
|
| 1038 |
value=1300,
|
| 1039 |
info="Number of generation steps per chunk (1000=~20s, 1500=~30s)."
|
| 1040 |
)
|
| 1041 |
+
seed = gr.Slider(
|
| 1042 |
+
label="Random Seed 🌱",
|
| 1043 |
+
minimum=0,
|
| 1044 |
+
maximum=10000,
|
| 1045 |
+
value=0,
|
| 1046 |
+
step=1,
|
| 1047 |
+
info="Set a seed for reproducibility (0-10000). Change for different variations."
|
| 1048 |
+
)
|
| 1049 |
bitrate_state = gr.State(value="96k") # Default bitrate
|
| 1050 |
sample_rate_state = gr.State(value="32000") # Default output sampling rate
|
| 1051 |
bit_depth_state = gr.State(value="16") # Default bit depth
|
|
|
|
| 1100 |
bit_depth_24_btn.click(set_bit_depth_24, inputs=None, outputs=bit_depth_state)
|
| 1101 |
gen_btn.click(
|
| 1102 |
generate_music,
|
| 1103 |
+
inputs=[instrumental_prompt, cfg_scale, top_k, top_p, temperature, total_duration, bpm, drum_beat, synthesizer, rhythmic_steps, bass_style, guitar_style, target_volume, preset, max_steps, vram_status, bitrate_state, sample_rate_state, bit_depth_state, seed],
|
| 1104 |
outputs=[out_audio, status, vram_status]
|
| 1105 |
)
|
| 1106 |
clr_btn.click(
|
| 1107 |
clear_inputs,
|
| 1108 |
inputs=None,
|
| 1109 |
+
outputs=[instrumental_prompt, cfg_scale, top_k, top_p, temperature, total_duration, bpm, drum_beat, synthesizer, rhythmic_steps, bass_style, guitar_style, target_volume, preset, max_steps, bitrate_state, sample_rate_state, bit_depth_state, seed]
|
| 1110 |
)
|
| 1111 |
log_btn.click(
|
| 1112 |
get_latest_log,
|