WeixuanYuan's picture
Upload 70 files
bd6e54b verified
import librosa
import numpy as np
import torch
import gradio as gr
import mido
from io import BytesIO
# import pyrubberband as pyrb
import torchaudio.transforms as transforms
from model.DiffSynthSampler import DiffSynthSampler
from tools import adsr_envelope, adjust_audio_length
from webUI.natural_language_guided.track_maker import DiffSynth
from webUI.natural_language_guided.utils import encodeBatch2GradioOutput_STFT, phase_to_Gradio_image, \
spectrogram_to_Gradio_image
def time_stretch_audio(waveform, sample_rate, stretch_factor):
# 如果输入是 numpy 数组,则转换为 torch.Tensor
if isinstance(waveform, np.ndarray):
waveform = torch.from_numpy(waveform)
# 确保 waveform 的类型为 torch.float32
waveform = waveform.to(torch.float32)
# 设置 STFT 参数
n_fft = 2048 # STFT 窗口大小
hop_length = n_fft // 4 # STFT 的 hop length 设置为 n_fft 的四分之一
# 计算短时傅里叶变换 (STFT)
stft = torch.stft(waveform, n_fft=n_fft, hop_length=hop_length, return_complex=True)
# 创建 TimeStretch 变换
time_stretch = transforms.TimeStretch(hop_length=hop_length, n_freq=1025, fixed_rate=False)
print(stft.shape)
# 应用时间伸缩
stretched_stft = time_stretch(stft, stretch_factor)
# 将 STFT 转换回时域波形
stretched_waveform = torch.istft(stretched_stft, n_fft=n_fft, hop_length=hop_length)
# 返回处理后的 waveform,转换为 numpy 数组
return stretched_waveform.detach().numpy()
def get_build_instrument_module(gradioWebUI, virtual_instruments_state):
# Load configurations
uNet = gradioWebUI.uNet
freq_resolution, time_resolution = gradioWebUI.freq_resolution, gradioWebUI.time_resolution
VAE_scale = gradioWebUI.VAE_scale
height, width, channels = int(freq_resolution / VAE_scale), int(time_resolution / VAE_scale), gradioWebUI.channels
timesteps = gradioWebUI.timesteps
VAE_quantizer = gradioWebUI.VAE_quantizer
VAE_decoder = gradioWebUI.VAE_decoder
CLAP = gradioWebUI.CLAP
CLAP_tokenizer = gradioWebUI.CLAP_tokenizer
device = gradioWebUI.device
squared = gradioWebUI.squared
sample_rate = gradioWebUI.sample_rate
noise_strategy = gradioWebUI.noise_strategy
def select_sound(virtual_instrument_name, virtual_instruments_dict):
virtual_instruments = virtual_instruments_dict["virtual_instruments"]
virtual_instrument = virtual_instruments[virtual_instrument_name]
return {source_sound_spectrogram_image: virtual_instrument["spectrogram_gradio_image"],
source_sound_phase_image: virtual_instrument["phase_gradio_image"],
source_sound_audio: virtual_instrument["signal"]}
def make_track(inpaint_steps, midi, noising_strength, attack, before_release, instrument_names, virtual_instruments_dict):
if noising_strength < 1:
print(f"Warning: making track with noising_strength = {noising_strength} < 1")
virtual_instruments = virtual_instruments_dict["virtual_instruments"]
sample_steps = int(inpaint_steps)
instrument_names = instrument_names.split("@")
instruments_configs = {}
for virtual_instrument_name in instrument_names:
virtual_instrument = virtual_instruments[virtual_instrument_name]
latent_representation = torch.tensor(virtual_instrument["latent_representation"], dtype=torch.float32).to(device)
sampler = virtual_instrument["sampler"]
batchsize = 1
latent_representation = latent_representation.repeat(batchsize, 1, 1, 1)
mid = mido.MidiFile(file=BytesIO(midi))
instruments_configs[virtual_instrument_name] = {
'sample_steps': sample_steps,
'sampler': sampler,
'noising_strength': noising_strength,
'latent_representation': latent_representation,
'attack': attack,
'before_release': before_release}
diffSynth = DiffSynth(instruments_configs, uNet, VAE_quantizer, VAE_decoder, CLAP, CLAP_tokenizer, device)
full_audio = diffSynth.get_music(mid, instrument_names)
return {track_audio: (sample_rate, full_audio)}
def test_duration_inpaint(virtual_instrument_name, inpaint_steps, duration, noising_strength, end_noise_level_ratio, attack, before_release, mask_flexivity, virtual_instruments_dict, use_dynamic_mask):
width = int(time_resolution * ((duration + 1) / 4) / VAE_scale)
virtual_instruments = virtual_instruments_dict["virtual_instruments"]
virtual_instrument = virtual_instruments[virtual_instrument_name]
latent_representation = torch.tensor(virtual_instrument["latent_representation"], dtype=torch.float32).to(device)
sample_steps = int(inpaint_steps)
sampler = virtual_instrument["sampler"]
batchsize = 1
mySampler = DiffSynthSampler(timesteps, height=height, channels=channels, noise_strategy=noise_strategy)
mySampler.respace(list(np.linspace(0, timesteps - 1, sample_steps, dtype=np.int32)))
latent_representation = latent_representation.repeat(batchsize, 1, 1, 1)
# mask = 1, freeze
latent_mask = torch.zeros((batchsize, 1, height, width), dtype=torch.float32).to(device)
latent_mask[:, :, :, :int(time_resolution * (attack / 4) / VAE_scale)] = 1.0
latent_mask[:, :, :, -int(time_resolution * ((before_release+1) / 4) / VAE_scale):] = 1.0
text2sound_embedding = \
CLAP.get_text_features(**CLAP_tokenizer([""], padding=True, return_tensors="pt"))[0].to(
device)
condition = text2sound_embedding.repeat(1, 1)
latent_representations, initial_noise = \
mySampler.inpaint_sample(model=uNet, shape=(batchsize, channels, height, width),
noising_strength=noising_strength,
guide_img=latent_representation, mask=latent_mask, return_tensor=True,
condition=condition, sampler=sampler,
use_dynamic_mask=use_dynamic_mask,
end_noise_level_ratio=end_noise_level_ratio,
mask_flexivity=mask_flexivity)
latent_representations = latent_representations[-1]
quantized_latent_representations, loss, (_, _, _) = VAE_quantizer(latent_representations)
# Todo: remove hard-coding
flipped_log_spectrums, flipped_phases, rec_signals, _, _, _ = encodeBatch2GradioOutput_STFT(VAE_decoder,
quantized_latent_representations,
resolution=(
512,
width * VAE_scale),
original_STFT_batch=None
)
return {test_duration_spectrogram_image: flipped_log_spectrums[0],
test_duration_phase_image: flipped_phases[0],
test_duration_audio: (sample_rate, rec_signals[0])}
def test_duration_envelope(virtual_instrument_name, duration, noising_strength, attack, before_release, release, virtual_instruments_dict):
virtual_instruments = virtual_instruments_dict["virtual_instruments"]
virtual_instrument = virtual_instruments[virtual_instrument_name]
sample_rate, signal = virtual_instrument["signal"]
applied_signal = adsr_envelope(signal=signal, sample_rate=sample_rate, duration=duration,
attack_time=0.0, decay_time=0.0, sustain_level=1.0, release_time=release)
D = librosa.stft(applied_signal, n_fft=1024, hop_length=256, win_length=1024)[1:, :]
spc = np.abs(D)
phase = np.angle(D)
flipped_log_spectrum = spectrogram_to_Gradio_image(spc)
flipped_phase = phase_to_Gradio_image(phase)
return {test_duration_spectrogram_image: flipped_log_spectrum,
test_duration_phase_image: flipped_phase,
test_duration_audio: (sample_rate, applied_signal)}
def test_duration_stretch(virtual_instrument_name, duration, noising_strength, attack, before_release, release, virtual_instruments_dict):
virtual_instruments = virtual_instruments_dict["virtual_instruments"]
virtual_instrument = virtual_instruments[virtual_instrument_name]
sample_rate, signal = virtual_instrument["signal"]
s = 3 / duration
# applied_signal = pyrb.time_stretch(signal, sample_rate, s)
applied_signal = time_stretch_audio(signal, sample_rate, s)
applied_signal = adjust_audio_length(applied_signal, int((duration+1) * sample_rate), sample_rate, sample_rate)
D = librosa.stft(applied_signal, n_fft=1024, hop_length=256, win_length=1024)[1:, :]
spc = np.abs(D)
phase = np.angle(D)
flipped_log_spectrum = spectrogram_to_Gradio_image(spc)
flipped_phase = phase_to_Gradio_image(phase)
return {test_duration_spectrogram_image: flipped_log_spectrum,
test_duration_phase_image: flipped_phase,
test_duration_audio: (sample_rate, applied_signal)}
with gr.Tab("TestInTrack"):
gr.Markdown("Make music with generated sounds!")
with gr.Row(variant="panel"):
with gr.Column(scale=3):
instrument_name_textbox = gr.Textbox(label="Instrument name", lines=1,
placeholder="Name of your instrument", scale=1)
select_instrument_button = gr.Button(variant="primary", value="Select", scale=1)
with gr.Column(scale=3):
inpaint_steps_slider = gr.Slider(minimum=5.0, maximum=999.0, value=20.0, step=1.0, label="inpaint_steps")
noising_strength_slider = gradioWebUI.get_noising_strength_slider(default_noising_strength=1.)
end_noise_level_ratio_slider = gr.Slider(minimum=0.0, maximum=1., value=0.0, step=0.01, label="end_noise_level_ratio")
attack_slider = gr.Slider(minimum=0.0, maximum=1.5, value=0.5, step=0.01, label="attack in sec")
before_release_slider = gr.Slider(minimum=0.0, maximum=1.5, value=0.5, step=0.01, label="before_release in sec")
release_slider = gr.Slider(minimum=0.0, maximum=1.0, value=0.3, step=0.01, label="release in sec")
mask_flexivity_slider = gr.Slider(minimum=0.01, maximum=1.00, value=1., step=0.01, label="mask_flexivity")
with gr.Column(scale=3):
use_dynamic_mask_checkbox = gr.Checkbox(label="Use dynamic mask", value=True)
test_duration_envelope_button = gr.Button(variant="primary", value="Apply envelope", scale=1)
test_duration_stretch_button = gr.Button(variant="primary", value="Apply stretch", scale=1)
test_duration_inpaint_button = gr.Button(variant="primary", value="Inpaint different duration", scale=1)
duration_slider = gradioWebUI.get_duration_slider()
with gr.Row(variant="panel"):
with gr.Column(scale=2):
with gr.Row(variant="panel"):
source_sound_spectrogram_image = gr.Image(label="New sound spectrogram", type="numpy",
height=600, scale=1)
source_sound_phase_image = gr.Image(label="New sound phase", type="numpy",
height=600, scale=1)
source_sound_audio = gr.Audio(type="numpy", label="Play new sound", interactive=False)
with gr.Column(scale=3):
with gr.Row(variant="panel"):
test_duration_spectrogram_image = gr.Image(label="New sound spectrogram", type="numpy",
height=600, scale=1)
test_duration_phase_image = gr.Image(label="New sound phase", type="numpy",
height=600, scale=1)
test_duration_audio = gr.Audio(type="numpy", label="Play new sound", interactive=False)
with gr.Row(variant="panel"):
with gr.Column(scale=1):
# track_spectrogram_image = gr.Image(label="New sound spectrogram", type="numpy",
# height=420, scale=1)
midi_file = gr.File(label="Upload midi file", type="binary")
instrument_names_textbox = gr.Textbox(label="Instrument names", lines=2,
placeholder="Names of your instrument used to play the midi", scale=1)
track_audio = gr.Audio(type="numpy", label="Play new sound", interactive=False)
make_track_button = gr.Button(variant="primary", value="Make track", scale=1)
select_instrument_button.click(select_sound,
inputs=[instrument_name_textbox, virtual_instruments_state],
outputs=[source_sound_spectrogram_image,
source_sound_phase_image,
source_sound_audio])
test_duration_envelope_button.click(test_duration_envelope,
inputs=[instrument_name_textbox, duration_slider,
noising_strength_slider,
attack_slider,
before_release_slider,
release_slider,
virtual_instruments_state,
],
outputs=[test_duration_spectrogram_image,
test_duration_phase_image,
test_duration_audio])
test_duration_stretch_button.click(test_duration_stretch,
inputs=[instrument_name_textbox, duration_slider,
noising_strength_slider,
attack_slider,
before_release_slider,
release_slider,
virtual_instruments_state,
],
outputs=[test_duration_spectrogram_image,
test_duration_phase_image,
test_duration_audio])
test_duration_inpaint_button.click(test_duration_inpaint,
inputs=[instrument_name_textbox,
inpaint_steps_slider,
duration_slider,
noising_strength_slider,
end_noise_level_ratio_slider,
attack_slider,
before_release_slider,
mask_flexivity_slider,
virtual_instruments_state,
use_dynamic_mask_checkbox],
outputs=[test_duration_spectrogram_image,
test_duration_phase_image,
test_duration_audio])
make_track_button.click(make_track,
inputs=[inpaint_steps_slider, midi_file,
noising_strength_slider,
attack_slider,
before_release_slider,
instrument_names_textbox,
virtual_instruments_state],
outputs=[track_audio])