Spaces:
Running
Running
import librosa | |
import numpy as np | |
import torch | |
import gradio as gr | |
import mido | |
from io import BytesIO | |
# import pyrubberband as pyrb | |
import torchaudio.transforms as transforms | |
from model.DiffSynthSampler import DiffSynthSampler | |
from tools import adsr_envelope, adjust_audio_length | |
from webUI.natural_language_guided.track_maker import DiffSynth | |
from webUI.natural_language_guided.utils import encodeBatch2GradioOutput_STFT, phase_to_Gradio_image, \ | |
spectrogram_to_Gradio_image | |
def time_stretch_audio(waveform, sample_rate, stretch_factor): | |
# 如果输入是 numpy 数组,则转换为 torch.Tensor | |
if isinstance(waveform, np.ndarray): | |
waveform = torch.from_numpy(waveform) | |
# 确保 waveform 的类型为 torch.float32 | |
waveform = waveform.to(torch.float32) | |
# 设置 STFT 参数 | |
n_fft = 2048 # STFT 窗口大小 | |
hop_length = n_fft // 4 # STFT 的 hop length 设置为 n_fft 的四分之一 | |
# 计算短时傅里叶变换 (STFT) | |
stft = torch.stft(waveform, n_fft=n_fft, hop_length=hop_length, return_complex=True) | |
# 创建 TimeStretch 变换 | |
time_stretch = transforms.TimeStretch(hop_length=hop_length, n_freq=1025, fixed_rate=False) | |
print(stft.shape) | |
# 应用时间伸缩 | |
stretched_stft = time_stretch(stft, stretch_factor) | |
# 将 STFT 转换回时域波形 | |
stretched_waveform = torch.istft(stretched_stft, n_fft=n_fft, hop_length=hop_length) | |
# 返回处理后的 waveform,转换为 numpy 数组 | |
return stretched_waveform.detach().numpy() | |
def get_build_instrument_module(gradioWebUI, virtual_instruments_state): | |
# Load configurations | |
uNet = gradioWebUI.uNet | |
freq_resolution, time_resolution = gradioWebUI.freq_resolution, gradioWebUI.time_resolution | |
VAE_scale = gradioWebUI.VAE_scale | |
height, width, channels = int(freq_resolution / VAE_scale), int(time_resolution / VAE_scale), gradioWebUI.channels | |
timesteps = gradioWebUI.timesteps | |
VAE_quantizer = gradioWebUI.VAE_quantizer | |
VAE_decoder = gradioWebUI.VAE_decoder | |
CLAP = gradioWebUI.CLAP | |
CLAP_tokenizer = gradioWebUI.CLAP_tokenizer | |
device = gradioWebUI.device | |
squared = gradioWebUI.squared | |
sample_rate = gradioWebUI.sample_rate | |
noise_strategy = gradioWebUI.noise_strategy | |
def select_sound(virtual_instrument_name, virtual_instruments_dict): | |
virtual_instruments = virtual_instruments_dict["virtual_instruments"] | |
virtual_instrument = virtual_instruments[virtual_instrument_name] | |
return {source_sound_spectrogram_image: virtual_instrument["spectrogram_gradio_image"], | |
source_sound_phase_image: virtual_instrument["phase_gradio_image"], | |
source_sound_audio: virtual_instrument["signal"]} | |
def make_track(inpaint_steps, midi, noising_strength, attack, before_release, instrument_names, virtual_instruments_dict): | |
if noising_strength < 1: | |
print(f"Warning: making track with noising_strength = {noising_strength} < 1") | |
virtual_instruments = virtual_instruments_dict["virtual_instruments"] | |
sample_steps = int(inpaint_steps) | |
instrument_names = instrument_names.split("@") | |
instruments_configs = {} | |
for virtual_instrument_name in instrument_names: | |
virtual_instrument = virtual_instruments[virtual_instrument_name] | |
latent_representation = torch.tensor(virtual_instrument["latent_representation"], dtype=torch.float32).to(device) | |
sampler = virtual_instrument["sampler"] | |
batchsize = 1 | |
latent_representation = latent_representation.repeat(batchsize, 1, 1, 1) | |
mid = mido.MidiFile(file=BytesIO(midi)) | |
instruments_configs[virtual_instrument_name] = { | |
'sample_steps': sample_steps, | |
'sampler': sampler, | |
'noising_strength': noising_strength, | |
'latent_representation': latent_representation, | |
'attack': attack, | |
'before_release': before_release} | |
diffSynth = DiffSynth(instruments_configs, uNet, VAE_quantizer, VAE_decoder, CLAP, CLAP_tokenizer, device) | |
full_audio = diffSynth.get_music(mid, instrument_names) | |
return {track_audio: (sample_rate, full_audio)} | |
def test_duration_inpaint(virtual_instrument_name, inpaint_steps, duration, noising_strength, end_noise_level_ratio, attack, before_release, mask_flexivity, virtual_instruments_dict, use_dynamic_mask): | |
width = int(time_resolution * ((duration + 1) / 4) / VAE_scale) | |
virtual_instruments = virtual_instruments_dict["virtual_instruments"] | |
virtual_instrument = virtual_instruments[virtual_instrument_name] | |
latent_representation = torch.tensor(virtual_instrument["latent_representation"], dtype=torch.float32).to(device) | |
sample_steps = int(inpaint_steps) | |
sampler = virtual_instrument["sampler"] | |
batchsize = 1 | |
mySampler = DiffSynthSampler(timesteps, height=height, channels=channels, noise_strategy=noise_strategy) | |
mySampler.respace(list(np.linspace(0, timesteps - 1, sample_steps, dtype=np.int32))) | |
latent_representation = latent_representation.repeat(batchsize, 1, 1, 1) | |
# mask = 1, freeze | |
latent_mask = torch.zeros((batchsize, 1, height, width), dtype=torch.float32).to(device) | |
latent_mask[:, :, :, :int(time_resolution * (attack / 4) / VAE_scale)] = 1.0 | |
latent_mask[:, :, :, -int(time_resolution * ((before_release+1) / 4) / VAE_scale):] = 1.0 | |
text2sound_embedding = \ | |
CLAP.get_text_features(**CLAP_tokenizer([""], padding=True, return_tensors="pt"))[0].to( | |
device) | |
condition = text2sound_embedding.repeat(1, 1) | |
latent_representations, initial_noise = \ | |
mySampler.inpaint_sample(model=uNet, shape=(batchsize, channels, height, width), | |
noising_strength=noising_strength, | |
guide_img=latent_representation, mask=latent_mask, return_tensor=True, | |
condition=condition, sampler=sampler, | |
use_dynamic_mask=use_dynamic_mask, | |
end_noise_level_ratio=end_noise_level_ratio, | |
mask_flexivity=mask_flexivity) | |
latent_representations = latent_representations[-1] | |
quantized_latent_representations, loss, (_, _, _) = VAE_quantizer(latent_representations) | |
# Todo: remove hard-coding | |
flipped_log_spectrums, flipped_phases, rec_signals, _, _, _ = encodeBatch2GradioOutput_STFT(VAE_decoder, | |
quantized_latent_representations, | |
resolution=( | |
512, | |
width * VAE_scale), | |
original_STFT_batch=None | |
) | |
return {test_duration_spectrogram_image: flipped_log_spectrums[0], | |
test_duration_phase_image: flipped_phases[0], | |
test_duration_audio: (sample_rate, rec_signals[0])} | |
def test_duration_envelope(virtual_instrument_name, duration, noising_strength, attack, before_release, release, virtual_instruments_dict): | |
virtual_instruments = virtual_instruments_dict["virtual_instruments"] | |
virtual_instrument = virtual_instruments[virtual_instrument_name] | |
sample_rate, signal = virtual_instrument["signal"] | |
applied_signal = adsr_envelope(signal=signal, sample_rate=sample_rate, duration=duration, | |
attack_time=0.0, decay_time=0.0, sustain_level=1.0, release_time=release) | |
D = librosa.stft(applied_signal, n_fft=1024, hop_length=256, win_length=1024)[1:, :] | |
spc = np.abs(D) | |
phase = np.angle(D) | |
flipped_log_spectrum = spectrogram_to_Gradio_image(spc) | |
flipped_phase = phase_to_Gradio_image(phase) | |
return {test_duration_spectrogram_image: flipped_log_spectrum, | |
test_duration_phase_image: flipped_phase, | |
test_duration_audio: (sample_rate, applied_signal)} | |
def test_duration_stretch(virtual_instrument_name, duration, noising_strength, attack, before_release, release, virtual_instruments_dict): | |
virtual_instruments = virtual_instruments_dict["virtual_instruments"] | |
virtual_instrument = virtual_instruments[virtual_instrument_name] | |
sample_rate, signal = virtual_instrument["signal"] | |
s = 3 / duration | |
# applied_signal = pyrb.time_stretch(signal, sample_rate, s) | |
applied_signal = time_stretch_audio(signal, sample_rate, s) | |
applied_signal = adjust_audio_length(applied_signal, int((duration+1) * sample_rate), sample_rate, sample_rate) | |
D = librosa.stft(applied_signal, n_fft=1024, hop_length=256, win_length=1024)[1:, :] | |
spc = np.abs(D) | |
phase = np.angle(D) | |
flipped_log_spectrum = spectrogram_to_Gradio_image(spc) | |
flipped_phase = phase_to_Gradio_image(phase) | |
return {test_duration_spectrogram_image: flipped_log_spectrum, | |
test_duration_phase_image: flipped_phase, | |
test_duration_audio: (sample_rate, applied_signal)} | |
with gr.Tab("TestInTrack"): | |
gr.Markdown("Make music with generated sounds!") | |
with gr.Row(variant="panel"): | |
with gr.Column(scale=3): | |
instrument_name_textbox = gr.Textbox(label="Instrument name", lines=1, | |
placeholder="Name of your instrument", scale=1) | |
select_instrument_button = gr.Button(variant="primary", value="Select", scale=1) | |
with gr.Column(scale=3): | |
inpaint_steps_slider = gr.Slider(minimum=5.0, maximum=999.0, value=20.0, step=1.0, label="inpaint_steps") | |
noising_strength_slider = gradioWebUI.get_noising_strength_slider(default_noising_strength=1.) | |
end_noise_level_ratio_slider = gr.Slider(minimum=0.0, maximum=1., value=0.0, step=0.01, label="end_noise_level_ratio") | |
attack_slider = gr.Slider(minimum=0.0, maximum=1.5, value=0.5, step=0.01, label="attack in sec") | |
before_release_slider = gr.Slider(minimum=0.0, maximum=1.5, value=0.5, step=0.01, label="before_release in sec") | |
release_slider = gr.Slider(minimum=0.0, maximum=1.0, value=0.3, step=0.01, label="release in sec") | |
mask_flexivity_slider = gr.Slider(minimum=0.01, maximum=1.00, value=1., step=0.01, label="mask_flexivity") | |
with gr.Column(scale=3): | |
use_dynamic_mask_checkbox = gr.Checkbox(label="Use dynamic mask", value=True) | |
test_duration_envelope_button = gr.Button(variant="primary", value="Apply envelope", scale=1) | |
test_duration_stretch_button = gr.Button(variant="primary", value="Apply stretch", scale=1) | |
test_duration_inpaint_button = gr.Button(variant="primary", value="Inpaint different duration", scale=1) | |
duration_slider = gradioWebUI.get_duration_slider() | |
with gr.Row(variant="panel"): | |
with gr.Column(scale=2): | |
with gr.Row(variant="panel"): | |
source_sound_spectrogram_image = gr.Image(label="New sound spectrogram", type="numpy", | |
height=600, scale=1) | |
source_sound_phase_image = gr.Image(label="New sound phase", type="numpy", | |
height=600, scale=1) | |
source_sound_audio = gr.Audio(type="numpy", label="Play new sound", interactive=False) | |
with gr.Column(scale=3): | |
with gr.Row(variant="panel"): | |
test_duration_spectrogram_image = gr.Image(label="New sound spectrogram", type="numpy", | |
height=600, scale=1) | |
test_duration_phase_image = gr.Image(label="New sound phase", type="numpy", | |
height=600, scale=1) | |
test_duration_audio = gr.Audio(type="numpy", label="Play new sound", interactive=False) | |
with gr.Row(variant="panel"): | |
with gr.Column(scale=1): | |
# track_spectrogram_image = gr.Image(label="New sound spectrogram", type="numpy", | |
# height=420, scale=1) | |
midi_file = gr.File(label="Upload midi file", type="binary") | |
instrument_names_textbox = gr.Textbox(label="Instrument names", lines=2, | |
placeholder="Names of your instrument used to play the midi", scale=1) | |
track_audio = gr.Audio(type="numpy", label="Play new sound", interactive=False) | |
make_track_button = gr.Button(variant="primary", value="Make track", scale=1) | |
select_instrument_button.click(select_sound, | |
inputs=[instrument_name_textbox, virtual_instruments_state], | |
outputs=[source_sound_spectrogram_image, | |
source_sound_phase_image, | |
source_sound_audio]) | |
test_duration_envelope_button.click(test_duration_envelope, | |
inputs=[instrument_name_textbox, duration_slider, | |
noising_strength_slider, | |
attack_slider, | |
before_release_slider, | |
release_slider, | |
virtual_instruments_state, | |
], | |
outputs=[test_duration_spectrogram_image, | |
test_duration_phase_image, | |
test_duration_audio]) | |
test_duration_stretch_button.click(test_duration_stretch, | |
inputs=[instrument_name_textbox, duration_slider, | |
noising_strength_slider, | |
attack_slider, | |
before_release_slider, | |
release_slider, | |
virtual_instruments_state, | |
], | |
outputs=[test_duration_spectrogram_image, | |
test_duration_phase_image, | |
test_duration_audio]) | |
test_duration_inpaint_button.click(test_duration_inpaint, | |
inputs=[instrument_name_textbox, | |
inpaint_steps_slider, | |
duration_slider, | |
noising_strength_slider, | |
end_noise_level_ratio_slider, | |
attack_slider, | |
before_release_slider, | |
mask_flexivity_slider, | |
virtual_instruments_state, | |
use_dynamic_mask_checkbox], | |
outputs=[test_duration_spectrogram_image, | |
test_duration_phase_image, | |
test_duration_audio]) | |
make_track_button.click(make_track, | |
inputs=[inpaint_steps_slider, midi_file, | |
noising_strength_slider, | |
attack_slider, | |
before_release_slider, | |
instrument_names_textbox, | |
virtual_instruments_state], | |
outputs=[track_audio]) | |