import librosa import numpy as np import torch import gradio as gr import mido from io import BytesIO # import pyrubberband as pyrb import torchaudio.transforms as transforms from model.DiffSynthSampler import DiffSynthSampler from tools import adsr_envelope, adjust_audio_length from webUI.natural_language_guided.track_maker import DiffSynth from webUI.natural_language_guided.utils import encodeBatch2GradioOutput_STFT, phase_to_Gradio_image, \ spectrogram_to_Gradio_image def time_stretch_audio(waveform, sample_rate, stretch_factor): # 如果输入是 numpy 数组,则转换为 torch.Tensor if isinstance(waveform, np.ndarray): waveform = torch.from_numpy(waveform) # 确保 waveform 的类型为 torch.float32 waveform = waveform.to(torch.float32) # 设置 STFT 参数 n_fft = 2048 # STFT 窗口大小 hop_length = n_fft // 4 # STFT 的 hop length 设置为 n_fft 的四分之一 # 计算短时傅里叶变换 (STFT) stft = torch.stft(waveform, n_fft=n_fft, hop_length=hop_length, return_complex=True) # 创建 TimeStretch 变换 time_stretch = transforms.TimeStretch(hop_length=hop_length, n_freq=1025, fixed_rate=False) print(stft.shape) # 应用时间伸缩 stretched_stft = time_stretch(stft, stretch_factor) # 将 STFT 转换回时域波形 stretched_waveform = torch.istft(stretched_stft, n_fft=n_fft, hop_length=hop_length) # 返回处理后的 waveform,转换为 numpy 数组 return stretched_waveform.detach().numpy() def get_build_instrument_module(gradioWebUI, virtual_instruments_state): # Load configurations uNet = gradioWebUI.uNet freq_resolution, time_resolution = gradioWebUI.freq_resolution, gradioWebUI.time_resolution VAE_scale = gradioWebUI.VAE_scale height, width, channels = int(freq_resolution / VAE_scale), int(time_resolution / VAE_scale), gradioWebUI.channels timesteps = gradioWebUI.timesteps VAE_quantizer = gradioWebUI.VAE_quantizer VAE_decoder = gradioWebUI.VAE_decoder CLAP = gradioWebUI.CLAP CLAP_tokenizer = gradioWebUI.CLAP_tokenizer device = gradioWebUI.device squared = gradioWebUI.squared sample_rate = gradioWebUI.sample_rate noise_strategy = gradioWebUI.noise_strategy def select_sound(virtual_instrument_name, virtual_instruments_dict): virtual_instruments = virtual_instruments_dict["virtual_instruments"] virtual_instrument = virtual_instruments[virtual_instrument_name] return {source_sound_spectrogram_image: virtual_instrument["spectrogram_gradio_image"], source_sound_phase_image: virtual_instrument["phase_gradio_image"], source_sound_audio: virtual_instrument["signal"]} def make_track(inpaint_steps, midi, noising_strength, attack, before_release, instrument_names, virtual_instruments_dict): if noising_strength < 1: print(f"Warning: making track with noising_strength = {noising_strength} < 1") virtual_instruments = virtual_instruments_dict["virtual_instruments"] sample_steps = int(inpaint_steps) instrument_names = instrument_names.split("@") instruments_configs = {} for virtual_instrument_name in instrument_names: virtual_instrument = virtual_instruments[virtual_instrument_name] latent_representation = torch.tensor(virtual_instrument["latent_representation"], dtype=torch.float32).to(device) sampler = virtual_instrument["sampler"] batchsize = 1 latent_representation = latent_representation.repeat(batchsize, 1, 1, 1) mid = mido.MidiFile(file=BytesIO(midi)) instruments_configs[virtual_instrument_name] = { 'sample_steps': sample_steps, 'sampler': sampler, 'noising_strength': noising_strength, 'latent_representation': latent_representation, 'attack': attack, 'before_release': before_release} diffSynth = DiffSynth(instruments_configs, uNet, VAE_quantizer, VAE_decoder, CLAP, CLAP_tokenizer, device) full_audio = diffSynth.get_music(mid, instrument_names) return {track_audio: (sample_rate, full_audio)} def test_duration_inpaint(virtual_instrument_name, inpaint_steps, duration, noising_strength, end_noise_level_ratio, attack, before_release, mask_flexivity, virtual_instruments_dict, use_dynamic_mask): width = int(time_resolution * ((duration + 1) / 4) / VAE_scale) virtual_instruments = virtual_instruments_dict["virtual_instruments"] virtual_instrument = virtual_instruments[virtual_instrument_name] latent_representation = torch.tensor(virtual_instrument["latent_representation"], dtype=torch.float32).to(device) sample_steps = int(inpaint_steps) sampler = virtual_instrument["sampler"] batchsize = 1 mySampler = DiffSynthSampler(timesteps, height=height, channels=channels, noise_strategy=noise_strategy) mySampler.respace(list(np.linspace(0, timesteps - 1, sample_steps, dtype=np.int32))) latent_representation = latent_representation.repeat(batchsize, 1, 1, 1) # mask = 1, freeze latent_mask = torch.zeros((batchsize, 1, height, width), dtype=torch.float32).to(device) latent_mask[:, :, :, :int(time_resolution * (attack / 4) / VAE_scale)] = 1.0 latent_mask[:, :, :, -int(time_resolution * ((before_release+1) / 4) / VAE_scale):] = 1.0 text2sound_embedding = \ CLAP.get_text_features(**CLAP_tokenizer([""], padding=True, return_tensors="pt"))[0].to( device) condition = text2sound_embedding.repeat(1, 1) latent_representations, initial_noise = \ mySampler.inpaint_sample(model=uNet, shape=(batchsize, channels, height, width), noising_strength=noising_strength, guide_img=latent_representation, mask=latent_mask, return_tensor=True, condition=condition, sampler=sampler, use_dynamic_mask=use_dynamic_mask, end_noise_level_ratio=end_noise_level_ratio, mask_flexivity=mask_flexivity) latent_representations = latent_representations[-1] quantized_latent_representations, loss, (_, _, _) = VAE_quantizer(latent_representations) # Todo: remove hard-coding flipped_log_spectrums, flipped_phases, rec_signals, _, _, _ = encodeBatch2GradioOutput_STFT(VAE_decoder, quantized_latent_representations, resolution=( 512, width * VAE_scale), original_STFT_batch=None ) return {test_duration_spectrogram_image: flipped_log_spectrums[0], test_duration_phase_image: flipped_phases[0], test_duration_audio: (sample_rate, rec_signals[0])} def test_duration_envelope(virtual_instrument_name, duration, noising_strength, attack, before_release, release, virtual_instruments_dict): virtual_instruments = virtual_instruments_dict["virtual_instruments"] virtual_instrument = virtual_instruments[virtual_instrument_name] sample_rate, signal = virtual_instrument["signal"] applied_signal = adsr_envelope(signal=signal, sample_rate=sample_rate, duration=duration, attack_time=0.0, decay_time=0.0, sustain_level=1.0, release_time=release) D = librosa.stft(applied_signal, n_fft=1024, hop_length=256, win_length=1024)[1:, :] spc = np.abs(D) phase = np.angle(D) flipped_log_spectrum = spectrogram_to_Gradio_image(spc) flipped_phase = phase_to_Gradio_image(phase) return {test_duration_spectrogram_image: flipped_log_spectrum, test_duration_phase_image: flipped_phase, test_duration_audio: (sample_rate, applied_signal)} def test_duration_stretch(virtual_instrument_name, duration, noising_strength, attack, before_release, release, virtual_instruments_dict): virtual_instruments = virtual_instruments_dict["virtual_instruments"] virtual_instrument = virtual_instruments[virtual_instrument_name] sample_rate, signal = virtual_instrument["signal"] s = 3 / duration # applied_signal = pyrb.time_stretch(signal, sample_rate, s) applied_signal = time_stretch_audio(signal, sample_rate, s) applied_signal = adjust_audio_length(applied_signal, int((duration+1) * sample_rate), sample_rate, sample_rate) D = librosa.stft(applied_signal, n_fft=1024, hop_length=256, win_length=1024)[1:, :] spc = np.abs(D) phase = np.angle(D) flipped_log_spectrum = spectrogram_to_Gradio_image(spc) flipped_phase = phase_to_Gradio_image(phase) return {test_duration_spectrogram_image: flipped_log_spectrum, test_duration_phase_image: flipped_phase, test_duration_audio: (sample_rate, applied_signal)} with gr.Tab("TestInTrack"): gr.Markdown("Make music with generated sounds!") with gr.Row(variant="panel"): with gr.Column(scale=3): instrument_name_textbox = gr.Textbox(label="Instrument name", lines=1, placeholder="Name of your instrument", scale=1) select_instrument_button = gr.Button(variant="primary", value="Select", scale=1) with gr.Column(scale=3): inpaint_steps_slider = gr.Slider(minimum=5.0, maximum=999.0, value=20.0, step=1.0, label="inpaint_steps") noising_strength_slider = gradioWebUI.get_noising_strength_slider(default_noising_strength=1.) end_noise_level_ratio_slider = gr.Slider(minimum=0.0, maximum=1., value=0.0, step=0.01, label="end_noise_level_ratio") attack_slider = gr.Slider(minimum=0.0, maximum=1.5, value=0.5, step=0.01, label="attack in sec") before_release_slider = gr.Slider(minimum=0.0, maximum=1.5, value=0.5, step=0.01, label="before_release in sec") release_slider = gr.Slider(minimum=0.0, maximum=1.0, value=0.3, step=0.01, label="release in sec") mask_flexivity_slider = gr.Slider(minimum=0.01, maximum=1.00, value=1., step=0.01, label="mask_flexivity") with gr.Column(scale=3): use_dynamic_mask_checkbox = gr.Checkbox(label="Use dynamic mask", value=True) test_duration_envelope_button = gr.Button(variant="primary", value="Apply envelope", scale=1) test_duration_stretch_button = gr.Button(variant="primary", value="Apply stretch", scale=1) test_duration_inpaint_button = gr.Button(variant="primary", value="Inpaint different duration", scale=1) duration_slider = gradioWebUI.get_duration_slider() with gr.Row(variant="panel"): with gr.Column(scale=2): with gr.Row(variant="panel"): source_sound_spectrogram_image = gr.Image(label="New sound spectrogram", type="numpy", height=600, scale=1) source_sound_phase_image = gr.Image(label="New sound phase", type="numpy", height=600, scale=1) source_sound_audio = gr.Audio(type="numpy", label="Play new sound", interactive=False) with gr.Column(scale=3): with gr.Row(variant="panel"): test_duration_spectrogram_image = gr.Image(label="New sound spectrogram", type="numpy", height=600, scale=1) test_duration_phase_image = gr.Image(label="New sound phase", type="numpy", height=600, scale=1) test_duration_audio = gr.Audio(type="numpy", label="Play new sound", interactive=False) with gr.Row(variant="panel"): with gr.Column(scale=1): # track_spectrogram_image = gr.Image(label="New sound spectrogram", type="numpy", # height=420, scale=1) midi_file = gr.File(label="Upload midi file", type="binary") instrument_names_textbox = gr.Textbox(label="Instrument names", lines=2, placeholder="Names of your instrument used to play the midi", scale=1) track_audio = gr.Audio(type="numpy", label="Play new sound", interactive=False) make_track_button = gr.Button(variant="primary", value="Make track", scale=1) select_instrument_button.click(select_sound, inputs=[instrument_name_textbox, virtual_instruments_state], outputs=[source_sound_spectrogram_image, source_sound_phase_image, source_sound_audio]) test_duration_envelope_button.click(test_duration_envelope, inputs=[instrument_name_textbox, duration_slider, noising_strength_slider, attack_slider, before_release_slider, release_slider, virtual_instruments_state, ], outputs=[test_duration_spectrogram_image, test_duration_phase_image, test_duration_audio]) test_duration_stretch_button.click(test_duration_stretch, inputs=[instrument_name_textbox, duration_slider, noising_strength_slider, attack_slider, before_release_slider, release_slider, virtual_instruments_state, ], outputs=[test_duration_spectrogram_image, test_duration_phase_image, test_duration_audio]) test_duration_inpaint_button.click(test_duration_inpaint, inputs=[instrument_name_textbox, inpaint_steps_slider, duration_slider, noising_strength_slider, end_noise_level_ratio_slider, attack_slider, before_release_slider, mask_flexivity_slider, virtual_instruments_state, use_dynamic_mask_checkbox], outputs=[test_duration_spectrogram_image, test_duration_phase_image, test_duration_audio]) make_track_button.click(make_track, inputs=[inpaint_steps_slider, midi_file, noising_strength_slider, attack_slider, before_release_slider, instrument_names_textbox, virtual_instruments_state], outputs=[track_audio])