import librosa import numpy as np import torch from PIL import Image from tools import np_power_to_db, decode_stft, depad_STFT def spectrogram_to_Gradio_image(spc): ### input: spc [np.ndarray] frequency_resolution, time_resolution = spc.shape[-2], spc.shape[-1] spc = np.reshape(spc, (frequency_resolution, time_resolution)) # Todo: magnitude_spectrum = np.abs(spc) log_spectrum = np_power_to_db(magnitude_spectrum) flipped_log_spectrum = np.flipud(log_spectrum) colorful_spc = np.ones((frequency_resolution, time_resolution, 3)) * -80.0 colorful_spc[:, :, 0] = flipped_log_spectrum colorful_spc[:, :, 1] = flipped_log_spectrum colorful_spc[:, :, 2] = np.ones((frequency_resolution, time_resolution)) * -60.0 # Rescale to 0-255 and convert to uint8 rescaled = (colorful_spc + 80.0) / 80.0 rescaled = (255.0 * rescaled).astype(np.uint8) return rescaled def phase_to_Gradio_image(phase): ### input: spc [np.ndarray] frequency_resolution, time_resolution = phase.shape[-2], phase.shape[-1] phase = np.reshape(phase, (frequency_resolution, time_resolution)) # Todo: flipped_phase = np.flipud(phase) flipped_phase = (flipped_phase + 1.0) / 2.0 colorful_spc = np.zeros((frequency_resolution, time_resolution, 3)) colorful_spc[:, :, 0] = flipped_phase colorful_spc[:, :, 1] = flipped_phase colorful_spc[:, :, 2] = 0.2 # Rescale to 0-255 and convert to uint8 rescaled = (255.0 * colorful_spc).astype(np.uint8) return rescaled def latent_representation_to_Gradio_image(latent_representation): # input: latent_representation [torch.tensor] if not isinstance(latent_representation, np.ndarray): latent_representation = latent_representation.to("cpu").detach().numpy() image = latent_representation def normalize_image(img): min_val = img.min() max_val = img.max() normalized_img = ((img - min_val) / (max_val - min_val) * 255) return normalized_img image[0, :, :] = normalize_image(image[0, :, :]) image[1, :, :] = normalize_image(image[1, :, :]) image[2, :, :] = normalize_image(image[2, :, :]) image[3, :, :] = normalize_image(image[3, :, :]) image_transposed = np.transpose(image, (1, 2, 0)) enlarged_image = np.repeat(image_transposed, 8, axis=0) enlarged_image = np.repeat(enlarged_image, 8, axis=1) return np.flipud(enlarged_image).astype(np.uint8) def InputBatch2Encode_STFT(encoder, STFT_batch, resolution=(512, 256), quantizer=None, squared=True): """Transform batch of numpy spectrogram's into signals and encodings.""" # Todo: remove resolution hard-coding frequency_resolution, time_resolution = resolution device = next(encoder.parameters()).device if not (quantizer is None): latent_representation_batch = encoder(STFT_batch.to(device)) quantized_latent_representation_batch, loss, (_, _, _) = quantizer(latent_representation_batch) else: mu, logvar, latent_representation_batch = encoder(STFT_batch.to(device)) quantized_latent_representation_batch = None STFT_batch = STFT_batch.to("cpu").detach().numpy() origin_flipped_log_spectrums, origin_flipped_phases, origin_signals = [], [], [] for STFT in STFT_batch: padded_D_rec = decode_stft(STFT) D_rec = depad_STFT(padded_D_rec) spc = np.abs(D_rec) phase = np.angle(D_rec) flipped_log_spectrum = spectrogram_to_Gradio_image(spc) flipped_phase = phase_to_Gradio_image(phase) # get_audio rec_signal = librosa.istft(D_rec, hop_length=256, win_length=1024) origin_flipped_log_spectrums.append(flipped_log_spectrum) origin_flipped_phases.append(flipped_phase) origin_signals.append(rec_signal) return origin_flipped_log_spectrums, origin_flipped_phases, origin_signals, \ latent_representation_batch, quantized_latent_representation_batch def encodeBatch2GradioOutput_STFT(decoder, latent_vector_batch, resolution=(512, 256), original_STFT_batch=None): """Show a spectrogram.""" # Todo: remove resolution hard-coding frequency_resolution, time_resolution = resolution if isinstance(latent_vector_batch, np.ndarray): latent_vector_batch = torch.from_numpy(latent_vector_batch).to(next(decoder.parameters()).device) reconstruction_batch = decoder(latent_vector_batch).to("cpu").detach().numpy() flipped_log_spectrums, flipped_phases, rec_signals = [], [], [] flipped_log_spectrums_with_original_amp, flipped_phases_with_original_amp, rec_signals_with_original_amp = [], [], [] for index, STFT in enumerate(reconstruction_batch): padded_D_rec = decode_stft(STFT) D_rec = depad_STFT(padded_D_rec) spc = np.abs(D_rec) phase = np.angle(D_rec) flipped_log_spectrum = spectrogram_to_Gradio_image(spc) flipped_phase = phase_to_Gradio_image(phase) # get_audio rec_signal = librosa.istft(D_rec, hop_length=256, win_length=1024) flipped_log_spectrums.append(flipped_log_spectrum) flipped_phases.append(flipped_phase) rec_signals.append(rec_signal) ########################################## if original_STFT_batch is not None: STFT[0, :, :] = original_STFT_batch[index, 0, :, :] padded_D_rec = decode_stft(STFT) D_rec = depad_STFT(padded_D_rec) spc = np.abs(D_rec) phase = np.angle(D_rec) flipped_log_spectrum = spectrogram_to_Gradio_image(spc) flipped_phase = phase_to_Gradio_image(phase) # get_audio rec_signal = librosa.istft(D_rec, hop_length=256, win_length=1024) flipped_log_spectrums_with_original_amp.append(flipped_log_spectrum) flipped_phases_with_original_amp.append(flipped_phase) rec_signals_with_original_amp.append(rec_signal) return flipped_log_spectrums, flipped_phases, rec_signals, \ flipped_log_spectrums_with_original_amp, flipped_phases_with_original_amp, rec_signals_with_original_amp def add_instrument(source_dict, virtual_instruments_dict, virtual_instrument_name, sample_index): virtual_instruments = virtual_instruments_dict["virtual_instruments"] virtual_instrument = { "latent_representation": source_dict["latent_representations"][sample_index], "quantized_latent_representation": source_dict["quantized_latent_representations"][sample_index], "sampler": source_dict["sampler"], "signal": source_dict["new_sound_rec_signals_gradio"][sample_index], "spectrogram_gradio_image": source_dict["new_sound_spectrogram_gradio_images"][ sample_index], "phase_gradio_image": source_dict["new_sound_phase_gradio_images"][ sample_index]} virtual_instruments[virtual_instrument_name] = virtual_instrument virtual_instruments_dict["virtual_instruments"] = virtual_instruments return virtual_instruments_dict def resize_image_to_aspect_ratio(image_data, aspect_ratio_width, aspect_ratio_height): """ 根据给定的宽高比例拉伸图像,并保持输入输出数据为 NumPy 数组。 参数: image_data (numpy array): 输入图像数据 (height, width, 3) aspect_ratio_width (int): 目标宽度比例 aspect_ratio_height (int): 目标高度比例 返回: numpy array: 调整大小后的图像数据 """ # 获取图像的当前宽度和高度 original_height, original_width, channels = image_data.shape # 计算当前的宽高比 current_aspect_ratio = original_width / original_height # 计算目标的宽高比 target_aspect_ratio = aspect_ratio_width / aspect_ratio_height # 判断是拉伸宽度还是高度 if current_aspect_ratio > target_aspect_ratio: # 当前图像宽高比大于目标宽高比,说明宽度相对较大,需要拉伸高度 new_width = original_width new_height = int(new_width / target_aspect_ratio) else: # 当前图像宽高比小于或等于目标宽高比,拉伸宽度 new_height = original_height new_width = int(new_height * target_aspect_ratio) # 将 numpy 数组转换为 PIL 图像对象 image = Image.fromarray(image_data.astype('uint8')) # 使用 PIL 的 resize 函数进行缩放,使用 LANCZOS 替代 ANTIALIAS resized_image = image.resize((new_width, new_height), Image.Resampling.LANCZOS) # 将 PIL 图像转换回 numpy 数组 resized_image_data = np.array(resized_image) return resized_image_data def average_np_arrays(arr_list): if not arr_list: raise ValueError("Input list cannot be empty") stacked_arrays = np.stack(arr_list, axis=0) avg_array = np.mean(stacked_arrays, axis=0) return avg_array