Spaces:

WeixuanYuan
/

DiffuSynthV0.2

Running

App Files Files Community

DiffuSynthV0.2 / webUI /natural_language_guided_4 /super_resolution_with_text.py

WeixuanYuan

Upload 70 files

bd6e54b verified 2 months ago

raw

history blame contribute delete

28.5 kB

	import librosa
	import numpy as np
	import torch
	import gradio as gr
	from scipy.ndimage import zoom

	from model.DiffSynthSampler import DiffSynthSampler
	from tools import adjust_audio_length, rescale, safe_int, pad_STFT, encode_stft
	from webUI.natural_language_guided_STFT.utils import latent_representation_to_Gradio_image
	from webUI.natural_language_guided_STFT.utils import InputBatch2Encode_STFT, encodeBatch2GradioOutput_STFT


	def get_super_resolution_with_text_module(gradioWebUI, inpaintWithText_state):
	# Load configurations
	uNet = gradioWebUI.uNet
	freq_resolution, time_resolution = gradioWebUI.freq_resolution, gradioWebUI.time_resolution
	VAE_scale = gradioWebUI.VAE_scale
	height, width, channels = int(freq_resolution/VAE_scale), int(time_resolution/VAE_scale), gradioWebUI.channels
	timesteps = gradioWebUI.timesteps
	VAE_encoder = gradioWebUI.VAE_encoder
	VAE_quantizer = gradioWebUI.VAE_quantizer
	VAE_decoder = gradioWebUI.VAE_decoder
	CLAP = gradioWebUI.CLAP
	CLAP_tokenizer = gradioWebUI.CLAP_tokenizer
	device = gradioWebUI.device
	squared = gradioWebUI.squared
	sample_rate = gradioWebUI.sample_rate
	noise_strategy = gradioWebUI.noise_strategy

	def receive_uopoad_origin_audio(sound2sound_duration, sound2sound_origin_source, sound2sound_origin_upload, sound2sound_origin_microphone,
	inpaintWithText_dict):

	if sound2sound_origin_source == "upload":
	origin_sr, origin_audio = sound2sound_origin_upload
	else:
	origin_sr, origin_audio = sound2sound_origin_microphone

	origin_audio = origin_audio / np.max(np.abs(origin_audio))

	width = int(time_resolution*((sound2sound_duration+1)/4) / VAE_scale)
	audio_length = 256 * (VAE_scale * width - 1)
	origin_audio = adjust_audio_length(origin_audio, audio_length, origin_sr, sample_rate)

	D = librosa.stft(origin_audio, n_fft=1024, hop_length=256, win_length=1024)
	padded_D = pad_STFT(D)
	encoded_D = encode_stft(padded_D)

	# Todo: justify batchsize to 1
	origin_spectrogram_batch_tensor = torch.from_numpy(
	np.repeat(encoded_D[np.newaxis, :, :, :], 1, axis=0)).float().to(device)

	# Todo: remove hard-coding
	origin_flipped_log_spectrums, origin_flipped_phases, origin_signals, origin_latent_representations, quantized_origin_latent_representations = InputBatch2Encode_STFT(
	VAE_encoder, origin_spectrogram_batch_tensor, resolution=(512, width * VAE_scale), quantizer=VAE_quantizer, squared=squared)

	if sound2sound_origin_source == "upload":
	inpaintWithText_dict["origin_upload_latent_representations"] = origin_latent_representations.tolist()
	inpaintWithText_dict[
	"sound2sound_origin_upload_latent_representation_image"] = latent_representation_to_Gradio_image(
	origin_latent_representations[0]).tolist()
	inpaintWithText_dict[
	"sound2sound_origin_upload_quantized_latent_representation_image"] = latent_representation_to_Gradio_image(
	quantized_origin_latent_representations[0]).tolist()
	return {sound2sound_origin_spectrogram_upload_image: origin_flipped_log_spectrums[0],
	sound2sound_origin_phase_upload_image: origin_flipped_phases[0],
	sound2sound_origin_spectrogram_microphone_image: gr.update(),
	sound2sound_origin_phase_microphone_image: gr.update(),
	sound2sound_origin_upload_latent_representation_image: latent_representation_to_Gradio_image(
	origin_latent_representations[0]),
	sound2sound_origin_upload_quantized_latent_representation_image: latent_representation_to_Gradio_image(
	quantized_origin_latent_representations[0]),
	sound2sound_origin_microphone_latent_representation_image: gr.update(),
	sound2sound_origin_microphone_quantized_latent_representation_image: gr.update(),
	inpaintWithText_state: inpaintWithText_dict}
	else:
	inpaintWithText_dict["origin_microphone_latent_representations"] = origin_latent_representations.tolist()
	inpaintWithText_dict[
	"sound2sound_origin_microphone_latent_representation_image"] = latent_representation_to_Gradio_image(
	origin_latent_representations[0]).tolist()
	inpaintWithText_dict[
	"sound2sound_origin_microphone_quantized_latent_representation_image"] = latent_representation_to_Gradio_image(
	quantized_origin_latent_representations[0]).tolist()
	return {sound2sound_origin_spectrogram_upload_image: origin_flipped_log_spectrums[0],
	sound2sound_origin_phase_upload_image: origin_flipped_phases[0],
	sound2sound_origin_spectrogram_microphone_image: gr.update(),
	sound2sound_origin_phase_microphone_image: gr.update(),
	sound2sound_origin_upload_latent_representation_image: latent_representation_to_Gradio_image(
	origin_latent_representations[0]),
	sound2sound_origin_upload_quantized_latent_representation_image: latent_representation_to_Gradio_image(
	quantized_origin_latent_representations[0]),
	sound2sound_origin_microphone_latent_representation_image: gr.update(),
	sound2sound_origin_microphone_quantized_latent_representation_image: gr.update(),
	inpaintWithText_state: inpaintWithText_dict}

	def sound2sound_sample(sound2sound_origin_spectrogram_upload, sound2sound_origin_spectrogram_microphone,
	text2sound_prompts, text2sound_negative_prompts, sound2sound_batchsize,
	sound2sound_guidance_scale, sound2sound_sampler,
	sound2sound_sample_steps, sound2sound_origin_source,
	sound2sound_noising_strength, sound2sound_seed, sound2sound_inpaint_area, inpaintWithText_dict
	):

	# input preprocessing
	sound2sound_seed = safe_int(sound2sound_seed, 12345678)
	sound2sound_batchsize = int(sound2sound_batchsize)
	noising_strength = sound2sound_noising_strength
	sound2sound_sample_steps = int(sound2sound_sample_steps)
	CFG = int(sound2sound_guidance_scale)

	text2sound_embedding = \
	CLAP.get_text_features(**CLAP_tokenizer([text2sound_prompts], padding=True, return_tensors="pt"))[0].to(device)

	if sound2sound_origin_source == "upload":
	origin_latent_representations = torch.tensor(
	inpaintWithText_dict["origin_upload_latent_representations"]).repeat(sound2sound_batchsize, 1, 1, 1).to(
	device)
	elif sound2sound_origin_source == "microphone":
	origin_latent_representations = torch.tensor(
	inpaintWithText_dict["origin_microphone_latent_representations"]).repeat(sound2sound_batchsize, 1, 1, 1).to(
	device)
	else:
	print("Input source not in ['upload', 'microphone']!")
	raise NotImplementedError()

	high_resolution_latent_representations = torch.zeros((sound2sound_batchsize, channels, 256, 64)).to(device)
	high_resolution_latent_representations[:, :, :128, :] = origin_latent_representations
	latent_mask = np.ones((256, 64))
	latent_mask[192:, :] = 0.0
	print(f"latent_mask mean: {np.mean(latent_mask)}")

	if sound2sound_inpaint_area == "inpaint masked":
	latent_mask = 1 - latent_mask
	latent_mask = torch.from_numpy(latent_mask).unsqueeze(0).unsqueeze(1).repeat(sound2sound_batchsize, channels, 1,
	1).float().to(device)
	latent_mask = torch.flip(latent_mask, [2])

	mySampler = DiffSynthSampler(timesteps, height=height*2, channels=channels, noise_strategy=noise_strategy)
	unconditional_condition = \
	CLAP.get_text_features(**CLAP_tokenizer([text2sound_negative_prompts], padding=True, return_tensors="pt"))[0]
	mySampler.activate_classifier_free_guidance(CFG, unconditional_condition.to(device))

	normalized_sample_steps = int(sound2sound_sample_steps / noising_strength)

	mySampler.respace(list(np.linspace(0, timesteps - 1, normalized_sample_steps, dtype=np.int32)))

	# Todo: remove hard-coding
	width = high_resolution_latent_representations.shape[-1]
	condition = text2sound_embedding.repeat(sound2sound_batchsize, 1)

	new_sound_latent_representations, initial_noise = \
	mySampler.inpaint_sample(model=uNet, shape=(sound2sound_batchsize, channels, height*2, width),
	seed=sound2sound_seed,
	noising_strength=noising_strength,
	guide_img=high_resolution_latent_representations, mask=latent_mask, return_tensor=True,
	condition=condition, sampler=sound2sound_sampler)

	new_sound_latent_representations = new_sound_latent_representations[-1]

	# Quantize new sound latent representations
	quantized_new_sound_latent_representations, loss, (_, _, _) = VAE_quantizer(new_sound_latent_representations)
	new_sound_flipped_log_spectrums, new_sound_flipped_phases, new_sound_signals, _, _, _ = encodeBatch2GradioOutput_STFT(VAE_decoder,
	quantized_new_sound_latent_representations,
	resolution=(
	1024,
	width * VAE_scale),
	original_STFT_batch=None
	)

	new_sound_latent_representation_gradio_images = []
	new_sound_quantized_latent_representation_gradio_images = []
	new_sound_spectrogram_gradio_images = []
	new_sound_phase_gradio_images = []
	new_sound_rec_signals_gradio = []
	for i in range(sound2sound_batchsize):
	new_sound_latent_representation_gradio_images.append(
	latent_representation_to_Gradio_image(new_sound_latent_representations[i]))
	new_sound_quantized_latent_representation_gradio_images.append(
	latent_representation_to_Gradio_image(quantized_new_sound_latent_representations[i]))
	new_sound_spectrogram_gradio_images.append(new_sound_flipped_log_spectrums[i])
	new_sound_phase_gradio_images.append(new_sound_flipped_phases[i])
	new_sound_rec_signals_gradio.append((sample_rate, new_sound_signals[i]))

	inpaintWithText_dict[
	"new_sound_latent_representation_gradio_images"] = new_sound_latent_representation_gradio_images
	inpaintWithText_dict[
	"new_sound_quantized_latent_representation_gradio_images"] = new_sound_quantized_latent_representation_gradio_images
	inpaintWithText_dict["new_sound_spectrogram_gradio_images"] = new_sound_spectrogram_gradio_images
	inpaintWithText_dict["new_sound_phase_gradio_images"] = new_sound_phase_gradio_images
	inpaintWithText_dict["new_sound_rec_signals_gradio"] = new_sound_rec_signals_gradio

	return {sound2sound_new_sound_latent_representation_image: latent_representation_to_Gradio_image(
	new_sound_latent_representations[0]),
	sound2sound_new_sound_quantized_latent_representation_image: latent_representation_to_Gradio_image(
	quantized_new_sound_latent_representations[0]),
	sound2sound_new_sound_spectrogram_image: new_sound_flipped_log_spectrums[0],
	sound2sound_new_sound_phase_image: new_sound_flipped_phases[0],
	sound2sound_new_sound_audio: (sample_rate, new_sound_signals[0]),
	sound2sound_sample_index_slider: gr.update(minimum=0, maximum=sound2sound_batchsize - 1, value=0,
	step=1.0,
	visible=True,
	label="Sample index",
	info="Swipe to view other samples"),
	sound2sound_seed_textbox: sound2sound_seed,
	inpaintWithText_state: inpaintWithText_dict}

	def show_sound2sound_sample(sound2sound_sample_index, inpaintWithText_dict):
	sample_index = int(sound2sound_sample_index)
	return {sound2sound_new_sound_latent_representation_image:
	inpaintWithText_dict["new_sound_latent_representation_gradio_images"][sample_index],
	sound2sound_new_sound_quantized_latent_representation_image:
	inpaintWithText_dict["new_sound_quantized_latent_representation_gradio_images"][sample_index],
	sound2sound_new_sound_spectrogram_image: inpaintWithText_dict["new_sound_spectrogram_gradio_images"][
	sample_index],
	sound2sound_new_sound_phase_image: inpaintWithText_dict["new_sound_phase_gradio_images"][
	sample_index],
	sound2sound_new_sound_audio: inpaintWithText_dict["new_sound_rec_signals_gradio"][sample_index]}

	def sound2sound_switch_origin_source(sound2sound_origin_source):

	if sound2sound_origin_source == "upload":
	return {sound2sound_origin_upload_audio: gr.update(visible=True),
	sound2sound_origin_microphone_audio: gr.update(visible=False),
	sound2sound_origin_spectrogram_upload_image: gr.update(visible=True),
	sound2sound_origin_phase_upload_image: gr.update(visible=True),
	sound2sound_origin_spectrogram_microphone_image: gr.update(visible=False),
	sound2sound_origin_phase_microphone_image: gr.update(visible=False),
	sound2sound_origin_upload_latent_representation_image: gr.update(visible=True),
	sound2sound_origin_upload_quantized_latent_representation_image: gr.update(visible=True),
	sound2sound_origin_microphone_latent_representation_image: gr.update(visible=False),
	sound2sound_origin_microphone_quantized_latent_representation_image: gr.update(visible=False)}
	elif sound2sound_origin_source == "microphone":
	return {sound2sound_origin_upload_audio: gr.update(visible=False),
	sound2sound_origin_microphone_audio: gr.update(visible=True),
	sound2sound_origin_spectrogram_upload_image: gr.update(visible=False),
	sound2sound_origin_phase_upload_image: gr.update(visible=False),
	sound2sound_origin_spectrogram_microphone_image: gr.update(visible=True),
	sound2sound_origin_phase_microphone_image: gr.update(visible=True),
	sound2sound_origin_upload_latent_representation_image: gr.update(visible=False),
	sound2sound_origin_upload_quantized_latent_representation_image: gr.update(visible=False),
	sound2sound_origin_microphone_latent_representation_image: gr.update(visible=True),
	sound2sound_origin_microphone_quantized_latent_representation_image: gr.update(visible=True)}
	else:
	print("Input source not in ['upload', 'microphone']!")

	with gr.Tab("Super Resolution"):
	gr.Markdown("Select the area to inpaint and use the prompt to guide the synthesis of a new sound!")
	with gr.Row(variant="panel"):
	with gr.Column(scale=3):
	text2sound_prompts_textbox = gr.Textbox(label="Positive prompt", lines=2, value="organ")
	text2sound_negative_prompts_textbox = gr.Textbox(label="Negative prompt", lines=2, value="")

	with gr.Column(scale=1):
	sound2sound_sample_button = gr.Button(variant="primary", value="Generate", scale=1)

	sound2sound_sample_index_slider = gr.Slider(minimum=0, maximum=3, value=0, step=1.0, visible=False,
	label="Sample index",
	info="Swipe to view other samples")

	with gr.Row(variant="panel"):
	with gr.Column(scale=1):
	with gr.Tab("Origin sound"):
	sound2sound_duration_slider = gradioWebUI.get_duration_slider()
	sound2sound_origin_source_radio = gr.Radio(choices=["upload", "microphone"], value="upload",
	label="Input source")

	sound2sound_origin_upload_audio = gr.Audio(type="numpy", label="Upload", source="upload",
	interactive=True, visible=True)
	sound2sound_origin_microphone_audio = gr.Audio(type="numpy", label="Record", source="microphone",
	interactive=True, visible=False)
	with gr.Row(variant="panel"):
	sound2sound_origin_spectrogram_upload_image = gr.Image(label="Original upload spectrogram",
	type="numpy", height=600,
	visible=True, tool="sketch")
	sound2sound_origin_phase_upload_image = gr.Image(label="Original upload phase",
	type="numpy", height=600,
	visible=True)
	sound2sound_origin_spectrogram_microphone_image = gr.Image(label="Original microphone spectrogram",
	type="numpy", height=600,
	visible=False, tool="sketch")
	sound2sound_origin_phase_microphone_image = gr.Image(label="Original microphone phase",
	type="numpy", height=600,
	visible=False)
	sound2sound_inpaint_area_radio = gr.Radio(choices=["inpaint masked", "inpaint not masked"],
	value="inpaint masked")

	with gr.Tab("Sound2sound settings"):
	sound2sound_sample_steps_slider = gradioWebUI.get_sample_steps_slider()
	sound2sound_sampler_radio = gradioWebUI.get_sampler_radio()
	sound2sound_batchsize_slider = gradioWebUI.get_batchsize_slider()
	sound2sound_noising_strength_slider = gradioWebUI.get_noising_strength_slider(default_noising_strength=1.0)
	sound2sound_guidance_scale_slider = gradioWebUI.get_guidance_scale_slider()
	sound2sound_seed_textbox = gradioWebUI.get_seed_textbox()


	with gr.Column(scale=1):
	sound2sound_new_sound_audio = gr.Audio(type="numpy", label="Play new sound", interactive=False)
	with gr.Row(variant="panel"):
	sound2sound_new_sound_spectrogram_image = gr.Image(label="New sound spectrogram", type="numpy",
	height=1200, scale=1)
	sound2sound_new_sound_phase_image = gr.Image(label="New sound phase", type="numpy",
	height=1200, scale=1)

	with gr.Row(variant="panel"):
	sound2sound_origin_upload_latent_representation_image = gr.Image(label="Original latent representation",
	type="numpy", height=1200,
	visible=True)
	sound2sound_origin_upload_quantized_latent_representation_image = gr.Image(
	label="Original quantized latent representation", type="numpy", height=1200, visible=True)

	sound2sound_origin_microphone_latent_representation_image = gr.Image(label="Original latent representation",
	type="numpy", height=1200,
	visible=False)
	sound2sound_origin_microphone_quantized_latent_representation_image = gr.Image(
	label="Original quantized latent representation", type="numpy", height=1200, visible=False)

	sound2sound_new_sound_latent_representation_image = gr.Image(label="New latent representation",
	type="numpy", height=1200)
	sound2sound_new_sound_quantized_latent_representation_image = gr.Image(
	label="New sound quantized latent representation", type="numpy", height=1200)

	sound2sound_origin_upload_audio.change(receive_uopoad_origin_audio,
	inputs=[sound2sound_duration_slider, sound2sound_origin_source_radio, sound2sound_origin_upload_audio,
	sound2sound_origin_microphone_audio, inpaintWithText_state],
	outputs=[sound2sound_origin_spectrogram_upload_image,
	sound2sound_origin_phase_upload_image,
	sound2sound_origin_spectrogram_microphone_image,
	sound2sound_origin_phase_microphone_image,
	sound2sound_origin_upload_latent_representation_image,
	sound2sound_origin_upload_quantized_latent_representation_image,
	sound2sound_origin_microphone_latent_representation_image,
	sound2sound_origin_microphone_quantized_latent_representation_image,
	inpaintWithText_state])
	sound2sound_origin_microphone_audio.change(receive_uopoad_origin_audio,
	inputs=[sound2sound_duration_slider, sound2sound_origin_source_radio, sound2sound_origin_upload_audio,
	sound2sound_origin_microphone_audio, inpaintWithText_state],
	outputs=[sound2sound_origin_spectrogram_upload_image,
	sound2sound_origin_phase_upload_image,
	sound2sound_origin_spectrogram_microphone_image,
	sound2sound_origin_phase_microphone_image,
	sound2sound_origin_upload_latent_representation_image,
	sound2sound_origin_upload_quantized_latent_representation_image,
	sound2sound_origin_microphone_latent_representation_image,
	sound2sound_origin_microphone_quantized_latent_representation_image,
	inpaintWithText_state])

	sound2sound_sample_button.click(sound2sound_sample,
	inputs=[sound2sound_origin_spectrogram_upload_image,
	sound2sound_origin_spectrogram_microphone_image,
	text2sound_prompts_textbox,
	text2sound_negative_prompts_textbox,
	sound2sound_batchsize_slider,
	sound2sound_guidance_scale_slider,
	sound2sound_sampler_radio,
	sound2sound_sample_steps_slider,
	sound2sound_origin_source_radio,
	sound2sound_noising_strength_slider,
	sound2sound_seed_textbox,
	sound2sound_inpaint_area_radio,
	inpaintWithText_state],
	outputs=[sound2sound_new_sound_latent_representation_image,
	sound2sound_new_sound_quantized_latent_representation_image,
	sound2sound_new_sound_spectrogram_image,
	sound2sound_new_sound_phase_image,
	sound2sound_new_sound_audio,
	sound2sound_sample_index_slider,
	sound2sound_seed_textbox,
	inpaintWithText_state])

	sound2sound_sample_index_slider.change(show_sound2sound_sample,
	inputs=[sound2sound_sample_index_slider, inpaintWithText_state],
	outputs=[sound2sound_new_sound_latent_representation_image,
	sound2sound_new_sound_quantized_latent_representation_image,
	sound2sound_new_sound_spectrogram_image,
	sound2sound_new_sound_phase_image,
	sound2sound_new_sound_audio])

	sound2sound_origin_source_radio.change(sound2sound_switch_origin_source,
	inputs=[sound2sound_origin_source_radio],
	outputs=[sound2sound_origin_upload_audio,
	sound2sound_origin_microphone_audio,
	sound2sound_origin_spectrogram_upload_image,
	sound2sound_origin_phase_upload_image,
	sound2sound_origin_spectrogram_microphone_image,
	sound2sound_origin_phase_microphone_image,
	sound2sound_origin_upload_latent_representation_image,
	sound2sound_origin_upload_quantized_latent_representation_image,
	sound2sound_origin_microphone_latent_representation_image,
	sound2sound_origin_microphone_quantized_latent_representation_image])