Spaces:
Running
on
Zero
Running
on
Zero
import spaces | |
import gradio as gr | |
import torch, os | |
import wave | |
import librosa | |
import numpy as np | |
from scipy.io.wavfile import write | |
from PIL import Image | |
import matplotlib.pyplot as plt | |
from huggingface_hub import snapshot_download | |
import soundfile as sf | |
from auffusion_pipeline import AuffusionPipeline | |
# ββ | |
from diffusers import StableDiffusionImg2ImgPipeline, StableDiffusionInpaintPipeline | |
from converter import load_wav, mel_spectrogram, normalize_spectrogram, denormalize_spectrogram, Generator, get_mel_spectrogram_from_audio | |
from utils import pad_spec, image_add_color, torch_to_pil, normalize, denormalize, prepare_mask_and_masked_image | |
# ββ | |
def resample_audio(input_audio, original_sr, target_sr=16000): | |
""" | |
Resample the audio to the target sample rate (16000 Hz by default). | |
Args: | |
- input_audio (numpy array): The raw audio data. | |
- original_sr (int): The original sample rate of the input audio. | |
- target_sr (int): The target sample rate (default is 16000 Hz). | |
Returns: | |
- numpy array: The resampled audio. | |
""" | |
if original_sr != target_sr: | |
# Resample the audio using librosa | |
audio_resampled = librosa.resample(input_audio, orig_sr=original_sr, target_sr=target_sr) | |
return audio_resampled | |
else: | |
# If sample rate is already 16000, no resampling is needed | |
return input_audio | |
def save_spectrogram_image(spectrogram, filename): | |
"""Save a spectrogram as an image.""" | |
plt.figure(figsize=(10, 4)) | |
plt.imshow(spectrogram.squeeze(), aspect='auto', origin='lower', cmap='magma') | |
plt.axis('off') # Hide axes for a cleaner image | |
plt.savefig(filename, bbox_inches='tight', pad_inches=0) | |
plt.close() | |
def infer(prompt, progress=gr.Progress(track_tqdm=True)): | |
""" | |
Generate audio from a textual prompt using AuffusionPipeline. | |
Args: | |
prompt (str): Text description of the desired audio content. | |
progress (gr.Progress, optional): Progress tracker for UI feedback. | |
Returns: | |
str: The file path to the generated WAV audio file. | |
""" | |
pipeline = AuffusionPipeline.from_pretrained("auffusion/auffusion") | |
prompt = prompt | |
output = pipeline(prompt=prompt) | |
audio = output.audios[0] | |
sf.write(f"{prompt}.wav", audio, samplerate=16000) | |
return f"{prompt}.wav" | |
def infer_img2img(prompt, audio_path, desired_strength, progress=gr.Progress(track_tqdm=True)): | |
""" | |
Perform audio-to-audio transformation with image-to-image style generation. | |
Args: | |
prompt (str): Text prompt guiding the audio transformation. | |
audio_path (str): File path to the input WAV audio reference. | |
desired_strength (float): Strength of prompt influence in [0.0, 1.0]. | |
progress (gr.Progress, optional): Progress tracker for UI feedback. | |
Returns: | |
tuple: | |
- str: File path of the generated output WAV audio. | |
- str: File path of the input spectrogram image (PNG). | |
- str: File path of the output spectrogram image (PNG). | |
""" | |
# Load your audio file | |
input_audio, original_sr = librosa.load(audio_path, sr=None) # Load with original sampling rate | |
resampled_audio = resample_audio(input_audio, original_sr, target_sr=16000) | |
# Save the resampled audio to a new file | |
sf.write('resampled_audio.wav', resampled_audio, 16000) | |
audio_path = 'resampled_audio.wav' | |
pretrained_model_name_or_path = "auffusion/auffusion-full-no-adapter" | |
dtype = torch.float16 | |
device = "cuda" | |
if not os.path.isdir(pretrained_model_name_or_path): | |
pretrained_model_name_or_path = snapshot_download(pretrained_model_name_or_path) | |
vocoder = Generator.from_pretrained(pretrained_model_name_or_path, subfolder="vocoder") | |
vocoder = vocoder.to(device=device, dtype=dtype) | |
pipe = StableDiffusionImg2ImgPipeline.from_pretrained(pretrained_model_name_or_path, torch_dtype=dtype) | |
pipe = pipe.to(device) | |
width_start, width = 0, 160 | |
strength_list = [desired_strength] | |
prompt = prompt | |
seed = 42 | |
# Loading | |
audio, sampling_rate = load_wav(audio_path) | |
audio, spec = get_mel_spectrogram_from_audio(audio) | |
# Normalize the spectrogram | |
norm_spec = normalize_spectrogram(spec) | |
# norm_spec = norm_spec[:,:, width_start:width_start+width] | |
norm_spec = pad_spec(norm_spec, 1024) | |
norm_spec = normalize(norm_spec) # normalize to [-1, 1], because pipeline do not normalize for torch.Tensor input | |
# raw_image = image_add_color(torch_to_pil(norm_spec[:,:,:width])) | |
raw_image = image_add_color(torch_to_pil(norm_spec)) | |
# Generation for different strength | |
image_list = [] | |
audio_list = [] | |
generator = torch.Generator(device=device).manual_seed(seed) | |
for strength in strength_list: | |
with torch.autocast("cuda"): | |
output_spec = pipe( | |
prompt=prompt, image=norm_spec, num_inference_steps=100, generator=generator, output_type="pt", strength=strength, guidance_scale=7.5 | |
).images[0] | |
# add to image_list | |
# output_spec = output_spec[:, :, :width] | |
output_spec_image = torch_to_pil(output_spec) | |
color_output_spec_image = image_add_color(output_spec_image) | |
image_list.append(color_output_spec_image) | |
# add to audio_list | |
denorm_spec = denormalize_spectrogram(output_spec) | |
denorm_spec_audio = vocoder.inference(denorm_spec) | |
audio_list.append(denorm_spec_audio) | |
# Display | |
# Concat image with different strength & add interval between images with black color | |
concat_image_list = [] | |
for i in range(len(image_list)): | |
if i == len(image_list) - 1: | |
concat_image_list.append(np.array(image_list[i])) | |
else: | |
concat_image_list.append(np.concatenate([np.array(image_list[i]), np.ones((256, 20, 3))*0], axis=1)) | |
concat_image = np.concatenate(concat_image_list, axis=1) | |
concat_image = Image.fromarray(np.uint8(concat_image)) | |
### Concat audio | |
concat_audio_list = [np.concatenate([audio, np.zeros((1, 16000))], axis=1) for audio in audio_list] | |
concat_audio = np.concatenate(concat_audio_list, axis=1) | |
print("audio_path:", audio_path) | |
print("width_start:", width_start, "width:", width) | |
print("text prompt:", prompt) | |
print("strength_list:", strength_list) | |
# Ensure correct shape | |
concat_audio = concat_audio.flatten() # Converts (1, N) β (N,) | |
# Normalize the audio to prevent clipping or excessive loudness | |
concat_audio = concat_audio / np.max(np.abs(concat_audio)) # Scale between -1 and 1 | |
# Save as WAV | |
sf.write("output.wav", concat_audio, 16000) | |
# Save input spectrogram image | |
input_spec_image_path = "input_spectrogram.png" | |
raw_image.save(input_spec_image_path) | |
# Save concatenated spectrogram image | |
output_spec_image_path = "output_spectrogram.png" | |
concat_image.save(output_spec_image_path) | |
return "output.wav", input_spec_image_path, output_spec_image_path | |
def infer_inp(prompt, audio_path, mask_start_point, mask_end_point, progress=gr.Progress(track_tqdm=True)): | |
""" | |
Perform audio inpainting with masked spectrogram region guided by a prompt. | |
Args: | |
prompt (str): Text prompt describing the desired inpainted audio content. | |
audio_path (str): File path to the input WAV audio reference. | |
mask_start_point (int): Start index of the mask region in the spectrogram. | |
mask_end_point (int): End index of the mask region in the spectrogram. | |
progress (gr.Progress, optional): Progress tracker for UI feedback. | |
Returns: | |
tuple: | |
- str: File path of the generated inpainted output WAV audio. | |
- str: File path of the input spectrogram image (PNG). | |
- PIL.Image.Image: The output spectrogram image with inpainted region (PIL image). | |
""" | |
# Load your audio file | |
input_audio, original_sr = librosa.load(audio_path, sr=None) # Load with original sampling rate | |
resampled_audio = resample_audio(input_audio, original_sr, target_sr=16000) | |
# Save the resampled audio to a new file | |
sf.write('resampled_audio.wav', resampled_audio, 16000) | |
audio_path = 'resampled_audio.wav' | |
pretrained_model_name_or_path = "auffusion/auffusion-full-no-adapter" | |
dtype = torch.float16 | |
device = "cuda" | |
if not os.path.isdir(pretrained_model_name_or_path): | |
pretrained_model_name_or_path = snapshot_download(pretrained_model_name_or_path) | |
vocoder = Generator.from_pretrained(pretrained_model_name_or_path, subfolder="vocoder") | |
vocoder = vocoder.to(device=device, dtype=dtype) | |
pipe = StableDiffusionInpaintPipeline.from_pretrained(pretrained_model_name_or_path, torch_dtype=dtype) | |
pipe = pipe.to(device) | |
width_start, width = mask_start_point, mask_end_point-mask_start_point | |
prompt = prompt | |
seed = 42 | |
# Loading | |
audio, sampling_rate = load_wav(audio_path) | |
audio, spec = get_mel_spectrogram_from_audio(audio) | |
norm_spec = normalize_spectrogram(spec) | |
norm_spec = pad_spec(norm_spec, 1024) | |
norm_spec = normalize(norm_spec) # normalize to [-1, 1], because pipeline do not normalize for torch.Tensor input | |
raw_image = image_add_color(torch_to_pil(norm_spec)) | |
# Add Mask | |
mask = torch.zeros_like(norm_spec)[:1,...] | |
mask[:, :, width_start:width_start+width] = 1 | |
mask_image = torch_to_pil(mask) | |
mask, masked_spec = prepare_mask_and_masked_image(norm_spec, mask) | |
masked_spec_image = torch_to_pil(masked_spec) | |
# color masked spec and paint masked area to black | |
color_masked_spec_image = image_add_color(masked_spec_image) | |
color_masked_spec_image = np.array(color_masked_spec_image) | |
color_masked_spec_image[:, width_start:width_start+width, :] = 0 | |
color_masked_spec_image = Image.fromarray(color_masked_spec_image) | |
# Generation | |
generator = torch.Generator(device=device).manual_seed(seed) | |
with torch.autocast("cuda"): | |
output_spec = pipe( | |
prompt=prompt, image=norm_spec, mask_image=mask, num_inference_steps=100, generator=generator, height=256, width=1024, output_type="pt" | |
).images[0] | |
output_spec_image = torch_to_pil(output_spec) | |
color_output_spec_image = image_add_color(output_spec_image) | |
# Display audio result: raw audio, masked raw audio, generated audio | |
post_norm_spec = denormalize(norm_spec).to(device, dtype) | |
raw_chunk_spec = denormalize_spectrogram(post_norm_spec) | |
raw_chunk_audio = vocoder.inference(raw_chunk_spec) | |
post_masked_spec = denormalize(masked_spec).to(device, dtype) | |
denorm_masked_spec = denormalize_spectrogram(post_masked_spec) | |
denorm_masked_spec_audio = vocoder.inference(denorm_masked_spec) | |
denorm_spec = denormalize_spectrogram(output_spec) | |
denorm_spec_audio = vocoder.inference(denorm_spec) | |
#βββ | |
# Ensure correct shape | |
denorm_spec_audio = denorm_spec_audio.flatten() # Converts (1, N) β (N,) | |
denorm_spec_audio = denorm_spec_audio / np.max(np.abs(denorm_spec_audio)) # Scale between -1 and 1 | |
# Save as WAV | |
sf.write("generated_output.wav", denorm_spec_audio, 16000) | |
# Save input spectrogram image | |
input_spec_image_path = "input_spectrogram.png" | |
raw_image.save(input_spec_image_path) | |
# Save output spectrogram image | |
output_spec_image_path = "output_spectrogram.png" | |
color_output_spec_image.save(output_spec_image_path) | |
return "generated_output.wav", input_spec_image_path, color_output_spec_image | |
def load_input_spectrogram(audio_path): | |
# Loading | |
audio, sampling_rate = load_wav(audio_path) | |
audio, spec = get_mel_spectrogram_from_audio(audio) | |
norm_spec = normalize_spectrogram(spec) | |
norm_spec = pad_spec(norm_spec, 1024) | |
norm_spec = normalize(norm_spec) # normalize to [-1, 1], because pipeline do not normalize for torch.Tensor input | |
raw_image = image_add_color(torch_to_pil(norm_spec)) | |
# Save input spectrogram image | |
input_spec_image_path = "input_spectrogram.png" | |
raw_image.save(input_spec_image_path) | |
return input_spec_image_path | |
def preview_masked_area(audio_path, mask_start_point, mask_end_point): | |
# Loading | |
audio, sampling_rate = load_wav(audio_path) | |
audio, spec = get_mel_spectrogram_from_audio(audio) | |
norm_spec = normalize_spectrogram(spec) | |
norm_spec = pad_spec(norm_spec, 1024) | |
norm_spec = normalize(norm_spec) # normalize to [-1, 1], because pipeline do not normalize for torch.Tensor input | |
# Add Mask | |
width_start, width = mask_start_point, mask_end_point-mask_start_point | |
mask = torch.zeros_like(norm_spec)[:1,...] | |
mask[:, :, width_start:width_start+width] = 1 | |
mask_image = torch_to_pil(mask) | |
mask, masked_spec = prepare_mask_and_masked_image(norm_spec, mask) | |
masked_spec_image = torch_to_pil(masked_spec) | |
# color masked spec and paint masked area to black | |
color_masked_spec_image = image_add_color(masked_spec_image) | |
color_masked_spec_image = np.array(color_masked_spec_image) | |
color_masked_spec_image[:, width_start:width_start+width, :] = 0 | |
color_masked_spec_image = Image.fromarray(color_masked_spec_image) | |
# Save the masked spectrogram image | |
masked_spec_image_path = "masked_spectrogram.png" | |
color_masked_spec_image.save(masked_spec_image_path) | |
return masked_spec_image_path | |
def load_inpaint_example(prompt_inp, audio_path): | |
in_spec_path = load_input_spectrogram(audio_path) | |
masked_spec_path = preview_masked_area(audio_path, 256, 768) | |
return in_spec_path, masked_spec_path | |
css=""" | |
div#col-container{ | |
margin: 0 auto; | |
max-width: 640px; | |
} | |
""" | |
with gr.Blocks(css=css) as demo: | |
with gr.Column(elem_id="col-container"): | |
gr.Markdown("# Auffusion") | |
gr.Markdown("Auffusion can generate realistic audios including human sounds, animal sounds, natural and artificial sounds and sound effects from textual prompts. ") | |
gr.HTML(""" | |
<div style="display:flex;column-gap:4px;"> | |
<a href="https://auffusion.github.io/"> | |
<img src='https://img.shields.io/badge/Project-Page-green'> | |
</a> | |
<a href="https://github.com/happylittlecat2333/Auffusion"> | |
<img src='https://img.shields.io/badge/GitHub-Repo-blue'> | |
</a> | |
<a href="https://arxiv.org/pdf/2401.01044"> | |
<img src='https://img.shields.io/badge/ArXiv-Paper-red'> | |
</a> | |
<a href="https://huggingface.co/spaces/fffiloni/auffusion?duplicate=true"> | |
<img src="https://huggingface.co/datasets/huggingface/badges/resolve/main/duplicate-this-space-sm.svg" alt="Duplicate this Space"> | |
</a> | |
</div> | |
""") | |
with gr.Tab("Text-to-Audio"): | |
prompt = gr.Textbox(label="Prompt") | |
submit_btn = gr.Button("Submit") | |
audio_out = gr.Audio(label="Audio Ressult") | |
gr.Examples( | |
examples = [ | |
"Rolling thunder with lightning strikes", | |
"Two gunshots followed by birds chirping", | |
"A train whistle blowing in the distance" | |
], | |
inputs = [prompt], | |
cache_examples=False | |
) | |
submit_btn.click( | |
fn = infer, | |
inputs = [prompt], | |
outputs = [audio_out], | |
show_api=True | |
) | |
with gr.Tab("Audio-to-Audio"): | |
prompt_img2img = gr.Textbox(label="Prompt") | |
audio_in_img2img = gr.Audio(label="Audio Reference", type="filepath", format="wav") | |
prompt_strength = gr.Slider(label="Prompt Strength", minimum=0.0, maximum=1.0, step=0.1, value=0.7) | |
submit_btn_img2img = gr.Button("Submit") | |
audio_out_img2img = gr.Audio(label="Audio Ressult") | |
with gr.Accordion("Compare Spectrograms", open=False): | |
with gr.Column(): | |
input_spectrogram = gr.Image(label="Input Spectrogram") | |
output_spectrogram = gr.Image(label="Output Spectrogram") | |
gr.Examples( | |
examples = [ | |
["Ambulance siren", "./notebooks/examples/img2img/GIOApFAWDOc_160.wav"], | |
["A cat is moewing", "./notebooks/examples/img2img/YniwgMbB6tpQ_01.wav"], | |
["A car racing", "./notebooks/examples/img2img/_GI7meqlYZk_30.wav"] | |
], | |
inputs = [prompt_img2img, audio_in_img2img], | |
cache_examples=False | |
) | |
submit_btn_img2img.click( | |
fn = infer_img2img, | |
inputs = [prompt_img2img, audio_in_img2img, prompt_strength], | |
outputs = [audio_out_img2img, input_spectrogram, output_spectrogram], | |
show_api=True | |
) | |
with gr.Tab("Audio InPainting"): | |
prompt_inp = gr.Textbox(label="Prompt") | |
audio_in_inp = gr.Audio(label="Audio Reference", type="filepath", format="wav") | |
audio_in_spec = gr.Image(label="Audio IN spectrogram") | |
mask_start_point = gr.Slider(label="Mask Start point", minimum=0, maximum=1024, step=1, value=256) | |
mask_end_point = gr.Slider(label="Mask End point", minimum=0, maximum=1024, step=1, value=768) | |
preview_mask_btn = gr.Button("Preview Mask") | |
masked_spec_preview = gr.Image(label="Spectrogram Mask Preview") | |
submit_btn_inp = gr.Button("Submit") | |
audio_out_inp = gr.Audio(label="Audio Ressult") | |
with gr.Accordion("Compare Spectrograms", open=False): | |
with gr.Column(): | |
input_spectrogram_inp = gr.Image(label="Input Spectrogram") | |
output_spectrogram_inp = gr.Image(label="Output Spectrogram") | |
gr.Examples( | |
examples = [ | |
["A siren ringing with a vehicle speeding closer", "./notebooks/examples/inpainting/IvfaKPDWC00_160.wav"], | |
["A woman speaking", "./notebooks/examples/inpainting/9z8XIRyUq9Q_30.wav"], | |
["An infant crying", "./notebooks/examples/inpainting/14ekd4nkpwc_28.wav"], | |
["A dog barking and growling", "./notebooks/examples/inpainting/3ek-xLwr05Q_30.wav"] | |
], | |
fn = load_inpaint_example, | |
inputs = [prompt_inp, audio_in_inp], | |
outputs = [audio_in_spec, masked_spec_preview], | |
cache_examples = False | |
) | |
audio_in_inp.upload( | |
fn = load_input_spectrogram, | |
inputs = [audio_in_inp], | |
outputs = [audio_in_spec], | |
show_api=False | |
) | |
audio_in_inp.stop_recording( | |
fn = load_input_spectrogram, | |
inputs = [audio_in_inp], | |
outputs = [audio_in_spec], | |
show_api=False | |
) | |
preview_mask_btn.click( | |
fn = preview_masked_area, | |
inputs = [audio_in_inp, mask_start_point, mask_end_point], | |
outputs = [masked_spec_preview], | |
show_api=False | |
) | |
submit_btn_inp.click( | |
fn = infer_inp, | |
inputs = [prompt_inp, audio_in_inp, mask_start_point, mask_end_point], | |
outputs = [audio_out_inp, input_spectrogram_inp, output_spectrogram_inp], | |
show_api=False | |
) | |
demo.queue().launch(ssr_mode=False, mcp_server=True, show_error=True) |