Spaces:
Running
Running
import gradio as gr | |
from gradio_client import Client, handle_file | |
import os | |
import random | |
import json | |
import re | |
import numpy as np | |
from moviepy.editor import VideoFileClip | |
from moviepy.audio.AudioClip import AudioClip | |
hf_token = os.environ.get("HF_TKN") | |
MAX_SEED = np.iinfo(np.int32).max | |
def extract_audio(video_in): | |
input_video = video_in | |
output_audio = 'audio.wav' | |
# Open the video file and extract the audio | |
video_clip = VideoFileClip(input_video) | |
audio_clip = video_clip.audio | |
# Save the audio as a .wav file | |
audio_clip.write_audiofile(output_audio, fps=44100) # Use 44100 Hz as the sample rate for .wav files | |
print("Audio extraction complete.") | |
return 'audio.wav' | |
def get_caption_from_kosmos(image_in): | |
kosmos2_client = Client("fffiloni/Kosmos-2-API", hf_token=hf_token) | |
kosmos2_result = kosmos2_client.predict( | |
image_input=handle_file(image_in), | |
text_input="Detailed", | |
api_name="/generate_predictions" | |
) | |
print(f"KOSMOS2 RETURNS: {kosmos2_result}") | |
data = kosmos2_result[1] | |
# Extract and combine tokens starting from the second element | |
sentence = ''.join(item['token'] for item in data[1:]) | |
# Find the last occurrence of "." | |
#last_period_index = full_sentence.rfind('.') | |
# Truncate the string up to the last period | |
#truncated_caption = full_sentence[:last_period_index + 1] | |
# print(truncated_caption) | |
#print(f"\n—\nIMAGE CAPTION: {truncated_caption}") | |
return sentence | |
def get_caption(image_in): | |
client = Client("fffiloni/moondream1", hf_token=hf_token) | |
result = client.predict( | |
image=handle_file(image_in), | |
question="Describe precisely the image in one sentence.", | |
api_name="/predict" | |
) | |
print(result) | |
return result | |
def get_magnet(prompt): | |
amended_prompt = f"{prompt}" | |
print(amended_prompt) | |
try: | |
client = Client("https://fffiloni-magnet.hf.space/") | |
result = client.predict( | |
"facebook/audio-magnet-medium", # Literal['facebook/magnet-small-10secs', 'facebook/magnet-medium-10secs', 'facebook/magnet-small-30secs', 'facebook/magnet-medium-30secs', 'facebook/audio-magnet-small', 'facebook/audio-magnet-medium'] in 'Model' Radio component | |
"", # str in 'Model Path (custom models)' Textbox component | |
amended_prompt, # str in 'Input Text' Textbox component | |
3, # float in 'Temperature' Number component | |
0.9, # float in 'Top-p' Number component | |
10, # float in 'Max CFG coefficient' Number component | |
1, # float in 'Min CFG coefficient' Number component | |
20, # float in 'Decoding Steps (stage 1)' Number component | |
10, # float in 'Decoding Steps (stage 2)' Number component | |
10, # float in 'Decoding Steps (stage 3)' Number component | |
10, # float in 'Decoding Steps (stage 4)' Number component | |
"prod-stride1 (new!)", # Literal['max-nonoverlap', 'prod-stride1 (new!)'] in 'Span Scoring' Radio component | |
api_name="/predict_full" | |
) | |
print(result) | |
return result[1] | |
except: | |
raise gr.Error("MAGNet space API is not ready, please try again in few minutes ") | |
def get_audioldm(prompt): | |
try: | |
client = Client("fffiloni/audioldm2-text2audio-text2music-API", hf_token=hf_token) | |
seed = random.randint(0, MAX_SEED) | |
result = client.predict( | |
text=prompt, # str in 'Input text' Textbox component | |
negative_prompt="Low quality. Music.", # str in 'Negative prompt' Textbox component | |
duration=10, # int | float (numeric value between 5 and 15) in 'Duration (seconds)' Slider component | |
guidance_scale=6.5, # int | float (numeric value between 0 and 7) in 'Guidance scale' Slider component | |
random_seed=seed, # int | float in 'Seed' Number component | |
n_candidates=3, # int | float (numeric value between 1 and 5) in 'Number waveforms to generate' Slider component | |
api_name="/text2audio" | |
) | |
print(result) | |
return result | |
except: | |
raise gr.Error("AudioLDM space API is not ready, please try again in few minutes ") | |
def get_audiogen(prompt): | |
try: | |
client = Client("https://fffiloni-audiogen.hf.space/") | |
result = client.predict( | |
prompt, | |
10, | |
api_name="/infer" | |
) | |
return result | |
except: | |
raise gr.Error("AudioGen space API is not ready, please try again in few minutes ") | |
def get_tango(prompt): | |
try: | |
client = Client("fffiloni/tango", hf_token=hf_token) | |
result = client.predict( | |
prompt=prompt, | |
steps=100, | |
guidance=3, | |
api_name="/predict" | |
) | |
print(result) | |
return result | |
except: | |
raise gr.Error("Tango space API is not ready, please try again in few minutes ") | |
def get_tango2(prompt): | |
try: | |
client = Client("declare-lab/tango2") | |
result = client.predict( | |
prompt=prompt, | |
output_format="wav", | |
steps=100, | |
guidance=3, | |
api_name="/predict" | |
) | |
print(result) | |
return result | |
except: | |
raise gr.Error("Tango2 space API is not ready, please try again in few minutes ") | |
def get_stable_audio_open(prompt): | |
try: | |
client = Client("fffiloni/Stable-Audio-Open-A10", hf_token=hf_token) | |
result = client.predict( | |
prompt=prompt, | |
seconds_total=30, | |
steps=100, | |
cfg_scale=7, | |
api_name="/predict" | |
) | |
print(result) | |
return result | |
except: | |
raise gr.Error("Stable Audio Open space API is not ready, please try again in few minutes ") | |
def get_ezaudio(prompt): | |
try: | |
client = Client("OpenSound/EzAudio") | |
result = client.predict( | |
text=prompt, | |
length=10, | |
guidance_scale=5, | |
guidance_rescale=0.75, | |
ddim_steps=50, | |
eta=1, | |
random_seed=0, | |
randomize_seed=True, | |
api_name="/generate_audio" | |
) | |
print(result) | |
return result | |
except: | |
raise gr.Error("EzAudio space API is not ready, please try again in few minutes ") | |
def infer(image_in, chosen_model): | |
""" | |
Generate an audio clip (sound effect) from an input image using the selected generative model. | |
This function first generates a caption from the provided image using a vision-language model. | |
The caption is then used as a text prompt for various audio generation models. | |
Args: | |
image_in (str): File path to the input image. The image will be processed to generate a descriptive caption. | |
chosen_model (str): The name of the audio generation model to use. Supported options include: "AudioLDM-2", "Tango", "Stable Audio Open". | |
Returns: | |
str | dict: The path or result object of the generated audio clip, depending on the model used. | |
""" | |
caption = get_caption_from_kosmos(image_in) | |
if chosen_model == "MAGNet" : | |
magnet_result = get_magnet(caption) | |
return magnet_result | |
elif chosen_model == "AudioLDM-2" : | |
audioldm_result = get_audioldm(caption) | |
return audioldm_result | |
elif chosen_model == "AudioGen" : | |
audiogen_result = get_audiogen(caption) | |
return audiogen_result | |
elif chosen_model == "Tango" : | |
tango_result = get_tango(caption) | |
return tango_result | |
elif chosen_model == "Tango 2" : | |
tango2_result = get_tango2(caption) | |
return tango2_result | |
elif chosen_model == "Stable Audio Open" : | |
stable_audio_open_result = get_stable_audio_open(caption) | |
return stable_audio_open_result | |
elif chosen_model == "EzAudio" : | |
ezaudio_result = get_ezaudio(caption) | |
return ezaudio_result | |
css=""" | |
#col-container{ | |
margin: 0 auto; | |
max-width: 800px; | |
} | |
""" | |
with gr.Blocks(css=css) as demo: | |
with gr.Column(elem_id="col-container"): | |
gr.HTML(""" | |
<h2 style="text-align: center;"> | |
Image to SFX | |
</h2> | |
<p style="text-align: center;"> | |
Compare sound effects generation models from image caption. | |
</p> | |
""") | |
with gr.Column(): | |
image_in = gr.Image(sources=["upload"], type="filepath", label="Image input") | |
with gr.Row(): | |
chosen_model = gr.Dropdown(label="Choose a model", choices=[ | |
#"MAGNet", | |
"AudioLDM-2", | |
#"AudioGen", | |
"Tango", | |
#"Tango 2", | |
"Stable Audio Open", | |
#"EzAudio" | |
], value="AudioLDM-2") | |
submit_btn = gr.Button("Submit") | |
with gr.Column(): | |
audio_o = gr.Audio(label="Audio output") | |
gr.Examples( | |
examples = [["oiseau.png", "AudioLDM-2"]], | |
inputs = [image_in, chosen_model] | |
) | |
submit_btn.click( | |
fn=infer, | |
inputs=[image_in, chosen_model], | |
outputs=[audio_o], | |
) | |
demo.queue(max_size=10).launch(debug=True, show_error=True, ssr_mode=False, mcp_server=True) |