Spaces:

fffiloni
/

Image2SFX-comparison

Running

App Files Files Community

Image2SFX-comparison / app.py

fffiloni

Cleans infer docs

94a1c5b verified 1 day ago

raw

history blame contribute delete

9.29 kB

	import gradio as gr
	from gradio_client import Client, handle_file
	import os
	import random
	import json
	import re
	import numpy as np
	from moviepy.editor import VideoFileClip
	from moviepy.audio.AudioClip import AudioClip

	hf_token = os.environ.get("HF_TKN")
	MAX_SEED = np.iinfo(np.int32).max

	def extract_audio(video_in):
	input_video = video_in
	output_audio = 'audio.wav'

	# Open the video file and extract the audio
	video_clip = VideoFileClip(input_video)
	audio_clip = video_clip.audio

	# Save the audio as a .wav file
	audio_clip.write_audiofile(output_audio, fps=44100) # Use 44100 Hz as the sample rate for .wav files
	print("Audio extraction complete.")

	return 'audio.wav'

	def get_caption_from_kosmos(image_in):
	kosmos2_client = Client("fffiloni/Kosmos-2-API", hf_token=hf_token)
	kosmos2_result = kosmos2_client.predict(
	image_input=handle_file(image_in),
	text_input="Detailed",
	api_name="/generate_predictions"
	)
	print(f"KOSMOS2 RETURNS: {kosmos2_result}")

	data = kosmos2_result[1]

	# Extract and combine tokens starting from the second element
	sentence = ''.join(item['token'] for item in data[1:])

	# Find the last occurrence of "."
	#last_period_index = full_sentence.rfind('.')

	# Truncate the string up to the last period
	#truncated_caption = full_sentence[:last_period_index + 1]

	# print(truncated_caption)
	#print(f"\n—\nIMAGE CAPTION: {truncated_caption}")

	return sentence

	def get_caption(image_in):
	client = Client("fffiloni/moondream1", hf_token=hf_token)
	result = client.predict(
	image=handle_file(image_in),
	question="Describe precisely the image in one sentence.",
	api_name="/predict"
	)
	print(result)
	return result

	def get_magnet(prompt):
	amended_prompt = f"{prompt}"
	print(amended_prompt)
	try:
	client = Client("https://fffiloni-magnet.hf.space/")
	result = client.predict(
	"facebook/audio-magnet-medium", # Literal['facebook/magnet-small-10secs', 'facebook/magnet-medium-10secs', 'facebook/magnet-small-30secs', 'facebook/magnet-medium-30secs', 'facebook/audio-magnet-small', 'facebook/audio-magnet-medium'] in 'Model' Radio component
	"", # str in 'Model Path (custom models)' Textbox component
	amended_prompt, # str in 'Input Text' Textbox component
	3, # float in 'Temperature' Number component
	0.9, # float in 'Top-p' Number component
	10, # float in 'Max CFG coefficient' Number component
	1, # float in 'Min CFG coefficient' Number component
	20, # float in 'Decoding Steps (stage 1)' Number component
	10, # float in 'Decoding Steps (stage 2)' Number component
	10, # float in 'Decoding Steps (stage 3)' Number component
	10, # float in 'Decoding Steps (stage 4)' Number component
	"prod-stride1 (new!)", # Literal['max-nonoverlap', 'prod-stride1 (new!)'] in 'Span Scoring' Radio component
	api_name="/predict_full"
	)
	print(result)
	return result[1]
	except:
	raise gr.Error("MAGNet space API is not ready, please try again in few minutes ")

	def get_audioldm(prompt):
	try:
	client = Client("fffiloni/audioldm2-text2audio-text2music-API", hf_token=hf_token)
	seed = random.randint(0, MAX_SEED)
	result = client.predict(
	text=prompt, # str in 'Input text' Textbox component
	negative_prompt="Low quality. Music.", # str in 'Negative prompt' Textbox component
	duration=10, # int \| float (numeric value between 5 and 15) in 'Duration (seconds)' Slider component
	guidance_scale=6.5, # int \| float (numeric value between 0 and 7) in 'Guidance scale' Slider component
	random_seed=seed, # int \| float in 'Seed' Number component
	n_candidates=3, # int \| float (numeric value between 1 and 5) in 'Number waveforms to generate' Slider component
	api_name="/text2audio"
	)
	print(result)

	return result
	except:
	raise gr.Error("AudioLDM space API is not ready, please try again in few minutes ")

	def get_audiogen(prompt):
	try:
	client = Client("https://fffiloni-audiogen.hf.space/")
	result = client.predict(
	prompt,
	10,
	api_name="/infer"
	)
	return result
	except:
	raise gr.Error("AudioGen space API is not ready, please try again in few minutes ")

	def get_tango(prompt):
	try:
	client = Client("fffiloni/tango", hf_token=hf_token)
	result = client.predict(
	prompt=prompt,
	steps=100,
	guidance=3,
	api_name="/predict"
	)
	print(result)
	return result
	except:
	raise gr.Error("Tango space API is not ready, please try again in few minutes ")



	def get_tango2(prompt):
	try:
	client = Client("declare-lab/tango2")
	result = client.predict(
	prompt=prompt,
	output_format="wav",
	steps=100,
	guidance=3,
	api_name="/predict"
	)
	print(result)
	return result
	except:
	raise gr.Error("Tango2 space API is not ready, please try again in few minutes ")



	def get_stable_audio_open(prompt):
	try:
	client = Client("fffiloni/Stable-Audio-Open-A10", hf_token=hf_token)
	result = client.predict(
	prompt=prompt,
	seconds_total=30,
	steps=100,
	cfg_scale=7,
	api_name="/predict"
	)
	print(result)
	return result
	except:
	raise gr.Error("Stable Audio Open space API is not ready, please try again in few minutes ")

	def get_ezaudio(prompt):
	try:
	client = Client("OpenSound/EzAudio")
	result = client.predict(
	text=prompt,
	length=10,
	guidance_scale=5,
	guidance_rescale=0.75,
	ddim_steps=50,
	eta=1,
	random_seed=0,
	randomize_seed=True,
	api_name="/generate_audio"
	)
	print(result)
	return result
	except:
	raise gr.Error("EzAudio space API is not ready, please try again in few minutes ")

	def infer(image_in, chosen_model):
	"""
	Generate an audio clip (sound effect) from an input image using the selected generative model.

	This function first generates a caption from the provided image using a vision-language model.
	The caption is then used as a text prompt for various audio generation models.

	Args:
	image_in (str): File path to the input image. The image will be processed to generate a descriptive caption.
	chosen_model (str): The name of the audio generation model to use. Supported options include: "AudioLDM-2", "Tango", "Stable Audio Open".

	Returns:
	str \| dict: The path or result object of the generated audio clip, depending on the model used.

	"""
	caption = get_caption_from_kosmos(image_in)
	if chosen_model == "MAGNet" :
	magnet_result = get_magnet(caption)
	return magnet_result
	elif chosen_model == "AudioLDM-2" :
	audioldm_result = get_audioldm(caption)
	return audioldm_result
	elif chosen_model == "AudioGen" :
	audiogen_result = get_audiogen(caption)
	return audiogen_result
	elif chosen_model == "Tango" :
	tango_result = get_tango(caption)
	return tango_result
	elif chosen_model == "Tango 2" :
	tango2_result = get_tango2(caption)
	return tango2_result
	elif chosen_model == "Stable Audio Open" :
	stable_audio_open_result = get_stable_audio_open(caption)
	return stable_audio_open_result
	elif chosen_model == "EzAudio" :
	ezaudio_result = get_ezaudio(caption)
	return ezaudio_result

	css="""
	#col-container{
	margin: 0 auto;
	max-width: 800px;
	}
	"""

	with gr.Blocks(css=css) as demo:
	with gr.Column(elem_id="col-container"):
	gr.HTML("""
	<h2 style="text-align: center;">
	Image to SFX
	</h2>
	<p style="text-align: center;">
	Compare sound effects generation models from image caption.
	</p>
	""")

	with gr.Column():
	image_in = gr.Image(sources=["upload"], type="filepath", label="Image input")
	with gr.Row():
	chosen_model = gr.Dropdown(label="Choose a model", choices=[
	#"MAGNet",
	"AudioLDM-2",
	#"AudioGen",
	"Tango",
	#"Tango 2",
	"Stable Audio Open",
	#"EzAudio"
	], value="AudioLDM-2")
	submit_btn = gr.Button("Submit")
	with gr.Column():
	audio_o = gr.Audio(label="Audio output")

	gr.Examples(
	examples = [["oiseau.png", "AudioLDM-2"]],
	inputs = [image_in, chosen_model]
	)

	submit_btn.click(
	fn=infer,
	inputs=[image_in, chosen_model],
	outputs=[audio_o],
	)

	demo.queue(max_size=10).launch(debug=True, show_error=True, ssr_mode=False, mcp_server=True)