Files changed (3) hide show
  1. README.md +1 -1
  2. app.py +3 -21
  3. requirements.txt +1 -1
README.md CHANGED
@@ -4,7 +4,7 @@ emoji: πŸ‘‚
4
  colorFrom: green
5
  colorTo: pink
6
  sdk: gradio
7
- sdk_version: 5.35.0
8
  app_file: app.py
9
  pinned: false
10
  short_description: Generates audio environment from an image
 
4
  colorFrom: green
5
  colorTo: pink
6
  sdk: gradio
7
+ sdk_version: 5.0.1
8
  app_file: app.py
9
  pinned: false
10
  short_description: Generates audio environment from an image
app.py CHANGED
@@ -26,7 +26,6 @@ def extract_audio(video_in):
26
  return 'audio.wav'
27
 
28
  def get_caption_from_kosmos(image_in):
29
- gr.Info("Generating image caption with Kosmos2...")
30
  kosmos2_client = Client("fffiloni/Kosmos-2-API", hf_token=hf_token)
31
  kosmos2_result = kosmos2_client.predict(
32
  image_input=handle_file(image_in),
@@ -87,7 +86,6 @@ def get_magnet(prompt):
87
  raise gr.Error("MAGNet space API is not ready, please try again in few minutes ")
88
 
89
  def get_audioldm(prompt):
90
- gr.Info("Now calling AudioLDM2 for SFX ...")
91
  try:
92
  client = Client("fffiloni/audioldm2-text2audio-text2music-API", hf_token=hf_token)
93
  seed = random.randint(0, MAX_SEED)
@@ -119,7 +117,6 @@ def get_audiogen(prompt):
119
  raise gr.Error("AudioGen space API is not ready, please try again in few minutes ")
120
 
121
  def get_tango(prompt):
122
- gr.Info("Now calling AudioGen for SFX ...")
123
  try:
124
  client = Client("fffiloni/tango", hf_token=hf_token)
125
  result = client.predict(
@@ -153,7 +150,6 @@ def get_tango2(prompt):
153
 
154
 
155
  def get_stable_audio_open(prompt):
156
- gr.Info("Now calling Stable-Audio for SFX ...")
157
  try:
158
  client = Client("fffiloni/Stable-Audio-Open-A10", hf_token=hf_token)
159
  result = client.predict(
@@ -188,20 +184,6 @@ def get_ezaudio(prompt):
188
  raise gr.Error("EzAudio space API is not ready, please try again in few minutes ")
189
 
190
  def infer(image_in, chosen_model):
191
- """
192
- Generate an audio clip (sound effect) from an input image using the selected generative model.
193
-
194
- This function first generates a caption from the provided image using a vision-language model.
195
- The caption is then used as a text prompt for various audio generation models.
196
-
197
- Args:
198
- image_in (str): File path to the input image. The image will be processed to generate a descriptive caption.
199
- chosen_model (str): The name of the audio generation model to use. Supported options include: "AudioLDM-2", "Tango", "Stable Audio Open".
200
-
201
- Returns:
202
- str | dict: The path or result object of the generated audio clip, depending on the model used.
203
-
204
- """
205
  caption = get_caption_from_kosmos(image_in)
206
  if chosen_model == "MAGNet" :
207
  magnet_result = get_magnet(caption)
@@ -251,9 +233,9 @@ with gr.Blocks(css=css) as demo:
251
  "AudioLDM-2",
252
  #"AudioGen",
253
  "Tango",
254
- #"Tango 2",
255
  "Stable Audio Open",
256
- #"EzAudio"
257
  ], value="AudioLDM-2")
258
  submit_btn = gr.Button("Submit")
259
  with gr.Column():
@@ -270,4 +252,4 @@ with gr.Blocks(css=css) as demo:
270
  outputs=[audio_o],
271
  )
272
 
273
- demo.queue(max_size=10).launch(debug=True, show_error=True, ssr_mode=False, mcp_server=True)
 
26
  return 'audio.wav'
27
 
28
  def get_caption_from_kosmos(image_in):
 
29
  kosmos2_client = Client("fffiloni/Kosmos-2-API", hf_token=hf_token)
30
  kosmos2_result = kosmos2_client.predict(
31
  image_input=handle_file(image_in),
 
86
  raise gr.Error("MAGNet space API is not ready, please try again in few minutes ")
87
 
88
  def get_audioldm(prompt):
 
89
  try:
90
  client = Client("fffiloni/audioldm2-text2audio-text2music-API", hf_token=hf_token)
91
  seed = random.randint(0, MAX_SEED)
 
117
  raise gr.Error("AudioGen space API is not ready, please try again in few minutes ")
118
 
119
  def get_tango(prompt):
 
120
  try:
121
  client = Client("fffiloni/tango", hf_token=hf_token)
122
  result = client.predict(
 
150
 
151
 
152
  def get_stable_audio_open(prompt):
 
153
  try:
154
  client = Client("fffiloni/Stable-Audio-Open-A10", hf_token=hf_token)
155
  result = client.predict(
 
184
  raise gr.Error("EzAudio space API is not ready, please try again in few minutes ")
185
 
186
  def infer(image_in, chosen_model):
 
 
 
 
 
 
 
 
 
 
 
 
 
 
187
  caption = get_caption_from_kosmos(image_in)
188
  if chosen_model == "MAGNet" :
189
  magnet_result = get_magnet(caption)
 
233
  "AudioLDM-2",
234
  #"AudioGen",
235
  "Tango",
236
+ "Tango 2",
237
  "Stable Audio Open",
238
+ "EzAudio"
239
  ], value="AudioLDM-2")
240
  submit_btn = gr.Button("Submit")
241
  with gr.Column():
 
252
  outputs=[audio_o],
253
  )
254
 
255
+ demo.queue(max_size=10).launch(debug=True, show_error=True)
requirements.txt CHANGED
@@ -1 +1 @@
1
- moviepy<2
 
1
+ moviepy