Spaces:

fffiloni
/

Image2SFX-comparison

Running

App Files Files Community

fffiloni commited on Oct 10, 2024

Commit

3740288

verified ·

1 Parent(s): b20b9d1

update APIs

Browse files

Files changed (1) hide show

app.py +42 -49

app.py CHANGED Viewed

@@ -23,46 +23,29 @@ def extract_audio(video_in):
     return 'audio.wav'
 def get_caption_from_kosmos(image_in):
-    kosmos2_client = Client("https://ydshieh-kosmos-2.hf.space/")
     kosmos2_result = kosmos2_client.predict(
-        image_in,	# str (filepath or URL to image) in 'Test Image' Image component
-        "Detailed",	# str in 'Description Type' Radio component
-        fn_index=4
     )
     print(f"KOSMOS2 RETURNS: {kosmos2_result}")
-    with open(kosmos2_result[1], 'r') as f:
-        data = json.load(f)
-    reconstructed_sentence = []
-    for sublist in data:
-        reconstructed_sentence.append(sublist[0])
-    full_sentence = ' '.join(reconstructed_sentence)
-    #print(full_sentence)
-    # Find the pattern matching the expected format ("Describe this image in detail:" followed by optional space and then the rest)...
-    pattern = r'^Describe this image in detail:\s*(.*)$'
-    # Apply the regex pattern to extract the description text.
-    match = re.search(pattern, full_sentence)
-    if match:
-        description = match.group(1)
-        print(description)
-    else:
-        print("Unable to locate valid description.")
     # Find the last occurrence of "."
-    last_period_index = description.rfind('.')
     # Truncate the string up to the last period
-    truncated_caption = description[:last_period_index + 1]
     # print(truncated_caption)
-    print(f"\n—\nIMAGE CAPTION: {truncated_caption}")
-    return truncated_caption
 def get_caption(image_in):
     client = Client("fffiloni/moondream1", hf_token=hf_token)
@@ -101,19 +84,20 @@ def get_magnet(prompt):
 def get_audioldm(prompt):
     try:
-        client = Client("https://haoheliu-audioldm2-text2audio-text2music.hf.space/")
         result = client.predict(
-            prompt,	# str in 'Input text' Textbox component
-            "Low quality. Music.",	# str in 'Negative prompt' Textbox component
-            10,	# int | float (numeric value between 5 and 15) in 'Duration (seconds)' Slider component
-            3.5,	# int | float (numeric value between 0 and 7) in 'Guidance scale' Slider component
-            45,	# int | float in 'Seed' Number component
-            3,	# int | float (numeric value between 1 and 5) in 'Number waveforms to generate' Slider component
-            fn_index=1
         )
         print(result)
-        audio_result = extract_audio(result)
-        return audio_result
     except:
         raise gr.Error("AudioLDM space API is not ready, please try again in few minutes ")
@@ -133,10 +117,10 @@ def get_tango(prompt):
     try:
         client = Client("fffiloni/tango")
         result = client.predict(
-				prompt,	# str representing string value in 'Prompt' Textbox component
-				100,	# int | float representing numeric value between 100 and 200 in 'Steps' Slider component
-				4,	# int | float representing numeric value between 1 and 10 in 'Guidance Scale' Slider component
-				api_name="/predict"
         )
         print(result)
         return result
@@ -149,10 +133,11 @@ def get_tango2(prompt):
     try:
         client = Client("declare-lab/tango2")
         result = client.predict(
-    		prompt,
-    		100,
-    		4,
-    		api_name="/predict"
         )
         print(result)
         return result
@@ -196,7 +181,7 @@ def get_ezaudio(prompt):
         raise gr.Error("EzAudio space API is not ready, please try again in few minutes ")
 def infer(image_in, chosen_model):
-    caption = get_caption(image_in)
     if chosen_model == "MAGNet" :
         magnet_result = get_magnet(caption)
         return magnet_result
@@ -240,7 +225,15 @@ with gr.Blocks(css=css) as demo:
         with gr.Column():
             image_in = gr.Image(sources=["upload"], type="filepath", label="Image input", value="oiseau.png")
             with gr.Row():
-                chosen_model = gr.Dropdown(label="Choose a model", choices=["MAGNet", "AudioLDM-2", "AudioGen", "Tango", "Tango 2", "Stable Audio Open", "EzAudio"], value="AudioLDM-2")
                 submit_btn = gr.Button("Submit")
         with gr.Column():
             audio_o = gr.Audio(label="Audio output")

     return 'audio.wav'
 def get_caption_from_kosmos(image_in):
+    kosmos2_client = Client("fffiloni/Kosmos-2-API", hf_token=hf_token)
     kosmos2_result = kosmos2_client.predict(
+		image_input=handle_file(image_in),
+		text_input="Detailed",
+		api_name="/generate_predictions"
     )
     print(f"KOSMOS2 RETURNS: {kosmos2_result}")
+    data = kosmos2_result[1]
+    # Extract and combine tokens starting from the second element
+    sentence = ''.join(item['token'] for item in data[1:])
     # Find the last occurrence of "."
+    #last_period_index = full_sentence.rfind('.')
     # Truncate the string up to the last period
+    #truncated_caption = full_sentence[:last_period_index + 1]
     # print(truncated_caption)
+    #print(f"\n—\nIMAGE CAPTION: {truncated_caption}")
+    return sentence
 def get_caption(image_in):
     client = Client("fffiloni/moondream1", hf_token=hf_token)
 def get_audioldm(prompt):
     try:
+        client = Client("fffiloni/audioldm2-text2audio-text2music-API", hf_token=hf_token)
+        seed = random.randint(0, MAX_SEED)
         result = client.predict(
+            text=prompt,	# str in 'Input text' Textbox component
+            negative_prompt="Low quality. Music.",	# str in 'Negative prompt' Textbox component
+            duration=10,	# int | float (numeric value between 5 and 15) in 'Duration (seconds)' Slider component
+            guidance_scale=6.5,	# int | float (numeric value between 0 and 7) in 'Guidance scale' Slider component
+            random_seed=seed,	# int | float in 'Seed' Number component
+            n_candidates=3,	# int | float (numeric value between 1 and 5) in 'Number waveforms to generate' Slider component
+            api_name="/text2audio"
         )
         print(result)
+        return result
     except:
         raise gr.Error("AudioLDM space API is not ready, please try again in few minutes ")
     try:
         client = Client("fffiloni/tango")
         result = client.predict(
+        		prompt=prompt,
+        		steps=100,
+        		guidance=3,
+        		api_name="/predict"
         )
         print(result)
         return result
     try:
         client = Client("declare-lab/tango2")
         result = client.predict(
+        		prompt=prompt,
+        		output_format="wav",
+        		steps=100,
+        		guidance=3,
+        		api_name="/predict"
         )
         print(result)
         return result
         raise gr.Error("EzAudio space API is not ready, please try again in few minutes ")
 def infer(image_in, chosen_model):
+    caption = get_caption_from_kosmos(image_in)
     if chosen_model == "MAGNet" :
         magnet_result = get_magnet(caption)
         return magnet_result
         with gr.Column():
             image_in = gr.Image(sources=["upload"], type="filepath", label="Image input", value="oiseau.png")
             with gr.Row():
+                chosen_model = gr.Dropdown(label="Choose a model", choices=[
+                    #"MAGNet",
+                    "AudioLDM-2",
+                    #"AudioGen",
+                    "Tango",
+                    "Tango 2",
+                    "Stable Audio Open",
+                    "EzAudio"
+                ], value="AudioLDM-2")
                 submit_btn = gr.Button("Submit")
         with gr.Column():
             audio_o = gr.Audio(label="Audio output")