import gradio as gr import spaces import json import re import random import numpy as np from gradio_client import Client, handle_file MAX_SEED = np.iinfo(np.int32).max import re import torch from transformers import pipeline zephyr_model = "HuggingFaceH4/zephyr-7b-beta" mixtral_model = "mistralai/Mixtral-8x7B-Instruct-v0.1" pipe = pipeline("text-generation", model=zephyr_model, torch_dtype=torch.bfloat16, device_map="auto") standard_sys = f""" You are an Art Director AI whose job is to help users create their own outfit which will reflect the musical mood or audio described by users. In particular, you need to respond succintly with an outfit idea, in a friendly tone, write a prompt for an image including your outfit idea. For example, if a user says, "This song features a female vocalist singing a beautiful and emotional melody. The melody is accompanied by the sound of a piano playing a slow and melancholic tune. The song has a dreamy and ethereal feel to it. The lyrics of the song are about the beauty of love and the joy it brings to one's life. Overall, this song is a perfect example of the power of music to evoke strong emotions and create a sense of wonder and awe in the listener.", provide immediately a prompt corresponding to the audio description. Immediately STOP after that. It should be EXACTLY in this format: "A lady dressed with a flowy, pastel-colored dress paired with strappy sandals and a wide-brimmed hat, accessorized with delicate jewelry, such as dainty earrings and a necklace." """ @spaces.GPU(enable_queue=True) def get_outfit_prompt(user_prompt): agent_maker_sys = standard_sys instruction = f""" <|system|> {agent_maker_sys} <|user|> """ prompt = f"{instruction.strip()}\n{user_prompt}" outputs = pipe(prompt, max_new_tokens=256, do_sample=True, temperature=0.7, top_k=50, top_p=0.95) pattern = r'\<\|system\|\>(.*?)\<\|assistant\|\>' cleaned_text = re.sub(pattern, '', outputs[0]["generated_text"], flags=re.DOTALL) print(f"SUGGESTED Musical prompt: {cleaned_text}") return cleaned_text.lstrip("\n") def get_salmonn(audio_in, prompt): client = Client("fffiloni/SALMONN-7B-gradio") result = client.predict( speech=handle_file(audio_in), text_input=prompt, num_beams=4, temperature=1, top_p=0.9, api_name="/gradio_answer_1" ) print(result) return result def infer(audio_in): salmonn_prompt = "Please describe the audio in detail." salmonn_res = get_salmonn(audio_in, salmonn_prompt) outfit_prompt = get_outfit_prompt(salmonn_res) return gr.update(value=outfit_prompt, interactive=True) demo_title = "Music to Outfit" description = "Get an outfit ideau from audio" css = """ #col-container { margin: 0 auto; max-width: 980px; text-align: left; } #inspi-prompt textarea { font-size: 20px; line-height: 24px; font-weight: 600; } /* fix examples gallery width on mobile */ div#component-11 > .gallery > .gallery-item > .container > img { width: auto!important; } """ with gr.Blocks(css=css) as demo: with gr.Column(elem_id="col-container"): gr.HTML(f"""
{description}
""") with gr.Row(): with gr.Column(): audio_in = gr.Audio( label = "Audio reference", type = "filepath", elem_id = "audio-in" ) submit_btn = gr.Button("Make music from my pic !") with gr.Column(): caption = gr.Textbox( label = "Inspirational outfit prompt", interactive = False, elem_id = "inspi-prompt" ) result = gr.Image( label = "Outfit" ) submit_btn.click( fn = infer, inputs = [ audio_in ], outputs =[ caption, #result ], concurrency_limit = 4 ) demo.queue(max_size=16).launch(show_api=False, show_error=True)