import gradio as gr
import spaces
import json
import re
import random
import numpy as np
from gradio_client import Client, handle_file

MAX_SEED = np.iinfo(np.int32).max
    
import re
import torch
from transformers import pipeline

zephyr_model = "HuggingFaceH4/zephyr-7b-beta"
mixtral_model = "mistralai/Mixtral-8x7B-Instruct-v0.1"

pipe = pipeline("text-generation", model=zephyr_model, torch_dtype=torch.bfloat16, device_map="auto")

standard_sys = f"""
You are an Art Director AI whose job is to help users create their own outfit which will reflect the musical mood or audio described by users.
In particular, you need to respond succintly with an outfit idea, in a friendly tone, write a prompt for an image including your outfit idea.

For example, if a user says, "This song features a female vocalist singing a beautiful and emotional melody. The melody is accompanied by the sound of a piano playing a slow and melancholic tune. The song has a dreamy and ethereal feel to it. The lyrics of the song are about the beauty of love and the joy it brings to one's life. Overall, this song is a perfect example of the power of music to evoke strong emotions and create a sense of wonder and awe in the listener.", provide immediately a prompt corresponding to the audio description. 
Immediately STOP after that. It should be EXACTLY in this format:
"A lady dressed with a flowy, pastel-colored dress paired with strappy sandals and a wide-brimmed hat, accessorized with delicate jewelry, such as dainty earrings and a necklace."
"""

@spaces.GPU(enable_queue=True)
def get_outfit_prompt(user_prompt):

    agent_maker_sys = standard_sys
    
    instruction = f"""
<|system|>
{agent_maker_sys}</s>
<|user|>
"""
    
    prompt = f"{instruction.strip()}\n{user_prompt}</s>"    
    outputs = pipe(prompt, max_new_tokens=256, do_sample=True, temperature=0.7, top_k=50, top_p=0.95)
    pattern = r'\<\|system\|\>(.*?)\<\|assistant\|\>'
    cleaned_text = re.sub(pattern, '', outputs[0]["generated_text"], flags=re.DOTALL)
    
    print(f"SUGGESTED Musical prompt: {cleaned_text}")
    return cleaned_text.lstrip("\n")

def get_salmonn(audio_in, prompt):
    client = Client("fffiloni/SALMONN-7B-gradio")
    result = client.predict(
		speech=handle_file(audio_in),
		text_input=prompt,
		num_beams=4,
		temperature=1,
		top_p=0.9,
		api_name="/gradio_answer_1"
    )
    print(result)
    
    return result

def infer(audio_in):
    salmonn_prompt = "Please describe the audio in detail."
    salmonn_res = get_salmonn(audio_in, salmonn_prompt)

    outfit_prompt = get_outfit_prompt(salmonn_res)
    
    return gr.update(value=outfit_prompt, interactive=True)

demo_title = "Music to Outfit"
description = "Get an outfit ideau from audio"

css = """
#col-container {
    margin: 0 auto;
    max-width: 980px;
    text-align: left;
}
#inspi-prompt textarea {
    font-size: 20px;
    line-height: 24px;
    font-weight: 600;
}
/* fix examples gallery width on mobile */
div#component-11 > .gallery > .gallery-item > .container > img {
    width: auto!important;
}
"""

with gr.Blocks(css=css) as demo:
    
    with gr.Column(elem_id="col-container"):
    
        gr.HTML(f"""
        <h2 style="text-align: center;">{demo_title}</h2>
        <p style="text-align: center;">{description}</p>
        """)
        
        with gr.Row():
            
            with gr.Column():
                audio_in = gr.Audio(
                    label = "Audio reference",
                    type = "filepath",
                    elem_id = "audio-in"
                )
                
                submit_btn = gr.Button("Make music from my pic !")
            
            with gr.Column():
            
                caption = gr.Textbox(
                    label = "Inspirational outfit prompt",
                    interactive = False,
                    elem_id = "inspi-prompt"
                )
                
                result = gr.Image(
                    label = "Outfit"
                )
        

    
    
    submit_btn.click(
        fn = infer,
        inputs = [
            audio_in
        ],
        outputs =[
            caption,
            #result
        ],
        concurrency_limit = 4
    )

demo.queue(max_size=16).launch(show_api=False, show_error=True)