import gradio as gr
import open_clip
import torch
import numpy as np
import pandas as pd
from PIL import Image
import matplotlib.pyplot as plt
import io

# --- 1. Model Initialization (run only once) ---
print("Loading HQ-CLIP model...")
model_hq, _, preprocess_hq = open_clip.create_model_and_transforms('hf-hub:zhixiangwei/hqclip-openai-large-ft-vlm1b')
tokenizer_hq = open_clip.get_tokenizer('hf-hub:zhixiangwei/hqclip-openai-large-ft-vlm1b')
print("HQ-CLIP model loaded.")

print("Loading standard OpenAI CLIP model...")
model_openai, _, preprocess_openai = open_clip.create_model_and_transforms('ViT-L-14-quickgelu','openai')
tokenizer_openai = open_clip.get_tokenizer('ViT-L-14-quickgelu','openai')
print("OpenAI CLIP model loaded.")

device = "cuda" if torch.cuda.is_available() else "cpu"
device='cpu'
model_hq.to(device)
model_openai.to(device)
print(f"Models moved to {device}.")


# --- 2. Core Logic: Similarity Calculation and Comparison ---
def calculate_similarities(image, *texts):
    texts = [t for t in texts if t and t.strip()]
    if not texts:
        # Return empty/default values for all outputs
        return "Please enter at least one valid text description.", pd.DataFrame(), None

    # --- Helper function to get scores from a model ---
    def get_scores(model, preprocess, tokenizer, img_input, text_inputs,ttype='hqclip'):
        # Preprocess image
        # The DeprecationWarning is fixed here by removing the mode='RGB' argument
        img = Image.fromarray(img_input.astype('uint8'))
        img_tensor = preprocess(img).unsqueeze(0).to(device)

        # --- FIX APPLIED HERE ---
        # Tokenize texts one by one and stack them, which works for ALL tokenizer types.
        # The original tokenizer for OpenAI models cannot handle a list of strings directly.
        # tokenized_texts = torch.cat([tokenizer(text) for text in text_inputs]).to(device)
        print(text_inputs,ttype)
        if ttype=='hqclip':
            tokenized_texts = tokenizer(text_inputs).to(device)
        else:
            tokenized_texts = tokenizer(text_inputs,77).to(device)
        
        with torch.no_grad():
            image_features = model.encode_image(img_tensor)
            text_features = model.encode_text(tokenized_texts)
            
            image_features /= image_features.norm(dim=-1, keepdim=True)
            text_features /= text_features.norm(dim=-1, keepdim=True)
            
            # Standard practice: scale by 100 for better score separation
            similarities = (100.0 * image_features @ text_features.T).squeeze(0)
            probs = torch.softmax(similarities, dim=-1)

        return similarities.cpu().numpy(), probs.cpu().numpy()

    # --- Calculate scores for both models ---
    # similarities_hq, probs_hq = get_scores(model_hq, preprocess_hq, tokenizer_hq, image, texts)
    print('hqclip yes')
    similarities_openai, probs_openai = get_scores(model_openai, preprocess_openai, tokenizer_openai, image, texts,ttype='openai')
    print('openai yes')

    # --- Identify Best Match for each model ---
    best_idx_hq = np.argmax(similarities_hq)
    best_idx_openai = np.argmax(similarities_openai)
    
    best_match_output = f"""
    ### 🏆 Best Match Analysis
    **HQ-CLIP's Choice:** **'{texts[best_idx_hq]}'** (Score: {similarities_hq[best_idx_hq]:.2f})
    
    **OpenAI CLIP's Choice:** **'{texts[best_idx_openai]}'** (Score: {similarities_openai[best_idx_openai]:.2f})
    """

    # --- Create Results DataFrame ---
    results_data = []
    for i, text in enumerate(texts):
        results_data.append({
            "Text Description": text,
            "HQ-CLIP Score": f"{similarities_hq[i]:.4f}",
            "OpenAI-CLIP Score": f"{similarities_openai[i]:.4f}",
            "HQ-CLIP Probability": f"{probs_hq[i]:.2%}",
            "OpenAI-CLIP Probability": f"{probs_openai[i]:.2%}"
        })
    df = pd.DataFrame(results_data)

    # --- Create Grouped Bar Chart for Comparison ---
    plt.style.use('seaborn-v0_8-whitegrid')
    fig, ax = plt.subplots(figsize=(14, 8))
    
    x = np.arange(len(texts))
    width = 0.35

    rects1 = ax.bar(x - width/2, similarities_hq, width, label='HQ-CLIP', color='#4A90E2', edgecolor='white', linewidth=1.5)
    rects2 = ax.bar(x + width/2, similarities_openai, width, label='OpenAI CLIP', color='#F5A623', edgecolor='white', linewidth=1.5)

    ax.set_ylabel('Similarity Score', fontsize=14, fontweight='bold')
    ax.set_title('Comparison: HQ-CLIP vs. OpenAI CLIP Similarity Scores', fontsize=18, fontweight='bold', pad=20)
    ax.set_xticks(x)
    
    display_texts = [text[:25] + '...' if len(text) > 25 else text for text in texts]
    ax.set_xticklabels(display_texts, rotation=40, ha="right", fontsize=12)
    ax.legend(fontsize=12)

    ax.bar_label(rects1, padding=3, fmt='%.2f', fontsize=10, fontweight='bold')
    ax.bar_label(rects2, padding=3, fmt='%.2f', fontsize=10, fontweight='bold')
    
    max_val = max(np.max(similarities_hq), np.max(similarities_openai)) if len(texts) > 0 else 100
    ax.set_ylim(0, max_val * 1.2)

    fig.tight_layout()

    buf = io.BytesIO()
    plt.savefig(buf, format='png', dpi=200)
    buf.seek(0)
    plot_img = Image.open(buf)
    plt.close(fig)

    return best_match_output, df, plot_img


# --- 5. Gradio Interface ---
css = """
.gradio-container { font-family: 'Inter', sans-serif; }
.main-header { text-align: center; }
.results-group { background: #F8F9FA; border-radius: 15px; padding: 20px; }
.custom-button {
    background: linear-gradient(45deg, #4A90E2 0%, #007BFF 100%);
    border-radius: 25px;
    font-size: 1.1em;
    padding: 12px 30px;
}
"""

with gr.Blocks(css=css, theme=gr.themes.Soft(primary_hue="blue"), title="HQ-CLIP vs OpenAI CLIP") as demo:
    gr.Markdown(
        """
        <div class="main-header">
            <h1>🎨 HQ-CLIP vs. OpenAI CLIP: A Visual Comparison</h1>
            <p>Upload an image and provide text descriptions to see how the fine-tuned <strong>HQ-CLIP</strong> model compares against the standard <strong>OpenAI CLIP</strong>.
            <br>The goal is to see if HQ-CLIP provides more nuanced and accurate similarity scores.</p>
        </div>
        """
    )

    with gr.Row(equal_height=False):
        with gr.Column(scale=1):
            with gr.Group():
                image_input = gr.Image(label="🖼️ Upload Your Image", type="numpy", height=350)
                with gr.Accordion("📝 Enter Text Descriptions (up to 4)", open=True):
                    text_inputs = [
                        gr.Textbox(label=f"Text Description {i+1}", placeholder="e.g., 'a photo of a cat sleeping'")
                        for i in range(4)
                    ]
            submit_btn = gr.Button("🔍 Analyze & Compare", variant="primary", elem_classes="custom-button")

        with gr.Column(scale=2):
            best_match_output = gr.Markdown(label="🏆 Best Match")
            
            with gr.Tabs():
                with gr.TabItem("📊 Comparison Chart"):
                    plot_output = gr.Image(label="Similarity Visualization", type="pil", show_label=False, height=500)
                with gr.TabItem("📋 Detailed Scores & Probabilities"):
                    table_output = gr.Dataframe(
                        label="Detailed Results",
                        headers=["Text Description", "HQ-CLIP Score", "OpenAI-CLIP Score", "HQ-CLIP Probability", "OpenAI-CLIP Probability"],
                        datatype=["str", "str", "str", "str", "str"],
                        wrap=True,
                    )

    gr.Markdown("---")
    gr.Markdown("### ✨ Try Some Examples")
    gr.Examples(
        examples=[
            ["examples/cat.webp", "a cat sitting by the window", "a dog playing in the yard", "a fluffy feline resting peacefully", "a sleeping kitten"],
            ["examples/dog.jpg", "a golden retriever dog", "a siberian husky", "a dog playing on grass", "a cute cat"],
            ["examples/sunset.jpg", "a beautiful sunset over the mountains", "a city skyline at night", "a vibrant landscape during golden hour", "ocean waves crashing on the shore"]
        ],
        inputs=[image_input] + text_inputs,
        label="Click an example to run the analysis"
    )

    submit_btn.click(
        fn=calculate_similarities,
        inputs=[image_input] + text_inputs,
        outputs=[best_match_output, table_output, plot_output]
    )

if __name__ == "__main__":
    demo.launch(share=True)