import gradio as gr
import open_clip
import torch
import numpy as np
import pandas as pd
from PIL import Image
import matplotlib.pyplot as plt
import io
# --- 1. Model Initialization (run only once) ---
print("Loading HQ-CLIP model...")
model_hq, _, preprocess_hq = open_clip.create_model_and_transforms('hf-hub:zhixiangwei/hqclip-openai-large-ft-vlm1b')
tokenizer_hq = open_clip.get_tokenizer('hf-hub:zhixiangwei/hqclip-openai-large-ft-vlm1b')
print("HQ-CLIP model loaded.")
print("Loading standard OpenAI CLIP model...")
model_openai, _, preprocess_openai = open_clip.create_model_and_transforms('ViT-L-14-quickgelu','openai')
tokenizer_openai = open_clip.get_tokenizer('ViT-L-14-quickgelu','openai')
print("OpenAI CLIP model loaded.")
device = "cuda" if torch.cuda.is_available() else "cpu"
device='cpu'
model_hq.to(device)
model_openai.to(device)
print(f"Models moved to {device}.")
# --- 2. Core Logic: Similarity Calculation and Comparison ---
def calculate_similarities(image, *texts):
texts = [t for t in texts if t and t.strip()]
if not texts:
# Return empty/default values for all outputs
return "Please enter at least one valid text description.", pd.DataFrame(), None
# --- Helper function to get scores from a model ---
def get_scores(model, preprocess, tokenizer, img_input, text_inputs,ttype='hqclip'):
# Preprocess image
# The DeprecationWarning is fixed here by removing the mode='RGB' argument
img = Image.fromarray(img_input.astype('uint8'))
img_tensor = preprocess(img).unsqueeze(0).to(device)
# --- FIX APPLIED HERE ---
# Tokenize texts one by one and stack them, which works for ALL tokenizer types.
# The original tokenizer for OpenAI models cannot handle a list of strings directly.
# tokenized_texts = torch.cat([tokenizer(text) for text in text_inputs]).to(device)
print(text_inputs,ttype)
if ttype=='hqclip':
tokenized_texts = tokenizer(text_inputs).to(device)
else:
tokenized_texts = tokenizer(text_inputs,77).to(device)
with torch.no_grad():
image_features = model.encode_image(img_tensor)
text_features = model.encode_text(tokenized_texts)
image_features /= image_features.norm(dim=-1, keepdim=True)
text_features /= text_features.norm(dim=-1, keepdim=True)
# Standard practice: scale by 100 for better score separation
similarities = (100.0 * image_features @ text_features.T).squeeze(0)
probs = torch.softmax(similarities, dim=-1)
return similarities.cpu().numpy(), probs.cpu().numpy()
# --- Calculate scores for both models ---
# similarities_hq, probs_hq = get_scores(model_hq, preprocess_hq, tokenizer_hq, image, texts)
print('hqclip yes')
similarities_openai, probs_openai = get_scores(model_openai, preprocess_openai, tokenizer_openai, image, texts,ttype='openai')
print('openai yes')
# --- Identify Best Match for each model ---
best_idx_hq = np.argmax(similarities_hq)
best_idx_openai = np.argmax(similarities_openai)
best_match_output = f"""
### 🏆 Best Match Analysis
**HQ-CLIP's Choice:** **'{texts[best_idx_hq]}'** (Score: {similarities_hq[best_idx_hq]:.2f})
**OpenAI CLIP's Choice:** **'{texts[best_idx_openai]}'** (Score: {similarities_openai[best_idx_openai]:.2f})
"""
# --- Create Results DataFrame ---
results_data = []
for i, text in enumerate(texts):
results_data.append({
"Text Description": text,
"HQ-CLIP Score": f"{similarities_hq[i]:.4f}",
"OpenAI-CLIP Score": f"{similarities_openai[i]:.4f}",
"HQ-CLIP Probability": f"{probs_hq[i]:.2%}",
"OpenAI-CLIP Probability": f"{probs_openai[i]:.2%}"
})
df = pd.DataFrame(results_data)
# --- Create Grouped Bar Chart for Comparison ---
plt.style.use('seaborn-v0_8-whitegrid')
fig, ax = plt.subplots(figsize=(14, 8))
x = np.arange(len(texts))
width = 0.35
rects1 = ax.bar(x - width/2, similarities_hq, width, label='HQ-CLIP', color='#4A90E2', edgecolor='white', linewidth=1.5)
rects2 = ax.bar(x + width/2, similarities_openai, width, label='OpenAI CLIP', color='#F5A623', edgecolor='white', linewidth=1.5)
ax.set_ylabel('Similarity Score', fontsize=14, fontweight='bold')
ax.set_title('Comparison: HQ-CLIP vs. OpenAI CLIP Similarity Scores', fontsize=18, fontweight='bold', pad=20)
ax.set_xticks(x)
display_texts = [text[:25] + '...' if len(text) > 25 else text for text in texts]
ax.set_xticklabels(display_texts, rotation=40, ha="right", fontsize=12)
ax.legend(fontsize=12)
ax.bar_label(rects1, padding=3, fmt='%.2f', fontsize=10, fontweight='bold')
ax.bar_label(rects2, padding=3, fmt='%.2f', fontsize=10, fontweight='bold')
max_val = max(np.max(similarities_hq), np.max(similarities_openai)) if len(texts) > 0 else 100
ax.set_ylim(0, max_val * 1.2)
fig.tight_layout()
buf = io.BytesIO()
plt.savefig(buf, format='png', dpi=200)
buf.seek(0)
plot_img = Image.open(buf)
plt.close(fig)
return best_match_output, df, plot_img
# --- 5. Gradio Interface ---
css = """
.gradio-container { font-family: 'Inter', sans-serif; }
.main-header { text-align: center; }
.results-group { background: #F8F9FA; border-radius: 15px; padding: 20px; }
.custom-button {
background: linear-gradient(45deg, #4A90E2 0%, #007BFF 100%);
border-radius: 25px;
font-size: 1.1em;
padding: 12px 30px;
}
"""
with gr.Blocks(css=css, theme=gr.themes.Soft(primary_hue="blue"), title="HQ-CLIP vs OpenAI CLIP") as demo:
gr.Markdown(
"""
🎨 HQ-CLIP vs. OpenAI CLIP: A Visual Comparison
Upload an image and provide text descriptions to see how the fine-tuned HQ-CLIP model compares against the standard OpenAI CLIP.
The goal is to see if HQ-CLIP provides more nuanced and accurate similarity scores.
"""
)
with gr.Row(equal_height=False):
with gr.Column(scale=1):
with gr.Group():
image_input = gr.Image(label="🖼️ Upload Your Image", type="numpy", height=350)
with gr.Accordion("📝 Enter Text Descriptions (up to 4)", open=True):
text_inputs = [
gr.Textbox(label=f"Text Description {i+1}", placeholder="e.g., 'a photo of a cat sleeping'")
for i in range(4)
]
submit_btn = gr.Button("🔍 Analyze & Compare", variant="primary", elem_classes="custom-button")
with gr.Column(scale=2):
best_match_output = gr.Markdown(label="🏆 Best Match")
with gr.Tabs():
with gr.TabItem("📊 Comparison Chart"):
plot_output = gr.Image(label="Similarity Visualization", type="pil", show_label=False, height=500)
with gr.TabItem("📋 Detailed Scores & Probabilities"):
table_output = gr.Dataframe(
label="Detailed Results",
headers=["Text Description", "HQ-CLIP Score", "OpenAI-CLIP Score", "HQ-CLIP Probability", "OpenAI-CLIP Probability"],
datatype=["str", "str", "str", "str", "str"],
wrap=True,
)
gr.Markdown("---")
gr.Markdown("### ✨ Try Some Examples")
gr.Examples(
examples=[
["examples/cat.webp", "a cat sitting by the window", "a dog playing in the yard", "a fluffy feline resting peacefully", "a sleeping kitten"],
["examples/dog.jpg", "a golden retriever dog", "a siberian husky", "a dog playing on grass", "a cute cat"],
["examples/sunset.jpg", "a beautiful sunset over the mountains", "a city skyline at night", "a vibrant landscape during golden hour", "ocean waves crashing on the shore"]
],
inputs=[image_input] + text_inputs,
label="Click an example to run the analysis"
)
submit_btn.click(
fn=calculate_similarities,
inputs=[image_input] + text_inputs,
outputs=[best_match_output, table_output, plot_output]
)
if __name__ == "__main__":
demo.launch(share=True)