import gradio as gr import open_clip import torch import numpy as np import pandas as pd from PIL import Image import matplotlib.pyplot as plt import io # --- 1. Model Initialization (run only once) --- print("Loading HQ-CLIP model...") model_hq, _, preprocess_hq = open_clip.create_model_and_transforms('hf-hub:zhixiangwei/hqclip-openai-large-ft-vlm1b') tokenizer_hq = open_clip.get_tokenizer('hf-hub:zhixiangwei/hqclip-openai-large-ft-vlm1b') print("HQ-CLIP model loaded.") print("Loading standard OpenAI CLIP model...") model_openai, _, preprocess_openai = open_clip.create_model_and_transforms('ViT-L-14-quickgelu','openai') tokenizer_openai = open_clip.get_tokenizer('ViT-L-14-quickgelu','openai') print("OpenAI CLIP model loaded.") device = "cuda" if torch.cuda.is_available() else "cpu" device='cpu' model_hq.to(device) model_openai.to(device) print(f"Models moved to {device}.") # --- 2. Core Logic: Similarity Calculation and Comparison --- def calculate_similarities(image, *texts): texts = [t for t in texts if t and t.strip()] if not texts: # Return empty/default values for all outputs return "Please enter at least one valid text description.", pd.DataFrame(), None # --- Helper function to get scores from a model --- def get_scores(model, preprocess, tokenizer, img_input, text_inputs,ttype='hqclip'): # Preprocess image # The DeprecationWarning is fixed here by removing the mode='RGB' argument img = Image.fromarray(img_input.astype('uint8')) img_tensor = preprocess(img).unsqueeze(0).to(device) # --- FIX APPLIED HERE --- # Tokenize texts one by one and stack them, which works for ALL tokenizer types. # The original tokenizer for OpenAI models cannot handle a list of strings directly. # tokenized_texts = torch.cat([tokenizer(text) for text in text_inputs]).to(device) print(text_inputs,ttype) if ttype=='hqclip': tokenized_texts = tokenizer(text_inputs).to(device) else: tokenized_texts = tokenizer(text_inputs,77).to(device) with torch.no_grad(): image_features = model.encode_image(img_tensor) text_features = model.encode_text(tokenized_texts) image_features /= image_features.norm(dim=-1, keepdim=True) text_features /= text_features.norm(dim=-1, keepdim=True) # Standard practice: scale by 100 for better score separation similarities = (100.0 * image_features @ text_features.T).squeeze(0) probs = torch.softmax(similarities, dim=-1) return similarities.cpu().numpy(), probs.cpu().numpy() # --- Calculate scores for both models --- # similarities_hq, probs_hq = get_scores(model_hq, preprocess_hq, tokenizer_hq, image, texts) print('hqclip yes') similarities_openai, probs_openai = get_scores(model_openai, preprocess_openai, tokenizer_openai, image, texts,ttype='openai') print('openai yes') # --- Identify Best Match for each model --- best_idx_hq = np.argmax(similarities_hq) best_idx_openai = np.argmax(similarities_openai) best_match_output = f""" ### 🏆 Best Match Analysis **HQ-CLIP's Choice:** **'{texts[best_idx_hq]}'** (Score: {similarities_hq[best_idx_hq]:.2f}) **OpenAI CLIP's Choice:** **'{texts[best_idx_openai]}'** (Score: {similarities_openai[best_idx_openai]:.2f}) """ # --- Create Results DataFrame --- results_data = [] for i, text in enumerate(texts): results_data.append({ "Text Description": text, "HQ-CLIP Score": f"{similarities_hq[i]:.4f}", "OpenAI-CLIP Score": f"{similarities_openai[i]:.4f}", "HQ-CLIP Probability": f"{probs_hq[i]:.2%}", "OpenAI-CLIP Probability": f"{probs_openai[i]:.2%}" }) df = pd.DataFrame(results_data) # --- Create Grouped Bar Chart for Comparison --- plt.style.use('seaborn-v0_8-whitegrid') fig, ax = plt.subplots(figsize=(14, 8)) x = np.arange(len(texts)) width = 0.35 rects1 = ax.bar(x - width/2, similarities_hq, width, label='HQ-CLIP', color='#4A90E2', edgecolor='white', linewidth=1.5) rects2 = ax.bar(x + width/2, similarities_openai, width, label='OpenAI CLIP', color='#F5A623', edgecolor='white', linewidth=1.5) ax.set_ylabel('Similarity Score', fontsize=14, fontweight='bold') ax.set_title('Comparison: HQ-CLIP vs. OpenAI CLIP Similarity Scores', fontsize=18, fontweight='bold', pad=20) ax.set_xticks(x) display_texts = [text[:25] + '...' if len(text) > 25 else text for text in texts] ax.set_xticklabels(display_texts, rotation=40, ha="right", fontsize=12) ax.legend(fontsize=12) ax.bar_label(rects1, padding=3, fmt='%.2f', fontsize=10, fontweight='bold') ax.bar_label(rects2, padding=3, fmt='%.2f', fontsize=10, fontweight='bold') max_val = max(np.max(similarities_hq), np.max(similarities_openai)) if len(texts) > 0 else 100 ax.set_ylim(0, max_val * 1.2) fig.tight_layout() buf = io.BytesIO() plt.savefig(buf, format='png', dpi=200) buf.seek(0) plot_img = Image.open(buf) plt.close(fig) return best_match_output, df, plot_img # --- 5. Gradio Interface --- css = """ .gradio-container { font-family: 'Inter', sans-serif; } .main-header { text-align: center; } .results-group { background: #F8F9FA; border-radius: 15px; padding: 20px; } .custom-button { background: linear-gradient(45deg, #4A90E2 0%, #007BFF 100%); border-radius: 25px; font-size: 1.1em; padding: 12px 30px; } """ with gr.Blocks(css=css, theme=gr.themes.Soft(primary_hue="blue"), title="HQ-CLIP vs OpenAI CLIP") as demo: gr.Markdown( """

🎨 HQ-CLIP vs. OpenAI CLIP: A Visual Comparison

Upload an image and provide text descriptions to see how the fine-tuned HQ-CLIP model compares against the standard OpenAI CLIP.
The goal is to see if HQ-CLIP provides more nuanced and accurate similarity scores.

""" ) with gr.Row(equal_height=False): with gr.Column(scale=1): with gr.Group(): image_input = gr.Image(label="🖼️ Upload Your Image", type="numpy", height=350) with gr.Accordion("📝 Enter Text Descriptions (up to 4)", open=True): text_inputs = [ gr.Textbox(label=f"Text Description {i+1}", placeholder="e.g., 'a photo of a cat sleeping'") for i in range(4) ] submit_btn = gr.Button("🔍 Analyze & Compare", variant="primary", elem_classes="custom-button") with gr.Column(scale=2): best_match_output = gr.Markdown(label="🏆 Best Match") with gr.Tabs(): with gr.TabItem("📊 Comparison Chart"): plot_output = gr.Image(label="Similarity Visualization", type="pil", show_label=False, height=500) with gr.TabItem("📋 Detailed Scores & Probabilities"): table_output = gr.Dataframe( label="Detailed Results", headers=["Text Description", "HQ-CLIP Score", "OpenAI-CLIP Score", "HQ-CLIP Probability", "OpenAI-CLIP Probability"], datatype=["str", "str", "str", "str", "str"], wrap=True, ) gr.Markdown("---") gr.Markdown("### ✨ Try Some Examples") gr.Examples( examples=[ ["examples/cat.webp", "a cat sitting by the window", "a dog playing in the yard", "a fluffy feline resting peacefully", "a sleeping kitten"], ["examples/dog.jpg", "a golden retriever dog", "a siberian husky", "a dog playing on grass", "a cute cat"], ["examples/sunset.jpg", "a beautiful sunset over the mountains", "a city skyline at night", "a vibrant landscape during golden hour", "ocean waves crashing on the shore"] ], inputs=[image_input] + text_inputs, label="Click an example to run the analysis" ) submit_btn.click( fn=calculate_similarities, inputs=[image_input] + text_inputs, outputs=[best_match_output, table_output, plot_output] ) if __name__ == "__main__": demo.launch(share=True)