import gradio as gr import open_clip import torch import numpy as np from PIL import Image # --- 1. Model Initialization (保持不变) --- print("Loading HQ-CLIP model...") model_hq, _, preprocess_hq = open_clip.create_model_and_transforms('hf-hub:zhixiangwei/hqclip-openai-large-ft-vlm1b') tokenizer_hq = open_clip.get_tokenizer('hf-hub:zhixiangwei/hqclip-openai-large-ft-vlm1b') print("HQ-CLIP model loaded.") print("Loading standard OpenAI CLIP model...") model_openai, _, preprocess_openai = open_clip.create_model_and_transforms('ViT-L-14-quickgelu', 'openai') tokenizer_openai = open_clip.get_tokenizer('ViT-L-14-quickgelu') print("OpenAI CLIP model loaded.") device = "cuda" if torch.cuda.is_available() else "cpu" device='cpu' # 保持您的强制CPU设置 model_hq.to(device) model_openai.to(device) print(f"Models moved to {device}.") # --- 2. Core Logic: Refactored for Simplicity --- def calculate_similarities(image, texts_str): """ 重构后的核心函数: - 接收一个换行分隔的字符串作为文本输入。 - 返回一个简洁的分析文本和两个字典,直接用于驱动 gr.Label 组件。 """ # 将换行分隔的字符串解析为文本列表 texts = [t.strip() for t in texts_str.split('\n') if t.strip()] # 修复:检查 image 是否为 None,而不是 image.any(),以避免在初始加载时出错 if image is None or not texts: return "请上传一张图片并输入至少一个文本描述。", None, None # --- 内部辅助函数 (保持不变) --- def get_scores(model, preprocess, tokenizer, img_input, text_inputs): img = Image.fromarray(img_input.astype('uint8')) img_tensor = preprocess(img).unsqueeze(0).to(device) tokenized_texts = torch.cat([tokenizer(text) for text in text_inputs]).to(device) with torch.no_grad(): image_features = model.encode_image(img_tensor) text_features = model.encode_text(tokenized_texts) image_features /= image_features.norm(dim=-1, keepdim=True) text_features /= text_features.norm(dim=-1, keepdim=True) similarities = (100.0 * image_features @ text_features.T).squeeze(0) probs = torch.softmax(similarities, dim=-1) return similarities.cpu().numpy(), probs.cpu().numpy() # --- 为两个模型计算分数和概率 --- similarities_hq, probs_hq = get_scores(model_hq, preprocess_hq, tokenizer_hq, image, texts) similarities_openai, probs_openai = get_scores(model_openai, preprocess_openai, tokenizer_openai, image, texts) # --- 准备 gr.Label 的输出 --- hq_results = {text: prob for text, prob in zip(texts, probs_hq)} openai_results = {text: prob for text, prob in zip(texts, probs_openai)} # --- 准备顶部的最佳匹配分析文本 --- best_idx_hq = np.argmax(similarities_hq) best_idx_openai = np.argmax(similarities_openai) best_match_output = f""" ### 🏆 Best Match Analysis **HQ-CLIP's Choice:** **'{texts[best_idx_hq]}'** (Probability: {probs_hq[best_idx_hq]:.2%}) | **OpenAI CLIP's Choice:** **'{texts[best_idx_openai]}'** (Probability: {probs_openai[best_idx_openai]:.2%}) """ return hq_results, openai_results # --- 3. Gradio Interface: Rebuilt with Default Loading --- # 步骤 1: 将示例数据提取到变量中 examples_list = [ ["examples/mnls.jpeg", "An oil painting of a smiling, long-haired woman\nAn oil painting of a sad, long-haired woman\nA sketch of a smiling, long-haired woman\nA photo of a smiling, long-haired woman"], ["examples/su7s.jpg", "a blue car, with black wheels\na blue car, with blue wheels\na black car, with blue wheels\na black car, with black wheels"], ] # 步骤 2: 创建一个在应用加载时运行的函数 def load_default_example(): print("Loading default example...") # 获取第一个示例的数据 image_path, texts_str = examples_list[0] # 将图片路径加载为 numpy 数组 image = np.array(Image.open(image_path)) # 调用主函数进行计算 return [image,texts_str]+list(calculate_similarities(image, texts_str)) with gr.Blocks(theme=gr.themes.Soft(primary_hue="blue"), title="HQ-CLIP vs OpenAI CLIP") as demo: gr.Markdown( """
Upload an image and provide text descriptions (one per line) to compare models.