aki-0421/clip-anime-patch400-10k-v1

This is a CLIP model designed for anime character retrieval tasks.

Example

import math
from PIL import Image
from sentence_transformers import SentenceTransformer

def resize_image_for_patch(image: Image.Image, patch_size: int = 14, max_patches: int = 400) -> Image.Image:
    orig_width, orig_height = image.size
    aspect_ratio = orig_width / orig_height

    # Max width and height in pixels under the patch constraint
    max_total_pixels = patch_size * math.sqrt(max_patches)

    if aspect_ratio >= 1:
        # Landscape or square orientation
        target_width = patch_size * int(math.floor(math.sqrt(max_patches * aspect_ratio)))
        target_height = int(target_width / aspect_ratio)
    else:
        # Portrait orientation
        target_height = patch_size * int(math.floor(math.sqrt(max_patches / aspect_ratio)))
        target_width = int(target_height * aspect_ratio)

    # Ensure dimensions are multiples of patch_size
    target_width -= target_width % patch_size
    target_height -= target_height % patch_size

    return image.resize((target_width, target_height), Image.BICUBIC)

# Init model
model = SentenceTransformer("aki-0421/clip-anime-patch400-10k-v1", device="cuda")

images = [
     resize_image_for_patch(Image.open("/home/aki0421/Share/images/00085.png"))
]
image_embeddings = model.encode(images, convert_to_tensor=True)

sentences = [
    "女の子が悲しんでいる。",
    "落ち込んでる人",
    "泣いている",
    "笑っている",
    "ピンクの髪の女の子",
    "赤い髪の女の子",
    "茶色の髪の女の子",
    "赤い目",
    "青い目",
    "曇っている",
    "雨が降っている",
    "晴れている",
    "キッチンにいます。",
    "学校にいる",
    "魔法少女のようだ",
    "戦闘しますか?",
    "男性ですか?",
    "茶色い髪の女の子が悲しんでいるシーン",
    "ピンクの髪の女の子が笑っているシーン"
]
text_embeddings = model.encode(sentences, convert_to_tensor=True)
similarities = model.similarity(text_embeddings, image_embeddings)

print(similarities)

Citation

@misc{
    qwen2.5-VL,
    title = {Qwen2.5-VL},
    url = {https://qwenlm.github.io/blog/qwen2.5-vl/},
    author = {Qwen Team},
    month = {January},
    year = {2025}
}

@misc{
    Ruri,
    title={{Ruri: Japanese General Text Embeddings}}, 
    author={Hayato Tsukagoshi and Ryohei Sasano},
    year={2024},
    eprint={2409.07737},
    archivePrefix={arXiv},
    primaryClass={cs.CL},
    url={https://arxiv.org/abs/2409.07737}, 
}

@misc{
    oshizo2024clipqwen,
    author       = {Oshizo},
    title        = {japanese-clip-qwen2\_vl},
    year         = {2024},
    howpublished = {\url{https://github.com/oshizo/japanese-clip-qwen2_vl}},
    note         = {Accessed: 2025-06-08}
}
Downloads last month
76
Safetensors
Model size
997M params
Tensor type
BF16
·
Inference Providers NEW
This model isn't deployed by any Inference Provider. 🙋 Ask for provider support

Model tree for aki-0421/clip-anime-patch400-10k-v1

Finetuned
(353)
this model