Upload Japanese CLIP model with custom modeling file

Browse files

Files changed (10) hide show

README.md +123 -0
__pycache__/modeling_japanese_clip.cpython-310.pyc +0 -0
config.json +12 -0
model.safetensors +3 -0
model_card.json +20 -0
modeling_japanese_clip.py +121 -0
special_tokens_map.json +7 -0
tokenizer_config.json +64 -0
usage_example.py +71 -0
vocab.txt +0 -0

README.md ADDED Viewed

	@@ -0,0 +1,123 @@

+---
+language: ja
+license: apache-2.0
+tags:
+- clip
+- japanese
+- multimodal
+- image-text
+- computer-vision
+- natural-language-processing
+datasets:
+- stair-captions
+library_name: transformers
+---
+# japanese-clip-stair
+日本語に特化したCLIPモデルです。STAIR Captionsデータセットで学習されています。
+## モデル概要
+このモデルは、画像とテキストの類似度を計算するマルチモーダルモデルです。
+- 画像エンコーダー: ResNet50
+- テキストエンコーダー: cl-tohoku/bert-base-japanese-v3
+- 学習データ: STAIR Captions
+- 埋め込み次元: 512
+## 使用方法
+### 基本的な使用例
+```python
+from transformers import AutoTokenizer, AutoModel
+from PIL import Image
+import torch
+from torchvision import transforms
+import requests
+from io import BytesIO
+# モデルとトークナイザーの読み込み
+model = AutoModel.from_pretrained("AoiNoGeso/japanese-clip-stair", trust_remote_code=True)
+tokenizer = AutoTokenizer.from_pretrained("AoiNoGeso/japanese-clip-stair")
+# 画像前処理関数
+def preprocess_image(image, size=224):
+    transform = transforms.Compose([
+        transforms.Resize((size, size)),
+        transforms.ToTensor(),
+        transforms.Normalize(mean=[0.485, 0.456, 0.406], std=[0.229, 0.224, 0.225])
+    ])
+    if image.mode != 'RGB':
+        image = image.convert('RGB')
+    return transform(image).unsqueeze(0)
+# 画像とテキストの準備
+image_url = "https://images.pexels.com/photos/2253275/pexels-photo-2253275.jpeg"
+image = Image.open(BytesIO(requests.get(image_url).content))
+pixel_values = preprocess_image(image)
+texts = ["犬", "猫", "象", "鳥"]
+text_inputs = tokenizer(texts, padding=True, return_tensors="pt")
+# 推論実行
+with torch.no_grad():
+    outputs = model(
+        pixel_values=pixel_values,
+        input_ids=text_inputs.input_ids,
+        attention_mask=text_inputs.attention_mask
+    )
+    # 確率計算
+    probs = outputs['logits_per_image'].softmax(dim=-1)
+    # 結果表示
+    for i, (text, prob) in enumerate(zip(texts, probs[0])):
+        print(f"{text}: {prob:.4f} ({prob*100:.2f}%)")
+```
+### 個別に特徴量を取得する場合
+```python
+with torch.no_grad():
+    # 画像特徴量のみ取得
+    image_features = model.get_image_features(pixel_values)
+    # テキスト特徴量のみ取得
+    text_features = model.get_text_features(
+        text_inputs.input_ids,
+        text_inputs.attention_mask
+    )
+    # 手動で類似度計算
+    similarity = torch.matmul(image_features, text_features.T)
+    probs = similarity.softmax(dim=-1)
+```
+## モデルの性能
+STAIR Captionsデータセットで学習されており、日本語の画像キャプションタスクに最適化されています。
+## 制限事項
+- 画像は224x224にリサイズされます
+- 日本語テキストに最適化されています
+- PyTorchとtorchvisionが必要です
+## ライセンス
+Apache 2.0
+## 引用
+```bibtex
+@dataset{stair_captions,
+  title={STAIR Captions: Constructing a Large-Scale Japanese Image Caption Dataset},
+  author={Yoshikawa, Yuya and Shigeto, Yutaro and Takeuchi, Akikazu},
+  year={2017}
+}
+```
+## 使用例
+詳細な使用例は `usage_example.py` を参照してください。

__pycache__/modeling_japanese_clip.cpython-310.pyc ADDED Viewed

Binary file (3.63 kB). View file

config.json ADDED Viewed

	@@ -0,0 +1,12 @@

+{
+  "architectures": [
+    "JapaneseCLIPModel"
+  ],
+  "image_embed_dim": 512,
+  "model_type": "japanese-clip",
+  "temperature": 0.07,
+  "text_embed_dim": 512,
+  "text_model_name": "cl-tohoku/bert-base-japanese-v3",
+  "torch_dtype": "float32",
+  "transformers_version": "4.52.4"
+}

model.safetensors ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:0fe9c3eeeda4442656dcc3e87ac855d2607f7dfe47de268fece777c8ca6eac13
+size 545962740

model_card.json ADDED Viewed

	@@ -0,0 +1,20 @@

+{
+  "model_name": "japanese-clip-stair",
+  "model_type": "japanese-clip",
+  "language": "ja",
+  "license": "apache-2.0",
+  "datasets": [
+    "stair-captions"
+  ],
+  "tags": [
+    "clip",
+    "japanese",
+    "multimodal",
+    "image-text"
+  ],
+  "architecture": {
+    "image_encoder": "ResNet50",
+    "text_encoder": "cl-tohoku/bert-base-japanese-v3",
+    "embedding_dim": 512
+  }
+}

modeling_japanese_clip.py ADDED Viewed

	@@ -0,0 +1,121 @@

+import torch
+import torch.nn as nn
+import torch.nn.functional as F
+import numpy as np
+from transformers import (
+    AutoTokenizer,
+    AutoModel,
+    PreTrainedModel,
+    PretrainedConfig
+)
+from torchvision.models import resnet50
+from typing import Optional, Dict, Any
+class JapaneseCLIPConfig(PretrainedConfig):
+    """Japanese CLIP モデル設定クラス"""
+    model_type = "japanese-clip"
+    def __init__(
+        self,
+        text_model_name="cl-tohoku/bert-base-japanese-v3",
+        image_embed_dim=512,
+        text_embed_dim=512,
+        temperature=0.07,
+        **kwargs
+    ):
+        super().__init__(**kwargs)
+        self.text_model_name = text_model_name
+        self.image_embed_dim = image_embed_dim
+        self.text_embed_dim = text_embed_dim
+        self.temperature = temperature
+class JapaneseCLIPModel(PreTrainedModel):
+    """Hugging Face互換のJapaneseCLIPモデル"""
+    config_class = JapaneseCLIPConfig
+    def __init__(self, config):
+        super().__init__(config)
+        # 画像エンコーダ（ResNet50ベース）
+        self.image_encoder = resnet50(pretrained=True)
+        self.image_encoder.fc = nn.Linear(
+            self.image_encoder.fc.in_features,
+            config.image_embed_dim
+        )
+        # テキストエンコーダ（日本語BERT）
+        self.text_encoder = AutoModel.from_pretrained(config.text_model_name)
+        # プロジェクション層
+        self.text_projection = nn.Linear(
+            self.text_encoder.config.hidden_size,
+            config.text_embed_dim
+        )
+        self.image_projection = nn.Linear(
+            config.image_embed_dim,
+            config.text_embed_dim
+        )
+        # 正規化層
+        self.image_norm = nn.LayerNorm(config.text_embed_dim)
+        self.text_norm = nn.LayerNorm(config.text_embed_dim)
+        # 温度パラメータ
+        self.temperature = nn.Parameter(
+            torch.ones([]) * np.log(1 / config.temperature)
+        )
+    def encode_image(self, pixel_values):
+        """画像をエンコード"""
+        image_features = self.image_encoder(pixel_values)
+        image_features = self.image_projection(image_features)
+        image_features = self.image_norm(image_features)
+        return F.normalize(image_features, dim=-1)
+    def encode_text(self, input_ids, attention_mask):
+        """テキストをエンコード"""
+        text_outputs = self.text_encoder(
+            input_ids=input_ids,
+            attention_mask=attention_mask
+        )
+        text_features = text_outputs.last_hidden_state[:, 0, :]
+        text_features = self.text_projection(text_features)
+        text_features = self.text_norm(text_features)
+        return F.normalize(text_features, dim=-1)
+    def get_image_features(self, pixel_values):
+        """画像特徴量を取得"""
+        return self.encode_image(pixel_values)
+    def get_text_features(self, input_ids, attention_mask):
+        """テキスト特徴量を取得"""
+        return self.encode_text(input_ids, attention_mask)
+    def forward(
+        self,
+        pixel_values: Optional[torch.Tensor] = None,
+        input_ids: Optional[torch.Tensor] = None,
+        attention_mask: Optional[torch.Tensor] = None,
+        **kwargs
+    ) -> Dict[str, torch.Tensor]:
+        """順伝播"""
+        outputs = {}
+        if pixel_values is not None:
+            outputs['image_features'] = self.encode_image(pixel_values)
+        if input_ids is not None and attention_mask is not None:
+            outputs['text_features'] = self.encode_text(input_ids, attention_mask)
+        if 'image_features' in outputs and 'text_features' in outputs:
+            # 類似度計算
+            similarity = torch.matmul(
+                outputs['image_features'],
+                outputs['text_features'].T
+            )
+            temperature = self.temperature.exp()
+            outputs['logits_per_image'] = similarity * temperature
+            outputs['logits_per_text'] = outputs['logits_per_image'].T
+            outputs['temperature'] = temperature
+        return outputs

special_tokens_map.json ADDED Viewed

	@@ -0,0 +1,7 @@

+{
+  "cls_token": "[CLS]",
+  "mask_token": "[MASK]",
+  "pad_token": "[PAD]",
+  "sep_token": "[SEP]",
+  "unk_token": "[UNK]"
+}

tokenizer_config.json ADDED Viewed

	@@ -0,0 +1,64 @@

+{
+  "added_tokens_decoder": {
+    "0": {
+      "content": "[PAD]",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "1": {
+      "content": "[UNK]",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "2": {
+      "content": "[CLS]",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "3": {
+      "content": "[SEP]",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    },
+    "4": {
+      "content": "[MASK]",
+      "lstrip": false,
+      "normalized": false,
+      "rstrip": false,
+      "single_word": false,
+      "special": true
+    }
+  },
+  "clean_up_tokenization_spaces": false,
+  "cls_token": "[CLS]",
+  "do_lower_case": false,
+  "do_subword_tokenize": true,
+  "do_word_tokenize": true,
+  "extra_special_tokens": {},
+  "jumanpp_kwargs": null,
+  "mask_token": "[MASK]",
+  "mecab_kwargs": {
+    "mecab_dic": "unidic_lite"
+  },
+  "model_max_length": 512,
+  "never_split": null,
+  "pad_token": "[PAD]",
+  "sep_token": "[SEP]",
+  "subword_tokenizer_type": "wordpiece",
+  "sudachi_kwargs": null,
+  "tokenizer_class": "BertJapaneseTokenizer",
+  "unk_token": "[UNK]",
+  "word_tokenizer_type": "mecab"
+}

usage_example.py ADDED Viewed

	@@ -0,0 +1,71 @@

+#!/usr/bin/env python3
+"""
+Japanese CLIP 使用例
+"""
+import io
+import requests
+from PIL import Image
+import torch
+from transformers import AutoTokenizer, AutoModel
+from torchvision import transforms
+def preprocess_image(image, size=224):
+    """画像前処理"""
+    transform = transforms.Compose([
+        transforms.Resize((size, size)),
+        transforms.ToTensor(),
+        transforms.Normalize(mean=[0.485, 0.456, 0.406], std=[0.229, 0.224, 0.225])
+    ])
+    if image.mode != 'RGB':
+        image = image.convert('RGB')
+    return transform(image).unsqueeze(0)
+def main():
+    # デバイス設定
+    device = "cuda" if torch.cuda.is_available() else "cpu"
+    print(f"Using device: {device}")
+    # モデルとトークナイザーの読み込み
+    print("Loading model and tokenizer...")
+    model = AutoModel.from_pretrained("AoiNoGeso/japanese-clip-stair", trust_remote_code=True).to(device)
+    tokenizer = AutoTokenizer.from_pretrained("AoiNoGeso/japanese-clip-stair")
+    # 画像取得
+    print("Loading image...")
+    image_url = "https://images.pexels.com/photos/2253275/pexels-photo-2253275.jpeg"
+    response = requests.get(image_url)
+    image = Image.open(io.BytesIO(response.content))
+    pixel_values = preprocess_image(image).to(device)
+    # テキスト候補
+    texts = ["犬", "猫", "象", "鳥", "魚", "花", "車", "建物"]
+    text_inputs = tokenizer(texts, padding=True, return_tensors="pt")
+    text_inputs = {k: v.to(device) for k, v in text_inputs.items()}
+    # 推論実行
+    print("Running inference...")
+    with torch.no_grad():
+        outputs = model(
+            pixel_values=pixel_values,
+            input_ids=text_inputs['input_ids'],
+            attention_mask=text_inputs['attention_mask']
+        )
+        probs = outputs['logits_per_image'].softmax(dim=-1)
+    # 結果表示
+    print("\n" + "="*50)
+    print("RESULTS")
+    print("="*50)
+    probs_cpu = probs.cpu().numpy()[0]
+    sorted_indices = probs_cpu.argsort()[::-1]
+    for i, idx in enumerate(sorted_indices):
+        text = texts[idx]
+        prob = probs_cpu[idx]
+        print(f"{i+1:2d}. {text:6s}: {prob:.4f} ({prob*100:.2f}%)")
+if __name__ == "__main__":
+    main()

vocab.txt ADDED Viewed

The diff for this file is too large to render. See raw diff