AoiNoGeso commited on
Commit
1457867
·
verified ·
1 Parent(s): 88ff7b2

Upload Japanese CLIP model with custom modeling file

Browse files
README.md ADDED
@@ -0,0 +1,123 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ ---
2
+ language: ja
3
+ license: apache-2.0
4
+ tags:
5
+ - clip
6
+ - japanese
7
+ - multimodal
8
+ - image-text
9
+ - computer-vision
10
+ - natural-language-processing
11
+ datasets:
12
+ - stair-captions
13
+ library_name: transformers
14
+ ---
15
+
16
+ # japanese-clip-stair
17
+
18
+ 日本語に特化したCLIPモデルです。STAIR Captionsデータセットで学習されています。
19
+
20
+ ## モデル概要
21
+
22
+ このモデルは、画像とテキストの類似度を計算するマルチモーダルモデルです。
23
+ - 画像エンコーダー: ResNet50
24
+ - テキストエンコーダー: cl-tohoku/bert-base-japanese-v3
25
+ - 学習データ: STAIR Captions
26
+ - 埋め込み次元: 512
27
+
28
+ ## 使用方法
29
+
30
+ ### 基本的な使用例
31
+
32
+ ```python
33
+ from transformers import AutoTokenizer, AutoModel
34
+ from PIL import Image
35
+ import torch
36
+ from torchvision import transforms
37
+ import requests
38
+ from io import BytesIO
39
+
40
+ # モデルとトークナイザーの読み込み
41
+ model = AutoModel.from_pretrained("AoiNoGeso/japanese-clip-stair", trust_remote_code=True)
42
+ tokenizer = AutoTokenizer.from_pretrained("AoiNoGeso/japanese-clip-stair")
43
+
44
+ # 画像前処理関数
45
+ def preprocess_image(image, size=224):
46
+ transform = transforms.Compose([
47
+ transforms.Resize((size, size)),
48
+ transforms.ToTensor(),
49
+ transforms.Normalize(mean=[0.485, 0.456, 0.406], std=[0.229, 0.224, 0.225])
50
+ ])
51
+ if image.mode != 'RGB':
52
+ image = image.convert('RGB')
53
+ return transform(image).unsqueeze(0)
54
+
55
+ # 画像とテキストの準備
56
+ image_url = "https://images.pexels.com/photos/2253275/pexels-photo-2253275.jpeg"
57
+ image = Image.open(BytesIO(requests.get(image_url).content))
58
+ pixel_values = preprocess_image(image)
59
+
60
+ texts = ["犬", "猫", "象", "鳥"]
61
+ text_inputs = tokenizer(texts, padding=True, return_tensors="pt")
62
+
63
+ # 推論実行
64
+ with torch.no_grad():
65
+ outputs = model(
66
+ pixel_values=pixel_values,
67
+ input_ids=text_inputs.input_ids,
68
+ attention_mask=text_inputs.attention_mask
69
+ )
70
+
71
+ # 確率計算
72
+ probs = outputs['logits_per_image'].softmax(dim=-1)
73
+
74
+ # 結果表示
75
+ for i, (text, prob) in enumerate(zip(texts, probs[0])):
76
+ print(f"{text}: {prob:.4f} ({prob*100:.2f}%)")
77
+ ```
78
+
79
+ ### 個別に特徴量を取得する場合
80
+
81
+ ```python
82
+ with torch.no_grad():
83
+ # 画像特徴量のみ取得
84
+ image_features = model.get_image_features(pixel_values)
85
+
86
+ # テキスト特徴量のみ取得
87
+ text_features = model.get_text_features(
88
+ text_inputs.input_ids,
89
+ text_inputs.attention_mask
90
+ )
91
+
92
+ # 手動で類似度計算
93
+ similarity = torch.matmul(image_features, text_features.T)
94
+ probs = similarity.softmax(dim=-1)
95
+ ```
96
+
97
+ ## モデルの性能
98
+
99
+ STAIR Captionsデータセットで学習されており、日本語の画像キャプションタスクに最適化されています。
100
+
101
+ ## 制限事項
102
+
103
+ - 画像は224x224にリサイズされます
104
+ - 日本語テキストに最適化されています
105
+ - PyTorchとtorchvisionが必要です
106
+
107
+ ## ライセンス
108
+
109
+ Apache 2.0
110
+
111
+ ## 引用
112
+
113
+ ```bibtex
114
+ @dataset{stair_captions,
115
+ title={STAIR Captions: Constructing a Large-Scale Japanese Image Caption Dataset},
116
+ author={Yoshikawa, Yuya and Shigeto, Yutaro and Takeuchi, Akikazu},
117
+ year={2017}
118
+ }
119
+ ```
120
+
121
+ ## 使用例
122
+
123
+ 詳細な使用例は `usage_example.py` を参照してください。
__pycache__/modeling_japanese_clip.cpython-310.pyc ADDED
Binary file (3.63 kB). View file
 
config.json ADDED
@@ -0,0 +1,12 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "architectures": [
3
+ "JapaneseCLIPModel"
4
+ ],
5
+ "image_embed_dim": 512,
6
+ "model_type": "japanese-clip",
7
+ "temperature": 0.07,
8
+ "text_embed_dim": 512,
9
+ "text_model_name": "cl-tohoku/bert-base-japanese-v3",
10
+ "torch_dtype": "float32",
11
+ "transformers_version": "4.52.4"
12
+ }
model.safetensors ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:0fe9c3eeeda4442656dcc3e87ac855d2607f7dfe47de268fece777c8ca6eac13
3
+ size 545962740
model_card.json ADDED
@@ -0,0 +1,20 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "model_name": "japanese-clip-stair",
3
+ "model_type": "japanese-clip",
4
+ "language": "ja",
5
+ "license": "apache-2.0",
6
+ "datasets": [
7
+ "stair-captions"
8
+ ],
9
+ "tags": [
10
+ "clip",
11
+ "japanese",
12
+ "multimodal",
13
+ "image-text"
14
+ ],
15
+ "architecture": {
16
+ "image_encoder": "ResNet50",
17
+ "text_encoder": "cl-tohoku/bert-base-japanese-v3",
18
+ "embedding_dim": 512
19
+ }
20
+ }
modeling_japanese_clip.py ADDED
@@ -0,0 +1,121 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import torch
2
+ import torch.nn as nn
3
+ import torch.nn.functional as F
4
+ import numpy as np
5
+ from transformers import (
6
+ AutoTokenizer,
7
+ AutoModel,
8
+ PreTrainedModel,
9
+ PretrainedConfig
10
+ )
11
+ from torchvision.models import resnet50
12
+ from typing import Optional, Dict, Any
13
+
14
+ class JapaneseCLIPConfig(PretrainedConfig):
15
+ """Japanese CLIP モデル設定クラス"""
16
+ model_type = "japanese-clip"
17
+
18
+ def __init__(
19
+ self,
20
+ text_model_name="cl-tohoku/bert-base-japanese-v3",
21
+ image_embed_dim=512,
22
+ text_embed_dim=512,
23
+ temperature=0.07,
24
+ **kwargs
25
+ ):
26
+ super().__init__(**kwargs)
27
+ self.text_model_name = text_model_name
28
+ self.image_embed_dim = image_embed_dim
29
+ self.text_embed_dim = text_embed_dim
30
+ self.temperature = temperature
31
+
32
+ class JapaneseCLIPModel(PreTrainedModel):
33
+ """Hugging Face互換のJapaneseCLIPモデル"""
34
+ config_class = JapaneseCLIPConfig
35
+
36
+ def __init__(self, config):
37
+ super().__init__(config)
38
+
39
+ # 画像エンコーダ(ResNet50ベース)
40
+ self.image_encoder = resnet50(pretrained=True)
41
+ self.image_encoder.fc = nn.Linear(
42
+ self.image_encoder.fc.in_features,
43
+ config.image_embed_dim
44
+ )
45
+
46
+ # テキストエンコーダ(日本語BERT)
47
+ self.text_encoder = AutoModel.from_pretrained(config.text_model_name)
48
+
49
+ # プロジェクション層
50
+ self.text_projection = nn.Linear(
51
+ self.text_encoder.config.hidden_size,
52
+ config.text_embed_dim
53
+ )
54
+ self.image_projection = nn.Linear(
55
+ config.image_embed_dim,
56
+ config.text_embed_dim
57
+ )
58
+
59
+ # 正規化層
60
+ self.image_norm = nn.LayerNorm(config.text_embed_dim)
61
+ self.text_norm = nn.LayerNorm(config.text_embed_dim)
62
+
63
+ # 温度パラメータ
64
+ self.temperature = nn.Parameter(
65
+ torch.ones([]) * np.log(1 / config.temperature)
66
+ )
67
+
68
+ def encode_image(self, pixel_values):
69
+ """画像をエンコード"""
70
+ image_features = self.image_encoder(pixel_values)
71
+ image_features = self.image_projection(image_features)
72
+ image_features = self.image_norm(image_features)
73
+ return F.normalize(image_features, dim=-1)
74
+
75
+ def encode_text(self, input_ids, attention_mask):
76
+ """テキストをエンコード"""
77
+ text_outputs = self.text_encoder(
78
+ input_ids=input_ids,
79
+ attention_mask=attention_mask
80
+ )
81
+ text_features = text_outputs.last_hidden_state[:, 0, :]
82
+ text_features = self.text_projection(text_features)
83
+ text_features = self.text_norm(text_features)
84
+ return F.normalize(text_features, dim=-1)
85
+
86
+ def get_image_features(self, pixel_values):
87
+ """画像特徴量を取得"""
88
+ return self.encode_image(pixel_values)
89
+
90
+ def get_text_features(self, input_ids, attention_mask):
91
+ """テキスト特徴量を取得"""
92
+ return self.encode_text(input_ids, attention_mask)
93
+
94
+ def forward(
95
+ self,
96
+ pixel_values: Optional[torch.Tensor] = None,
97
+ input_ids: Optional[torch.Tensor] = None,
98
+ attention_mask: Optional[torch.Tensor] = None,
99
+ **kwargs
100
+ ) -> Dict[str, torch.Tensor]:
101
+ """順伝播"""
102
+ outputs = {}
103
+
104
+ if pixel_values is not None:
105
+ outputs['image_features'] = self.encode_image(pixel_values)
106
+
107
+ if input_ids is not None and attention_mask is not None:
108
+ outputs['text_features'] = self.encode_text(input_ids, attention_mask)
109
+
110
+ if 'image_features' in outputs and 'text_features' in outputs:
111
+ # 類似度計算
112
+ similarity = torch.matmul(
113
+ outputs['image_features'],
114
+ outputs['text_features'].T
115
+ )
116
+ temperature = self.temperature.exp()
117
+ outputs['logits_per_image'] = similarity * temperature
118
+ outputs['logits_per_text'] = outputs['logits_per_image'].T
119
+ outputs['temperature'] = temperature
120
+
121
+ return outputs
special_tokens_map.json ADDED
@@ -0,0 +1,7 @@
 
 
 
 
 
 
 
 
1
+ {
2
+ "cls_token": "[CLS]",
3
+ "mask_token": "[MASK]",
4
+ "pad_token": "[PAD]",
5
+ "sep_token": "[SEP]",
6
+ "unk_token": "[UNK]"
7
+ }
tokenizer_config.json ADDED
@@ -0,0 +1,64 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "added_tokens_decoder": {
3
+ "0": {
4
+ "content": "[PAD]",
5
+ "lstrip": false,
6
+ "normalized": false,
7
+ "rstrip": false,
8
+ "single_word": false,
9
+ "special": true
10
+ },
11
+ "1": {
12
+ "content": "[UNK]",
13
+ "lstrip": false,
14
+ "normalized": false,
15
+ "rstrip": false,
16
+ "single_word": false,
17
+ "special": true
18
+ },
19
+ "2": {
20
+ "content": "[CLS]",
21
+ "lstrip": false,
22
+ "normalized": false,
23
+ "rstrip": false,
24
+ "single_word": false,
25
+ "special": true
26
+ },
27
+ "3": {
28
+ "content": "[SEP]",
29
+ "lstrip": false,
30
+ "normalized": false,
31
+ "rstrip": false,
32
+ "single_word": false,
33
+ "special": true
34
+ },
35
+ "4": {
36
+ "content": "[MASK]",
37
+ "lstrip": false,
38
+ "normalized": false,
39
+ "rstrip": false,
40
+ "single_word": false,
41
+ "special": true
42
+ }
43
+ },
44
+ "clean_up_tokenization_spaces": false,
45
+ "cls_token": "[CLS]",
46
+ "do_lower_case": false,
47
+ "do_subword_tokenize": true,
48
+ "do_word_tokenize": true,
49
+ "extra_special_tokens": {},
50
+ "jumanpp_kwargs": null,
51
+ "mask_token": "[MASK]",
52
+ "mecab_kwargs": {
53
+ "mecab_dic": "unidic_lite"
54
+ },
55
+ "model_max_length": 512,
56
+ "never_split": null,
57
+ "pad_token": "[PAD]",
58
+ "sep_token": "[SEP]",
59
+ "subword_tokenizer_type": "wordpiece",
60
+ "sudachi_kwargs": null,
61
+ "tokenizer_class": "BertJapaneseTokenizer",
62
+ "unk_token": "[UNK]",
63
+ "word_tokenizer_type": "mecab"
64
+ }
usage_example.py ADDED
@@ -0,0 +1,71 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ #!/usr/bin/env python3
2
+ """
3
+ Japanese CLIP 使用例
4
+ """
5
+
6
+ import io
7
+ import requests
8
+ from PIL import Image
9
+ import torch
10
+ from transformers import AutoTokenizer, AutoModel
11
+ from torchvision import transforms
12
+
13
+ def preprocess_image(image, size=224):
14
+ """画像前処理"""
15
+ transform = transforms.Compose([
16
+ transforms.Resize((size, size)),
17
+ transforms.ToTensor(),
18
+ transforms.Normalize(mean=[0.485, 0.456, 0.406], std=[0.229, 0.224, 0.225])
19
+ ])
20
+ if image.mode != 'RGB':
21
+ image = image.convert('RGB')
22
+ return transform(image).unsqueeze(0)
23
+
24
+ def main():
25
+ # デバイス設定
26
+ device = "cuda" if torch.cuda.is_available() else "cpu"
27
+ print(f"Using device: {device}")
28
+
29
+ # モデルとトークナイザーの読み込み
30
+ print("Loading model and tokenizer...")
31
+ model = AutoModel.from_pretrained("AoiNoGeso/japanese-clip-stair", trust_remote_code=True).to(device)
32
+ tokenizer = AutoTokenizer.from_pretrained("AoiNoGeso/japanese-clip-stair")
33
+
34
+ # 画像取得
35
+ print("Loading image...")
36
+ image_url = "https://images.pexels.com/photos/2253275/pexels-photo-2253275.jpeg"
37
+ response = requests.get(image_url)
38
+ image = Image.open(io.BytesIO(response.content))
39
+ pixel_values = preprocess_image(image).to(device)
40
+
41
+ # テキスト候補
42
+ texts = ["犬", "猫", "象", "鳥", "魚", "花", "車", "建物"]
43
+ text_inputs = tokenizer(texts, padding=True, return_tensors="pt")
44
+ text_inputs = {k: v.to(device) for k, v in text_inputs.items()}
45
+
46
+ # 推論実行
47
+ print("Running inference...")
48
+ with torch.no_grad():
49
+ outputs = model(
50
+ pixel_values=pixel_values,
51
+ input_ids=text_inputs['input_ids'],
52
+ attention_mask=text_inputs['attention_mask']
53
+ )
54
+
55
+ probs = outputs['logits_per_image'].softmax(dim=-1)
56
+
57
+ # 結果表示
58
+ print("\n" + "="*50)
59
+ print("RESULTS")
60
+ print("="*50)
61
+
62
+ probs_cpu = probs.cpu().numpy()[0]
63
+ sorted_indices = probs_cpu.argsort()[::-1]
64
+
65
+ for i, idx in enumerate(sorted_indices):
66
+ text = texts[idx]
67
+ prob = probs_cpu[idx]
68
+ print(f"{i+1:2d}. {text:6s}: {prob:.4f} ({prob*100:.2f}%)")
69
+
70
+ if __name__ == "__main__":
71
+ main()
vocab.txt ADDED
The diff for this file is too large to render. See raw diff