LDanielBlueway commited on
Commit
a30baa4
·
verified ·
1 Parent(s): c4dbae8

Upload 11 files

Browse files
config.json ADDED
@@ -0,0 +1,71 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "_name_or_path": "jinaai/jina-clip-v1",
3
+ "add_projections": false,
4
+ "architectures": [
5
+ "JinaCLIPModel"
6
+ ],
7
+ "auto_map": {
8
+ "AutoConfig": "jinaai/jina-clip-implementation--configuration_clip.JinaCLIPConfig",
9
+ "AutoModel": "jinaai/jina-clip-implementation--modeling_clip.JinaCLIPModel"
10
+ },
11
+ "initializer_factor": 1.0,
12
+ "logit_scale_init_value": 2.6592,
13
+ "model_type": "jina_clip",
14
+ "projection_dim": 768,
15
+ "text_config": {
16
+ "_name_or_path": "",
17
+ "embed_dim": 768,
18
+ "hf_model_config_kwargs": {
19
+ "use_flash_attn": false
20
+ },
21
+ "hf_model_name_or_path": "jinaai/jina-bert-flash-implementation",
22
+ "model_type": "jina_clip_text",
23
+ "output_attentions": false,
24
+ "output_hidden_states": false,
25
+ "output_scores": false,
26
+ "pad_token_id": null,
27
+ "pooler_type": "mean_pooler",
28
+ "proj_bias": false,
29
+ "proj_type": null,
30
+ "transformers_version": "4.36.2",
31
+ "use_bfloat16": false
32
+ },
33
+ "torch_dtype": "float32",
34
+ "transformers_version": null,
35
+ "vision_config": {
36
+ "_name_or_path": "",
37
+ "embed_dim": 768,
38
+ "fused_layer_norm": false,
39
+ "head_width": 64,
40
+ "image_size": 224,
41
+ "intp_freq": false,
42
+ "layers": 12,
43
+ "ls_init_value": null,
44
+ "mlp_ratio": 2.6667,
45
+ "model_type": "jina_clip_vision",
46
+ "naive_swiglu": true,
47
+ "output_attentions": false,
48
+ "output_hidden_states": false,
49
+ "output_scores": false,
50
+ "pad_token_id": null,
51
+ "patch_dropout": 0.1,
52
+ "patch_size": 16,
53
+ "post_norm": false,
54
+ "prefix": null,
55
+ "problem_type": null,
56
+ "proj_type": null,
57
+ "pruned_heads": {},
58
+ "pt_hw_seq_len": 14,
59
+ "qkv_bias": true,
60
+ "remove_invalid_values": false,
61
+ "return_dict": true,
62
+ "return_dict_in_generate": false,
63
+ "rope_embeddings": true,
64
+ "subln": true,
65
+ "tie_word_embeddings": true,
66
+ "transformers_version": "4.36.2",
67
+ "use_bfloat16": false,
68
+ "width": 768,
69
+ "x_attention": false
70
+ }
71
+ }
config_sentence_transformers.json ADDED
@@ -0,0 +1,10 @@
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "__version__": {
3
+ "sentence_transformers": "3.1.0",
4
+ "transformers": "4.41.2",
5
+ "pytorch": "2.3.1+cu121"
6
+ },
7
+ "prompts": {},
8
+ "default_prompt_name": null,
9
+ "similarity_fn_name": "cosine"
10
+ }
custom_st.py ADDED
@@ -0,0 +1,203 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import base64
2
+ import json
3
+ import os
4
+ from io import BytesIO
5
+ from typing import Any, Dict, List, Literal, Optional, Union
6
+
7
+ import requests
8
+ import torch
9
+ from PIL import Image
10
+ from torch import nn
11
+ from transformers import AutoConfig, AutoImageProcessor, AutoModel, AutoTokenizer
12
+
13
+
14
+ class Transformer(nn.Module):
15
+ """Huggingface AutoModel to generate token embeddings.
16
+ Loads the correct class, e.g. BERT / RoBERTa etc.
17
+
18
+ Args:
19
+ model_name_or_path: Huggingface models name
20
+ (https://huggingface.co/models)
21
+ max_seq_length: Truncate any inputs longer than max_seq_length
22
+ model_args: Keyword arguments passed to the Huggingface
23
+ Transformers model
24
+ tokenizer_args: Keyword arguments passed to the Huggingface
25
+ Transformers tokenizer
26
+ config_args: Keyword arguments passed to the Huggingface
27
+ Transformers config
28
+ cache_dir: Cache dir for Huggingface Transformers to store/load
29
+ models
30
+ do_lower_case: If true, lowercases the input (independent if the
31
+ model is cased or not)
32
+ tokenizer_name_or_path: Name or path of the tokenizer. When
33
+ None, then model_name_or_path is used
34
+ """
35
+
36
+ def __init__(
37
+ self,
38
+ model_name_or_path: str,
39
+ max_seq_length: Optional[int] = None,
40
+ model_args: Optional[Dict[str, Any]] = None,
41
+ tokenizer_args: Optional[Dict[str, Any]] = None,
42
+ config_args: Optional[Dict[str, Any]] = None,
43
+ cache_dir: Optional[str] = None,
44
+ do_lower_case: bool = False,
45
+ tokenizer_name_or_path: str = None,
46
+ backend: Literal['torch', 'onnx', 'openvino'] = 'torch',
47
+ **_,
48
+ ) -> None:
49
+ super(Transformer, self).__init__()
50
+ if backend != 'torch':
51
+ raise ValueError(
52
+ f'Backend \'{backend}\' is not supported, please use \'torch\' instead'
53
+ )
54
+
55
+ self.config_keys = ["max_seq_length", "do_lower_case"]
56
+ self.do_lower_case = do_lower_case
57
+ if model_args is None:
58
+ model_args = {}
59
+ if tokenizer_args is None:
60
+ tokenizer_args = {}
61
+ if config_args is None:
62
+ config_args = {}
63
+
64
+ config = AutoConfig.from_pretrained(
65
+ model_name_or_path, **config_args, cache_dir=cache_dir
66
+ )
67
+ self.jina_clip = AutoModel.from_pretrained(
68
+ model_name_or_path, config=config, cache_dir=cache_dir, **model_args
69
+ )
70
+
71
+ if max_seq_length is not None and "model_max_length" not in tokenizer_args:
72
+ tokenizer_args["model_max_length"] = max_seq_length
73
+ self.tokenizer = AutoTokenizer.from_pretrained(
74
+ (
75
+ tokenizer_name_or_path
76
+ if tokenizer_name_or_path is not None
77
+ else model_name_or_path
78
+ ),
79
+ cache_dir=cache_dir,
80
+ **tokenizer_args,
81
+ )
82
+ self.preprocessor = AutoImageProcessor.from_pretrained(
83
+ (
84
+ tokenizer_name_or_path
85
+ if tokenizer_name_or_path is not None
86
+ else model_name_or_path
87
+ ),
88
+ cache_dir=cache_dir,
89
+ **tokenizer_args,
90
+ )
91
+
92
+ # No max_seq_length set. Try to infer from model
93
+ if max_seq_length is None:
94
+ if (
95
+ hasattr(self.jina_clip, "config")
96
+ and hasattr(self.jina_clip.config, "max_position_embeddings")
97
+ and hasattr(self.tokenizer, "model_max_length")
98
+ ):
99
+ max_seq_length = min(
100
+ self.jina_clip.config.max_position_embeddings,
101
+ self.tokenizer.model_max_length,
102
+ )
103
+
104
+ self.max_seq_length = max_seq_length
105
+
106
+ if tokenizer_name_or_path is not None:
107
+ self.jina_clip.config.tokenizer_class = self.tokenizer.__class__.__name__
108
+
109
+ def forward(
110
+ self, features: Dict[str, torch.Tensor]
111
+ ) -> Dict[str, torch.Tensor]:
112
+ """Returns token_embeddings, cls_token"""
113
+ if "input_ids" in features:
114
+ embedding = self.jina_clip.get_text_features(
115
+ input_ids=features["input_ids"]
116
+ )
117
+ else:
118
+ embedding = self.jina_clip.get_image_features(
119
+ pixel_values=features["pixel_values"]
120
+ )
121
+ return {"sentence_embedding": embedding}
122
+
123
+ def get_word_embedding_dimension(self) -> int:
124
+ return self.config.text_config.embed_dim
125
+
126
+ def decode_data_image(data_image_str):
127
+ header, data = data_image_str.split(',', 1)
128
+ image_data = base64.b64decode(data)
129
+ return Image.open(BytesIO(image_data))
130
+
131
+ def tokenize(
132
+ self, batch: Union[List[str]], padding: Union[str, bool] = True
133
+ ) -> Dict[str, torch.Tensor]:
134
+ """Tokenizes a text and maps tokens to token-ids"""
135
+ images = []
136
+ texts = []
137
+ for sample in batch:
138
+ if isinstance(sample, str):
139
+ if sample.startswith('http'):
140
+ response = requests.get(sample)
141
+ images.append(Image.open(BytesIO(response.content)).convert('RGB'))
142
+ elif sample.startswith('data:image/'):
143
+ images.append(self.decode_data_image(sample).convert('RGB'))
144
+ else:
145
+ # TODO: Make sure that Image.open fails for non-image files
146
+ try:
147
+ images.append(Image.open(sample).convert('RGB'))
148
+ except:
149
+ texts.append(sample)
150
+ elif isinstance(sample, Image.Image):
151
+ images.append(sample.convert('RGB'))
152
+
153
+ if images and texts:
154
+ raise ValueError('Batch must contain either images or texts, not both')
155
+
156
+ if texts:
157
+ return self.tokenizer(
158
+ texts,
159
+ padding=padding,
160
+ truncation="longest_first",
161
+ return_tensors="pt",
162
+ max_length=self.max_seq_length,
163
+ )
164
+ elif images:
165
+ return self.preprocessor(images)
166
+ return {}
167
+
168
+ def save(self, output_path: str, safe_serialization: bool = True) -> None:
169
+ self.jina_clip.save_pretrained(
170
+ output_path, safe_serialization=safe_serialization
171
+ )
172
+ self.tokenizer.save_pretrained(output_path)
173
+ self.preprocessor.save_pretrained(output_path)
174
+
175
+ @staticmethod
176
+ def load(input_path: str) -> "Transformer":
177
+ # Old classes used other config names than 'sentence_bert_config.json'
178
+ for config_name in [
179
+ "sentence_bert_config.json",
180
+ "sentence_roberta_config.json",
181
+ "sentence_distilbert_config.json",
182
+ "sentence_camembert_config.json",
183
+ "sentence_albert_config.json",
184
+ "sentence_xlm-roberta_config.json",
185
+ "sentence_xlnet_config.json",
186
+ ]:
187
+ sbert_config_path = os.path.join(input_path, config_name)
188
+ if os.path.exists(sbert_config_path):
189
+ break
190
+
191
+ with open(sbert_config_path) as fIn:
192
+ config = json.load(fIn)
193
+ # Don't allow configs to set trust_remote_code
194
+ if "model_args" in config and "trust_remote_code" in config["model_args"]:
195
+ config["model_args"].pop("trust_remote_code")
196
+ if (
197
+ "tokenizer_args" in config
198
+ and "trust_remote_code" in config["tokenizer_args"]
199
+ ):
200
+ config["tokenizer_args"].pop("trust_remote_code")
201
+ if "config_args" in config and "trust_remote_code" in config["config_args"]:
202
+ config["config_args"].pop("trust_remote_code")
203
+ return Transformer(model_name_or_path=input_path, **config)
model.safetensors ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:5140d6df851d217296a10b3961ece2850d22b35bff37af948f2d9db33ae4aec2
3
+ size 890733860
modules.json ADDED
@@ -0,0 +1,14 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ [
2
+ {
3
+ "idx":0,
4
+ "name":"0",
5
+ "path":"",
6
+ "type":"custom_st.Transformer"
7
+ },
8
+ {
9
+ "idx":2,
10
+ "name":"2",
11
+ "path":"2_Normalize",
12
+ "type":"sentence_transformers.models.Normalize"
13
+ }
14
+ ]
preprocessor_config.json ADDED
@@ -0,0 +1,22 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "auto_map": {
3
+ "AutoImageProcessor": "jinaai/jina-clip-implementation--processing_clip.JinaCLIPImageProcessor",
4
+ "AutoProcessor": "jinaai/jina-clip-implementation--processing_clip.JinaCLIPProcessor"
5
+ },
6
+ "fill_color": 0,
7
+ "image_processor_type": "JinaCLIPImageProcessor",
8
+ "interpolation": "bicubic",
9
+ "mean": [
10
+ 0.48145466,
11
+ 0.4578275,
12
+ 0.40821073
13
+ ],
14
+ "processor_class": "JinaCLIPProcessor",
15
+ "resize_mode": "shortest",
16
+ "size": 224,
17
+ "std": [
18
+ 0.26862954,
19
+ 0.26130258,
20
+ 0.27577711
21
+ ]
22
+ }
pytorch_model.bin ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:cc0a69757ff58c708bc773bd911d99a6d1e25be2b0eb4fb2e06640848dbb44a9
3
+ size 890820542
special_tokens_map.json ADDED
@@ -0,0 +1,37 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "cls_token": {
3
+ "content": "[CLS]",
4
+ "lstrip": false,
5
+ "normalized": false,
6
+ "rstrip": false,
7
+ "single_word": false
8
+ },
9
+ "mask_token": {
10
+ "content": "[MASK]",
11
+ "lstrip": false,
12
+ "normalized": false,
13
+ "rstrip": false,
14
+ "single_word": false
15
+ },
16
+ "pad_token": {
17
+ "content": "[PAD]",
18
+ "lstrip": false,
19
+ "normalized": false,
20
+ "rstrip": false,
21
+ "single_word": false
22
+ },
23
+ "sep_token": {
24
+ "content": "[SEP]",
25
+ "lstrip": false,
26
+ "normalized": false,
27
+ "rstrip": false,
28
+ "single_word": false
29
+ },
30
+ "unk_token": {
31
+ "content": "[UNK]",
32
+ "lstrip": false,
33
+ "normalized": false,
34
+ "rstrip": false,
35
+ "single_word": false
36
+ }
37
+ }
tokenizer.json ADDED
The diff for this file is too large to render. See raw diff
 
tokenizer_config.json ADDED
@@ -0,0 +1,57 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "added_tokens_decoder": {
3
+ "0": {
4
+ "content": "[PAD]",
5
+ "lstrip": false,
6
+ "normalized": false,
7
+ "rstrip": false,
8
+ "single_word": false,
9
+ "special": true
10
+ },
11
+ "100": {
12
+ "content": "[UNK]",
13
+ "lstrip": false,
14
+ "normalized": false,
15
+ "rstrip": false,
16
+ "single_word": false,
17
+ "special": true
18
+ },
19
+ "101": {
20
+ "content": "[CLS]",
21
+ "lstrip": false,
22
+ "normalized": false,
23
+ "rstrip": false,
24
+ "single_word": false,
25
+ "special": true
26
+ },
27
+ "102": {
28
+ "content": "[SEP]",
29
+ "lstrip": false,
30
+ "normalized": false,
31
+ "rstrip": false,
32
+ "single_word": false,
33
+ "special": true
34
+ },
35
+ "103": {
36
+ "content": "[MASK]",
37
+ "lstrip": false,
38
+ "normalized": false,
39
+ "rstrip": false,
40
+ "single_word": false,
41
+ "special": true
42
+ }
43
+ },
44
+ "clean_up_tokenization_spaces": true,
45
+ "cls_token": "[CLS]",
46
+ "do_basic_tokenize": true,
47
+ "do_lower_case": true,
48
+ "mask_token": "[MASK]",
49
+ "model_max_length": 8192,
50
+ "never_split": null,
51
+ "pad_token": "[PAD]",
52
+ "sep_token": "[SEP]",
53
+ "strip_accents": null,
54
+ "tokenize_chinese_chars": true,
55
+ "tokenizer_class": "BertTokenizer",
56
+ "unk_token": "[UNK]"
57
+ }
vocab.txt ADDED
The diff for this file is too large to render. See raw diff