yaswanthgali commited on
Commit
2162fbb
·
verified ·
1 Parent(s): acad9e1

Upload 6 files

Browse files
.gitattributes CHANGED
@@ -33,3 +33,4 @@ saved_model/**/* filter=lfs diff=lfs merge=lfs -text
33
  *.zip filter=lfs diff=lfs merge=lfs -text
34
  *.zst filter=lfs diff=lfs merge=lfs -text
35
  *tfevents* filter=lfs diff=lfs merge=lfs -text
 
 
33
  *.zip filter=lfs diff=lfs merge=lfs -text
34
  *.zst filter=lfs diff=lfs merge=lfs -text
35
  *tfevents* filter=lfs diff=lfs merge=lfs -text
36
+ aimv2_overview_light.png filter=lfs diff=lfs merge=lfs -text
LICENSE ADDED
@@ -0,0 +1,88 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ Disclaimer: IMPORTANT: This Apple Machine Learning Research Model is
2
+ specifically developed and released by Apple Inc. ("Apple") for the sole purpose
3
+ of scientific research of artificial intelligence and machine-learning
4
+ technology. “Apple Machine Learning Research Model” means the model, including
5
+ but not limited to algorithms, formulas, trained model weights, parameters,
6
+ configurations, checkpoints, and any related materials (including
7
+ documentation).
8
+
9
+ This Apple Machine Learning Research Model is provided to You by
10
+ Apple in consideration of your agreement to the following terms, and your use,
11
+ modification, creation of Model Derivatives, and or redistribution of the Apple
12
+ Machine Learning Research Model constitutes acceptance of this Agreement. If You
13
+ do not agree with these terms, please do not use, modify, create Model
14
+ Derivatives of, or distribute this Apple Machine Learning Research Model or
15
+ Model Derivatives.
16
+
17
+ * License Scope: In consideration of your agreement to abide by the following
18
+ terms, and subject to these terms, Apple hereby grants you a personal,
19
+ non-exclusive, worldwide, non-transferable, royalty-free, revocable, and
20
+ limited license, to use, copy, modify, distribute, and create Model
21
+ Derivatives (defined below) of the Apple Machine Learning Research Model
22
+ exclusively for Research Purposes. You agree that any Model Derivatives You
23
+ may create or that may be created for You will be limited to Research Purposes
24
+ as well. “Research Purposes” means non-commercial scientific research and
25
+ academic development activities, such as experimentation, analysis, testing
26
+ conducted by You with the sole intent to advance scientific knowledge and
27
+ research. “Research Purposes” does not include any commercial exploitation,
28
+ product development or use in any commercial product or service.
29
+
30
+ * Distribution of Apple Machine Learning Research Model and Model Derivatives:
31
+ If you choose to redistribute Apple Machine Learning Research Model or its
32
+ Model Derivatives, you must provide a copy of this Agreement to such third
33
+ party, and ensure that the following attribution notice be provided: “Apple
34
+ Machine Learning Research Model is licensed under the Apple Machine Learning
35
+ Research Model License Agreement.” Additionally, all Model Derivatives must
36
+ clearly be identified as such, including disclosure of modifications and
37
+ changes made to the Apple Machine Learning Research Model. The name,
38
+ trademarks, service marks or logos of Apple may not be used to endorse or
39
+ promote Model Derivatives or the relationship between You and Apple. “Model
40
+ Derivatives” means any models or any other artifacts created by modifications,
41
+ improvements, adaptations, alterations to the architecture, algorithm or
42
+ training processes of the Apple Machine Learning Research Model, or by any
43
+ retraining, fine-tuning of the Apple Machine Learning Research Model.
44
+
45
+ * No Other License: Except as expressly stated in this notice, no other rights
46
+ or licenses, express or implied, are granted by Apple herein, including but
47
+ not limited to any patent, trademark, and similar intellectual property rights
48
+ worldwide that may be infringed by the Apple Machine Learning Research Model,
49
+ the Model Derivatives or by other works in which the Apple Machine Learning
50
+ Research Model may be incorporated.
51
+
52
+ * Compliance with Laws: Your use of Apple Machine Learning Research Model must
53
+ be in compliance with all applicable laws and regulations.
54
+
55
+ * Term and Termination: The term of this Agreement will begin upon your
56
+ acceptance of this Agreement or use of the Apple Machine Learning Research
57
+ Model and will continue until terminated in accordance with the following
58
+ terms. Apple may terminate this Agreement at any time if You are in breach of
59
+ any term or condition of this Agreement. Upon termination of this Agreement,
60
+ You must cease to use all Apple Machine Learning Research Models and Model
61
+ Derivatives and permanently delete any copy thereof. Sections 3, 6 and 7 will
62
+ survive termination.
63
+
64
+ * Disclaimer and Limitation of Liability: This Apple Machine Learning Research
65
+ Model and any outputs generated by the Apple Machine Learning Research Model
66
+ are provided on an “AS IS” basis. APPLE MAKES NO WARRANTIES, EXPRESS OR
67
+ IMPLIED, INCLUDING WITHOUT LIMITATION THE IMPLIED WARRANTIES OF
68
+ NON-INFRINGEMENT, MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE,
69
+ REGARDING THE APPLE MACHINE LEARNING RESEARCH MODEL OR OUTPUTS GENERATED BY
70
+ THE APPLE MACHINE LEARNING RESEARCH MODEL. You are solely responsible for
71
+ determining the appropriateness of using or redistributing the Apple Machine
72
+ Learning Research Model and any outputs of the Apple Machine Learning Research
73
+ Model and assume any risks associated with Your use of the Apple Machine
74
+ Learning Research Model and any output and results. IN NO EVENT SHALL APPLE BE
75
+ LIABLE FOR ANY SPECIAL, INDIRECT, INCIDENTAL OR CONSEQUENTIAL DAMAGES ARISING
76
+ IN ANY WAY OUT OF THE USE, REPRODUCTION, MODIFICATION AND/OR DISTRIBUTION OF
77
+ THE APPLE MACHINE LEARNING RESEARCH MODEL AND ANY OUTPUTS OF THE APPLE MACHINE
78
+ LEARNING RESEARCH MODEL, HOWEVER CAUSED AND WHETHER UNDER THEORY OF CONTRACT,
79
+ TORT (INCLUDING NEGLIGENCE), STRICT LIABILITY OR OTHERWISE, EVEN IF APPLE HAS
80
+ BEEN ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
81
+
82
+ * Governing Law: This Agreement will be governed by and construed under the laws
83
+ of the State of California without regard to its choice of law principles. The
84
+ Convention on Contracts for the International Sale of Goods shall not apply to
85
+ the Agreement except that the arbitration clause and any arbitration hereunder
86
+ shall be governed by the Federal Arbitration Act, Chapters 1 and 2. 
87
+
88
+ Copyright (C) 2025 Apple Inc. All Rights Reserved.
README.md CHANGED
@@ -1,3 +1,239 @@
1
- ---
2
- license: apple-amlr
3
- ---
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ ---
2
+ library_name: transformers
3
+ license: apple-amlr
4
+ metrics:
5
+ - accuracy
6
+ pipeline_tag: image-feature-extraction
7
+ tags:
8
+ - vision
9
+ - image-feature-extraction
10
+ - mlx
11
+ - pytorch
12
+ model-index:
13
+ - name: aimv2-large-patch14-224
14
+ results:
15
+ - task:
16
+ type: classification
17
+ name: Classification
18
+ dataset:
19
+ name: imagenet-1k
20
+ type: imagenet-1k
21
+ metrics:
22
+ - type: accuracy
23
+ value: 86.6
24
+ name: Accuracy
25
+ verified: false
26
+ - task:
27
+ type: classification
28
+ name: Classification
29
+ dataset:
30
+ name: inaturalist-18
31
+ type: inaturalist-18
32
+ metrics:
33
+ - type: accuracy
34
+ value: 76.0
35
+ name: Accuracy
36
+ verified: false
37
+ - task:
38
+ type: classification
39
+ name: Classification
40
+ dataset:
41
+ name: cifar10
42
+ type: cifar10
43
+ metrics:
44
+ - type: accuracy
45
+ value: 99.1
46
+ name: Accuracy
47
+ verified: false
48
+ - task:
49
+ type: classification
50
+ name: Classification
51
+ dataset:
52
+ name: cifar100
53
+ type: cifar100
54
+ metrics:
55
+ - type: accuracy
56
+ value: 92.2
57
+ name: Accuracy
58
+ verified: false
59
+ - task:
60
+ type: classification
61
+ name: Classification
62
+ dataset:
63
+ name: food101
64
+ type: food101
65
+ metrics:
66
+ - type: accuracy
67
+ value: 95.7
68
+ name: Accuracy
69
+ verified: false
70
+ - task:
71
+ type: classification
72
+ name: Classification
73
+ dataset:
74
+ name: dtd
75
+ type: dtd
76
+ metrics:
77
+ - type: accuracy
78
+ value: 87.9
79
+ name: Accuracy
80
+ verified: false
81
+ - task:
82
+ type: classification
83
+ name: Classification
84
+ dataset:
85
+ name: oxford-pets
86
+ type: oxford-pets
87
+ metrics:
88
+ - type: accuracy
89
+ value: 96.3
90
+ name: Accuracy
91
+ verified: false
92
+ - task:
93
+ type: classification
94
+ name: Classification
95
+ dataset:
96
+ name: stanford-cars
97
+ type: stanford-cars
98
+ metrics:
99
+ - type: accuracy
100
+ value: 96.3
101
+ name: Accuracy
102
+ verified: false
103
+ - task:
104
+ type: classification
105
+ name: Classification
106
+ dataset:
107
+ name: camelyon17
108
+ type: camelyon17
109
+ metrics:
110
+ - type: accuracy
111
+ value: 93.7
112
+ name: Accuracy
113
+ verified: false
114
+ - task:
115
+ type: classification
116
+ name: Classification
117
+ dataset:
118
+ name: patch-camelyon
119
+ type: patch-camelyon
120
+ metrics:
121
+ - type: accuracy
122
+ value: 89.3
123
+ name: Accuracy
124
+ verified: false
125
+ - task:
126
+ type: classification
127
+ name: Classification
128
+ dataset:
129
+ name: rxrx1
130
+ type: rxrx1
131
+ metrics:
132
+ - type: accuracy
133
+ value: 5.6
134
+ name: Accuracy
135
+ verified: false
136
+ - task:
137
+ type: classification
138
+ name: Classification
139
+ dataset:
140
+ name: eurosat
141
+ type: eurosat
142
+ metrics:
143
+ - type: accuracy
144
+ value: 98.4
145
+ name: Accuracy
146
+ verified: false
147
+ - task:
148
+ type: classification
149
+ name: Classification
150
+ dataset:
151
+ name: fmow
152
+ type: fmow
153
+ metrics:
154
+ - type: accuracy
155
+ value: 60.7
156
+ name: Accuracy
157
+ verified: false
158
+ - task:
159
+ type: classification
160
+ name: Classification
161
+ dataset:
162
+ name: domainnet-infographic
163
+ type: domainnet-infographic
164
+ metrics:
165
+ - type: accuracy
166
+ value: 69.0
167
+ name: Accuracy
168
+ verified: false
169
+ ---
170
+ # Introduction
171
+ [[`AIMv2 Paper`](https://arxiv.org/abs/2411.14402)] [[`BibTeX`](#citation)]
172
+
173
+ We introduce the AIMv2 family of vision models pre-trained with a multimodal autoregressive objective.
174
+ AIMv2 pre-training is simple and straightforward to train and scale effectively. Some AIMv2 highlights include:
175
+
176
+ 1. Outperforms OAI CLIP and SigLIP on the majority of multimodal understanding benchmarks.
177
+ 2. Outperforms DINOv2 on open-vocabulary object detection and referring expression comprehension.
178
+ 3. Exhibits strong recognition performance with AIMv2-3B achieving *89.5% on ImageNet using a frozen trunk*.
179
+
180
+ <img src="aimv2_overview_light.png" alt="AIMv2 Overview"/>
181
+
182
+ ## Usage
183
+
184
+ ### PyTorch
185
+ ```python
186
+ import requests
187
+ from PIL import Image
188
+ from transformers import AutoImageProcessor, AutoModel
189
+
190
+ url = "http://images.cocodataset.org/val2017/000000039769.jpg"
191
+ image = Image.open(requests.get(url, stream=True).raw)
192
+
193
+ processor = AutoImageProcessor.from_pretrained(
194
+ "apple/aimv2-large-patch14-224",
195
+ )
196
+ model = AutoModel.from_pretrained(
197
+ "apple/aimv2-large-patch14-224",
198
+ trust_remote_code=True,
199
+ )
200
+
201
+ inputs = processor(images=image, return_tensors="pt")
202
+ outputs = model(**inputs)
203
+ ```
204
+
205
+ ### JAX
206
+ ```python
207
+ import requests
208
+ from PIL import Image
209
+ from transformers import AutoImageProcessor, FlaxAutoModel
210
+
211
+ url = "http://images.cocodataset.org/val2017/000000039769.jpg"
212
+ image = Image.open(requests.get(url, stream=True).raw)
213
+
214
+ processor = AutoImageProcessor.from_pretrained(
215
+ "apple/aimv2-large-patch14-224",
216
+ )
217
+ model = FlaxAutoModel.from_pretrained(
218
+ "apple/aimv2-large-patch14-224",
219
+ trust_remote_code=True,
220
+ )
221
+
222
+ inputs = processor(images=image, return_tensors="jax")
223
+ outputs = model(**inputs)
224
+ ```
225
+
226
+ ## Citation
227
+ If you find our work useful, please consider citing us as:
228
+ ```bibtex
229
+ @misc{fini2024multimodalautoregressivepretraininglarge,
230
+ author = {Fini, Enrico and Shukor, Mustafa and Li, Xiujun and Dufter, Philipp and Klein, Michal and Haldimann, David and Aitharaju, Sai and da Costa, Victor Guilherme Turrisi and Béthune, Louis and Gan, Zhe and Toshev, Alexander T and Eichner, Marcin and Nabi, Moin and Yang, Yinfei and Susskind, Joshua M. and El-Nouby, Alaaeldin},
231
+ url = {https://arxiv.org/abs/2411.14402},
232
+ eprint = {2411.14402},
233
+ eprintclass = {cs.CV},
234
+ eprinttype = {arXiv},
235
+ title = {Multimodal Autoregressive Pre-training of Large Vision Encoders},
236
+ year = {2024},
237
+ }
238
+ ```
239
+
aimv2_overview_light.png ADDED

Git LFS Details

  • SHA256: 524b6eb5049fb4bac6303ecee386d0e885fa69a96756557d843084ba4caae08f
  • Pointer size: 131 Bytes
  • Size of remote file: 336 kB
config.json ADDED
@@ -0,0 +1,29 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "architectures": [
3
+ "AIMv2VisionModel"
4
+ ],
5
+ "attention_dropout": 0.0,
6
+ "auto_map": {
7
+ "AutoConfig": "apple/aimv2-large-patch14-224--configuration_aimv2.AIMv2Config",
8
+ "AutoModel": "apple/aimv2-large-patch14-224--modeling_aimv2.AIMv2Model",
9
+ "FlaxAutoModel": "apple/aimv2-large-patch14-224--modeling_flax_aimv2.FlaxAIMv2Model"
10
+ },
11
+ "hidden_act": "silu",
12
+ "hidden_size": 1024,
13
+ "image_size": 224,
14
+ "initializer_range": 0.02,
15
+ "intermediate_size": 2816,
16
+ "is_causal": false,
17
+ "model_type": "aimv2_vision_model",
18
+ "num_attention_heads": 8,
19
+ "num_channels": 3,
20
+ "num_hidden_layers": 24,
21
+ "patch_size": 14,
22
+ "projection_dropout": 0.0,
23
+ "qkv_bias": false,
24
+ "rms_norm_eps": 1e-05,
25
+ "torch_dtype": "float32",
26
+ "transformers_version": "4.51.0.dev0",
27
+ "use_bias": false,
28
+ "use_head": false
29
+ }
model.safetensors ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:40a9777acd7cad34a97ab28d933fc43d43f9fb55e00056964806bc82065ef43e
3
+ size 1236815864
preprocessor_config.json ADDED
@@ -0,0 +1,32 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "crop_size": {
3
+ "height": 224,
4
+ "width": 224
5
+ },
6
+ "data_format": "channels_first",
7
+ "default_to_square": false,
8
+ "device": null,
9
+ "do_center_crop": true,
10
+ "do_convert_rgb": true,
11
+ "do_normalize": true,
12
+ "do_rescale": true,
13
+ "do_resize": true,
14
+ "image_mean": [
15
+ 0.48145466,
16
+ 0.4578275,
17
+ 0.40821073
18
+ ],
19
+ "image_processor_type": "CLIPImageProcessorFast",
20
+ "image_std": [
21
+ 0.26862954,
22
+ 0.26130258,
23
+ 0.27577711
24
+ ],
25
+ "input_data_format": null,
26
+ "resample": 3,
27
+ "rescale_factor": 0.00392156862745098,
28
+ "return_tensors": null,
29
+ "size": {
30
+ "shortest_edge": 224
31
+ }
32
+ }