Safetensors
roman-bachmann commited on
Commit
1af7e96
·
0 Parent(s):

Initial commit

Browse files
Files changed (4) hide show
  1. .gitattributes +35 -0
  2. README.md +79 -0
  3. config.json +297 -0
  4. model.safetensors +3 -0
.gitattributes ADDED
@@ -0,0 +1,35 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ *.7z filter=lfs diff=lfs merge=lfs -text
2
+ *.arrow filter=lfs diff=lfs merge=lfs -text
3
+ *.bin filter=lfs diff=lfs merge=lfs -text
4
+ *.bz2 filter=lfs diff=lfs merge=lfs -text
5
+ *.ckpt filter=lfs diff=lfs merge=lfs -text
6
+ *.ftz filter=lfs diff=lfs merge=lfs -text
7
+ *.gz filter=lfs diff=lfs merge=lfs -text
8
+ *.h5 filter=lfs diff=lfs merge=lfs -text
9
+ *.joblib filter=lfs diff=lfs merge=lfs -text
10
+ *.lfs.* filter=lfs diff=lfs merge=lfs -text
11
+ *.mlmodel filter=lfs diff=lfs merge=lfs -text
12
+ *.model filter=lfs diff=lfs merge=lfs -text
13
+ *.msgpack filter=lfs diff=lfs merge=lfs -text
14
+ *.npy filter=lfs diff=lfs merge=lfs -text
15
+ *.npz filter=lfs diff=lfs merge=lfs -text
16
+ *.onnx filter=lfs diff=lfs merge=lfs -text
17
+ *.ot filter=lfs diff=lfs merge=lfs -text
18
+ *.parquet filter=lfs diff=lfs merge=lfs -text
19
+ *.pb filter=lfs diff=lfs merge=lfs -text
20
+ *.pickle filter=lfs diff=lfs merge=lfs -text
21
+ *.pkl filter=lfs diff=lfs merge=lfs -text
22
+ *.pt filter=lfs diff=lfs merge=lfs -text
23
+ *.pth filter=lfs diff=lfs merge=lfs -text
24
+ *.rar filter=lfs diff=lfs merge=lfs -text
25
+ *.safetensors filter=lfs diff=lfs merge=lfs -text
26
+ saved_model/**/* filter=lfs diff=lfs merge=lfs -text
27
+ *.tar.* filter=lfs diff=lfs merge=lfs -text
28
+ *.tar filter=lfs diff=lfs merge=lfs -text
29
+ *.tflite filter=lfs diff=lfs merge=lfs -text
30
+ *.tgz filter=lfs diff=lfs merge=lfs -text
31
+ *.wasm filter=lfs diff=lfs merge=lfs -text
32
+ *.xz filter=lfs diff=lfs merge=lfs -text
33
+ *.zip filter=lfs diff=lfs merge=lfs -text
34
+ *.zst filter=lfs diff=lfs merge=lfs -text
35
+ *tfevents* filter=lfs diff=lfs merge=lfs -text
README.md ADDED
@@ -0,0 +1,79 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ ---
2
+ license: apple-amlr
3
+ ---
4
+
5
+ # FlexTok: Resampling Images into 1D Token Sequences of Flexible Length
6
+
7
+ [`Website`](https://flextok.epfl.ch) | [`arXiv`](https://arxiv.org/abs/2502.13967) | [`GitHub`](https://github.com/apple/ml-flextok) | [`🤗 Demo`](https://huggingface.co/spaces/EPFL-VILAB/FlexTok) | [`BibTeX`](#citation)
8
+
9
+ Official implementation and pre-trained models for: <br>
10
+ [**FlexTok: Resampling Images into 1D Token Sequences of Flexible Length**](https://arxiv.org/abs/2502.13967), arXiv 2025 <br>
11
+ *[Roman Bachmann](https://roman-bachmann.github.io/)\*, [Jesse Allardice](https://github.com/JesseAllardice)\*, [David Mizrahi](https://dmizrahi.com/)\*, [Enrico Fini](https://scholar.google.com/citations?user=OQMtSKIAAAAJ), [Oğuzhan Fatih Kar](https://ofkar.github.io/), [Elmira Amirloo](https://elamirloo.github.io/), [Alaaeldin El-Nouby](https://aelnouby.github.io/), [Amir Zamir](https://vilab.epfl.ch/zamir/), [Afshin Dehghan](https://scholar.google.com/citations?user=wcX-UW4AAAAJ)*
12
+
13
+
14
+ ## Installation
15
+ For install instructions, please see https://github.com/apple/ml-flextok.
16
+
17
+
18
+ ## Usage
19
+
20
+ To load the `FlexTok d18-d18 ImageNet-1k` model directly from HuggingFace Hub, call:
21
+ ```python
22
+ from flextok.flextok_wrapper import FlexTokFromHub
23
+ model = FlexTokFromHub.from_pretrained('EPFL-VILAB/flextok_d18_d18_in1k').eval()
24
+ ```
25
+
26
+ The model can also be loaded by downloading the `model.safetensors` checkpoint in this repository manually and loading it using our helper functions:
27
+ ```python
28
+ from hydra.utils import instantiate
29
+ from flextok.utils.checkpoint import load_safetensors
30
+
31
+ ckpt, config = load_safetensors('/path/to/model.safetensors')
32
+ model = instantiate(config).eval()
33
+ model.load_state_dict(ckpt)
34
+ ```
35
+
36
+ After loading a FlexTok model, image batches can be encoded using:
37
+ ```python
38
+ from flextok.utils.demo import imgs_from_urls
39
+ # Load example images of shape (B, 3, 256, 256), normalized to [-1,1]
40
+ imgs = imgs_from_urls(urls=['https://storage.googleapis.com/flextok_site/nb_demo_images/0.png'])
41
+
42
+ # tokens_list is a list of [1, 256] discrete token sequences
43
+ tokens_list = model.tokenize(imgs)
44
+ ```
45
+
46
+ The list of token sequences can be truncated in a nested fashion:
47
+ ```python
48
+ k_keep = 64 # For example, only keep the first 64 out of 256 tokens
49
+ tokens_list = [t[:,:k_keep] for t in tokens_list]
50
+ ```
51
+
52
+ To decode the tokens with FlexTok's rectified flow decoder, call:
53
+ ```python
54
+ # tokens_list is a list of [1, l] discrete token sequences, with l <= 256
55
+ # reconst is a [B, 3, 256, 256] tensor, normalized to [-1,1]
56
+ reconst = model.detokenize(
57
+ tokens_list,
58
+ timesteps=20, # Number of denoising steps
59
+ guidance_scale=7.5, # Classifier-free guidance scale
60
+ perform_norm_guidance=True, # See https://arxiv.org/abs/2410.02416
61
+ )
62
+ ```
63
+
64
+
65
+ ## Citation
66
+
67
+ If you find this repository helpful, please consider citing our work:
68
+ ```
69
+ @article{flextok,
70
+ title={{FlexTok}: Resampling Images into 1D Token Sequences of Flexible Length},
71
+ author={Roman Bachmann and Jesse Allardice and David Mizrahi and Enrico Fini and O{\u{g}}uzhan Fatih Kar and Elmira Amirloo and Alaaeldin El-Nouby and Amir Zamir and Afshin Dehghan},
72
+ journal={arXiv 2025},
73
+ year={2025},
74
+ }
75
+ ```
76
+
77
+ ## License
78
+
79
+ The model weights in this repository are released under the Apple Model License for Research.
config.json ADDED
@@ -0,0 +1,297 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "regularizer": {
3
+ "_target_": "flextok.regularizers.quantize_fsq.FSQ",
4
+ "latents_read_key": "enc_registers",
5
+ "quants_write_key": "enc_registers_quant",
6
+ "tokens_write_key": "tokens",
7
+ "levels": [
8
+ 8,
9
+ 8,
10
+ 8,
11
+ 5,
12
+ 5,
13
+ 5
14
+ ],
15
+ "drop_quant_p": 0.0,
16
+ "packed_call": false
17
+ },
18
+ "decoder": {
19
+ "_target_": "flextok.model.utils.wrappers.SequentialModuleDictWrapper",
20
+ "module_dict": {
21
+ "dec_from_latents": {
22
+ "_target_": "flextok.model.preprocessors.linear.LinearLayer",
23
+ "read_key": "enc_registers_quant",
24
+ "write_key": "dec_registers_proj",
25
+ "dim_in": 6,
26
+ "dim": 1152
27
+ },
28
+ "dec_registers_posemb_module": {
29
+ "_target_": "flextok.model.utils.posembs.PositionalEmbeddingAdder",
30
+ "read_key": "dec_registers_proj",
31
+ "write_key": "dec_registers_proj",
32
+ "dim": 1152,
33
+ "max_sizes": [
34
+ 256
35
+ ],
36
+ "posemb_type": "learnable_sum",
37
+ "posemb_scaling": "absolute"
38
+ },
39
+ "dec_nested_dropout": {
40
+ "_target_": "flextok.model.preprocessors.token_dropout.MaskedNestedDropout",
41
+ "read_write_key": "dec_registers_proj",
42
+ "dim": 1152,
43
+ "size_sampling_mode": "pow2"
44
+ },
45
+ "dec_latent_dropout": {
46
+ "_target_": "flextok.model.preprocessors.nullcond.LearnedNullCond",
47
+ "read_write_key": "dec_registers_proj",
48
+ "dim": 1152,
49
+ "dropout_prob": 0.2
50
+ },
51
+ "dec_noise_channels_to_last": {
52
+ "_target_": "flextok.model.utils.dict_ops.PerSampleOp",
53
+ "read_key": "vae_latents_noised",
54
+ "write_key": "vae_latents_noised_bhwc",
55
+ "per_sample_op": {
56
+ "_target_": "flextok.model.utils.dict_ops.channels_first_to_last",
57
+ "_partial_": true
58
+ }
59
+ },
60
+ "dec_noise_patch_emb": {
61
+ "_target_": "flextok.model.preprocessors.patching.PatchEmbedder",
62
+ "input_tensor_list_read_key": "vae_latents_noised_bhwc",
63
+ "patches_list_write_key": "vae_latents_noised_patched",
64
+ "n_patches_write_key": "dec_n_patches",
65
+ "channels_in": 16,
66
+ "dim": 1152,
67
+ "patch_sizes": [
68
+ 2,
69
+ 2
70
+ ],
71
+ "flatten_patches": false
72
+ },
73
+ "dec_patches_posemb_module": {
74
+ "_target_": "flextok.model.utils.posembs.PositionalEmbeddingAdder",
75
+ "read_key": "vae_latents_noised_patched",
76
+ "write_key": "dec_patches",
77
+ "dim": 1152,
78
+ "max_sizes": [
79
+ 16,
80
+ 16
81
+ ],
82
+ "posemb_type": "sincos",
83
+ "posemb_scaling": "absolute"
84
+ },
85
+ "dec_seq_packer": {
86
+ "_target_": "flextok.model.preprocessors.flex_seq_packing.BlockWiseSequencePacker",
87
+ "input_list_read_keys": [
88
+ "dec_patches",
89
+ "dec_registers_proj"
90
+ ],
91
+ "packed_seq_write_key": "dec_packed_seq",
92
+ "block_mask_write_key": "dec_block_mask",
93
+ "inner_packed_shapes_write_key": "dec_ps_inner",
94
+ "outer_packed_shapes_write_key": "dec_ps_outer",
95
+ "emb_packing_fn_write_key": "emb_packing_fn",
96
+ "mask_mode": "full",
97
+ "pad_to_multiple": 128,
98
+ "per_subseq_embs": true
99
+ },
100
+ "dec_time_embedder": {
101
+ "_target_": "flextok.model.preprocessors.time_embedding.TimestepEmbedder",
102
+ "timesteps_read_key": "timesteps",
103
+ "time_embedding_write_key": "dec_temb",
104
+ "dim": 1152,
105
+ "frequency_embedding_size": 256,
106
+ "max_timestep": 1000.0
107
+ },
108
+ "dec_transformer": {
109
+ "_target_": "flextok.model.trunks.transformers.FlexTransformer",
110
+ "input_seq_read_key": "dec_packed_seq",
111
+ "output_seq_write_key": "dec_packed_seq",
112
+ "dim": 1152,
113
+ "depth": 18,
114
+ "block_mask_read_key": "dec_block_mask",
115
+ "adaLN_emb_read_key": "dec_temb",
116
+ "adaLN_packing_fn_read_key": "emb_packing_fn",
117
+ "adaLN_expansion": 2,
118
+ "intermediate_layer_write_key": "dec_packed_seq_repa_layer",
119
+ "intermediate_layers": [
120
+ 1
121
+ ],
122
+ "use_act_checkpoint": false
123
+ },
124
+ "dec_unpacker": {
125
+ "_target_": "flextok.model.postprocessors.seq_unpacking.SequenceUnpacker",
126
+ "packed_seq_read_key": "dec_packed_seq",
127
+ "inner_seq_write_keys": [
128
+ "dec_patches",
129
+ "dec_registers_proj"
130
+ ],
131
+ "inner_packed_shapes_read_key": "dec_ps_inner",
132
+ "outer_packed_shapes_read_key": "dec_ps_outer"
133
+ },
134
+ "dec_repa_unpacker": {
135
+ "_target_": "flextok.model.postprocessors.seq_unpacking.SequenceUnpacker",
136
+ "packed_seq_read_key": "dec_packed_seq_repa_layer",
137
+ "inner_seq_write_keys": [
138
+ "dec_patches_repa_layer",
139
+ "dec_registers_repa_layer"
140
+ ],
141
+ "inner_packed_shapes_read_key": "dec_ps_inner",
142
+ "outer_packed_shapes_read_key": "dec_ps_outer"
143
+ },
144
+ "dec_to_patches": {
145
+ "_target_": "flextok.model.postprocessors.heads.ToPatchesLinearHead",
146
+ "read_key": "dec_patches",
147
+ "write_key": "dec_patches",
148
+ "dim": 1152,
149
+ "channels_out": 16,
150
+ "patch_sizes": [
151
+ 2,
152
+ 2
153
+ ],
154
+ "use_mup_readout": false,
155
+ "weight_init_style": "zero",
156
+ "adaLN_emb_read_key": "dec_temb"
157
+ },
158
+ "dec_channels_to_first": {
159
+ "_target_": "flextok.model.utils.dict_ops.PerSampleOp",
160
+ "read_key": "dec_patches",
161
+ "write_key": "vae_latents_reconst",
162
+ "per_sample_op": {
163
+ "_target_": "flextok.model.utils.dict_ops.channels_last_to_first",
164
+ "_partial_": true
165
+ }
166
+ }
167
+ }
168
+ },
169
+ "vae": {
170
+ "_target_": "flextok.vae_wrapper.StableDiffusionVAE",
171
+ "images_read_key": "rgb",
172
+ "vae_latents_read_key": "vae_latents_reconst",
173
+ "vae_latents_write_key": "vae_latents",
174
+ "images_reconst_write_key": "rgb_reconst",
175
+ "vae_kl_loss_write_key": "kl_loss",
176
+ "dtype_override": null,
177
+ "sample_posterior": true,
178
+ "compile_encode_fn": false,
179
+ "force_vae_encode": true,
180
+ "latent_channels": 16,
181
+ "scaling_factor": 0.88
182
+ },
183
+ "pipeline": {
184
+ "_target_": "flextok.flow_matching.pipelines.MinRFPipeline",
185
+ "_partial_": true,
186
+ "target_sizes_read_key": null,
187
+ "latents_read_key": "enc_registers_quant",
188
+ "timesteps_read_key": "timesteps",
189
+ "noised_images_read_key": "vae_latents_noised",
190
+ "reconst_write_key": "vae_latents_reconst",
191
+ "out_channels": 16
192
+ },
193
+ "flow_matching_noise_module": {
194
+ "_target_": "flextok.flow_matching.noise_modules.MinRFNoiseModule",
195
+ "clean_images_read_key": "vae_latents",
196
+ "noised_images_write_key": "vae_latents_noised",
197
+ "noise_write_key": "flow_noise",
198
+ "timesteps_write_key": "timesteps",
199
+ "sigmas_write_key": "sigmas",
200
+ "ln": false,
201
+ "stratisfied": false,
202
+ "mode_scale": 0.25
203
+ },
204
+ "_target_": "flextok.flextok_wrapper.FlexTok",
205
+ "encoder": {
206
+ "_target_": "flextok.model.utils.wrappers.SequentialModuleDictWrapper",
207
+ "module_dict": {
208
+ "enc_channels_to_last": {
209
+ "_target_": "flextok.model.utils.dict_ops.PerSampleOp",
210
+ "read_key": "vae_latents",
211
+ "write_key": "vae_latents_bhwc",
212
+ "per_sample_op": {
213
+ "_target_": "flextok.model.utils.dict_ops.channels_first_to_last",
214
+ "_partial_": true
215
+ }
216
+ },
217
+ "enc_patch_emb": {
218
+ "_target_": "flextok.model.preprocessors.patching.PatchEmbedder",
219
+ "input_tensor_list_read_key": "vae_latents_bhwc",
220
+ "patches_list_write_key": "enc_vae_latents_patched",
221
+ "n_patches_write_key": "enc_n_patches",
222
+ "channels_in": 16,
223
+ "dim": 1152,
224
+ "patch_sizes": [
225
+ 2,
226
+ 2
227
+ ],
228
+ "flatten_patches": false
229
+ },
230
+ "enc_posemb_module": {
231
+ "_target_": "flextok.model.utils.posembs.PositionalEmbeddingAdder",
232
+ "read_key": "enc_vae_latents_patched",
233
+ "write_key": "enc_vae_latents_patched",
234
+ "dim": 1152,
235
+ "max_sizes": [
236
+ 16,
237
+ 16
238
+ ],
239
+ "posemb_type": "sincos",
240
+ "posemb_scaling": "absolute"
241
+ },
242
+ "enc_register_module": {
243
+ "_target_": "flextok.model.preprocessors.registers.Registers1D",
244
+ "input_tensor_list_read_key": "enc_vae_latents_patched",
245
+ "register_sizes_read_write_key": "register_sizes",
246
+ "registers_write_key": "enc_registers",
247
+ "dim": 1152,
248
+ "n_min": 256,
249
+ "n_max": 256,
250
+ "size_sampling_mode": "uniform",
251
+ "ordering_mode": "nested"
252
+ },
253
+ "enc_seq_packer": {
254
+ "_target_": "flextok.model.preprocessors.flex_seq_packing.BlockWiseSequencePacker",
255
+ "input_list_read_keys": [
256
+ "enc_vae_latents_patched",
257
+ "enc_registers"
258
+ ],
259
+ "packed_seq_write_key": "enc_packed_seq",
260
+ "block_mask_write_key": "enc_block_mask",
261
+ "inner_packed_shapes_write_key": "enc_ps_inner",
262
+ "outer_packed_shapes_write_key": "enc_ps_outer",
263
+ "mask_mode": "causal_last",
264
+ "pad_to_multiple": 128
265
+ },
266
+ "enc_transformer": {
267
+ "_target_": "flextok.model.trunks.transformers.FlexTransformer",
268
+ "input_seq_read_key": "enc_packed_seq",
269
+ "output_seq_write_key": "enc_packed_seq",
270
+ "dim": 1152,
271
+ "depth": 18,
272
+ "block_mask_read_key": "enc_block_mask",
273
+ "use_act_checkpoint": false
274
+ },
275
+ "enc_unpacker": {
276
+ "_target_": "flextok.model.postprocessors.seq_unpacking.SequenceUnpacker",
277
+ "packed_seq_read_key": "enc_packed_seq",
278
+ "inner_seq_write_keys": [
279
+ "enc_vae_latents_patched",
280
+ "enc_registers"
281
+ ],
282
+ "inner_packed_shapes_read_key": "enc_ps_inner",
283
+ "outer_packed_shapes_read_key": "enc_ps_outer"
284
+ },
285
+ "enc_to_latents": {
286
+ "_target_": "flextok.model.postprocessors.heads.LinearHead",
287
+ "read_key": "enc_registers",
288
+ "write_key": "enc_registers",
289
+ "dim": 1152,
290
+ "dim_out": 6,
291
+ "use_mup_readout": false,
292
+ "weight_init_style": "zero",
293
+ "dtype_override": null
294
+ }
295
+ }
296
+ }
297
+ }
model.safetensors ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:996036ef14ff39da93ef56b967ae2948538f68b7e37815e8065afa74442c37a9
3
+ size 3799019068