diff --git a/.gitattributes b/.gitattributes old mode 100644 new mode 100755 diff --git a/README.md b/README.md old mode 100644 new mode 100755 index 38006366fa5497155cb061bc7ef6ac58d30100c0..719bea262b19b85180d9363124dae088e8e5a37b --- a/README.md +++ b/README.md @@ -1,14 +1,113 @@ --- -title: MSAE -emoji: 🌖 -colorFrom: gray -colorTo: pink -sdk: gradio -sdk_version: 5.38.0 -app_file: app.py -pinned: false license: mit -short_description: Application of MSAE from https://github.com/WolodjaZ/MSAE +datasets: +- pixparse/cc3m-wds +base_model: +- openai/clip-vit-large-patch14 +- openai/clip-vit-base-patch16 +tags: +- clip +- vision +- text --- -Check out the configuration reference at https://huggingface.co/docs/hub/spaces-config-reference +# Matryoshka Sparse Autoencoders (MSAE) for CLIP + +This repository provides PyTorch implementations of Matryoshka Sparse Autoencoders (MSAEs) trained on the image encoder of CLIP (ViT-L/14 and ViT-B/16). These models are designed to learn interpretable, hierarchical features from complex multimodal representations. + +For a deeper dive into the underlying theory and the full research implementation, please see the original [MSAE repository](https://github.com/WolodjaZ/MSAE) and the accompanying paper. + +## What is a Sparse Autoencoder (SAE)? + +Sparse autoencoders (SAEs) are useful for detecting and steering interpretable features within complex neural networks. They learn to represent complex data in a sparse manner, meaning that only a small number of neurons are activated at any given time which enable to reconstruct the input data. This sparsity leads to more interpretable representations, as each active neuron can be associated with a specific feature or concept. As a result, SAEs can be used to identify and manipulate specific features in the data, making them powerful tools for understanding and controlling the behavior of neural networks. + +## Key Features + +- **Interpretability**: SAEs learn to decompose complex representations into sparse, interpretable features. This allows for a better understanding of what the model has learned. +- **Hierarchical Features**: The Matryoshka SAE (MSAE) architecture learns features at multiple granularities simultaneously, from fine-grained details to high-level concepts. +- **Model Steering**: By identifying and manipulating specific features, you can steer the behavior of the CLIP model. +- **Simple Integration**: The provided `sae.py` module allows for easy loading and integration of the trained models into your own projects. + +## Repository Structure + +The repository is organized as follows: + +- `sae.py`: A self-contained Python module with the SAE and MSAE model implementations to run the inference. +- `clip_disect_20k.txt`: A vocabulary file containing 20,000 concept names used for interpreting the learned features. +- `ViT-L_14/`: Contains the trained SAE models for the CLIP ViT-L/14 image encoder. +- `ViT-B_16/`: Contains the trained SAE models for the CLIP ViT-B/16 image encoder. + +Each model directory (`ViT-L_14` and `ViT-B_16`) is further subdivided into: + +- `centered/`: Models trained on mean-centered features. +- `not_centered/`: Models trained on non-centered features. + +Additionally, each directory contains `.pth` files for the model weights and `.npy` files for the concept matching scores. + +## Understanding the Model Names + +The model filenames follow a consistent naming convention that encodes the model's hyperparameters. Here's how to interpret a typical filename: + +`{n_latents}_{n_inputs}_{activation}_{k}_{weighting}_{tied}_{normalized}_{soft_cap}_{dataset}.pth` + +Where: + +- `n_latents`: The number of latent features in the SAE. +- `n_inputs`: The input dimensionality (e.g., 768 for ViT-L/14, 512 for ViT-B/16). +- `activation`: The activation function used (e.g., `TopKReLU`). +- `k`: The number of smallest trained active latents for the `TopK` activation. +- `weighting`: Whether the model was trained with uniform weighting (UW) or reverse weighting (RW). +- `tied`: Indicates if the model encoder is tied to the decoder. +- `normalized`: Indicates if the model was trained with normalized inputs. +- `soft_cap`: Indicates if the model uses soft capping for the latent features. +- `dataset`: The dataset used for training (e.g., `cc3m`). + +The concept matching scores are stored in `.npy` files with a similar naming convention: `Concept_Interpreter_{model_name}_{vocab_name}.npy`, where `vocab_name` indicates the vocabulary used for concept matching. + +## How to Use + +To get started, you'll need to have PyTorch and NumPy installed. + +```bash +pip install torch numpy +``` + +First, copy the `sae.py` file to your working directory. Then, you can load a model and its corresponding concept vocabulary as follows: + +```python +import torch +import numpy as np +from sae import SAE +from huggingface_hub import hf_hub_download + +# Download the SAE model weights +weights_path = hf_hub_download( + repo_id="WolodjaZ/MSAE", + filename="ViT-L_14/centered/6144_768_TopKReLU_64_RW_False_False_0.0_cc3m_ViT-L~14_train_image_2905936_768.pth" +) +sae_model = SAE(weights_path) + +# Download the concept matching scores for the model +vocab_path = hf_hub_download( + repo_id="WolodjaZ/MSAE", + filename="ViT-L_14/centered/Concept_Interpreter_6144_768_TopKReLU_64_RW_False_False_0.0_cc3m_ViT-L~14_train_image_2905936_768_disect_ViT-L~14_-1_text_20000_768.npy" +) +concept_match_scores = np.load(vocab_path) + +# Load the vocabulary names +with open('clip_disect_20k.txt', 'r') as f: + vocab_names = [line.strip() for line in f.readlines()] + +print(f"Concept match scores shape: {concept_match_scores.shape}") +print(f"Vocabulary size: {len(vocab_names)}") + +# Now you can use the model to encode and decode your own data +# For a detailed example, please refer to the demo notebook in the original repository: +# https://github.com/WolodjaZ/MSAE/blob/main/demo.ipynb +``` + +This example demonstrates how to load a specific SAE model and its associated concept names. You can adapt the `filename` in `hf_hub_download` to load any of the other available models. For a complete guide on how to use the model for feature extraction and steering, please refer to the [demo notebook](https://github.com/WolodjaZ/MSAE/blob/main/demo.ipynb) in the original MSAE repository. + +## Citation + +Paper: https://arxiv.org/abs/2502.20578 \ No newline at end of file diff --git a/ViT-B_16/centered/16384_512_TopKReLU_64_RW_False_False_0.0_cc3m_ViT-B~16_train_image_2905936_512.pth b/ViT-B_16/centered/16384_512_TopKReLU_64_RW_False_False_0.0_cc3m_ViT-B~16_train_image_2905936_512.pth new file mode 100755 index 0000000000000000000000000000000000000000..65c737cfe6ae2609650d9cf3971b49cd32b72281 --- /dev/null +++ b/ViT-B_16/centered/16384_512_TopKReLU_64_RW_False_False_0.0_cc3m_ViT-B~16_train_image_2905936_512.pth @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:f87530f9bba1b8366b2213b3ad23d497427d9bddefb4862de0030955c92e98f2 +size 67314062 diff --git a/ViT-B_16/centered/16384_512_TopKReLU_64_UW_False_False_0.0_cc3m_ViT-B~16_train_image_2905936_512.pth b/ViT-B_16/centered/16384_512_TopKReLU_64_UW_False_False_0.0_cc3m_ViT-B~16_train_image_2905936_512.pth new file mode 100755 index 0000000000000000000000000000000000000000..b1c538fbfef15249efa9befa65f6a2a5ffce95a4 --- /dev/null +++ b/ViT-B_16/centered/16384_512_TopKReLU_64_UW_False_False_0.0_cc3m_ViT-B~16_train_image_2905936_512.pth @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:8780986a61a4ef0c4181f5204966027980e34f8dbbfc3ad0fa0b723f70fa5509 +size 67314062 diff --git a/ViT-B_16/centered/4096_512_TopKReLU_64_RW_False_False_0.0_cc3m_ViT-B~16_train_image_2905936_512.pth b/ViT-B_16/centered/4096_512_TopKReLU_64_RW_False_False_0.0_cc3m_ViT-B~16_train_image_2905936_512.pth new file mode 100755 index 0000000000000000000000000000000000000000..6a8dc2af70fcaeb7ad402fd2cc80ed534715e670 --- /dev/null +++ b/ViT-B_16/centered/4096_512_TopKReLU_64_RW_False_False_0.0_cc3m_ViT-B~16_train_image_2905936_512.pth @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:28d923c96b7d26a4fab04e20f9dbd4d223cf1f512fc847104dd7a53dc7d73035 +size 16834948 diff --git a/ViT-B_16/centered/4096_512_TopKReLU_64_UW_False_False_0.0_cc3m_ViT-B~16_train_image_2905936_512.pth b/ViT-B_16/centered/4096_512_TopKReLU_64_UW_False_False_0.0_cc3m_ViT-B~16_train_image_2905936_512.pth new file mode 100755 index 0000000000000000000000000000000000000000..dcf4409943aa6f09f9b802aef9553739b42aa605 --- /dev/null +++ b/ViT-B_16/centered/4096_512_TopKReLU_64_UW_False_False_0.0_cc3m_ViT-B~16_train_image_2905936_512.pth @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:1794297e3b88c6a1dee2affa50e28d2e5d132c388dbaf7904dd61e32547b3815 +size 16834948 diff --git a/ViT-B_16/centered/8192_512_TopKReLU_64_RW_False_False_0.0_cc3m_ViT-B~16_train_image_2905936_512.pth b/ViT-B_16/centered/8192_512_TopKReLU_64_RW_False_False_0.0_cc3m_ViT-B~16_train_image_2905936_512.pth new file mode 100755 index 0000000000000000000000000000000000000000..dcde75f6fa7a384e9fd53e37c926a59bc23b21bf --- /dev/null +++ b/ViT-B_16/centered/8192_512_TopKReLU_64_RW_False_False_0.0_cc3m_ViT-B~16_train_image_2905936_512.pth @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:26ad0cec5b81140885e9331ff0367956149f76b472b11fb7ee335e7e840c6bd3 +size 33661316 diff --git a/ViT-B_16/centered/8192_512_TopKReLU_64_UW_False_False_0.0_cc3m_ViT-B~16_train_image_2905936_512.pth b/ViT-B_16/centered/8192_512_TopKReLU_64_UW_False_False_0.0_cc3m_ViT-B~16_train_image_2905936_512.pth new file mode 100755 index 0000000000000000000000000000000000000000..4af1074bf41a822d2a1834b90987570994db89df --- /dev/null +++ b/ViT-B_16/centered/8192_512_TopKReLU_64_UW_False_False_0.0_cc3m_ViT-B~16_train_image_2905936_512.pth @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:fd80b19a40cfd5dd9b28feeddff95c6429e2929c6fe280fed4c191b24f7ca1ad +size 33661316 diff --git a/ViT-B_16/centered/Concept_Interpreter_16384_512_TopKReLU_64_RW_False_False_0.0_cc3m_ViT-B~16_train_image_2905936_512_disect_ViT-B~16_-1_text_20000_512.npy b/ViT-B_16/centered/Concept_Interpreter_16384_512_TopKReLU_64_RW_False_False_0.0_cc3m_ViT-B~16_train_image_2905936_512_disect_ViT-B~16_-1_text_20000_512.npy new file mode 100755 index 0000000000000000000000000000000000000000..03ac9143f70751fcc8229d9ea085d54d9a6da50d --- /dev/null +++ b/ViT-B_16/centered/Concept_Interpreter_16384_512_TopKReLU_64_RW_False_False_0.0_cc3m_ViT-B~16_train_image_2905936_512_disect_ViT-B~16_-1_text_20000_512.npy @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:41064a0812be9559cb5c1e789f8cf4ec02732fa167ff92d014b047757779d749 +size 1310720128 diff --git a/ViT-B_16/centered/Concept_Interpreter_16384_512_TopKReLU_64_UW_False_False_0.0_cc3m_ViT-B~16_train_image_2905936_512_disect_ViT-B~16_-1_text_20000_512.npy b/ViT-B_16/centered/Concept_Interpreter_16384_512_TopKReLU_64_UW_False_False_0.0_cc3m_ViT-B~16_train_image_2905936_512_disect_ViT-B~16_-1_text_20000_512.npy new file mode 100755 index 0000000000000000000000000000000000000000..656f28eeb9cf25145d5e2bf182863c606db650e1 --- /dev/null +++ b/ViT-B_16/centered/Concept_Interpreter_16384_512_TopKReLU_64_UW_False_False_0.0_cc3m_ViT-B~16_train_image_2905936_512_disect_ViT-B~16_-1_text_20000_512.npy @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:06c9013aae027f6c206a5423da29bd92cf7b81ef63bdf1bb23c170a7e98c3b5a +size 1310720128 diff --git a/ViT-B_16/centered/Concept_Interpreter_4096_512_TopKReLU_64_RW_False_False_0.0_cc3m_ViT-B~16_train_image_2905936_512_disect_ViT-B~16_-1_text_20000_512.npy b/ViT-B_16/centered/Concept_Interpreter_4096_512_TopKReLU_64_RW_False_False_0.0_cc3m_ViT-B~16_train_image_2905936_512_disect_ViT-B~16_-1_text_20000_512.npy new file mode 100755 index 0000000000000000000000000000000000000000..13c35f8ec34c12011c10b569b012c44a7b2a59db --- /dev/null +++ b/ViT-B_16/centered/Concept_Interpreter_4096_512_TopKReLU_64_RW_False_False_0.0_cc3m_ViT-B~16_train_image_2905936_512_disect_ViT-B~16_-1_text_20000_512.npy @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:1aefad1d81ba571c859493d094890902fb388a2950404bfccb98b2b4a31a4ad8 +size 327680128 diff --git a/ViT-B_16/centered/Concept_Interpreter_4096_512_TopKReLU_64_UW_False_False_0.0_cc3m_ViT-B~16_train_image_2905936_512_disect_ViT-B~16_-1_text_20000_512.npy b/ViT-B_16/centered/Concept_Interpreter_4096_512_TopKReLU_64_UW_False_False_0.0_cc3m_ViT-B~16_train_image_2905936_512_disect_ViT-B~16_-1_text_20000_512.npy new file mode 100755 index 0000000000000000000000000000000000000000..5ce882b0adc3f42b5de01a60d40c868c0c9e7050 --- /dev/null +++ b/ViT-B_16/centered/Concept_Interpreter_4096_512_TopKReLU_64_UW_False_False_0.0_cc3m_ViT-B~16_train_image_2905936_512_disect_ViT-B~16_-1_text_20000_512.npy @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:95e4b4c20c15212778fa109da604e2ce015d977aa20414d74274407afef0f4de +size 327680128 diff --git a/ViT-B_16/centered/Concept_Interpreter_8192_512_TopKReLU_64_RW_False_False_0.0_cc3m_ViT-B~16_train_image_2905936_512_disect_ViT-B~16_-1_text_20000_512.npy b/ViT-B_16/centered/Concept_Interpreter_8192_512_TopKReLU_64_RW_False_False_0.0_cc3m_ViT-B~16_train_image_2905936_512_disect_ViT-B~16_-1_text_20000_512.npy new file mode 100755 index 0000000000000000000000000000000000000000..ceddefce2779bc9f6d8750189814609c2889ffad --- /dev/null +++ b/ViT-B_16/centered/Concept_Interpreter_8192_512_TopKReLU_64_RW_False_False_0.0_cc3m_ViT-B~16_train_image_2905936_512_disect_ViT-B~16_-1_text_20000_512.npy @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:871db72478420c54dc686816152f82e8f3bd9e75bf88fb5848aa74c372eddf51 +size 655360128 diff --git a/ViT-B_16/centered/Concept_Interpreter_8192_512_TopKReLU_64_UW_False_False_0.0_cc3m_ViT-B~16_train_image_2905936_512_disect_ViT-B~16_-1_text_20000_512.npy b/ViT-B_16/centered/Concept_Interpreter_8192_512_TopKReLU_64_UW_False_False_0.0_cc3m_ViT-B~16_train_image_2905936_512_disect_ViT-B~16_-1_text_20000_512.npy new file mode 100755 index 0000000000000000000000000000000000000000..45f0c6125c54fe8782656e06dec281e7e0be643b --- /dev/null +++ b/ViT-B_16/centered/Concept_Interpreter_8192_512_TopKReLU_64_UW_False_False_0.0_cc3m_ViT-B~16_train_image_2905936_512_disect_ViT-B~16_-1_text_20000_512.npy @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:ecdefa4e94a864a6f108281143c73e72b592865cf3b891e1324f05de218fc4da +size 655360128 diff --git a/ViT-B_16/not_centered/16384_512_TopKReLU_64_RW_False_False_0.0_cc3m_ViT-B~16_train_image_2905936_512.pth b/ViT-B_16/not_centered/16384_512_TopKReLU_64_RW_False_False_0.0_cc3m_ViT-B~16_train_image_2905936_512.pth new file mode 100755 index 0000000000000000000000000000000000000000..5cbf019049a233c8b60df9ef890a8ff77bd07dca --- /dev/null +++ b/ViT-B_16/not_centered/16384_512_TopKReLU_64_RW_False_False_0.0_cc3m_ViT-B~16_train_image_2905936_512.pth @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:8883a4cc01604f24f76d379f304e1427bbfb22fed33344210f1cf70f26938a58 +size 67314062 diff --git a/ViT-B_16/not_centered/16384_512_TopKReLU_64_UW_False_False_0.0_cc3m_ViT-B~16_train_image_2905936_512.pth b/ViT-B_16/not_centered/16384_512_TopKReLU_64_UW_False_False_0.0_cc3m_ViT-B~16_train_image_2905936_512.pth new file mode 100755 index 0000000000000000000000000000000000000000..46151bf592c4b32530e33027d6928a90fcc23564 --- /dev/null +++ b/ViT-B_16/not_centered/16384_512_TopKReLU_64_UW_False_False_0.0_cc3m_ViT-B~16_train_image_2905936_512.pth @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:8e30463d780a1e61739335d8c6c9dc703831704af30c222f796fa16b2b1abeae +size 67314062 diff --git a/ViT-B_16/not_centered/4096_512_TopKReLU_64_RW_False_False_0.0_cc3m_ViT-B~16_train_image_2905936_512.pth b/ViT-B_16/not_centered/4096_512_TopKReLU_64_RW_False_False_0.0_cc3m_ViT-B~16_train_image_2905936_512.pth new file mode 100755 index 0000000000000000000000000000000000000000..69abe751190fdf3329784c303654a7aee713b792 --- /dev/null +++ b/ViT-B_16/not_centered/4096_512_TopKReLU_64_RW_False_False_0.0_cc3m_ViT-B~16_train_image_2905936_512.pth @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:0a4815f602c2fbd9bdbc5fc401d778b14ac69218639f5c180ddc528c733a3965 +size 16834948 diff --git a/ViT-B_16/not_centered/4096_512_TopKReLU_64_UW_False_False_0.0_cc3m_ViT-B~16_train_image_2905936_512.pth b/ViT-B_16/not_centered/4096_512_TopKReLU_64_UW_False_False_0.0_cc3m_ViT-B~16_train_image_2905936_512.pth new file mode 100755 index 0000000000000000000000000000000000000000..1dbd4d2492578e1d7285062a7251072b6943084b --- /dev/null +++ b/ViT-B_16/not_centered/4096_512_TopKReLU_64_UW_False_False_0.0_cc3m_ViT-B~16_train_image_2905936_512.pth @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:acbcb059adb3ad773f5790fe479ad35ef3ef8ea0a2b8ef7703106f8f7dfda6ef +size 16834948 diff --git a/ViT-B_16/not_centered/8192_512_TopKReLU_64_RW_False_False_0.0_cc3m_ViT-B~16_train_image_2905936_512.pth b/ViT-B_16/not_centered/8192_512_TopKReLU_64_RW_False_False_0.0_cc3m_ViT-B~16_train_image_2905936_512.pth new file mode 100755 index 0000000000000000000000000000000000000000..b31aa35eaa1ac22ece2134c1487ff106e55bcde2 --- /dev/null +++ b/ViT-B_16/not_centered/8192_512_TopKReLU_64_RW_False_False_0.0_cc3m_ViT-B~16_train_image_2905936_512.pth @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:3a2e8919d2b6ae1ba055c7cb1111039e9083e1bcd9c2ff79864e8943b4897ff1 +size 33661316 diff --git a/ViT-B_16/not_centered/8192_512_TopKReLU_64_UW_False_False_0.0_cc3m_ViT-B~16_train_image_2905936_512.pth b/ViT-B_16/not_centered/8192_512_TopKReLU_64_UW_False_False_0.0_cc3m_ViT-B~16_train_image_2905936_512.pth new file mode 100755 index 0000000000000000000000000000000000000000..ceb09fb98acc2d79ccb5124d56fd6b79f3b70a66 --- /dev/null +++ b/ViT-B_16/not_centered/8192_512_TopKReLU_64_UW_False_False_0.0_cc3m_ViT-B~16_train_image_2905936_512.pth @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:fa782f88b1a37a218ccbc40399b6d1394aa42747e84433dd776aaacc5888a344 +size 33661316 diff --git a/ViT-B_16/not_centered/Concept_Interpreter_16384_512_TopKReLU_64_RW_False_False_0.0_cc3m_ViT-B~16_train_image_2905936_512_disect_ViT-B~16_-1_text_20000_512.npy b/ViT-B_16/not_centered/Concept_Interpreter_16384_512_TopKReLU_64_RW_False_False_0.0_cc3m_ViT-B~16_train_image_2905936_512_disect_ViT-B~16_-1_text_20000_512.npy new file mode 100755 index 0000000000000000000000000000000000000000..d308b397dff6a747f11f4e5ca954deaae646fbb1 --- /dev/null +++ b/ViT-B_16/not_centered/Concept_Interpreter_16384_512_TopKReLU_64_RW_False_False_0.0_cc3m_ViT-B~16_train_image_2905936_512_disect_ViT-B~16_-1_text_20000_512.npy @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:8cc88a52b473247d088f0f65a1ee58a4864da31af215a9076821b0fbe59b5814 +size 1310720128 diff --git a/ViT-B_16/not_centered/Concept_Interpreter_16384_512_TopKReLU_64_UW_False_False_0.0_cc3m_ViT-B~16_train_image_2905936_512_disect_ViT-B~16_-1_text_20000_512.npy b/ViT-B_16/not_centered/Concept_Interpreter_16384_512_TopKReLU_64_UW_False_False_0.0_cc3m_ViT-B~16_train_image_2905936_512_disect_ViT-B~16_-1_text_20000_512.npy new file mode 100755 index 0000000000000000000000000000000000000000..c27b4c622e1ec9c0c56fb0d5af094310735d2a37 --- /dev/null +++ b/ViT-B_16/not_centered/Concept_Interpreter_16384_512_TopKReLU_64_UW_False_False_0.0_cc3m_ViT-B~16_train_image_2905936_512_disect_ViT-B~16_-1_text_20000_512.npy @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:9ec9aab849549e3fce328a8bfbb93f65039021402ea5c35504991d46205752b1 +size 1310720128 diff --git a/ViT-B_16/not_centered/Concept_Interpreter_4096_512_TopKReLU_64_RW_False_False_0.0_cc3m_ViT-B~16_train_image_2905936_512_disect_ViT-B~16_-1_text_20000_512.npy b/ViT-B_16/not_centered/Concept_Interpreter_4096_512_TopKReLU_64_RW_False_False_0.0_cc3m_ViT-B~16_train_image_2905936_512_disect_ViT-B~16_-1_text_20000_512.npy new file mode 100755 index 0000000000000000000000000000000000000000..49d077186e4cea1583c06b58fd0df56716a0f387 --- /dev/null +++ b/ViT-B_16/not_centered/Concept_Interpreter_4096_512_TopKReLU_64_RW_False_False_0.0_cc3m_ViT-B~16_train_image_2905936_512_disect_ViT-B~16_-1_text_20000_512.npy @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:c4fe591941506764938b9e3ba5bc52133765f2911b24c232e202a70773b899e7 +size 327680128 diff --git a/ViT-B_16/not_centered/Concept_Interpreter_4096_512_TopKReLU_64_UW_False_False_0.0_cc3m_ViT-B~16_train_image_2905936_512_disect_ViT-B~16_-1_text_20000_512.npy b/ViT-B_16/not_centered/Concept_Interpreter_4096_512_TopKReLU_64_UW_False_False_0.0_cc3m_ViT-B~16_train_image_2905936_512_disect_ViT-B~16_-1_text_20000_512.npy new file mode 100755 index 0000000000000000000000000000000000000000..bfd96f2e126b1b2c4c28e4d53207849209bae8a8 --- /dev/null +++ b/ViT-B_16/not_centered/Concept_Interpreter_4096_512_TopKReLU_64_UW_False_False_0.0_cc3m_ViT-B~16_train_image_2905936_512_disect_ViT-B~16_-1_text_20000_512.npy @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:95aa6a46a5302eac986b6e21b17f712c2266c2ac0d2d5536c915221c1d7d8cb2 +size 327680128 diff --git a/ViT-B_16/not_centered/Concept_Interpreter_8192_512_TopKReLU_64_RW_False_False_0.0_cc3m_ViT-B~16_train_image_2905936_512_disect_ViT-B~16_-1_text_20000_512.npy b/ViT-B_16/not_centered/Concept_Interpreter_8192_512_TopKReLU_64_RW_False_False_0.0_cc3m_ViT-B~16_train_image_2905936_512_disect_ViT-B~16_-1_text_20000_512.npy new file mode 100755 index 0000000000000000000000000000000000000000..5609884a36abcec4f628e80ccdc953bc48ad6d97 --- /dev/null +++ b/ViT-B_16/not_centered/Concept_Interpreter_8192_512_TopKReLU_64_RW_False_False_0.0_cc3m_ViT-B~16_train_image_2905936_512_disect_ViT-B~16_-1_text_20000_512.npy @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:72fb8fe9b90785663d7d73cd92683c6ed912b69a9f7a8db2a08f9a367ced18dd +size 655360128 diff --git a/ViT-B_16/not_centered/Concept_Interpreter_8192_512_TopKReLU_64_UW_False_False_0.0_cc3m_ViT-B~16_train_image_2905936_512_disect_ViT-B~16_-1_text_20000_512.npy b/ViT-B_16/not_centered/Concept_Interpreter_8192_512_TopKReLU_64_UW_False_False_0.0_cc3m_ViT-B~16_train_image_2905936_512_disect_ViT-B~16_-1_text_20000_512.npy new file mode 100755 index 0000000000000000000000000000000000000000..d390eb2bc21a3fc979db9d899c9dcc05c87b9e8a --- /dev/null +++ b/ViT-B_16/not_centered/Concept_Interpreter_8192_512_TopKReLU_64_UW_False_False_0.0_cc3m_ViT-B~16_train_image_2905936_512_disect_ViT-B~16_-1_text_20000_512.npy @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:dc2b0f649d84caa867172d3fb9d141cc9d833aa4f5160e426de82c51b066dc2f +size 655360128 diff --git a/ViT-L_14/centered/12288_768_TopKReLU_64_RW_False_False_0.0_cc3m_ViT-L~14_train_image_2905936_768.pth b/ViT-L_14/centered/12288_768_TopKReLU_64_RW_False_False_0.0_cc3m_ViT-L~14_train_image_2905936_768.pth new file mode 100755 index 0000000000000000000000000000000000000000..d5492583912ed7818276b18a0c1e656eb982fefe --- /dev/null +++ b/ViT-L_14/centered/12288_768_TopKReLU_64_RW_False_False_0.0_cc3m_ViT-L~14_train_image_2905936_768.pth @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:8d0f6016b5914c24a178d07241fe7c20cc74936fe8dec60afac16101dbd393e5 +size 75655630 diff --git a/ViT-L_14/centered/12288_768_TopKReLU_64_UW_False_False_0.0_cc3m_ViT-L~14_train_image_2905936_768.pth b/ViT-L_14/centered/12288_768_TopKReLU_64_UW_False_False_0.0_cc3m_ViT-L~14_train_image_2905936_768.pth new file mode 100755 index 0000000000000000000000000000000000000000..4e6c6acdc8c7bea37d50b50548f871a7f9aa799f --- /dev/null +++ b/ViT-L_14/centered/12288_768_TopKReLU_64_UW_False_False_0.0_cc3m_ViT-L~14_train_image_2905936_768.pth @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:31e37f1f1510451ac51805bd66119e0bdae8138441ca1ded369d5f206479ded7 +size 75655630 diff --git a/ViT-L_14/centered/24576_768_TopKReLU_64_RW_False_False_0.0_cc3m_ViT-L~14_train_image_2905936_768.pth b/ViT-L_14/centered/24576_768_TopKReLU_64_RW_False_False_0.0_cc3m_ViT-L~14_train_image_2905936_768.pth new file mode 100755 index 0000000000000000000000000000000000000000..c108c0466dc0e820a3c4d34d1c8eda555ed0764d --- /dev/null +++ b/ViT-L_14/centered/24576_768_TopKReLU_64_RW_False_False_0.0_cc3m_ViT-L~14_train_image_2905936_768.pth @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:8d5fc87254b011b312fbef112df2d239e649f8de2173cae25ac3c39099e29c6e +size 151300558 diff --git a/ViT-L_14/centered/24576_768_TopKReLU_64_UW_False_False_0.0_cc3m_ViT-L~14_train_image_2905936_768.pth b/ViT-L_14/centered/24576_768_TopKReLU_64_UW_False_False_0.0_cc3m_ViT-L~14_train_image_2905936_768.pth new file mode 100755 index 0000000000000000000000000000000000000000..2fbbd7548baa6fc5bd3672ed5605116b2cd5bd65 --- /dev/null +++ b/ViT-L_14/centered/24576_768_TopKReLU_64_UW_False_False_0.0_cc3m_ViT-L~14_train_image_2905936_768.pth @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:cb8b13eb302c8494b8af4c82f76d4af1fe54455b2aa6dc8af7deb6e5ffe8ab15 +size 151300558 diff --git a/6144_768_TopKReLU_64_RW_False_False_0.0_cc3m_ViT-L~14_train_image_2905936_768.pth b/ViT-L_14/centered/6144_768_TopKReLU_64_RW_False_False_0.0_cc3m_ViT-L~14_train_image_2905936_768.pth old mode 100644 new mode 100755 similarity index 100% rename from 6144_768_TopKReLU_64_RW_False_False_0.0_cc3m_ViT-L~14_train_image_2905936_768.pth rename to ViT-L_14/centered/6144_768_TopKReLU_64_RW_False_False_0.0_cc3m_ViT-L~14_train_image_2905936_768.pth diff --git a/ViT-L_14/centered/6144_768_TopKReLU_64_UW_False_False_0.0_cc3m_ViT-L~14_train_image_2905936_768.pth b/ViT-L_14/centered/6144_768_TopKReLU_64_UW_False_False_0.0_cc3m_ViT-L~14_train_image_2905936_768.pth new file mode 100755 index 0000000000000000000000000000000000000000..322c88755c05b3012a2d35797baf322d5d4b4795 --- /dev/null +++ b/ViT-L_14/centered/6144_768_TopKReLU_64_UW_False_False_0.0_cc3m_ViT-L~14_train_image_2905936_768.pth @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:816390b726fff214280ae527f16caa2d27274ecd85c1be0ee93a1506de8a6941 +size 37833156 diff --git a/ViT-L_14/centered/Concept_Interpreter_12288_768_TopKReLU_64_RW_False_False_0.0_cc3m_ViT-L~14_train_image_2905936_768_disect_ViT-L~14_-1_text_20000_768.npy b/ViT-L_14/centered/Concept_Interpreter_12288_768_TopKReLU_64_RW_False_False_0.0_cc3m_ViT-L~14_train_image_2905936_768_disect_ViT-L~14_-1_text_20000_768.npy new file mode 100755 index 0000000000000000000000000000000000000000..fc69cd2dc369f0430d2271422e04a920d376ee8f --- /dev/null +++ b/ViT-L_14/centered/Concept_Interpreter_12288_768_TopKReLU_64_RW_False_False_0.0_cc3m_ViT-L~14_train_image_2905936_768_disect_ViT-L~14_-1_text_20000_768.npy @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:f5d7412d7020c8a52052b821e9428bd8e798d0dbdf05b3f9cd2832db27c11dee +size 983040128 diff --git a/ViT-L_14/centered/Concept_Interpreter_12288_768_TopKReLU_64_UW_False_False_0.0_cc3m_ViT-L~14_train_image_2905936_768_disect_ViT-L~14_-1_text_20000_768.npy b/ViT-L_14/centered/Concept_Interpreter_12288_768_TopKReLU_64_UW_False_False_0.0_cc3m_ViT-L~14_train_image_2905936_768_disect_ViT-L~14_-1_text_20000_768.npy new file mode 100755 index 0000000000000000000000000000000000000000..db7fd7adef1d9a6b2d0cc8727aa8b5a3ee5297bf --- /dev/null +++ b/ViT-L_14/centered/Concept_Interpreter_12288_768_TopKReLU_64_UW_False_False_0.0_cc3m_ViT-L~14_train_image_2905936_768_disect_ViT-L~14_-1_text_20000_768.npy @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:923f2eadbca66a0b872e344b6e41098248ec1052cfee114080b7b09b12a0882d +size 983040128 diff --git a/ViT-L_14/centered/Concept_Interpreter_24576_768_TopKReLU_64_RW_False_False_0.0_cc3m_ViT-L~14_train_image_2905936_768_disect_ViT-L~14_-1_text_20000_768.npy b/ViT-L_14/centered/Concept_Interpreter_24576_768_TopKReLU_64_RW_False_False_0.0_cc3m_ViT-L~14_train_image_2905936_768_disect_ViT-L~14_-1_text_20000_768.npy new file mode 100755 index 0000000000000000000000000000000000000000..fc42eb693460754d35de039160268d61224b8e71 --- /dev/null +++ b/ViT-L_14/centered/Concept_Interpreter_24576_768_TopKReLU_64_RW_False_False_0.0_cc3m_ViT-L~14_train_image_2905936_768_disect_ViT-L~14_-1_text_20000_768.npy @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:c8363b945516e42a38f9e86f02300051cc9e132039f53bca7b40684a06557adc +size 1966080128 diff --git a/ViT-L_14/centered/Concept_Interpreter_24576_768_TopKReLU_64_UW_False_False_0.0_cc3m_ViT-L~14_train_image_2905936_768_disect_ViT-L~14_-1_text_20000_768.npy b/ViT-L_14/centered/Concept_Interpreter_24576_768_TopKReLU_64_UW_False_False_0.0_cc3m_ViT-L~14_train_image_2905936_768_disect_ViT-L~14_-1_text_20000_768.npy new file mode 100755 index 0000000000000000000000000000000000000000..1423281761999d28b6d7e24546b86378a5508518 --- /dev/null +++ b/ViT-L_14/centered/Concept_Interpreter_24576_768_TopKReLU_64_UW_False_False_0.0_cc3m_ViT-L~14_train_image_2905936_768_disect_ViT-L~14_-1_text_20000_768.npy @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:93b8c7c2a58d6546684a448b3a163d55fa6a1b07fd454c385203eacf00826e82 +size 1966080128 diff --git a/Concept_Interpreter_6144_768_TopKReLU_64_RW_False_False_0.0_cc3m_ViT-L~14_train_image_2905936_768_disect_ViT-L~14_-1_text_20000_768.npy b/ViT-L_14/centered/Concept_Interpreter_6144_768_TopKReLU_64_RW_False_False_0.0_cc3m_ViT-L~14_train_image_2905936_768_disect_ViT-L~14_-1_text_20000_768.npy old mode 100644 new mode 100755 similarity index 100% rename from Concept_Interpreter_6144_768_TopKReLU_64_RW_False_False_0.0_cc3m_ViT-L~14_train_image_2905936_768_disect_ViT-L~14_-1_text_20000_768.npy rename to ViT-L_14/centered/Concept_Interpreter_6144_768_TopKReLU_64_RW_False_False_0.0_cc3m_ViT-L~14_train_image_2905936_768_disect_ViT-L~14_-1_text_20000_768.npy diff --git a/ViT-L_14/centered/Concept_Interpreter_6144_768_TopKReLU_64_UW_False_False_0.0_cc3m_ViT-L~14_train_image_2905936_768_disect_ViT-L~14_-1_text_20000_768.npy b/ViT-L_14/centered/Concept_Interpreter_6144_768_TopKReLU_64_UW_False_False_0.0_cc3m_ViT-L~14_train_image_2905936_768_disect_ViT-L~14_-1_text_20000_768.npy new file mode 100755 index 0000000000000000000000000000000000000000..9b0b4d61e488850c03e84e0ade052d91b4751eb1 --- /dev/null +++ b/ViT-L_14/centered/Concept_Interpreter_6144_768_TopKReLU_64_UW_False_False_0.0_cc3m_ViT-L~14_train_image_2905936_768_disect_ViT-L~14_-1_text_20000_768.npy @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:27481bc1557944d8ba6d8b36d405d4019e49285b5aafc80733c898ef62c315e4 +size 491520128 diff --git a/ViT-L_14/not_centered/._.DS_Store b/ViT-L_14/not_centered/._.DS_Store new file mode 100755 index 0000000000000000000000000000000000000000..28c42fb20a1f27e695fb64323501fc6476578b13 Binary files /dev/null and b/ViT-L_14/not_centered/._.DS_Store differ diff --git a/ViT-L_14/not_centered/12288_768_TopKReLU_64_RW_False_False_0.0_cc3m_ViT-L~14_train_image_2905936_768.pth b/ViT-L_14/not_centered/12288_768_TopKReLU_64_RW_False_False_0.0_cc3m_ViT-L~14_train_image_2905936_768.pth new file mode 100755 index 0000000000000000000000000000000000000000..e7b8ec9f60cad91e776ef551207146bc5c8d857b --- /dev/null +++ b/ViT-L_14/not_centered/12288_768_TopKReLU_64_RW_False_False_0.0_cc3m_ViT-L~14_train_image_2905936_768.pth @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:8386aa9529c4833f22188a51c5689be63f093d2a9a1e8d41df3b2440ce4c685f +size 75655630 diff --git a/ViT-L_14/not_centered/12288_768_TopKReLU_64_UW_False_False_0.0_cc3m_ViT-L~14_train_image_2905936_768.pth b/ViT-L_14/not_centered/12288_768_TopKReLU_64_UW_False_False_0.0_cc3m_ViT-L~14_train_image_2905936_768.pth new file mode 100755 index 0000000000000000000000000000000000000000..d059fe754bea0ffa4b520014b7c1afd38f78da9f --- /dev/null +++ b/ViT-L_14/not_centered/12288_768_TopKReLU_64_UW_False_False_0.0_cc3m_ViT-L~14_train_image_2905936_768.pth @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:030daf3813fdcea96093ed7e4d1c0297b090cadce911409047005484b524be88 +size 75655630 diff --git a/ViT-L_14/not_centered/24576_768_TopKReLU_64_RW_False_False_0.0_cc3m_ViT-L~14_train_image_2905936_768.pth b/ViT-L_14/not_centered/24576_768_TopKReLU_64_RW_False_False_0.0_cc3m_ViT-L~14_train_image_2905936_768.pth new file mode 100755 index 0000000000000000000000000000000000000000..8c35add6a767cc6f3137b2c97c459ed4dda582be --- /dev/null +++ b/ViT-L_14/not_centered/24576_768_TopKReLU_64_RW_False_False_0.0_cc3m_ViT-L~14_train_image_2905936_768.pth @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:5a4b154fa581c5d7e793ffffbcc46dfc03c4b6cfb7f8ad474f5c327125f4d091 +size 151300558 diff --git a/ViT-L_14/not_centered/24576_768_TopKReLU_64_UW_False_False_0.0_cc3m_ViT-L~14_train_image_2905936_768.pth b/ViT-L_14/not_centered/24576_768_TopKReLU_64_UW_False_False_0.0_cc3m_ViT-L~14_train_image_2905936_768.pth new file mode 100755 index 0000000000000000000000000000000000000000..378229881c65ac0231f11f0f7d930a1737f1389d --- /dev/null +++ b/ViT-L_14/not_centered/24576_768_TopKReLU_64_UW_False_False_0.0_cc3m_ViT-L~14_train_image_2905936_768.pth @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:c368c8ea0d9c1b14c2c58c03689dcfdd6258833f6ab41103a30ccdff3c91ec43 +size 151300558 diff --git a/ViT-L_14/not_centered/6144_768_TopKReLU_64_RW_False_False_0.0_cc3m_ViT-L~14_train_image_2905936_768.pth b/ViT-L_14/not_centered/6144_768_TopKReLU_64_RW_False_False_0.0_cc3m_ViT-L~14_train_image_2905936_768.pth new file mode 100755 index 0000000000000000000000000000000000000000..3eff8aa54c49cf81733fff60a736867948afbe47 --- /dev/null +++ b/ViT-L_14/not_centered/6144_768_TopKReLU_64_RW_False_False_0.0_cc3m_ViT-L~14_train_image_2905936_768.pth @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:ed5f569cea3cd0da932e3fd92bf326d9fc8d3291f4ebf50f20a38558ea33ada2 +size 37833156 diff --git a/ViT-L_14/not_centered/6144_768_TopKReLU_64_UW_False_False_0.0_cc3m_ViT-L~14_train_image_2905936_768.pth b/ViT-L_14/not_centered/6144_768_TopKReLU_64_UW_False_False_0.0_cc3m_ViT-L~14_train_image_2905936_768.pth new file mode 100755 index 0000000000000000000000000000000000000000..2a74a2193686155a85aa44c6503bd9609888e4ee --- /dev/null +++ b/ViT-L_14/not_centered/6144_768_TopKReLU_64_UW_False_False_0.0_cc3m_ViT-L~14_train_image_2905936_768.pth @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:975b8526c5f0bc38f8e8ceeb496403d8710b988f548e8ad43c811ea1498e5d7c +size 37833156 diff --git a/ViT-L_14/not_centered/Concept_Interpreter_12288_768_TopKReLU_64_RW_False_False_0.0_cc3m_ViT-L~14_train_image_2905936_768_disect_ViT-L~14_-1_text_20000_768.npy b/ViT-L_14/not_centered/Concept_Interpreter_12288_768_TopKReLU_64_RW_False_False_0.0_cc3m_ViT-L~14_train_image_2905936_768_disect_ViT-L~14_-1_text_20000_768.npy new file mode 100755 index 0000000000000000000000000000000000000000..74656f30b732776108a27ab7d75734065fb5c239 --- /dev/null +++ b/ViT-L_14/not_centered/Concept_Interpreter_12288_768_TopKReLU_64_RW_False_False_0.0_cc3m_ViT-L~14_train_image_2905936_768_disect_ViT-L~14_-1_text_20000_768.npy @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:b23db51cc483b1e5079a8a70ba72216209c23b75d24ffa479986f9d9389b3fa6 +size 983040128 diff --git a/ViT-L_14/not_centered/Concept_Interpreter_12288_768_TopKReLU_64_UW_False_False_0.0_cc3m_ViT-L~14_train_image_2905936_768_disect_ViT-L~14_-1_text_20000_768.npy b/ViT-L_14/not_centered/Concept_Interpreter_12288_768_TopKReLU_64_UW_False_False_0.0_cc3m_ViT-L~14_train_image_2905936_768_disect_ViT-L~14_-1_text_20000_768.npy new file mode 100755 index 0000000000000000000000000000000000000000..61926191c7d70ad93ef30be446a863f489089dc2 --- /dev/null +++ b/ViT-L_14/not_centered/Concept_Interpreter_12288_768_TopKReLU_64_UW_False_False_0.0_cc3m_ViT-L~14_train_image_2905936_768_disect_ViT-L~14_-1_text_20000_768.npy @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:63e47558d5805593c7e448838eaae13ed68301322d97d4769fa6ebb3abec3455 +size 983040128 diff --git a/ViT-L_14/not_centered/Concept_Interpreter_24576_768_TopKReLU_64_RW_False_False_0.0_cc3m_ViT-L~14_train_image_2905936_768_disect_ViT-L~14_-1_text_20000_768.npy b/ViT-L_14/not_centered/Concept_Interpreter_24576_768_TopKReLU_64_RW_False_False_0.0_cc3m_ViT-L~14_train_image_2905936_768_disect_ViT-L~14_-1_text_20000_768.npy new file mode 100755 index 0000000000000000000000000000000000000000..4ac93495a495757b2b11c2b41ea1ea23d0e730c3 --- /dev/null +++ b/ViT-L_14/not_centered/Concept_Interpreter_24576_768_TopKReLU_64_RW_False_False_0.0_cc3m_ViT-L~14_train_image_2905936_768_disect_ViT-L~14_-1_text_20000_768.npy @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:dfd037dbd6b6bc76cf61e2e1060ce0a253ca1e76f43c2a440f0679df95f432b1 +size 1966080128 diff --git a/ViT-L_14/not_centered/Concept_Interpreter_24576_768_TopKReLU_64_UW_False_False_0.0_cc3m_ViT-L~14_train_image_2905936_768_disect_ViT-L~14_-1_text_20000_768.npy b/ViT-L_14/not_centered/Concept_Interpreter_24576_768_TopKReLU_64_UW_False_False_0.0_cc3m_ViT-L~14_train_image_2905936_768_disect_ViT-L~14_-1_text_20000_768.npy new file mode 100755 index 0000000000000000000000000000000000000000..67716a4a227de50bd6f297d99b05d2e54adbb1d2 --- /dev/null +++ b/ViT-L_14/not_centered/Concept_Interpreter_24576_768_TopKReLU_64_UW_False_False_0.0_cc3m_ViT-L~14_train_image_2905936_768_disect_ViT-L~14_-1_text_20000_768.npy @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:4afa8726f7d317476dee2a881b737a5f7a38955cf17e609d10d3d9a6c0254073 +size 1966080128 diff --git a/ViT-L_14/not_centered/Concept_Interpreter_6144_768_TopKReLU_64_RW_False_False_0.0_cc3m_ViT-L~14_train_image_2905936_768_disect_ViT-L~14_-1_text_20000_768.npy b/ViT-L_14/not_centered/Concept_Interpreter_6144_768_TopKReLU_64_RW_False_False_0.0_cc3m_ViT-L~14_train_image_2905936_768_disect_ViT-L~14_-1_text_20000_768.npy new file mode 100755 index 0000000000000000000000000000000000000000..d251712d001d078065ed310587d23e6b2ba72aec --- /dev/null +++ b/ViT-L_14/not_centered/Concept_Interpreter_6144_768_TopKReLU_64_RW_False_False_0.0_cc3m_ViT-L~14_train_image_2905936_768_disect_ViT-L~14_-1_text_20000_768.npy @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:757bd2a3200eb2e5c9eef08bd916a84bc1db2ba3f1c6b682c5879810a4cf1f87 +size 491520128 diff --git a/ViT-L_14/not_centered/Concept_Interpreter_6144_768_TopKReLU_64_UW_False_False_0.0_cc3m_ViT-L~14_train_image_2905936_768_disect_ViT-L~14_-1_text_20000_768.npy b/ViT-L_14/not_centered/Concept_Interpreter_6144_768_TopKReLU_64_UW_False_False_0.0_cc3m_ViT-L~14_train_image_2905936_768_disect_ViT-L~14_-1_text_20000_768.npy new file mode 100755 index 0000000000000000000000000000000000000000..94c3d1117168fa1b90709b688a0eac11b509f49f --- /dev/null +++ b/ViT-L_14/not_centered/Concept_Interpreter_6144_768_TopKReLU_64_UW_False_False_0.0_cc3m_ViT-L~14_train_image_2905936_768_disect_ViT-L~14_-1_text_20000_768.npy @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:4a26c2aa9049d278654bb12fe0c96a625c61037f0f57b1b1e79ead161f5a89ba +size 491520128 diff --git a/app.py b/app.py deleted file mode 100644 index 39877b1711f843ac1ca52b56bf0aa42fd5c26abe..0000000000000000000000000000000000000000 --- a/app.py +++ /dev/null @@ -1,219 +0,0 @@ -import torch -import numpy as np -import gradio as gr -import matplotlib.pyplot as plt -import clip -from sae import SAE -import os - -# --- 1. Setup and Model Loading --- - -# Use GPU if available, otherwise CPU -device = "cuda" if torch.cuda.is_available() else "cpu" -print(f"Using device: {device}") - -# Define file paths for clarity -SAE_MODEL_PATH = '6144_768_TopKReLU_64_RW_False_False_0.0_cc3m_ViT-L~14_train_image_2905936_768.pth' -VOCAB_SCORES_PATH = 'Concept_Interpreter_6144_768_TopKReLU_64_RW_False_False_0.0_cc3m_ViT-L~14_train_image_2905936_768_disect_ViT-L~14_-1_text_20000_768.npy' -VOCAB_NAMES_PATH = 'clip_disect_20k.txt' - -# Load models and data -try: - # Load CLIP model - model, preprocess = clip.load("ViT-L/14", device=device) - - # Load Sparse Autoencoder (SAE) model - # Ensure the SAE class correctly handles moving the model to the specified device - sae_model = SAE(SAE_MODEL_PATH).to(device).eval() - - # Load pre-computed vocabulary scores and names - vocab_scores = np.load(VOCAB_SCORES_PATH) - with open(VOCAB_NAMES_PATH, 'r') as f: - vocab_names = [line.strip() for line in f.readlines()] - -except FileNotFoundError as e: - print(f"ERROR: A required file was not found: {e.filename}") - print("Please ensure all model and vocabulary files are present in the correct paths.") - # Exit if essential files are missing - exit() - -# Pre-calculate mappings for faster lookup -# For a given feature index, what is the best concept name? -feature_to_concept_score = np.max(vocab_scores, axis=0) -feature_to_concept_name_idx = np.argmax(vocab_scores, axis=0) - -# For a given concept name, what is the best feature index? -concept_to_feature_score = np.max(vocab_scores, axis=1) -concept_to_feature_idx = np.argmax(vocab_scores, axis=1) - - -# --- 2. Helper and Core Logic Functions --- - -def calculate_fvu(original_input, reconstruction): - """Calculates the Fraction of Variance Unexplained (FVU).""" - variance = (original_input - original_input.mean(dim=-1, keepdim=True)).var(dim=-1) - recon_error_variance = (original_input - reconstruction).var(dim=-1) - # Clamp to avoid division by zero or tiny numbers - fvu_val = (recon_error_variance / (variance + 1e-8)).mean() - return fvu_val.item() - -def predict(input_img, top_k, concept, neg_concept, max_strength): - """ - Main function to process an image, identify top concepts, and visualize concept manipulation. - """ - if not input_img: - raise gr.Error("Please provide an input image.") - - # --- Part A: Top Concepts Analysis --- - - # Preprocess the input image and move to the correct device - image_input_processed = preprocess(input_img.convert("RGB")).unsqueeze(0).to(device) - - with torch.no_grad(): - # Encode the image with CLIP - image_features = model.encode_image(image_input_processed).to(torch.float32) - - # Get SAE reconstruction and latent activations - reconstructed_features, _, full_latents = sae_model(image_features) - - fvu_score = calculate_fvu(image_features, reconstructed_features) - - # Get the top K activating SAE features for the image - full_latents = full_latents.cpu().flatten() - top_sae_values, top_sae_indices = full_latents.topk(k=top_k) - - # Create the bar plot for top concepts - fig_bar, ax_bar = plt.subplots(figsize=(10, 6)) - concept_labels = [ - f"{vocab_names[feature_to_concept_name_idx[i]]} ({feature_to_concept_score[i]:.2f})" - for i in top_sae_indices - ] - ax_bar.barh(range(top_k), top_sae_values.numpy(), color='skyblue') - ax_bar.set_yticks(range(top_k)) - ax_bar.set_yticklabels(concept_labels) - ax_bar.invert_yaxis() # Display top concept at the top - ax_bar.set_xlabel("SAE Feature Activation") - ax_bar.set_title(f"Top {top_k} Concepts (FVU: {fvu_score:.2f})") - plt.tight_layout() - - # --- Part B: Concept Manipulation --- - - # Validate the user-provided concept - if concept not in vocab_names: - raise gr.Error(f"Concept '{concept}' not found in vocabulary. Please choose a valid concept.") - - # Get the feature index corresponding to the chosen concept - concept_feature_id = concept_to_feature_idx[vocab_names.index(concept)] - concept_assign_score = concept_to_feature_score[vocab_names.index(concept)] - - # Create positive and negative text prompts - if not neg_concept: - neg_concept_prompt = f"a photo without {concept}" - else: - neg_concept_prompt = f"a photo with {neg_concept}" - - pos_concept_prompt = f"a photo with {concept}" - - # Tokenize prompts and encode with CLIP - text_labels = clip.tokenize([pos_concept_prompt, neg_concept_prompt]).to(device) - with torch.no_grad(): - text_features = model.encode_text(text_labels) - text_features /= text_features.norm(dim=-1, keepdim=True) - - # Define the range of strengths to test - strengths = torch.linspace(0.0, max_strength, 11).to(device) - pos_concept_probs, neg_concept_probs, cos_sims = [], [], [] - - original_reconstructed_norm = reconstructed_features / reconstructed_features.norm(dim=-1, keepdim=True) - - for st in strengths: - with torch.no_grad(): - # Create a copy of latents and modify the target concept feature - modified_latents = full_latents.clone().to(device).reshape(1, -1) - modified_latents[:, concept_feature_id] = st - - # Decode the modified latents back into feature space - modified_reconstructed = sae_model.model.decode(modified_latents) - - # Normalize for comparison - modified_reconstructed_norm = modified_reconstructed / modified_reconstructed.norm(dim=-1, keepdim=True) - - # Calculate similarity to the text prompts (probabilities) - probs = (100.0 * modified_reconstructed_norm @ text_features.T).softmax(dim=-1) - pos_concept_probs.append(probs[0, 0].item()) - neg_concept_probs.append(probs[0, 1].item()) - - # Calculate cosine similarity to the original reconstructed image - cos_sims.append( - torch.nn.functional.cosine_similarity(modified_reconstructed_norm, original_reconstructed_norm).item() - ) - - # Create the line plot for concept manipulation - fig_line, ax_line = plt.subplots(figsize=(10, 6)) - strengths_cpu = strengths.cpu().numpy() - ax_line.plot(strengths_cpu, pos_concept_probs, 'o-', label=f'"{pos_concept_prompt}"') - ax_line.plot(strengths_cpu, neg_concept_probs, 'o-', label=f'"{neg_concept_prompt}"') - - # Add cosine similarity on a secondary y-axis - ax2 = ax_line.twinx() - ax2.plot(strengths_cpu, cos_sims, 'x-', color='green', label='Similarity to Original') - ax2.set_ylabel('Cosine Similarity', color='green') - ax2.tick_params(axis='y', labelcolor='green') - - ax_line.set_xlabel("Strength of Concept Feature") - ax_line.set_ylabel("CLIP Probability") - ax_line.set_title(f"Effect of Modifying '{concept}' (Assignment Score: {concept_assign_score:.2f})") - fig_line.legend(loc="upper right", bbox_to_anchor=(1, 1), bbox_transform=ax_line.transAxes) - plt.tight_layout() - - # Close figures to free memory - plt.close(fig_bar) - plt.close(fig_line) - - return input_img, fig_bar, fig_line - - -# --- 3. Gradio Interface --- - -with gr.Blocks(theme=gr.themes.Soft(), title="Matryoshka Sparse Autoencoder (MSAE) Example") as demo: - gr.Markdown( - "Based on the paper: [Interpreting CLIP with Hierarchical Sparse Autoencoders](https://openreview.net/forum?id=5MQQsenQBm). " - "Upload an image to see its top activating concepts from a sparse autoencoder. Then, choose a concept (from `clip_disect_20k.txt`) to visualize how manipulating its corresponding concept magnitude affects the image representation." - ) - - with gr.Row(): - with gr.Column(scale=1): - # Input controls - image_input = gr.Image(label="Input Image", sources=['upload', 'webcam'], type="pil") - gr.Markdown("### Analysis & Manipulation Controls") - top_k_slider = gr.Slider(minimum=3, maximum=20, value=10, step=1, label="Top K Concepts") - concept_input = gr.Textbox(label="Concept to Manipulate", value="hair", placeholder="e.g., hair") - neg_concept_input = gr.Textbox(label="Negative Concept (Optional)", placeholder="e.g., a frown") - max_strength_slider = gr.Slider(minimum=1.0, maximum=20.0, value=10.0, step=0.5, label="Max Concept Strength") - submit_btn = gr.Button("Analyze and Interpret", variant="primary") - - with gr.Column(scale=2): - # Output displays - gr.Markdown("### Results") - output_image = gr.Image(label="Original Image", interactive=False) - output_bar_plot = gr.Plot(label="Top Activating Concepts") - output_line_plot = gr.Plot(label="Concept Manipulation Analysis") - - gr.Examples( - inputs=[image_input, top_k_slider, concept_input, neg_concept_input, max_strength_slider], - outputs=[output_image, output_bar_plot, output_line_plot], - fn=predict, - cache_examples=True # Set to True for faster loading on HF Spaces - ) - - # Wire up the button to the function - submit_btn.click( - fn=predict, - inputs=[image_input, top_k_slider, concept_input, neg_concept_input, max_strength_slider], - outputs=[output_image, output_bar_plot, output_line_plot] - ) - - -if __name__ == "__main__": - demo.launch(debug=True) - diff --git a/clip_disect_20k.txt b/clip_disect_20k.txt old mode 100644 new mode 100755 diff --git a/requirements.txt b/requirements.txt deleted file mode 100644 index ebfc3aec87f41eeab60b175bfed8351fe3dcc6ae..0000000000000000000000000000000000000000 --- a/requirements.txt +++ /dev/null @@ -1,6 +0,0 @@ -torch==2.2.2 -numpy==1.26.4 -torchvision==0.17.2 -matplotlib -gradio -git+https://github.com/openai/CLIP.git