--- name: EVA-CLIP-8B-SFT base_model: BAAI/EVA-CLIP-8B license: apache-2.0 pipeline_tag: image-to-text size: - 30100210576 - 30GB - 10GB x 3 - 3GB x 1 tasks : - image-to-text - image-text-to-text - visual-question-answering - zero-shot-image-classification language: en datasets : - laion/laion2B-en - kakaobrain/coyo-700m --- > [!IMPORTANT] > Original Model Link : [https://huggingface.co/BAAI/EVA-CLIP-8B](https://huggingface.co/BAAI/EVA-CLIP-8B) > ``` name: EVA-CLIP-8B-SFT base_model: BAAI/EVA-CLIP-8B license: apache-2.0 pipeline_tag: image-to-text size: - 30100210576 - 30GB - 10GB x 3 - 3GB x 1 tasks : - image-to-text - image-text-to-text - visual-question-answering - zero-shot-image-classification language: en datasets : - laion/laion2B-en - kakaobrain/coyo-700m ``` # EVA-CLIP-8B-SFT EVA-CLIP-8B in Safetensors format with no quantization ## Usage
Huggingface Version ```python from PIL import Image from transformers import AutoModel, AutoConfig from transformers import CLIPImageProcessor, pipeline, CLIPTokenizer import torch import torchvision.transforms as T from torchvision.transforms import InterpolationMode image_path = "CLIP.png" model_name_or_path = "BAAI/EVA-CLIP-8B" # or /path/to/local/EVA-CLIP-8B image_size = 224 processor = CLIPImageProcessor.from_pretrained("openai/clip-vit-large-patch14") # use image processor with conig # processor = CLIPImageProcessor(size={"shortest_edge":image_size}, do_center_crop=True, crop_size=image_size) ## you can also directly use the image processor by torchvision ## squash # processor = T.Compose( # [ # T.Lambda(lambda img: img.convert('RGB') if img.mode != 'RGB' else img), # T.Resize((image_size, image_size), interpolation=InterpolationMode.BICUBIC), # T.ToTensor(), # T.Normalize(mean=(0.48145466, 0.4578275, 0.40821073), std=(0.26862954, 0.26130258, 0.27577711)) # ] # ) ## shortest ## processor = T.Compose( # [ # T.Lambda(lambda img: img.convert('RGB') if img.mode != 'RGB' else img), # T.Resize(image_size, interpolation=InterpolationMode.BICUBIC), # T.CenterCrop(image_size), # T.ToTensor(), # T.Normalize(mean=(0.48145466, 0.4578275, 0.40821073), std=(0.26862954, 0.26130258, 0.27577711)) # ] # ) model = AutoModel.from_pretrained( model_name_or_path, torch_dtype=torch.float16, trust_remote_code=True).to('cuda').eval() image = Image.open(image_path) captions = ["a diagram", "a dog", "a cat"] tokenizer = CLIPTokenizer.from_pretrained(model_name_or_path) input_ids = tokenizer(captions, return_tensors="pt", padding=True).input_ids.to('cuda') input_pixels = processor(images=image, return_tensors="pt", padding=True).pixel_values.to('cuda') with torch.no_grad(), torch.cuda.amp.autocast(): image_features = model.encode_image(input_pixels) text_features = model.encode_text(input_ids) image_features /= image_features.norm(dim=-1, keepdim=True) text_features /= text_features.norm(dim=-1, keepdim=True) label_probs = (100.0 * image_features @ text_features.T).softmax(dim=-1) print(f"Label probs: {label_probs}") ```
PyTorch Version [GitHub](https://github.com/baaivision/EVA/tree/master/EVA-CLIP-8B) ```python import torch from eva_clip import create_model_and_transforms, get_tokenizer from PIL import Image model_name = "EVA-CLIP-8B" pretrained = "eva_clip" # or "/path/to/EVA_CLIP_18B_psz14_s6B.fp16.pt" image_path = "CLIP.png" caption = ["a diagram", "a dog", "a cat"] device = "cuda" if torch.cuda.is_available() else "cpu" model, _, processor = create_model_and_transforms(model_name, pretrained, force_custom_clip=True) tokenizer = get_tokenizer(model_name) model = model.to(device) image = processor(Image.open(image_path)).unsqueeze(0).to(device) text = tokenizer(["a diagram", "a dog", "a cat"]).to(device) with torch.no_grad(), torch.cuda.amp.autocast(): image_features = model.encode_image(image) text_features = model.encode_text(text) image_features /= image_features.norm(dim=-1, keepdim=True) text_features /= text_features.norm(dim=-1, keepdim=True) text_probs = (100.0 * image_features @ text_features.T).softmax(dim=-1) print("Label probs:", text_probs) ``` You can leverage [deepspeed.zero.Init()](https://deepspeed.readthedocs.io/en/stable/zero3.html#constructing-massive-models) with deepspeed zero stage 3 if you have limited CPU memory. For loading a pretrained checkpoint in the context of using deepspeed.zero.Init(), it's advised to use the `load_zero_partitions()` function in `eva_clip/factory.py`.
## BibTeX & Citation ``` @article{EVA-CLIP-8B, title={EVA-CLIP-8B: Scaling CLIP to 18 Billion Parameters}, author={Quan Sun and Jinsheng Wang and Qiying Yu and Yufeng Cui and Fan Zhang and Xiaosong Zhang and Xinlong Wang}, journal={arXiv preprint arXiv:2402.04252}, year={2023} } ```