facebook
/

PE-Core-B16-224

Zero-Shot Image Classification

PerceptionEncoder

Model card Files Files and versions

berniebear commited on Apr 21

Commit

7fe502b

·

verified ·

1 Parent(s): 3338906

Update README.md

Files changed (1) hide show

README.md +12 -14

README.md CHANGED Viewed

@@ -68,27 +68,25 @@ This will install an editable version of repo, allowing you to make changes to t
 ## Image and Text Feature extraction with a Trained Model
 ```python
 import torch
-from core.vision_encoder.factory import create_model_and_transforms, get_tokenizer
 from PIL import Image
-model_name = 'PEv1-B16-224'
-pretrained = 'PATH_TO_PE_Core_B16_224'
-model, _, preprocess = create_model_and_transforms(
-    model_name,
-    pretrained=pretrained,
-)
 model = model.cuda()
-tokenizer = get_tokenizer(model_name)
-image = preprocess(Image.open("docs/cat.png")).unsqueeze(0).cuda()
 text = tokenizer(["a diagram", "a dog", "a cat"]).cuda()
 with torch.no_grad(), torch.autocast("cuda"):
-    image_features = model.encode_image(image)
-    text_features = model.encode_text(text)
-    image_features /= image_features.norm(dim=-1, keepdim=True)
-    text_features /= text_features.norm(dim=-1, keepdim=True)
-    text_probs = (100.0 * image_features @ text_features.T).softmax(dim=-1)
 print("Label probs:", text_probs)  # prints: [[0.0, 0.0, 1.0]]
 ```

 ## Image and Text Feature extraction with a Trained Model
 ```python
 import torch
 from PIL import Image
+import core.vision_encoder.pe as pe
+import core.vision_encoder.transforms as transforms
+print("CLIP configs:", pe.CLIP.available_configs())
+# CLIP configs: ['PE-Core-G14-448', 'PE-Core-L14-336', 'PE-Core-B16-224']
+model = pe.CLIP.from_config("PE-Core-B16-224", pretrained=True)  # Downloads from HF
 model = model.cuda()
+preprocess = transforms.get_image_transform(model.image_size)
+tokenizer = transforms.get_text_tokenizer(model.context_length)
+image = preprocess(Image.open("docs/assets/cat.png")).unsqueeze(0).cuda()
 text = tokenizer(["a diagram", "a dog", "a cat"]).cuda()
 with torch.no_grad(), torch.autocast("cuda"):
+    image_features, text_features, logit_scale = model(image, text)
+    text_probs = (logit_scale * image_features @ text_features.T).softmax(dim=-1)
 print("Label probs:", text_probs)  # prints: [[0.0, 0.0, 1.0]]
 ```