from transformers import ViTFeatureExtractor, ViTForImageClassification from PIL import Image import numpy as np class VitBase(): def __init__(self): self.feature_extractor = ViTFeatureExtractor.from_pretrained('google/vit-base-patch16-224') self.model = ViTForImageClassification.from_pretrained('google/vit-base-patch16-224') def extract_feature(self, imgs): features = [] for img in imgs: feature = self.feature_extractor(images=img, return_tensors="tf") print('keys: ', feature.keys()) f = self.model(feature) print('--> f', type(f)) # print('type::', (feature['pixel_values'].shape)) features.append(np.squeeze(feature['pixel_values'])) print('shape:::',features[0].shape) return features