```python import torch import requests from PIL import Image from transformers import AutoModel, AutoProcessor, AutoImageProcessor from transformers import Qwen2VLForConditionalGeneration image_processor = AutoImageProcessor.from_pretrained("shilinxu/Qwen2-VL-2B-ViT", trust_remote_code=True) vit = AutoModel.from_pretrained("shilinxu/Qwen2-VL-2B-ViT", trust_remote_code=True, device_map='auto',torch_dtype=torch.bfloat16, attn_implementation='flash_attention_2') url = "https://qianwen-res.oss-cn-beijing.aliyuncs.com/Qwen-VL/assets/demo.jpeg" image = Image.open(requests.get(url, stream=True).raw).convert("RGB") images = [image] inputs = image_processor(images, return_tensors="pt") pixel_values = inputs['pixel_values'].to(device=vit.device, dtype=vit.dtype) image_grid_thw = inputs['image_grid_thw'] image_embeds = vit(pixel_values, grid_thw=image_grid_thw) ```