import os os.environ["CUDA_VISIBLE_DEVICES"] = "1" import os import torch import sys sys.path.append('./') from videollama3 import disable_torch_init, model_init, mm_infer, get_model_output from videollama3.mm_utils import load_video import numpy as np from PIL import Image def infer_image(model, tokenizer): image_path = 'demo/images/1.jpg' image = Image.open(image_path) image_data = np.array(image) question = '\nPlease describe the in the image in detail.' mask = np.load('demo/masks/demo0.npy') masks = [] masks.append(mask) masks = np.array(masks) masks = torch.from_numpy(masks).to(torch.uint8) mask_ids = [0]*len(masks) output = get_model_output( [image_data], question, model=model, tokenizer=tokenizer, masks=masks, mask_ids=mask_ids, modal='image', image_downsampling=1, ) print(output) def infer_video(model, tokenizer): video_path = 'demo/videos/1.mp4' question = '