Spaces:
Running
on
Zero
Running
on
Zero
import os | |
os.environ["CUDA_VISIBLE_DEVICES"] = "1" | |
import os | |
import torch | |
import sys | |
sys.path.append('./') | |
from videollama3 import disable_torch_init, model_init, mm_infer, get_model_output | |
from videollama3.mm_utils import load_video | |
import numpy as np | |
from PIL import Image | |
def infer_image(model, tokenizer): | |
image_path = 'demo/images/1.jpg' | |
image = Image.open(image_path) | |
image_data = np.array(image) | |
question = '<image>\nPlease describe the <region> in the image in detail.' | |
mask = np.load('demo/masks/demo0.npy') | |
masks = [] | |
masks.append(mask) | |
masks = np.array(masks) | |
masks = torch.from_numpy(masks).to(torch.uint8) | |
mask_ids = [0]*len(masks) | |
output = get_model_output( | |
[image_data], | |
question, | |
model=model, | |
tokenizer=tokenizer, | |
masks=masks, | |
mask_ids=mask_ids, | |
modal='image', | |
image_downsampling=1, | |
) | |
print(output) | |
def infer_video(model, tokenizer): | |
video_path = 'demo/videos/1.mp4' | |
question = '<video>\nPlease describe the <region> in the video in detail.' | |
frame_idx = 0 # mask from the first frame | |
video_tensor = load_video(video_path, fps=1, max_frames=768, frame_ids=[frame_idx]) | |
mask = np.load('demo/masks/demo1.npy') | |
masks = [] | |
masks.append(mask) | |
masks = np.array(masks) | |
masks = torch.from_numpy(masks).to(torch.uint8) | |
mask_ids = [0]*len(masks) | |
output = get_model_output( | |
video_tensor, | |
question, | |
model=model, | |
tokenizer=tokenizer, | |
masks=masks, | |
mask_ids=mask_ids, | |
modal='video', | |
) | |
print(output) | |
def main(): | |
disable_torch_init() | |
# fill in the model path here | |
model_path = '/mnt/workspace/workgroup/yuanyq/code/videollama3/ProjectX_region/work_dirs/VideoRefer-VideoLLaMA3-7B' | |
model, processor, tokenizer = model_init(model_path) | |
# image | |
infer_image(model, tokenizer) | |
# viideo | |
infer_video(model, tokenizer) | |
if __name__=='__main__': | |
main() |