File size: 2,017 Bytes
44d8da2
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
import os
os.environ["CUDA_VISIBLE_DEVICES"] = "1"

import os
import torch
import sys
sys.path.append('./')
from videollama3 import disable_torch_init, model_init, mm_infer, get_model_output
from videollama3.mm_utils import load_video

import numpy as np
from PIL import Image

def infer_image(model, tokenizer):
    image_path = 'demo/images/1.jpg'
    image = Image.open(image_path)
    image_data = np.array(image)

    question = '<image>\nPlease describe the <region> in the image in detail.'

    mask = np.load('demo/masks/demo0.npy')
    masks = []
    masks.append(mask)
    masks = np.array(masks)
    masks = torch.from_numpy(masks).to(torch.uint8)

    mask_ids = [0]*len(masks)

    output = get_model_output(
        [image_data],
        question,
        model=model,
        tokenizer=tokenizer,
        masks=masks,
        mask_ids=mask_ids,
        modal='image',
        image_downsampling=1,
    )
    print(output)

def infer_video(model, tokenizer):
    video_path = 'demo/videos/1.mp4'
    question = '<video>\nPlease describe the <region> in the video in detail.'

    frame_idx = 0 # mask from the first frame
    video_tensor = load_video(video_path, fps=1, max_frames=768, frame_ids=[frame_idx])

    mask = np.load('demo/masks/demo1.npy')
    masks = []
    masks.append(mask)
    masks = np.array(masks)
    masks = torch.from_numpy(masks).to(torch.uint8)

    mask_ids = [0]*len(masks)

    output = get_model_output(
        video_tensor,
        question,
        model=model,
        tokenizer=tokenizer,
        masks=masks,
        mask_ids=mask_ids,
        modal='video',
    )
    print(output)

def main():
    disable_torch_init()

    # fill in the model path here
    model_path = '/mnt/workspace/workgroup/yuanyq/code/videollama3/ProjectX_region/work_dirs/VideoRefer-VideoLLaMA3-7B'
    model, processor, tokenizer = model_init(model_path)

    # image
    infer_image(model, tokenizer)

    # viideo
    infer_video(model, tokenizer)


if __name__=='__main__':
    main()