Spaces:

lixin4ever
/

VideoRefer-VideoLLaMA3

Running on Zero

App Files Files Community

VideoRefer-VideoLLaMA3 / videollama3 /infer.py

lixin4ever

init (#1)

44d8da2 verified 10 days ago

raw

history blame

2.02 kB

	import os
	os.environ["CUDA_VISIBLE_DEVICES"] = "1"

	import os
	import torch
	import sys
	sys.path.append('./')
	from videollama3 import disable_torch_init, model_init, mm_infer, get_model_output
	from videollama3.mm_utils import load_video

	import numpy as np
	from PIL import Image

	def infer_image(model, tokenizer):
	image_path = 'demo/images/1.jpg'
	image = Image.open(image_path)
	image_data = np.array(image)

	question = '<image>\nPlease describe the <region> in the image in detail.'

	mask = np.load('demo/masks/demo0.npy')
	masks = []
	masks.append(mask)
	masks = np.array(masks)
	masks = torch.from_numpy(masks).to(torch.uint8)

	mask_ids = [0]*len(masks)

	output = get_model_output(
	[image_data],
	question,
	model=model,
	tokenizer=tokenizer,
	masks=masks,
	mask_ids=mask_ids,
	modal='image',
	image_downsampling=1,
	)
	print(output)

	def infer_video(model, tokenizer):
	video_path = 'demo/videos/1.mp4'
	question = '<video>\nPlease describe the <region> in the video in detail.'

	frame_idx = 0 # mask from the first frame
	video_tensor = load_video(video_path, fps=1, max_frames=768, frame_ids=[frame_idx])

	mask = np.load('demo/masks/demo1.npy')
	masks = []
	masks.append(mask)
	masks = np.array(masks)
	masks = torch.from_numpy(masks).to(torch.uint8)

	mask_ids = [0]*len(masks)

	output = get_model_output(
	video_tensor,
	question,
	model=model,
	tokenizer=tokenizer,
	masks=masks,
	mask_ids=mask_ids,
	modal='video',
	)
	print(output)

	def main():
	disable_torch_init()

	# fill in the model path here
	model_path = '/mnt/workspace/workgroup/yuanyq/code/videollama3/ProjectX_region/work_dirs/VideoRefer-VideoLLaMA3-7B'
	model, processor, tokenizer = model_init(model_path)

	# image
	infer_image(model, tokenizer)

	# viideo
	infer_video(model, tokenizer)


	if __name__=='__main__':
	main()