Visual Question Answering
Transformers
Safetensors
English
videollama2_mistral
text-generation
multimodal large language model
large video-language model
ccclemenfff commited on
Commit
d4a7de9
·
1 Parent(s): 9d75944

Add custom handler and requirements

Browse files
Files changed (2) hide show
  1. handler.py +73 -0
  2. requirements.txt +39 -0
handler.py ADDED
@@ -0,0 +1,73 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ from typing import Dict, Any
2
+ import base64
3
+ import tempfile
4
+ import os
5
+ import sys
6
+
7
+ # 确保能导入 videollama2 模块(模型代码需要放同目录或已安装)
8
+ sys.path.append('./')
9
+
10
+ from videollama2 import model_init, mm_infer
11
+ from videollama2.utils import disable_torch_init
12
+
13
+ class EndpointHandler:
14
+ def __init__(self, path=""):
15
+ # 关闭torch自动初始化,避免重复加载
16
+ disable_torch_init()
17
+ # 模型路径,如果HF环境传入的path为空,就用默认的官方仓库地址
18
+ self.model_path = path or "DAMO-NLP-SG/VideoLLaMA2-7B-16F"
19
+ # 加载模型、处理器、分词器
20
+ self.model, self.processor, self.tokenizer = model_init(self.model_path)
21
+
22
+ def __call__(self, data: Dict[str, Any]) -> Dict[str, Any]:
23
+ """
24
+ 期待输入数据格式:
25
+ {
26
+ "video": "<base64字符串>", # 视频文件base64编码
27
+ "prompt": "描述视频内容的自然语言指令"
28
+ }
29
+ 或者
30
+ {
31
+ "image": "<base64字符串>", # 图片文件base64编码
32
+ "prompt": "描述图片内容的自然语言指令"
33
+ }
34
+ """
35
+ # 判断输入模态
36
+ if "video" in data:
37
+ modal = "video"
38
+ file_b64 = data["video"]
39
+ elif "image" in data:
40
+ modal = "image"
41
+ file_b64 = data["image"]
42
+ else:
43
+ return {"error": "请求必须包含 'video' 或 'image' 字段"}
44
+
45
+ prompt = data.get("prompt", "Describe the content.")
46
+
47
+ # 临时写入二进制文件,供 processor 读取
48
+ suffix = ".mp4" if modal == "video" else ".png"
49
+ with tempfile.NamedTemporaryFile(delete=False, suffix=suffix) as tmp_file:
50
+ tmp_file.write(base64.b64decode(file_b64))
51
+ tmp_path = tmp_file.name
52
+
53
+ try:
54
+ # 处理输入,调用模型推理
55
+ inputs = self.processor[modal](tmp_path)
56
+ output = mm_infer(
57
+ inputs,
58
+ prompt,
59
+ model=self.model,
60
+ tokenizer=self.tokenizer,
61
+ do_sample=False,
62
+ modal=modal
63
+ )
64
+ finally:
65
+ # 清理临时文件
66
+ os.remove(tmp_path)
67
+
68
+ # 返回结构统一,方便调用方解析
69
+ return {
70
+ "modal": modal,
71
+ "prompt": prompt,
72
+ "result": output
73
+ }
requirements.txt ADDED
@@ -0,0 +1,39 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ --extra-index-url https://download.pytorch.org/whl/cu118
2
+ # basic dependencies
3
+ torch==2.2.0
4
+ torchvision==0.17.0
5
+ transformers==4.40.0
6
+ tokenizers==0.19.1
7
+ deepspeed==0.13.1
8
+ accelerate==0.26.1
9
+ peft==0.4.0
10
+ timm==1.0.3
11
+ numpy==1.24.4
12
+ # data processing
13
+ decord==0.6.0
14
+ imageio==2.34.0
15
+ imageio-ffmpeg==0.4.9
16
+ moviepy==1.0.3
17
+ opencv-python==4.6.0.66
18
+ pysubs2
19
+ # misc
20
+ scikit-learn==1.2.2
21
+ huggingface_hub==0.23.4
22
+ sentencepiece==0.1.99
23
+ shortuuid
24
+ einops==0.6.1
25
+ einops-exts==0.0.4
26
+ bitsandbytes==0.43.0
27
+ pydantic>=2.0
28
+ markdown2[all]
29
+ gradio==3.50.0
30
+ gradio_client==0.6.1
31
+ httpx==0.24.1
32
+ requests
33
+ openai
34
+ uvicorn
35
+ fastapi
36
+ tensorboard
37
+ wandb
38
+ tabulate
39
+ spaces==0.29.2