tuandunghcmut commited on
Commit
ca0b425
·
verified ·
1 Parent(s): 716a88e

Add files using upload-large-folder tool

Browse files
This view is limited to 50 files because it contains too many changes.   See raw diff
Files changed (50) hide show
  1. PaddleMIX/comfyui/ComfyUI_ppdiffusers/README.md +50 -0
  2. PaddleMIX/comfyui/ComfyUI_ppdiffusers/__init__.py +43 -0
  3. PaddleMIX/comfyui/ComfyUI_ppdiffusers/basic_nodes.py +128 -0
  4. PaddleMIX/comfyui/ComfyUI_ppdiffusers/requirements.txt +2 -0
  5. PaddleMIX/comfyui/ComfyUI_ppdiffusers/sd_pipe_nodes.py +348 -0
  6. PaddleMIX/comfyui/ComfyUI_ppdiffusers/utils/schedulers.py +75 -0
  7. vlmeval/VLMEvalKit_old/assets/LOGO.svg +24 -0
  8. vlmeval/VLMEvalKit_old/assets/apple.jpg +0 -0
  9. vlmeval/VLMEvalKit_old/docs/en/Contributors.md +21 -0
  10. vlmeval/VLMEvalKit_old/docs/en/_static/css/readthedocs.css +63 -0
  11. vlmeval/VLMEvalKit_old/docs/en/_static/image/logo_icon.svg +31 -0
  12. vlmeval/VLMEvalKit_old/docs/en/_static/js/custom.js +10 -0
  13. vlmeval/VLMEvalKit_old/docs/en/_templates/404.html +18 -0
  14. vlmeval/VLMEvalKit_old/docs/en/_templates/autosummary/class.rst +13 -0
  15. vlmeval/VLMEvalKit_old/docs/en/_templates/callable.rst +14 -0
  16. vlmeval/VLMEvalKit_old/docs/en/docutils.conf +2 -0
  17. vlmeval/VLMEvalKit_old/docs/ja/README_ja.md +177 -0
  18. vlmeval/VLMEvalKit_old/docs/zh-CN/ConfigSystem.md +59 -0
  19. vlmeval/VLMEvalKit_old/docs/zh-CN/Development.md +140 -0
  20. vlmeval/VLMEvalKit_old/docs/zh-CN/Makefile +20 -0
  21. vlmeval/VLMEvalKit_old/docs/zh-CN/README_zh-CN.md +215 -0
  22. vlmeval/VLMEvalKit_old/docs/zh-CN/_static/image/logo_icon.svg +31 -0
  23. vlmeval/VLMEvalKit_old/docs/zh-CN/_static/js/custom.js +10 -0
  24. vlmeval/VLMEvalKit_old/docs/zh-CN/_templates/autosummary/class.rst +13 -0
  25. vlmeval/VLMEvalKit_old/docs/zh-CN/_templates/callable.rst +14 -0
  26. vlmeval/VLMEvalKit_old/docs/zh-CN/conf.py +242 -0
  27. vlmeval/VLMEvalKit_old/docs/zh-CN/index.rst +49 -0
  28. vlmeval/VLMEvalKit_old/scripts/apires_scan.py +55 -0
  29. vlmeval/VLMEvalKit_old/scripts/cover.sh +4 -0
  30. vlmeval/VLMEvalKit_old/scripts/data_browser.py +166 -0
  31. vlmeval/VLMEvalKit_old/scripts/mmb_eval_gradio.py +101 -0
  32. vlmeval/VLMEvalKit_old/scripts/run.sh +4 -0
  33. vlmeval/VLMEvalKit_old/scripts/srun.sh +3 -0
  34. vlmeval/VLMEvalKit_old/scripts/visualize.ipynb +266 -0
  35. vlmeval/VLMEvalKit_old/vlmeval/api/__init__.py +26 -0
  36. vlmeval/VLMEvalKit_old/vlmeval/api/bailingmm.py +90 -0
  37. vlmeval/VLMEvalKit_old/vlmeval/api/base.py +289 -0
  38. vlmeval/VLMEvalKit_old/vlmeval/api/claude.py +111 -0
  39. vlmeval/VLMEvalKit_old/vlmeval/api/cloudwalk.py +107 -0
  40. vlmeval/VLMEvalKit_old/vlmeval/api/gemini.py +119 -0
  41. vlmeval/VLMEvalKit_old/vlmeval/api/glm_vision.py +95 -0
  42. vlmeval/VLMEvalKit_old/vlmeval/api/hf_chat_model.py +246 -0
  43. vlmeval/VLMEvalKit_old/vlmeval/api/hunyuan.py +147 -0
  44. vlmeval/VLMEvalKit_old/vlmeval/api/jt_vl_chat.py +239 -0
  45. vlmeval/VLMEvalKit_old/vlmeval/api/qwen_api.py +75 -0
  46. vlmeval/VLMEvalKit_old/vlmeval/api/qwen_vl_api.py +219 -0
  47. vlmeval/VLMEvalKit_old/vlmeval/api/reka.py +60 -0
  48. vlmeval/VLMEvalKit_old/vlmeval/api/siliconflow.py +185 -0
  49. vlmeval/VLMEvalKit_old/vlmeval/api/stepai.py +87 -0
  50. vlmeval/VLMEvalKit_old/vlmeval/api/taiyi.py +192 -0
PaddleMIX/comfyui/ComfyUI_ppdiffusers/README.md ADDED
@@ -0,0 +1,50 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # ComfyUI_ppdiffusers
2
+ 这是一个针对ComfyUI开发的 [ppdiffusers](https://github.com/PaddlePaddle/PaddleMIX/tree/develop/ppdiffusers)扩展,支持了常见的模型加载转换、文生图、图生图、图像局部修改等任务。
3
+ ## 安装
4
+ ```bash
5
+ git clone https://github.com/PaddlePaddle/PaddleMIX --depth 1
6
+ cd PaddleMIX/comfyui/
7
+ mv ComfyUI_ppdiffusers/ /path/to/your_comfyui/custom_nodes/
8
+ ```
9
+
10
+ ## 使用
11
+ ### 在线体验
12
+ aistudio: https://aistudio.baidu.com/community/app/106043
13
+
14
+ ### 本地运行
15
+ 所有已经支持的工作流都在./workflows目录下,可以直接加载工作流json文件使用。
16
+ 原生支持加载safetensors格式模型,在线读取转换为paddle模型后,在ComfyUI中使用ppdiffusers的pipeline运行。
17
+
18
+ ## 已支持Node
19
+ Stable Diffusion 1.5系列:
20
+ - SD1.5模型加载转换
21
+ - SD1.5文生图
22
+ - SD1.5图生图
23
+ - SD1.5图像局部修改
24
+
25
+ Stable Diffusion XL系列:
26
+ - SDXL模型加载转换
27
+ - SDXL文生图
28
+ - SDXL图生图
29
+ - SDXL图像局部修改
30
+
31
+ ## 效果展示
32
+ ### SDXL
33
+ 1. 文生图
34
+ <img width="600" alt="image" src="https://github.com/PaddlePaddle/PaddleMIX/assets/35400185/1d9d17cd-dd1f-4e05-9c98-c1fc4fca8185">
35
+
36
+ 2. 图生图
37
+ <img width="600" alt="image" src="https://github.com/PaddlePaddle/PaddleMIX/assets/35400185/04e9cc05-9ce4-4207-88c4-3d076aaebff4">
38
+
39
+ 3. 局部编辑
40
+ <img width="600" alt="image" src="https://github.com/PaddlePaddle/PaddleMIX/assets/35400185/36ba7261-1744-41a4-b1cb-c9e99f6931f2">
41
+
42
+ ### SD1.5
43
+ 1. 文生图
44
+ <img width="600" alt="image" src="https://github.com/PaddlePaddle/PaddleMIX/assets/35400185/a939274e-a23b-4ecf-956c-56fd8343707c">
45
+
46
+ 2. 图生图
47
+ <img width="600" alt="image" src="https://github.com/PaddlePaddle/PaddleMIX/assets/35400185/b8b144bb-0b52-4235-91d9-d7bdfb44a1d8">
48
+
49
+ 3. 图像局部重绘
50
+ <img width="600" alt="image" src="https://github.com/PaddlePaddle/PaddleMIX/assets/35400185/7a75899d-48ca-4479-9fb8-18d077fc3607">
PaddleMIX/comfyui/ComfyUI_ppdiffusers/__init__.py ADDED
@@ -0,0 +1,43 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # Copyright (c) 2024 PaddlePaddle Authors. All Rights Reserved.
2
+ #
3
+ # Licensed under the Apache License, Version 2.0 (the "License");
4
+ # you may not use this file except in compliance with the License.
5
+ # You may obtain a copy of the License at
6
+ #
7
+ # http://www.apache.org/licenses/LICENSE-2.0
8
+ #
9
+ # Unless required by applicable law or agreed to in writing, software
10
+ # distributed under the License is distributed on an "AS IS" BASIS,
11
+ # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12
+ # See the License for the specific language governing permissions and
13
+ # limitations under the License.
14
+
15
+ try:
16
+ import paddle
17
+ except ImportError:
18
+ print("Paddle is not installed. Please install it to use this node.")
19
+ __all__ = []
20
+ else:
21
+ from .basic_nodes import NODE_CLASS_MAPPINGS as NODE_CLASS_MAPPINGS_BASIC
22
+ from .basic_nodes import (
23
+ NODE_DISPLAY_NAME_MAPPINGS as NODE_DISPLAY_NAME_MAPPINGS_BASIC,
24
+ )
25
+ from .sd_pipe_nodes import NODE_CLASS_MAPPINGS as NODE_CLASS_MAPPINGS_SD
26
+ from .sd_pipe_nodes import (
27
+ NODE_DISPLAY_NAME_MAPPINGS as NODE_DISPLAY_NAME_MAPPINGS_SD,
28
+ )
29
+ from .sdxl_pipe_nodes import NODE_CLASS_MAPPINGS as NODE_CLASS_MAPPINGS_SDXL
30
+ from .sdxl_pipe_nodes import (
31
+ NODE_DISPLAY_NAME_MAPPINGS as NODE_DISPLAY_NAME_MAPPINGS_SDXL,
32
+ )
33
+
34
+ NODE_CLASS_MAPPINGS = {**NODE_CLASS_MAPPINGS_BASIC, **NODE_CLASS_MAPPINGS_SD, **NODE_CLASS_MAPPINGS_SDXL}
35
+ NODE_DISPLAY_NAME_MAPPINGS = {
36
+ **NODE_DISPLAY_NAME_MAPPINGS_BASIC,
37
+ **NODE_DISPLAY_NAME_MAPPINGS_SD,
38
+ **NODE_DISPLAY_NAME_MAPPINGS_SDXL,
39
+ }
40
+ __all__ = [
41
+ "NODE_CLASS_MAPPINGS",
42
+ "NODE_DISPLAY_NAME_MAPPINGS",
43
+ ]
PaddleMIX/comfyui/ComfyUI_ppdiffusers/basic_nodes.py ADDED
@@ -0,0 +1,128 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # Copyright (c) 2024 PaddlePaddle Authors. All Rights Reserved.
2
+ #
3
+ # Licensed under the Apache License, Version 2.0 (the "License");
4
+ # you may not use this file except in compliance with the License.
5
+ # You may obtain a copy of the License at
6
+ #
7
+ # http://www.apache.org/licenses/LICENSE-2.0
8
+ #
9
+ # Unless required by applicable law or agreed to in writing, software
10
+ # distributed under the License is distributed on an "AS IS" BASIS,
11
+ # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12
+ # See the License for the specific language governing permissions and
13
+ # limitations under the License.
14
+
15
+ import base64
16
+ import json
17
+ import os
18
+ from io import BytesIO
19
+
20
+ import folder_paths
21
+ import requests
22
+ from comfy.cli_args import args
23
+ from PIL import Image
24
+ from PIL.PngImagePlugin import PngInfo
25
+
26
+
27
+ class PaddleSaveImage:
28
+ def __init__(self):
29
+ self.output_dir = folder_paths.get_output_directory()
30
+ self.type = "output"
31
+ self.prefix_append = ""
32
+ self.compress_level = 4
33
+ self.serving_web_host = os.environ.get("AISTUDIO_MS_SERVING_WEB_HOST")
34
+ self.serving_app_token = os.environ.get("AISTUDIO_MS_AIGC_APP_JWT")
35
+
36
+ @classmethod
37
+ def INPUT_TYPES(s):
38
+ return {
39
+ "required": {
40
+ "images": ("IMAGE",),
41
+ "filename_prefix": ("STRING", {"default": "ComfyUI"}),
42
+ "censor": ("BOOLEAN", {"default": True})
43
+ },
44
+ "hidden": {"prompt": "PROMPT", "extra_pnginfo": "EXTRA_PNGINFO"},
45
+ }
46
+
47
+ RETURN_TYPES = ()
48
+ FUNCTION = "save_images"
49
+
50
+ OUTPUT_NODE = True
51
+
52
+ CATEGORY = "🚢 paddlemix/ppdiffusers/output"
53
+
54
+ def censor_image(self, image):
55
+ buffered = BytesIO()
56
+ image.save(buffered, format="PNG")
57
+ img_str = base64.b64encode(buffered.getvalue()).decode("utf-8")
58
+ url = (
59
+ f"http://{self.serving_web_host}/serving/web/aigc/censor/image?serving_app_token={self.serving_app_token}"
60
+ )
61
+ data = {"image": img_str}
62
+ response = requests.post(url, json=data).json()
63
+ print(response)
64
+ return response["result"]["pass"]
65
+
66
+ def save_images(self, images, censor=True, filename_prefix="ComfyUI", prompt=None, extra_pnginfo=None):
67
+ filename_prefix += self.prefix_append
68
+ full_output_folder, filename, counter, subfolder, filename_prefix = folder_paths.get_save_image_path(
69
+ filename_prefix, self.output_dir, images[0].shape[1], images[0].shape[0]
70
+ )
71
+ results = list()
72
+ for (batch_number, image) in enumerate(images):
73
+ img = Image.fromarray(image)
74
+ if censor:
75
+ pass_censor = self.censor_image(img)
76
+ else:
77
+ pass_censor = True
78
+ # breakpoint()
79
+ if pass_censor:
80
+ metadata = None
81
+ if not args.disable_metadata:
82
+ metadata = PngInfo()
83
+ if prompt is not None:
84
+ metadata.add_text("prompt", json.dumps(prompt))
85
+ if extra_pnginfo is not None:
86
+ for x in extra_pnginfo:
87
+ metadata.add_text(x, json.dumps(extra_pnginfo[x]))
88
+
89
+ filename_with_batch_num = filename.replace("%batch_num%", str(batch_number))
90
+ file = f"{filename_with_batch_num}_{counter:05}_.png"
91
+ img.save(os.path.join(full_output_folder, file), pnginfo=metadata, compress_level=self.compress_level)
92
+ results.append({"filename": file, "subfolder": subfolder, "type": self.type})
93
+ else:
94
+ results.append({"filename": "forbidden.png", "subfolder": "", "type": "output"})
95
+ counter += 1
96
+
97
+ return_dict = {"ui": {"images": results}}
98
+ return return_dict
99
+
100
+
101
+ class PromptInput:
102
+ @classmethod
103
+ def INPUT_TYPES(cls):
104
+ return {
105
+ "required": {
106
+ "prompt": ("STRING", {"multiline": True, "dynamicPrompts": True}),
107
+ }
108
+ }
109
+
110
+ RETURN_TYPES = ("PROMPT",)
111
+ RETURN_NAMES = ("prompt",)
112
+ FUNCTION = "encode"
113
+ CATEGORY = "🚢 paddlemix/ppdiffusers/input"
114
+
115
+ def encode(self, prompt):
116
+ # TODO: add check for prompt
117
+ return (prompt,)
118
+
119
+
120
+ NODE_CLASS_MAPPINGS = {
121
+ "PaddleSaveImage": PaddleSaveImage,
122
+ "PromptInput": PromptInput,
123
+ }
124
+
125
+ NODE_DISPLAY_NAME_MAPPINGS = {
126
+ "PromptInput": "Paddle Prompt Input",
127
+ "PaddleSaveImage": "Paddle Save Image",
128
+ }
PaddleMIX/comfyui/ComfyUI_ppdiffusers/requirements.txt ADDED
@@ -0,0 +1,2 @@
 
 
 
1
+ ppdiffusers
2
+ requests
PaddleMIX/comfyui/ComfyUI_ppdiffusers/sd_pipe_nodes.py ADDED
@@ -0,0 +1,348 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # Copyright (c) 2024 PaddlePaddle Authors. All Rights Reserved.
2
+ #
3
+ # Licensed under the Apache License, Version 2.0 (the "License");
4
+ # you may not use this file except in compliance with the License.
5
+ # You may obtain a copy of the License at
6
+ #
7
+ # http://www.apache.org/licenses/LICENSE-2.0
8
+ #
9
+ # Unless required by applicable law or agreed to in writing, software
10
+ # distributed under the License is distributed on an "AS IS" BASIS,
11
+ # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12
+ # See the License for the specific language governing permissions and
13
+ # limitations under the License.
14
+
15
+ import folder_paths
16
+ import numpy as np
17
+ import paddle
18
+ import torch # for convert data
19
+ from comfy.utils import ProgressBar
20
+
21
+ from ppdiffusers import (
22
+ StableDiffusionImg2ImgPipeline,
23
+ StableDiffusionInpaintPipeline,
24
+ StableDiffusionPipeline,
25
+ )
26
+
27
+ from .utils.schedulers import get_scheduler
28
+
29
+
30
+ class PaddleSDCheckpointLoader:
31
+ @classmethod
32
+ def INPUT_TYPES(cls):
33
+ return {"required": {"ckpt_name": (folder_paths.get_filename_list("checkpoints"),)}}
34
+
35
+ RETURN_TYPES = ("PIPELINE",)
36
+ RETURN_NAMES = ("sd_pipe",)
37
+ FUNCTION = "load_checkpoint"
38
+ CATEGORY = "🚢 paddlemix/ppdiffusers/input"
39
+
40
+ def load_checkpoint(self, ckpt_name):
41
+ ckpt_path = folder_paths.get_full_path("checkpoints", ckpt_name)
42
+ pipe = StableDiffusionPipeline.from_single_file(ckpt_path)
43
+ return (pipe,)
44
+
45
+
46
+ class PaddleSDVaeDecoder:
47
+ @classmethod
48
+ def INPUT_TYPES(cls):
49
+ return {"required": {"latent": ("LATENT",), "sd_pipe": ("PIPELINE",)}}
50
+
51
+ RETURN_TYPES = ("IMAGE",)
52
+ RETURN_NAMES = ("image",)
53
+ FUNCTION = "decode"
54
+ CATEGORY = "🚢 paddlemix/ppdiffusers/output"
55
+
56
+ def decode(self, sd_pipe, latent):
57
+ vae = sd_pipe.vae
58
+ latent = 1 / vae.config.scaling_factor * latent
59
+ image = vae.decode(latent, return_dict=False)[0]
60
+ image = (image / 2 + 0.5).clip(0, 1)
61
+ image = image.cast(dtype=paddle.float32).transpose([0, 2, 3, 1]).cpu().numpy()
62
+ image = (image * 255).astype(np.uint8)
63
+
64
+ return (image,)
65
+
66
+
67
+ class PaddleSDText2ImagePipe:
68
+ @classmethod
69
+ def INPUT_TYPES(cls):
70
+ return {
71
+ "required": {
72
+ "sd_pipe": ("PIPELINE",),
73
+ "prompt": ("PROMPT",),
74
+ "negative_prompt": ("PROMPT",),
75
+ "steps": (
76
+ "INT",
77
+ {
78
+ "default": 20,
79
+ "min": 1,
80
+ "max": 1000,
81
+ },
82
+ ),
83
+ "width": ("INT", {"default": 512, "min": 1, "max": 8192}),
84
+ "height": ("INT", {"default": 768, "min": 1, "max": 8192}),
85
+ "number": ("INT", {"default": 1, "min": 1, "max": 100}),
86
+ "seed": ("INT", {"default": 0, "min": 0, "max": 99999999999999999999999}),
87
+ "cfg": (
88
+ "FLOAT",
89
+ {
90
+ "default": 7.5,
91
+ "min": 0.0,
92
+ "max": 1000.0,
93
+ "step": 0.01,
94
+ },
95
+ ),
96
+ "scheduler_type": (
97
+ [
98
+ "euler",
99
+ "euler-ancestral",
100
+ "pndm",
101
+ "lms",
102
+ "heun",
103
+ "dpm-multi",
104
+ "dpm-single",
105
+ "kdpm2-ancestral",
106
+ "kdpm2",
107
+ "unipc-multi",
108
+ "ddim",
109
+ "ddpm",
110
+ "deis-multi",
111
+ ],
112
+ ),
113
+ }
114
+ }
115
+
116
+ RETURN_TYPES = ("LATENT",)
117
+ RETURN_NAMES = ("latent",)
118
+ FUNCTION = "sample"
119
+ CATEGORY = "🚢 paddlemix/ppdiffusers/pipelines"
120
+
121
+ def sample(self, sd_pipe, prompt, negative_prompt, steps, width, height, number, seed, cfg, scheduler_type):
122
+
123
+ pipe = StableDiffusionPipeline(**sd_pipe.components)
124
+ pipe.scheduler = get_scheduler(scheduler_type)
125
+ paddle.seed(seed)
126
+
127
+ progress_bar = ProgressBar(steps)
128
+ latent = pipe(
129
+ prompt=prompt,
130
+ negative_prompt=negative_prompt,
131
+ width=width,
132
+ height=height,
133
+ num_images_per_prompt=number,
134
+ num_inference_steps=steps,
135
+ guidance_scale=cfg,
136
+ output_type="latent",
137
+ callback=lambda step, timestep, latents: progress_bar.update_absolute(
138
+ value=step, total=steps, preview=None
139
+ ),
140
+ callback_steps=1,
141
+ ).images
142
+
143
+ return (latent,)
144
+
145
+
146
+ class PaddleSDImage2ImagePipe:
147
+ @classmethod
148
+ def INPUT_TYPES(cls):
149
+ return {
150
+ "required": {
151
+ "sd_pipe": ("PIPELINE",),
152
+ "image": ("IMAGE",),
153
+ "denoise": (
154
+ "FLOAT",
155
+ {
156
+ "default": 0.7,
157
+ "min": 0.0,
158
+ "max": 1.0,
159
+ "step": 0.01,
160
+ },
161
+ ),
162
+ "prompt": ("PROMPT",),
163
+ "negative_prompt": ("PROMPT",),
164
+ "steps": (
165
+ "INT",
166
+ {
167
+ "default": 20,
168
+ "min": 1,
169
+ "max": 1000,
170
+ },
171
+ ),
172
+ "number": ("INT", {"default": 1, "min": 1, "max": 100}),
173
+ "seed": ("INT", {"default": 0, "min": 0, "max": 99999999999999999999999}),
174
+ "cfg": (
175
+ "FLOAT",
176
+ {
177
+ "default": 7.5,
178
+ "min": 0.0,
179
+ "max": 1000.0,
180
+ "step": 0.01,
181
+ },
182
+ ),
183
+ "scheduler_type": (
184
+ [
185
+ "euler",
186
+ "euler-ancestral",
187
+ "pndm",
188
+ "lms",
189
+ "heun",
190
+ "dpm-multi",
191
+ "dpm-single",
192
+ "kdpm2-ancestral",
193
+ "kdpm2",
194
+ "unipc-multi",
195
+ "ddim",
196
+ "ddpm",
197
+ "deis-multi",
198
+ ],
199
+ ),
200
+ }
201
+ }
202
+
203
+ RETURN_TYPES = ("LATENT",)
204
+ RETURN_NAMES = ("latent",)
205
+ FUNCTION = "sample"
206
+ CATEGORY = "🚢 paddlemix/ppdiffusers/pipelines"
207
+
208
+ def sample(self, sd_pipe, image, denoise, prompt, negative_prompt, steps, number, seed, cfg, scheduler_type):
209
+ # torch -> numpy
210
+ if isinstance(image, torch.Tensor):
211
+ image = image.cpu().numpy()
212
+
213
+ pipe = StableDiffusionImg2ImgPipeline(**sd_pipe.components)
214
+ pipe.scheduler = get_scheduler(scheduler_type)
215
+ paddle.seed(seed)
216
+
217
+ progress_bar = ProgressBar(steps)
218
+ latent = pipe(
219
+ image=image,
220
+ strength=denoise,
221
+ prompt=prompt,
222
+ negative_prompt=negative_prompt,
223
+ num_images_per_prompt=number,
224
+ num_inference_steps=steps,
225
+ guidance_scale=cfg,
226
+ output_type="latent",
227
+ callback=lambda step, timestep, latents: progress_bar.update_absolute(
228
+ value=step, total=steps, preview=None
229
+ ),
230
+ callback_steps=1,
231
+ ).images
232
+
233
+ return (latent,)
234
+
235
+
236
+ class PaddleSDInpaintPipe:
237
+ @classmethod
238
+ def INPUT_TYPES(cls):
239
+ return {
240
+ "required": {
241
+ "sd_pipe": ("PIPELINE",),
242
+ "image": ("IMAGE",),
243
+ "mask": ("MASK",),
244
+ "denoise": (
245
+ "FLOAT",
246
+ {
247
+ "default": 0.7,
248
+ "min": 0.0,
249
+ "max": 1.0,
250
+ "step": 0.01,
251
+ },
252
+ ),
253
+ "prompt": ("PROMPT",),
254
+ "negative_prompt": ("PROMPT",),
255
+ "steps": (
256
+ "INT",
257
+ {
258
+ "default": 20,
259
+ "min": 1,
260
+ "max": 1000,
261
+ },
262
+ ),
263
+ "number": ("INT", {"default": 1, "min": 1, "max": 100}),
264
+ "seed": ("INT", {"default": 0, "min": 0, "max": 99999999999999999999999}),
265
+ "cfg": (
266
+ "FLOAT",
267
+ {
268
+ "default": 7.5,
269
+ "min": 0.0,
270
+ "max": 1000.0,
271
+ "step": 0.01,
272
+ },
273
+ ),
274
+ "scheduler_type": (
275
+ [
276
+ "euler",
277
+ "euler-ancestral",
278
+ "pndm",
279
+ "lms",
280
+ "heun",
281
+ "dpm-multi",
282
+ "dpm-single",
283
+ "kdpm2-ancestral",
284
+ "kdpm2",
285
+ "unipc-multi",
286
+ "ddim",
287
+ "ddpm",
288
+ "deis-multi",
289
+ ],
290
+ ),
291
+ }
292
+ }
293
+
294
+ RETURN_TYPES = ("LATENT",)
295
+ RETURN_NAMES = ("latent",)
296
+ FUNCTION = "sample"
297
+ CATEGORY = "🚢 paddlemix/ppdiffusers/pipelines"
298
+
299
+ def sample(self, sd_pipe, image, mask, denoise, prompt, negative_prompt, steps, number, seed, cfg, scheduler_type):
300
+ # torch -> numpy
301
+ if isinstance(image, torch.Tensor):
302
+ image = image.cpu().numpy()
303
+ if isinstance(mask, torch.Tensor):
304
+ mask = mask.cpu().numpy()
305
+
306
+ height, width = image.shape[1] // 8 * 8, image.shape[2] // 8 * 8
307
+
308
+ pipe = StableDiffusionInpaintPipeline(**sd_pipe.components)
309
+ pipe.scheduler = get_scheduler(scheduler_type)
310
+ paddle.seed(seed)
311
+
312
+ progress_bar = ProgressBar(steps)
313
+ latent = pipe(
314
+ image=image,
315
+ mask_image=mask,
316
+ strength=denoise,
317
+ prompt=prompt,
318
+ negative_prompt=negative_prompt,
319
+ width=width,
320
+ height=height,
321
+ num_images_per_prompt=number,
322
+ num_inference_steps=steps,
323
+ guidance_scale=cfg,
324
+ output_type="latent",
325
+ callback=lambda step, timestep, latents: progress_bar.update_absolute(
326
+ value=step, total=steps, preview=None
327
+ ),
328
+ callback_steps=1,
329
+ ).images
330
+
331
+ return (latent,)
332
+
333
+
334
+ NODE_CLASS_MAPPINGS = {
335
+ "PaddleSDCheckpointLoader": PaddleSDCheckpointLoader,
336
+ "PaddleSDVaeDecoder": PaddleSDVaeDecoder,
337
+ "PaddleSDText2ImagePipe": PaddleSDText2ImagePipe,
338
+ "PaddleSDImage2ImagePipe": PaddleSDImage2ImagePipe,
339
+ "PaddleSDInpaintPipe": PaddleSDInpaintPipe,
340
+ }
341
+
342
+ NODE_DISPLAY_NAME_MAPPINGS = {
343
+ "PaddleSDCheckpointLoader": "Paddle SD1.5 Checkpoint Loader",
344
+ "PaddleSDVaeDecoder": "Paddle SD1.5 VAE Decoder",
345
+ "PaddleSDText2ImagePipe": "Paddle SD1.5 Text2Image Pipe",
346
+ "PaddleSDImage2ImagePipe": "Paddle SD1.5 Image2Image Pipe",
347
+ "PaddleSDInpaintPipe": "Paddle SD1.5 Inpaint Pipe",
348
+ }
PaddleMIX/comfyui/ComfyUI_ppdiffusers/utils/schedulers.py ADDED
@@ -0,0 +1,75 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # Copyright (c) 2024 PaddlePaddle Authors. All Rights Reserved.
2
+ #
3
+ # Licensed under the Apache License, Version 2.0 (the "License");
4
+ # you may not use this file except in compliance with the License.
5
+ # You may obtain a copy of the License at
6
+ #
7
+ # http://www.apache.org/licenses/LICENSE-2.0
8
+ #
9
+ # Unless required by applicable law or agreed to in writing, software
10
+ # distributed under the License is distributed on an "AS IS" BASIS,
11
+ # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12
+ # See the License for the specific language governing permissions and
13
+ # limitations under the License.
14
+
15
+ from ppdiffusers.schedulers import (
16
+ DDIMScheduler,
17
+ DDPMScheduler,
18
+ DEISMultistepScheduler,
19
+ DPMSolverMultistepScheduler,
20
+ DPMSolverSinglestepScheduler,
21
+ EulerAncestralDiscreteScheduler,
22
+ EulerDiscreteScheduler,
23
+ HeunDiscreteScheduler,
24
+ KDPM2AncestralDiscreteScheduler,
25
+ KDPM2DiscreteScheduler,
26
+ LMSDiscreteScheduler,
27
+ PNDMScheduler,
28
+ UniPCMultistepScheduler,
29
+ )
30
+
31
+
32
+ def get_scheduler(scheduler_type):
33
+ scheduler_type = scheduler_type.lower()
34
+ scheduler = DDIMScheduler(
35
+ beta_start=0.00085,
36
+ beta_end=0.012,
37
+ beta_schedule="scaled_linear",
38
+ clip_sample=False,
39
+ set_alpha_to_one=False,
40
+ )
41
+ original_scheduler_config = scheduler.config
42
+ if scheduler_type == "pndm":
43
+ scheduler = PNDMScheduler.from_config(original_scheduler_config, skip_prk_steps=True)
44
+ elif scheduler_type == "lms":
45
+ scheduler = LMSDiscreteScheduler.from_config(original_scheduler_config)
46
+ elif scheduler_type == "heun":
47
+ scheduler = HeunDiscreteScheduler.from_config(original_scheduler_config)
48
+ elif scheduler_type == "euler":
49
+ scheduler = EulerDiscreteScheduler.from_config(original_scheduler_config)
50
+ elif scheduler_type == "euler-ancestral":
51
+ scheduler = EulerAncestralDiscreteScheduler.from_config(original_scheduler_config)
52
+ elif scheduler_type == "dpm-multi":
53
+ scheduler = DPMSolverMultistepScheduler.from_config(original_scheduler_config)
54
+ elif scheduler_type == "dpm-single":
55
+ scheduler = DPMSolverSinglestepScheduler.from_config(original_scheduler_config)
56
+ elif scheduler_type == "kdpm2-ancestral":
57
+ scheduler = KDPM2AncestralDiscreteScheduler.from_config(original_scheduler_config)
58
+ elif scheduler_type == "kdpm2":
59
+ scheduler = KDPM2DiscreteScheduler.from_config(original_scheduler_config)
60
+ elif scheduler_type == "unipc-multi":
61
+ scheduler = UniPCMultistepScheduler.from_config(original_scheduler_config)
62
+ elif scheduler_type == "ddim":
63
+ scheduler = DDIMScheduler.from_config(
64
+ original_scheduler_config,
65
+ steps_offset=1,
66
+ clip_sample=False,
67
+ set_alpha_to_one=False,
68
+ )
69
+ elif scheduler_type == "ddpm":
70
+ scheduler = DDPMScheduler.from_config(original_scheduler_config)
71
+ elif scheduler_type == "deis-multi":
72
+ scheduler = DEISMultistepScheduler.from_config(original_scheduler_config)
73
+ else:
74
+ raise ValueError(f"Scheduler of type {scheduler_type} doesn't exist!")
75
+ return scheduler
vlmeval/VLMEvalKit_old/assets/LOGO.svg ADDED
vlmeval/VLMEvalKit_old/assets/apple.jpg ADDED
vlmeval/VLMEvalKit_old/docs/en/Contributors.md ADDED
@@ -0,0 +1,21 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # Contributors
2
+
3
+ ## Contributors w. 3+ Major Contributions
4
+
5
+ > In this section, we list all the contributors who have made significant contributions (3+) to the development of VLMEvalKit.
6
+
7
+ New Qualified Contributors (2024.09):
8
+
9
+ 1. [amitbcp](https://github.com/amitbcp): The contributor helped support MUIRBench, Phi-3.5, Idefics3, VILA, and xGen-MM
10
+ 2. [czczup](https://github.com/czczup): The contributor helped support the InternVL Series (V1.5, Mini-InternVL, V2, etc.)
11
+ 3. [DseidLi](https://github.com/DseidLi): The contributor helped support LLaVA-OneVision, GQA, and developed the readthedocs site for VLMEvalKit
12
+ 4. [mayubo2333](https://github.com/mayubo2333): The contributor helped support MMLongBench, SlideVQA, and DUDE
13
+ 5. [sun-hailong](https://github.com/sun-hailong): The contributor helped support A-OKVQA, Parrot, MMMB, and MTL-MMBench
14
+ 6. [PhoenixZ810](https://github.com/PhoenixZ810): The contributor helped support Video-ChatGPT, Chat-UniVI, and Llama-VID
15
+ 7. [Cuiunbo](https://github.com/Cuiunbo): The contributor helped support OmniLMM-12B, MiniCPM-V Series (V1, V2, V2.5)
16
+
17
+ ## Full Contributor List
18
+
19
+ > In this section, we list all the contributors as well as their corresponding contributions to the development of VLMEvalKit.
20
+
21
+ TBD.
vlmeval/VLMEvalKit_old/docs/en/_static/css/readthedocs.css ADDED
@@ -0,0 +1,63 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ .header-logo {
2
+ background-image: url("../image/logo.svg");
3
+ background-size: 275px 80px;
4
+ height: 80px;
5
+ width: 275px;
6
+ }
7
+
8
+
9
+ @media screen and (min-width: 1100px) {
10
+ .header-logo {
11
+ top: -25px;
12
+ }
13
+ }
14
+
15
+ pre {
16
+ white-space: pre;
17
+ }
18
+
19
+ @media screen and (min-width: 2000px) {
20
+ .pytorch-content-left {
21
+ width: 1200px;
22
+ margin-left: 30px;
23
+ }
24
+ article.pytorch-article {
25
+ max-width: 1200px;
26
+ }
27
+ .pytorch-breadcrumbs-wrapper {
28
+ width: 1200px;
29
+ }
30
+ .pytorch-right-menu.scrolling-fixed {
31
+ position: fixed;
32
+ top: 45px;
33
+ left: 1580px;
34
+ }
35
+ }
36
+
37
+
38
+ article.pytorch-article section code {
39
+ padding: .2em .4em;
40
+ background-color: #f3f4f7;
41
+ border-radius: 5px;
42
+ }
43
+
44
+ /* Disable the change in tables */
45
+ article.pytorch-article section table code {
46
+ padding: unset;
47
+ background-color: unset;
48
+ border-radius: unset;
49
+ }
50
+
51
+ table.autosummary td {
52
+ width: 50%
53
+ }
54
+
55
+ img.align-center {
56
+ display: block;
57
+ margin-left: auto;
58
+ margin-right: auto;
59
+ }
60
+
61
+ article.pytorch-article p.rubric {
62
+ font-weight: bold;
63
+ }
vlmeval/VLMEvalKit_old/docs/en/_static/image/logo_icon.svg ADDED
vlmeval/VLMEvalKit_old/docs/en/_static/js/custom.js ADDED
@@ -0,0 +1,10 @@
 
 
 
 
 
 
 
 
 
 
 
1
+ var collapsedSections = [];
2
+
3
+ $(document).ready(function () {
4
+ $('.model-summary').DataTable({
5
+ "stateSave": false,
6
+ "lengthChange": false,
7
+ "pageLength": 20,
8
+ "order": []
9
+ });
10
+ });
vlmeval/VLMEvalKit_old/docs/en/_templates/404.html ADDED
@@ -0,0 +1,18 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {% extends "layout.html" %}
2
+
3
+ {% block body %}
4
+
5
+ <h1>Page Not Found</h1>
6
+ <p>
7
+ The page you are looking for cannot be found.
8
+ </p>
9
+ <p>
10
+ If you just switched documentation versions, it is likely that the page you were on is moved. You can look for it in
11
+ the content table left, or go to <a href="{{ pathto(root_doc) }}">the homepage</a>.
12
+ </p>
13
+ <!-- <p>
14
+ If you cannot find documentation you want, please <a
15
+ href="">open an issue</a> to tell us!
16
+ </p> -->
17
+
18
+ {% endblock %}
vlmeval/VLMEvalKit_old/docs/en/_templates/autosummary/class.rst ADDED
@@ -0,0 +1,13 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ .. role:: hidden
2
+ :class: hidden-section
3
+ .. currentmodule:: {{ module }}
4
+
5
+
6
+ {{ name | underline}}
7
+
8
+ .. autoclass:: {{ name }}
9
+ :members:
10
+
11
+ ..
12
+ autogenerated from _templates/autosummary/class.rst
13
+ note it does not have :inherited-members:
vlmeval/VLMEvalKit_old/docs/en/_templates/callable.rst ADDED
@@ -0,0 +1,14 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ .. role:: hidden
2
+ :class: hidden-section
3
+ .. currentmodule:: {{ module }}
4
+
5
+
6
+ {{ name | underline}}
7
+
8
+ .. autoclass:: {{ name }}
9
+ :members:
10
+ :special-members: __call__
11
+
12
+ ..
13
+ autogenerated from _templates/callable.rst
14
+ note it does not have :inherited-members:
vlmeval/VLMEvalKit_old/docs/en/docutils.conf ADDED
@@ -0,0 +1,2 @@
 
 
 
1
+ [html writers]
2
+ table_style: colwidths-auto
vlmeval/VLMEvalKit_old/docs/ja/README_ja.md ADDED
@@ -0,0 +1,177 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ <div align="center">
2
+
3
+ ![LOGO](http://opencompass.openxlab.space/utils/MMLB.jpg)
4
+
5
+ <b>VLMEvalKit: 大規模視覚言語モデルの評価ツールキット</b>
6
+
7
+ [![][github-contributors-shield]][github-contributors-link] • [![][github-forks-shield]][github-forks-link] • [![][github-stars-shield]][github-stars-link] • [![][github-issues-shield]][github-issues-link] • [![][github-license-shield]][github-license-link]
8
+
9
+ [English](/README.md) | [简体中文](/docs/zh-CN/README_zh-CN.md) | 日本語
10
+
11
+ <a href="https://rank.opencompass.org.cn/leaderboard-multimodal">🏆 OpenCompass Learderboard </a> •
12
+ <a href="#-datasets-models-and-evaluation-results">📊Datasets & Models </a> •
13
+ <a href="#%EF%B8%8F-quickstart">🏗️Quickstart </a> •
14
+ <a href="#%EF%B8%8F-development-guide">🛠️Development </a> •
15
+ <a href="#-the-goal-of-vlmevalkit">🎯Goal </a> •
16
+ <a href="#%EF%B8%8F-citation">🖊️Citation </a>
17
+
18
+ <a href="https://huggingface.co/spaces/opencompass/open_vlm_leaderboard">🤗 HF Leaderboard</a> •
19
+ <a href="https://huggingface.co/datasets/VLMEval/OpenVLMRecords">🤗 Evaluation Records</a> •
20
+ <a href="https://discord.gg/evDT4GZmxN">🔊 Discord Channel</a> •
21
+ <a href="https://www.arxiv.org/abs/2407.11691">📝 Technical Report</a>
22
+ </div>
23
+
24
+ **VLMEvalKit**(pythonパッケージ名は**vlmeval**)は、**大規模視覚言語モデル(LVLMs)**の**オープンソース評価ツールキット**です。このツールキットは、複数のリポジトリでのデータ準備という重労働なしに、さまざまなベンチマークでLVLMsの**ワンコマンド評価**を可能にします。VLMEvalKitでは、すべてのLVLMsに対して**生成ベースの評価**を採用し、**正確なマッチング**と**LLMベースの回答抽出**の両方で得られた評価結果を提供します。
25
+
26
+ PS: 日本語の README には最新のアップデートがすべて含まれていない場合があります。英語版をご確認ください。
27
+
28
+ ## 📊 データセット、モデル、および評価結果
29
+
30
+ **公式のマルチモーダルリーダーボードでのパフォーマンス数値は、ここからダウンロードできます!**
31
+
32
+ [**OpenVLM Leaderboard**](https://huggingface.co/spaces/opencompass/open_vlm_leaderboard): [すべての詳細な結果をダウンロード](http://opencompass.openxlab.space/assets/OpenVLM.json)。
33
+
34
+ **Supported Image Understanding Dataset**
35
+
36
+ - デフォルトでは、すべての評価結果は[**OpenVLM Leaderboard**](https://huggingface.co/spaces/opencompass/open_vlm_leaderboard)に表示されます。
37
+
38
+ | データセット | データセット名 (run.py用) | タスク | データセット | データセット名 (run.py用) | タスク |
39
+ | ------------------------------------------------------------ | ------------------------------------------------------ | --------- | --------- | --------- | --------- |
40
+ | [**MMBench シリーズ**](https://github.com/open-compass/mmbench/): <br>MMBench, MMBench-CN, CCBench | MMBench\_DEV\_[EN/CN] <br>MMBench\_TEST\_[EN/CN]<br>MMBench\_DEV\_[EN/CN]\_V11<br>MMBench\_TEST\_[EN/CN]\_V11<br>CCBench | 多肢選択問題 (MCQ) | [**MMStar**](https://github.com/MMStar-Benchmark/MMStar) | MMStar | MCQ |
41
+ | [**MME**](https://github.com/BradyFU/Awesome-Multimodal-Large-Language-Models/tree/Evaluation) | MME | はい/いいえ (Y/N) | [**SEEDBench シリーズ**](https://github.com/AILab-CVC/SEED-Bench) | SEEDBench_IMG <br>SEEDBench2 <br>SEEDBench2_Plus | MCQ |
42
+ | [**MM-Vet**](https://github.com/yuweihao/MM-Vet) | MMVet | VQA | [**MMMU**](https://mmmu-benchmark.github.io) | MMMU_[DEV_VAL/TEST] | MCQ |
43
+ | [**MathVista**](https://mathvista.github.io) | MathVista_MINI | VQA | [**ScienceQA_IMG**](https://scienceqa.github.io) | ScienceQA_[VAL/TEST] | MCQ |
44
+ | [**COCO Caption**](https://cocodataset.org) | COCO_VAL | キャプション | [**HallusionBench**](https://github.com/tianyi-lab/HallusionBench) | HallusionBench | Y/N |
45
+ | [**OCRVQA**](https://ocr-vqa.github.io)* | OCRVQA_[TESTCORE/TEST] | VQA | [**TextVQA**](https://textvqa.org)* | TextVQA_VAL | VQA |
46
+ | [**ChartQA**](https://github.com/vis-nlp/ChartQA)* | ChartQA_TEST | VQA | [**AI2D**](https://allenai.org/data/diagrams) | AI2D_[TEST/TEST_NO_MASK] | MCQ |
47
+ | [**LLaVABench**](https://huggingface.co/datasets/liuhaotian/llava-bench-in-the-wild) | LLaVABench | VQA | [**DocVQA**](https://www.docvqa.org)+ | DocVQA_[VAL/TEST] | VQA |
48
+ | [**InfoVQA**](https://www.docvqa.org/datasets/infographicvqa)+ | InfoVQA_[VAL/TEST] | VQA | [**OCRBench**](https://github.com/Yuliang-Liu/MultimodalOCR) | OCRBench | VQA |
49
+ | [**RealWorldQA**](https://x.ai/blog/grok-1.5v) | RealWorldQA | MCQ | [**POPE**](https://github.com/AoiDragon/POPE) | POPE | Y/N |
50
+ | [**Core-MM**](https://github.com/core-mm/core-mm)- | CORE_MM | VQA | [**MMT-Bench**](https://mmt-bench.github.io) | MMT-Bench_[VAL/VAL_MI/ALL/ALL_MI] | MCQ |
51
+ | [**MLLMGuard**](https://github.com/Carol-gutianle/MLLMGuard) - | MLLMGuard_DS | VQA | [**AesBench**](https://github.com/yipoh/AesBench) | AesBench_[VAL/TEST] | MCQ |
52
+ | [**VCR-wiki**](https://huggingface.co/vcr-org/) + | VCR\_[EN/ZH]\_[EASY/HARD]_[ALL/500/100] | VQA | [**MMLongBench-Doc**](https://mayubo2333.github.io/MMLongBench-Doc/)+ | MMLongBench_DOC | VQA |
53
+ | [**BLINK**](https://zeyofu.github.io/blink/) + | BLINK | MCQ | [**MathVision**](https://mathvision-cuhk.github.io)+ | MathVision<br>MathVision_MINI | VQA |
54
+ | [**MT-VQA**](https://github.com/bytedance/MTVQA)+ | MTVQA_TEST | VQA | [**MMDU**](https://liuziyu77.github.io/MMDU/)+ | MMDU | VQA (multi-turn) |
55
+ | [**Q-Bench1**](https://github.com/Q-Future/Q-Bench)+ | Q-Bench1_[VAL/TEST] | MCQ | [**A-Bench**](https://github.com/Q-Future/A-Bench)+ | A-Bench_[VAL/TEST] | MCQ |
56
+ | [**TaskMeAnything ImageQA Random**](https://huggingface.co/datasets/weikaih/TaskMeAnything-v1-imageqa-random)+ | TaskMeAnything_v1_imageqa_random | MCQ | | | |
57
+
58
+ **\*** ゼロショット設定で合理的な結果を出せないVLMの一部の評価結果のみを提供しています
59
+
60
+ **\+** 評価結果はまだ利用できません
61
+
62
+ **\-** VLMEvalKitでは推論のみがサポートされています
63
+
64
+ VLMEvalKitは、キーを設定すると**判定LLM**を使用して出力から回答を抽出し、それ以外の場合は**正確なマッチング**モード(出力文字列で「はい」、「いいえ」、「A」、「B」、「C」...を検索)を使用します。**正確なマッチングは、はい/いいえのタスクと多肢選択問題にのみ適用できます。**
65
+
66
+ **Supported Video Understanding Dataset**
67
+
68
+ | Dataset | Dataset Names (for run.py) | Task | Dataset | Dataset Names (for run.py) | Task |
69
+ | ---------------------------------------------------- | -------------------------- | ---- | --------------------------------------------- | -------------------------- | ---- |
70
+ | [**MMBench-Video**](https://mmbench-video.github.io) | MMBench-Video | VQA | [**Video-MME**](https://video-mme.github.io/) | Video-MME | MCQ |
71
+
72
+ **Supported API Models**
73
+
74
+ | [**GPT-4v (20231106, 20240409)**](https://platform.openai.com/docs/guides/vision) 🎞️🚅 | [**GPT-4o**](https://openai.com/index/hello-gpt-4o/) 🎞️🚅 | [**Gemini-1.0-Pro**](https://platform.openai.com/docs/guides/vision) 🎞️🚅 | [**Gemini-1.5-Pro**](https://platform.openai.com/docs/guides/vision) 🎞️🚅 | [**Step-1V**](https://www.stepfun.com/#step1v) 🎞️🚅 |
75
+ | ------------------------------------------------------------ | ------------------------------------------------------------ | ------------------------------------------------------------ | ------------------------------------------------------------ | ------------------------------------------------- |
76
+ | [**Reka-[Edge / Flash / Core]**](https://www.reka.ai)🚅 | [**Qwen-VL-[Plus / Max]**](https://huggingface.co/spaces/Qwen/Qwen-VL-Max) 🎞️🚅 | [**Claude-3v-[Haiku / Sonnet / Opus]**](https://www.anthropic.com/news/claude-3-family) 🎞️🚅 | [**GLM-4v**](https://open.bigmodel.cn/dev/howuse/glm4v) 🚅 | [**CongRong**](https://mllm.cloudwalk.com/web) 🎞️🚅 |
77
+ | [**Claude3.5-Sonnet**](https://www.anthropic.com/news/claude-3-5-sonnet) 🎞️🚅 | [**GPT-4o-Mini**](https://openai.com/index/gpt-4o-mini-advancing-cost-efficient-intelligence/) 🎞️🚅 | [**Yi-Vision**](https://platform.lingyiwanwu.com)🎞️🚅 | [**Hunyuan-Vision**](https://cloud.tencent.com/document/product/1729)🎞️🚅 | [**BlueLM-V**](https://developers.vivo.com/) 🎞️🚅 |
78
+
79
+ **Supported PyTorch / HF Models**
80
+
81
+ | [**IDEFICS-[9B/80B/v2-8B]-Instruct**](https://huggingface.co/HuggingFaceM4/idefics-9b-instruct)🎞️🚅 | [**InstructBLIP-[7B/13B]**](https://github.com/salesforce/LAVIS/blob/main/projects/instructblip/README.md) | [**LLaVA-[v1-7B/v1.5-7B/v1.5-13B]**](https://github.com/haotian-liu/LLaVA) | [**MiniGPT-4-[v1-7B/v1-13B/v2-7B]**](https://github.com/Vision-CAIR/MiniGPT-4) |
82
+ | ------------------------------------------------------------ | ------------------------------------------------------------ | ------------------------------------------------------------ | ------------------------------------------------------------ |
83
+ | [**mPLUG-Owl2**](https://github.com/X-PLUG/mPLUG-Owl/tree/main/mPLUG-Owl2)🎞️ | [**OpenFlamingo-v2**](https://github.com/mlfoundations/open_flamingo)🎞️ | [**PandaGPT-13B**](https://github.com/yxuansu/PandaGPT) | [**Qwen-VL**](https://huggingface.co/Qwen/Qwen-VL)🎞️🚅, [**Qwen-VL-Chat**](https://huggingface.co/Qwen/Qwen-VL-Chat)🎞️**🚅** |
84
+ | [**VisualGLM-6B**](https://huggingface.co/THUDM/visualglm-6b)🚅 | [**InternLM-XComposer-[1/2]**](https://huggingface.co/internlm/internlm-xcomposer-7b)🚅 | [**ShareGPT4V-[7B/13B]**](https://sharegpt4v.github.io)🚅 | [**TransCore-M**](https://github.com/PCIResearch/TransCore-M) |
85
+ | [**LLaVA (XTuner)**](https://huggingface.co/xtuner/llava-internlm-7b)🚅 | [**CogVLM-[Chat/Llama3]**](https://huggingface.co/THUDM/cogvlm-chat-hf)🚅 | [**ShareCaptioner**](https://huggingface.co/spaces/Lin-Chen/Share-Captioner)🚅 | [**CogVLM-Grounding-Generalist**](https://huggingface.co/THUDM/cogvlm-grounding-generalist-hf)🚅 |
86
+ | [**Monkey**](https://github.com/Yuliang-Liu/Monkey)🚅, [**Monkey-Chat**](https://github.com/Yuliang-Liu/Monkey)🚅 | [**EMU2-Chat**](https://github.com/baaivision/Emu)🚅🎞️ | [**Yi-VL-[6B/34B]**](https://huggingface.co/01-ai/Yi-VL-6B) | [**MMAlaya**](https://huggingface.co/DataCanvas/MMAlaya)🚅 |
87
+ | [**InternLM-XComposer-2.5**](https://github.com/InternLM/InternLM-XComposer)🚅🎞️ | [**MiniCPM-[V1/V2/V2.5/V2.6]**](https://github.com/OpenBMB/MiniCPM-V)🚅🎞️ | [**OmniLMM-12B**](https://huggingface.co/openbmb/OmniLMM-12B) | [**InternVL-Chat-[V1-1/V1-2/V1-5/V2]**](https://github.com/OpenGVLab/InternVL)🚅🎞️, <br>[**Mini-InternVL-Chat-[2B/4B]-V1-5**](https://github.com/OpenGVLab/InternVL)🚅🎞️ |
88
+ | [**DeepSeek-VL**](https://github.com/deepseek-ai/DeepSeek-VL/tree/main)🎞️ | [**LLaVA-NeXT**](https://llava-vl.github.io/blog/2024-01-30-llava-next/)🚅🎞️ | [**Bunny-Llama3**](https://huggingface.co/BAAI/Bunny-v1_1-Llama-3-8B-V)🚅 | [**XVERSE-V-13B**](https://github.com/xverse-ai/XVERSE-V-13B/blob/main/vxverse/models/vxverse.py) |
89
+ | [**PaliGemma-3B**](https://huggingface.co/google/paligemma-3b-pt-448) 🚅 | [**360VL-70B**](https://huggingface.co/qihoo360/360VL-70B) 🚅 | [**Phi-3-Vision**](https://huggingface.co/microsoft/Phi-3-vision-128k-instruct)🚅 | [**WeMM**](https://github.com/scenarios/WeMM)🚅 |
90
+ | [**GLM-4v-9B**](https://huggingface.co/THUDM/glm-4v-9b) 🚅 | [**Cambrian-[8B/13B/34B]**](https://cambrian-mllm.github.io/) | [**LLaVA-Next-[Qwen-32B]**](https://huggingface.co/lmms-lab/llava-next-qwen-32b) 🎞️ | [**Chameleon-[7B/30B]**](https://huggingface.co/facebook/chameleon-7b)🚅🎞️ |
91
+ | [**Video-LLaVA-7B-[HF]**](https://github.com/PKU-YuanGroup/Video-LLaVA) 🎬 | [**VILA1.5-[8B/13B/40B]**](https://github.com/NVlabs/VILA/)🎞️ | [**Ovis1.5-Llama3-8B**](https://github.com/AIDC-AI/Ovis) 🚅🎞 | [**Mantis-8B-[siglip-llama3/clip-llama3/Idefics2/Fuyu]**](https://huggingface.co/TIGER-Lab/Mantis-8B-Idefics2) 🎞️ |
92
+
93
+ 🎞️: 複数の画像を入力としてサポートします。
94
+
95
+ 🚅: 追加の設定/操作なしで使用できるモデルです。
96
+
97
+ 🎬: 入力としてビデオをサポート。
98
+
99
+ **Transformersバージョンの推奨事項:**
100
+
101
+ 特定のtransformerバージョンで一部のVLMが実行できない可能性があることに注意してください。各VLMを評価するために、以下の設定を推奨します:
102
+
103
+ - **`transformers==4.33.0`を使用してください**: `Qwenシリーズ`, `Monkeyシリーズ`, `InternLM-XComposerシリーズ`, `mPLUG-Owl2`, `OpenFlamingo v2`, `IDEFICSシリーズ`, `VisualGLM`, `MMAlaya`, `ShareCaptioner`, `MiniGPT-4シリーズ`, `InstructBLIPシリーズ`, `PandaGPT`, `VXVERSE`, `GLM-4v-9B`.
104
+ - **`transformers==4.37.0`を使用してください**: `LLaVAシリーズ`, `ShareGPT4Vシリーズ`, `TransCore-M`, `LLaVA (XTuner)`, `CogVLMシリーズ`, `EMU2シリーズ`, `Yi-VLシリーズ`, `MiniCPM-[V1/V2]`, `OmniLMM-12B`, `DeepSeek-VLシリーズ`, `InternVLシリーズ`, `Cambrianシリーズ`, `VILA-VLシリーズ`.
105
+ - **`transformers==4.40.0`を使用してください**: `IDEFICS2`, `Bunny-Llama3`, `MiniCPM-Llama3-V2.5`, `360VL-70B`, `Phi-3-Vision`, `WeMM`.
106
+ - **`transformers==latest`を使用してください**: `LLaVA-Nextシリーズ`, `PaliGemma-3B`, `Chameleon-VLシリーズ`, `Video-LLaVA-7B-HF`, `Ovis1.5シリーズ`, `Mantisシリーズ`, `MiniCPM-V2.6`.
107
+
108
+ ```python
109
+ # デモ
110
+ from vlmeval.config import supported_VLM
111
+ model = supported_VLM['idefics_9b_instruct']()
112
+ # 単一画像のフォワード
113
+ ret = model.generate(['assets/apple.jpg', 'この画像には何がありますか?'])
114
+ print(ret) # この画像には葉がついた赤いリンゴがあります。
115
+ # 複数画像のフォワード
116
+ ret = model.generate(['assets/apple.jpg', 'assets/apple.jpg', '提供された画像にはリンゴが何個ありますか?'])
117
+ print(ret) # 提供された画像にはリンゴが2個あります。
118
+ ```
119
+
120
+ ## 🏗️ クイックスタート
121
+
122
+ クイックスタートガイドについては、[クイックスタート](/docs/en/Quickstart.md)を参照してください。
123
+
124
+ ## 🛠️ 開発ガイド
125
+
126
+ カスタムベンチマーク、VLMsを開発するか、単に**VLMEvalKit**に他のコードを貢献する場合は、[開発ガイド](/docs/en/Development.md)を参照してください。
127
+
128
+ コミュニティからの共有を奨励し、それに応じたクレジットを共有するために、次回のレポート更新では以下のことを実施します:
129
+
130
+ - 全ての貢献に対して感謝の意を示します
131
+ - 新しいモデル、評価セット、または主要な機能への3つ以上の主要な貢献を持つ貢献者は、テクニカルレポートの著者リストに加わることができます。適格な貢献者は、issueを作成するか、または[VLM評価キット ディスコードチャンネル](https://discord.com/invite/evDT4GZmxN)で kennyutc にDMを送ることができます。私たちはそれに応じてフォローアップします。
132
+
133
+ ## 🎯 VLMEvalKitの目標
134
+
135
+ **このコードベースは以下を目的として設計されています:**
136
+
137
+ 1. 研究者や開発者が既存のLVLMsを評価し、評価結果を**簡単に再現できるようにする**ための**使いやすい**、**オープンソースの評価ツールキット**を提供します。
138
+ 2. VLMの開発者が自分のモデルを簡単に評価できるようにします。複数のサポートされているベンチマークでVLMを評価するには、単一の`generate_inner()`関数を**実装するだけで**、他のすべてのワークロード(データのダウンロード、データの前処理、予測の推論、メトリックの計算)はコードベースによって処理されます。
139
+
140
+ **このコードベースは以下を目的として設計されていません:**
141
+
142
+ 1. すべての**第三者ベンチマーク**の元の論文で報告された正確な精度数値を再現すること。その理由は2つあります:
143
+ 1. VLMEvalKitは、すべてのVLMに対して**生成ベースの評価**を使用します(オプションで**LLMベースの回答抽出**を使用)。一方、一部のベンチマークは異なるアプローチを使用する場合があります(SEEDBenchはPPLベースの評価を使用します)。これらのベンチマークについては、対応する結果で両方のスコアを比較します。開発者には、コードベースで他の評価パラダイムをサポートすることをお勧めします。
144
+ 2. デフォルトでは、すべてのVLMに対して同じプロンプトテンプレートを使用してベンチマークを評価します。一方、**一部のVLMには特定のプロンプトテンプレートがある**場合があります(現時点ではコードベースでカバーされていない場合があります)。VLMの開発者には、現在カバーされていない場合でも、VLMEvalKitで独自のプロンプトテンプレートを実装することをお勧めします。これにより、再現性が向上します。
145
+
146
+ ## 🖊️ 引用
147
+
148
+ この作業が役立つ場合は、このリポジトリに**スター🌟**を付けてください。サポートありがとうございます!
149
+
150
+ [![Stargazers repo roster for @open-compass/VLMEvalKit](https://reporoster.com/stars/open-compass/VLMEvalKit)](https://github.com/open-compass/VLMEvalKit/stargazers)
151
+
152
+ 研究でVLMEvalKitを使用する場合、または公開されたオープンソースの評価結果を参照する場合は、以下のBibTeXエントリと、使用した特定のVLM/ベンチマークに対応するBibTexエントリを使用してください。
153
+
154
+ ```bib
155
+ @misc{duan2024vlmevalkit,
156
+ title={VLMEvalKit: An Open-Source Toolkit for Evaluating Large Multi-Modality Models},
157
+ author={Haodong Duan and Junming Yang and Yuxuan Qiao and Xinyu Fang and Lin Chen and Yuan Liu and Xiaoyi Dong and Yuhang Zang and Pan Zhang and Jiaqi Wang and Dahua Lin and Kai Chen},
158
+ year={2024},
159
+ eprint={2407.11691},
160
+ archivePrefix={arXiv},
161
+ primaryClass={cs.CV},
162
+ url={https://arxiv.org/abs/2407.11691},
163
+ }
164
+ ```
165
+
166
+ <p align="right"><a href="#top">🔝Top に戻る</a></p>
167
+
168
+ [github-contributors-link]: https://github.com/open-compass/VLMEvalKit/graphs/contributors
169
+ [github-contributors-shield]: https://img.shields.io/github/contributors/open-compass/VLMEvalKit?color=c4f042&labelColor=black&style=flat-square
170
+ [github-forks-link]: https://github.com/open-compass/VLMEvalKit/network/members
171
+ [github-forks-shield]: https://img.shields.io/github/forks/open-compass/VLMEvalKit?color=8ae8ff&labelColor=black&style=flat-square
172
+ [github-issues-link]: https://github.com/open-compass/VLMEvalKit/issues
173
+ [github-issues-shield]: https://img.shields.io/github/issues/open-compass/VLMEvalKit?color=ff80eb&labelColor=black&style=flat-square
174
+ [github-license-link]: https://github.com/open-compass/VLMEvalKit/blob/main/LICENSE
175
+ [github-license-shield]: https://img.shields.io/github/license/open-compass/VLMEvalKit?color=white&labelColor=black&style=flat-square
176
+ [github-stars-link]: https://github.com/open-compass/VLMEvalKit/stargazers
177
+ [github-stars-shield]: https://img.shields.io/github/stars/open-compass/VLMEvalKit?color=ffcb47&labelColor=black&style=flat-square
vlmeval/VLMEvalKit_old/docs/zh-CN/ConfigSystem.md ADDED
@@ -0,0 +1,59 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+
2
+ # 配置系统
3
+
4
+ 默认情况下,VLMEvalKit通过在`run.py`脚本中使用`--model`和`--data`参数设置模型名称(在`/vlmeval/config.py`中定义)和数据集名称(在`vlmeval/dataset/__init__.py`中定义)来启动评估。这种方法在大多数情况下简单且高效,但当用户希望使用不同设置评估多个模型/数据集时,可能不够灵活。
5
+
6
+ 为了解决这个问题,VLMEvalKit提供了一个更灵活的配置系统。用户可以在json文件中指定模型和数据集设置,并通过`--config`参数将配置文件的路径传递给`run.py`脚本。以下是一个示例配置json:
7
+
8
+ ```json
9
+ {
10
+ "model": {
11
+ "GPT4o_20240806_T00_HIGH": {
12
+ "class": "GPT4V",
13
+ "model": "gpt-4o-2024-08-06",
14
+ "temperature": 0,
15
+ "img_detail": "high"
16
+ },
17
+ "GPT4o_20240806_T10_Low": {
18
+ "class": "GPT4V",
19
+ "model": "gpt-4o-2024-08-06",
20
+ "temperature": 1.0,
21
+ "img_detail": "low"
22
+ }
23
+ },
24
+ "data": {
25
+ "MME-RealWorld-Lite": {
26
+ "class": "MMERealWorld",
27
+ "dataset": "MME-RealWorld-Lite"
28
+ },
29
+ "MMBench_DEV_EN_V11": {
30
+ "class": "ImageMCQDataset",
31
+ "dataset": "MMBench_DEV_EN_V11"
32
+ }
33
+ }
34
+ }
35
+ ```
36
+
37
+ 配置json的解释:
38
+
39
+ 1. 现在我们支持两个字段:`model`和`data`,每个字段都是一个字典。字典的键是模型/数据集的名称(由用户设置),值是模型/数据集的设置。
40
+ 2. 对于`model`中的项目,值是一个包含以下键的字典:
41
+ - `class`:模型的类名,应该是`vlmeval/vlm/__init__.py`(开源模型)或`vlmeval/api/__init__.py`(API模型)中定义的类名。
42
+ - 其他kwargs:其他kwargs是模型特定的参数,请参考模型类的定义以获取详细用法。例如,`model`、`temperature`、`img_detail`是`GPT4V`类的参数。值得注意的是,大多数模型类都需要`model`参数。
43
+ 3. 对于字典`data`,我们建议用户使用官方数据集名称作为键(或键的一部分),因为我们经常根据数据集名称确定后处理/判断设置。对于`data`中的项目,值是一个包含以下键的字典:
44
+ - `class`:数据集的类名,应该是`vlmeval/dataset/__init__.py`中定义的类名。
45
+ - 其他kwargs:其他kwargs是数据集特定的参数,请参考数据集类的定义以获取详细用法。通常,大多数数据集类都需要`dataset`参数。
46
+
47
+ 将示例配置json保存为`config.json`,您可以通过以下命令启动评估:
48
+
49
+ ```bash
50
+ python run.py --config config.json
51
+ ```
52
+
53
+ 这将在工作目录`$WORK_DIR`下生成以下输出文件(格式为`{$WORK_DIR}/{$MODEL_NAME}/{$MODEL_NAME}_{$DATASET_NAME}_*`):
54
+
55
+ - `$WORK_DIR/GPT4o_20240806_T00_HIGH/GPT4o_20240806_T00_HIGH_MME-RealWorld-Lite*`
56
+ - `$WORK_DIR/GPT4o_20240806_T10_Low/GPT4o_20240806_T10_Low_MME-RealWorld-Lite*`
57
+ - `$WORK_DIR/GPT4o_20240806_T00_HIGH/GPT4o_20240806_T00_HIGH_MMBench_DEV_EN_V11*`
58
+ - `$WORK_DIR/GPT4o_20240806_T10_Low/GPT4o_20240806_T10_Low_MMBench_DEV_EN_V11*`
59
+ -
vlmeval/VLMEvalKit_old/docs/zh-CN/Development.md ADDED
@@ -0,0 +1,140 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # 🛠️ 如何在 VLMEvalKit 中实现一个新的 Benchmark 或多模态模型(VLM)
2
+
3
+ ## 实现一个新的 benchmark
4
+
5
+ 示例 PR: **添加 Math-Vision Benchmark** ([#292](https://github.com/open-compass/VLMEvalKit/pull/292/files))
6
+
7
+ 目前在 VLMEvalKit 中,benchmark 以数据集类的形式呈现,当你新增一个 benchmark 时,你可以选择复用现有的数据集类 (如单选题 benchmark 可复用 `ImageMCQDataset`),或是实现新的数据集类。你的数据集类必须支持以下两种方法 (复用父类或自行实现):
8
+
9
+ - `build_prompt(self, line)`: 方法输入 `line` 类型为 int (对应数据 index) 或 `pd.Series` (对应数据原始 record)。方法输出一条 `multi-modal message` 作为多模态模型输入,`multi-modal message` 是一个图文交错的列表,如以下格式 (一图一文): `[dict(type='image', value=IMAGE_PTH), dict(type='text', value=prompt)]`。
10
+ - `evaluate(self, eval_file, **judge_kwargs)`: 方法输入 `eval_file` 为多模态模型的预测结果 (多以 `.xlsx` 格式存在),如 benchmark evaluation 需要大语言模型 (一般为 GPT) 辅助,则 `judge_kwargs` 传入大语言模型的参数。方法输出 benchmark 的评测结果,以 `dict` 或 `pd.DataFrame` 的形式。
11
+
12
+ 以下,我们简述新增数据集的通常步骤:
13
+
14
+ ### 1. TSV 数据文件准备 (图文评测集)
15
+
16
+ 目前,我们将每一个 benchmark 数据集设置为一个单独的 TSV 文件。在推理过程中,数据文件将从数据集定义的 `DATASET_URL` 链接地址自动下载到 `$LMUData` 中(如果没有明确设置的话,默认路径是 `$HOME/LMUData`)。你可以将准备好的 TSV 文件上传到一个可下载的地址(如:huggingface),或发送给我们 <[email protected]>,我们将帮助上传数据集到服务器中。此外,你也可以在环境变量中自定义设置下载路径 `LMUData=/path/to/your/data`。
17
+
18
+ TSV 文件中的内容组成为:
19
+
20
+ | 数据集名称 \ 字段 | index | image | image_path | question | hint | multi-choice<br>options | answer | category | l2-category | split |
21
+ | ---------------------- | ----- | ----- | ---------- | -------- | ---- | ----------------------- | ------ | -------- | ----------- | ----- |
22
+ | MMBench_DEV_[CN/EN] | ✅ | ✅ | | ✅ | ✅ | ✅ | ✅ | ✅ | ✅ | ✅ |
23
+ | MMBench_TEST_[CN/EN] | ✅ | ✅ | | ✅ | ✅ | ✅ | | ✅ | ✅ | ✅ |
24
+ | CCBench | ✅ | ✅ | | ✅ | | ✅ | ✅ | ✅ | | |
25
+ | SEEDBench_IMG | ✅ | ✅ | | ✅ | | ✅ | ✅ | ✅ | | |
26
+ | MME | ✅ | ✅ | | ✅ | | | ✅ | ✅ | | |
27
+ | CORE_MM | ✅ | ✅ | ✅ | ✅ | | | | ✅ | | |
28
+ | MMVet | ✅ | ✅ | | ✅ | | | ✅ | ✅ | | |
29
+ | MMMU_DEV_VAL | ✅ | ✅ | ✅ | ✅ | | ✅ | ✅ | ✅ | ✅ | ✅ |
30
+ | COCO_VAL | ✅ | ✅ | | | | | ✅ | | | |
31
+ | OCRVQA_[TEST/TESTCORE] | ✅ | ✅ | | ✅ | | | ✅ | | | |
32
+ | TextVQA_VAL | ✅ | ✅ | | ✅ | | | ✅ | | | |
33
+ | VCR_[EN/ZH]\_[EASY/HARD]_[ALL/500/100] | ✅ | ✅ | | ✅ | | | ✅ | | | |
34
+
35
+ <div align="center"><b>表 1. 支持的数据集的 TSV 字段。</b></div>
36
+
37
+ **TSV 中必须字段的介绍:**
38
+
39
+ - **index:** 一个整数,`tsv` 中每一行的唯一标识
40
+ - **image:** 图片的 base64 编码,你可以使用 `vlmeval/smp/vlm.py` 中实现的API进行编码和解码:
41
+ - 编码:`encode_image_to_base64`(对于PIL Image)/ `encode_image_file_to_base64`(对于图片文件路径)
42
+ - 解码:`decode_base64_to_image`(对于PIL Image)/ `decode_base64_to_image_file`(对于图片文件路径)
43
+ - **question:** 针对图像所提取出的问题,类型为字符串
44
+ - **answer:** 问题的答案,类型为字符串,Test 集可缺失这一字段
45
+
46
+ ### 2. 自定义数据集的 prompt 构建
47
+
48
+ `ImageBaseDataset` 定义了默认的 prompt 格式。如果需要针对数据集添加 prompt,或给模型输入 `Interleave` 的数据格式,可以通过 `build_prompt(line)` 函数实现���该函数输入为,每次给定 TSV 文件中的一行,包含 index, image, question 等内容作为 line。该函数将返回一个多模态消息 `msg` 的字典列表 `[dict(type='image', value=IMAGE_PTH), dict(type='text', value=prompt)]`,包括图片路径和将被输入到 VLMs 的文本 prompt。对于 interleave 类型输入,可以直接将图片路径的字典放置到 image token 位置。
49
+
50
+ ### 3. 自定义数据集的指标实现
51
+
52
+ 增加对 benchmark 的评测需要自定义一个该数据集的 class 对象,从而实现数据集的指标计算。图文多模态数据集均继承自 `vlmeval/dataset/image_base.py` 中的 `ImageBaseDataset` 对象。其中 `TYPE` 定义了数据集的类型;`DATASET_URL` 为数据集的下载地址;`DATASET_MD5` 为数据集文件的 md5 一致性编码检查。
53
+
54
+ 在 class 中**需要实现** `evaluate(eval_file, **judge_kwargs)` 类函数,对自定义的数据集结果进行指标计算和结果输出。函数输入 `eval_file` 为模型预测结果 `{model_name}_{dataset}.xlsx` 的路径。可以通过 `load(eval_file)` 文件将其读取为 panda.DataFrames 类型,其中包含 index, question, answer, category, prediction 等字段。`judge_kwargs` 参数将传递一个评测相关的字典,如:judge 模型的名称,api 请求线程数等。**函数的返回值**为评估完成的准确度等指标,其格式为由 list 组成的字典,并组织成 panda.DataFrames 类型。
55
+
56
+ ## 实现一个新的模型
57
+
58
+ 示例 PR: **支持 LLaVA-Next-Interleave** ([#294](https://github.com/open-compass/VLMEvalKit/pull/294))
59
+
60
+ **1. 支持 `generate_inner` API (必须)**
61
+
62
+ 现有所有的模型都在 `vlmeval/vlm` 中实现。对于一个最基本的模型,你的模型类**应该实现方法** `generate_inner(msgs, dataset=None)`。这个函数将向 VLM 输入一个多模态数据,并返回 VLM 的预测(一个字符串)。可选参数 `dataset` 可以用作模型在不同推理策略之间切换的标志。
63
+
64
+ 其中多模态消息 `msgs` 是一个字典列表,每个字典有两个键:类型和值:
65
+ - `type`:我们目前支持两种类型,选项是 ["image", "text"]。
66
+ - `value`:当类型为 `text` 时,值是文本消息(一个字符串);当类型为 `image` 时,值可以是图像文件的本地路径,或者是图像的URL。
67
+
68
+ > 目前,一个多模态消息可能包含任意交错的图像和文本。如果你的模型不支持这一点,我们推荐的做法是取第一张图像和连接的文本消息作为模型的输入。你可以在模型的 class 中设置 `INTERLEAVE = False` 并调用 `self.message_to_promptimg(message, dataset=dataset)` 函数来获取你的 prompt 和第一张图片的地址。
69
+
70
+ 一些多模态消息的例子:
71
+
72
+ ```python
73
+ IMAGE_PTH = 'assets/apple.jpg'
74
+ IMAGE_URL = 'https://raw.githubusercontent.com/open-compass/VLMEvalKit/main/assets/apple.jpg'
75
+ msg1 = [
76
+ dict(type='image', value=IMAGE_PTH),
77
+ dict(type='text', value='What is in this image?')
78
+ ]
79
+ msg2 = [
80
+ dict(type='image', value=IMAGE_URL),
81
+ dict(type='image', value=IMAGE_URL),
82
+ dict(type='text', value='How many apples are there in these images?')
83
+ ]
84
+ response = model.generate(msg1)
85
+ ```
86
+
87
+ 为了方便起见,我们还支持接受字符串列表作为输入。在这种情况下,我们将检查一个字符串是图像路径还是图像 URL,并自动将其转换为 `list[dict]` 格式:
88
+
89
+ ```python
90
+ IMAGE_PTH = 'assets/apple.jpg'
91
+ IMAGE_URL = 'https://raw.githubusercontent.com/open-compass/VLMEvalKit/main/assets/apple.jpg'
92
+ msg1 = [IMAGE_PTH, 'What is in this image?']
93
+ msg2 = [IMAGE_URL, IMAGE_URL, 'How many apples are there in these images?']
94
+ response = model.generate(msg1)
95
+ ```
96
+
97
+ **2. 支持自定义提示词构建 (可选)**
98
+
99
+ 此外,你的模型可以通过实现两个可选方法来支持自定义提示构建:`use_custom_prompt(dataset)` 和 `build_prompt(line, dataset=None)`。
100
+
101
+ - `use_custom_prompt(dataset)` 将返回一个布尔值,指示模型是否应使用自定义提示构建策略。
102
+ - 如果`use_custom_prompt(dataset)`返回 True,`build_prompt(line, dataset)` 应该为相应的数据集返回一个自定义构建的多模态消息,line 数据是一个包含数据样本所需信息的字典。如果`use_custom_prompt(dataset)` 返回False,则将使用默认的 prompt 构建策略。
103
+
104
+ **3. 支持多轮对话 (可选)**
105
+
106
+ 你可以通过支持 `chat_inner(message, dataset)` API 为你的模型新增多轮对话功能并兼容多轮对话评测。这个 API 输出一个字符串型回复,`message` 包含一个聊天记录的列表,格式如下:
107
+
108
+ ```python
109
+ # Assume msg1, msg2, msg3, ... are multi-modal messages following the previously described format
110
+ # `chat_inner` take the following chat history list as input:
111
+ message = [
112
+ dict(role='user', content=msg1),
113
+ dict(role='assistant', content=msg2),
114
+ dict(role='user', content=msg3),
115
+ dict(role='assistant', content=msg4),
116
+ ......
117
+ dict(role='user', content=msgn),
118
+ ]
119
+ # `message` should contain an odd number of chat utterances, the role of utterances should be interleaved "user" and "assistant", with the role of the last utterance to be "user".
120
+ # The chat function will call `chat_inner`
121
+ response = model.chat(message)
122
+ ```
123
+
124
+ ### 示例 PRs:
125
+
126
+ - 不支持交错的图像和文本,且不使用自定义提示的VLM:[[模型] 支持 glm-4v-9b](https://github.com/open-compass/VLMEvalKit/pull/221)
127
+ - 支持交错的图像和文本及自定义提示的VLM:[添加 MiniCPM-Llama3-V-2.5](https://github.com/open-compass/VLMEvalKit/pull/205)
128
+ - VLM API:[特征添加 glmv](https://github.com/open-compass/VLMEvalKit/pull/201)
129
+
130
+ ## 为 VLMEvalKit 贡献代码
131
+
132
+ 如果你想为 **VLMEvalKit** 贡献代码,请在提交PR之前进行预提交检查。这有助于保持代码整洁。
133
+
134
+ ```bash
135
+ # 在VLMEvalKit的目录下,安装预提交 hook:
136
+ pip install pre-commit
137
+ pre-commit install
138
+ pre-commit run --all-files
139
+ # 然后提交你的代码。
140
+ ```
vlmeval/VLMEvalKit_old/docs/zh-CN/Makefile ADDED
@@ -0,0 +1,20 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # Minimal makefile for Sphinx documentation
2
+ #
3
+
4
+ # You can set these variables from the command line, and also
5
+ # from the environment for the first two.
6
+ SPHINXOPTS ?=
7
+ SPHINXBUILD ?= sphinx-build
8
+ SOURCEDIR = .
9
+ BUILDDIR = _build
10
+
11
+ # Put it first so that "make" without argument is like "make help".
12
+ help:
13
+ @$(SPHINXBUILD) -M help "$(SOURCEDIR)" "$(BUILDDIR)" $(SPHINXOPTS) $(O)
14
+
15
+ .PHONY: help Makefile
16
+
17
+ # Catch-all target: route all unknown targets to Sphinx using the new
18
+ # "make mode" option. $(O) is meant as a shortcut for $(SPHINXOPTS).
19
+ %: Makefile
20
+ @$(SPHINXBUILD) -M $@ "$(SOURCEDIR)" "$(BUILDDIR)" $(SPHINXOPTS) $(O)
vlmeval/VLMEvalKit_old/docs/zh-CN/README_zh-CN.md ADDED
@@ -0,0 +1,215 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ <div align="center">
2
+
3
+ ![LOGO](http://opencompass.openxlab.space/utils/MMLB.jpg)
4
+
5
+ <b>VLMEvalKit: 一种多模态大模型评测工具 </b>
6
+
7
+ [![][github-contributors-shield]][github-contributors-link] • [![][github-forks-shield]][github-forks-link] • [![][github-stars-shield]][github-stars-link] • [![][github-issues-shield]][github-issues-link] • [![][github-license-shield]][github-license-link]
8
+
9
+ [English](/README.md) | 简体中文 | [日本語](/docs/ja/README_ja.md)
10
+
11
+ <a href="https://rank.opencompass.org.cn/leaderboard-multimodal">🏆 OpenCompass 排行榜 </a> •
12
+ <a href="#%EF%B8%8F-quickstart">🏗️ 快速开始 </a> •
13
+ <a href="#-datasets-models-and-evaluation-results">📊 数据集和模型 </a> •
14
+ <a href="#%EF%B8%8F-development-guide">🛠️ 开发指南 </a> •
15
+ <a href="#-the-goal-of-vlmevalkit">🎯 我们的目标 </a> •
16
+ <a href="#%EF%B8%8F-citation">🖊️ 引用 </a>
17
+
18
+ <a href="https://huggingface.co/spaces/opencompass/open_vlm_leaderboard">🤗 HuggingFace 排行榜 (存档全部性能) </a> •
19
+ <a href="https://huggingface.co/datasets/VLMEval/OpenVLMRecords">🤗 原始评测记录</a> •
20
+ <a href="https://discord.gg/evDT4GZmxN">🔊 Discord</a> •
21
+ <a href="https://www.arxiv.org/abs/2407.11691">📝 技术报告 </a>
22
+ </div>
23
+
24
+ **VLMEvalKit** (python 包名为 **vlmeval**) 是一款专为大型视觉语言模型 (Large Vision-Language Models, LVLMs) 评测而设计的开源工具包。该工具支持在各种基准测试上对大型视觉语言模型进行**一键评估**,无需进行繁重的数据准备工作,让评估过程更加简便。在 VLMEvalKit 中,我们对所有大型视觉语言模型生成的结果进行评测,并提供基于**精确匹配**与基于 **LLM 的答案提取**两种评测结果。
25
+
26
+ ## 🆕 更新
27
+
28
+ - **[2024-11-21]** 集成了一个新的配置系统,以实现更灵活的评估设置。查看[文档](/docs/zh-CN/ConfigSystem.md)或运行`python run.py --help`了解更多详情 🔥🔥🔥
29
+ - **[2024-11-21]** 支持 **[QSpatial](https://andrewliao11.github.io/spatial_prompt/)**,一个用于定量空间推理的多模态基准(例如,确定大小/距离),感谢 **[andrewliao11](https://github.com/andrewliao11)** 提供官方支持 🔥🔥🔥
30
+ - **[2024-11-21]** 支持 **[MM-Math](https://github.com/kge-sun/mm-math)**,一个包含约6K初中多模态推理数学问题的新多模态数学基准。GPT-4o-20240806在该基准上达到了22.5%的准确率 🔥🔥🔥
31
+ - **[2024-11-16]** 支持 **[OlympiadBench](https://github.com/OpenBMB/OlympiadBench)**,一个多模态基准,包含奥林匹克级别的数学和物理问题 🔥🔥🔥
32
+ - **[2024-11-16]** 支持 **[WildVision](https://huggingface.co/datasets/WildVision/wildvision-bench)**,一个基于多模态竞技场数据的主观多模态基准 🔥🔥🔥
33
+ - **[2024-11-13]** 支持 **[MIA-Bench](https://arxiv.org/abs/2407.01509)**,一个多模态指令跟随基准 🔥🔥🔥
34
+ - **[2024-11-08]** 支持 **[Aria](https://arxiv.org/abs/2410.05993)**,一个多模态原生 MoE 模型,感谢 **[teowu](https://github.com/teowu)** 🔥🔥🔥
35
+ - **[2024-11-04]** 支持 **[WorldMedQA-V](https://www.arxiv.org/abs/2410.12722)**,该基准包含 1000 多个医学 VQA 问题,涵盖巴西、以色列、日本、西班牙等四个国家的语言,以及它们的英文翻译 🔥🔥🔥
36
+ - **[2024-11-01]** 支持 `AUTO_SPLIT` 标志 (https://github.com/open-compass/VLMEvalKit/pull/566),用于在低配置 GPU 上进行评估。设置后,模型将自动拆分到多个 GPU(流水线并行)以减少 GPU 内存使用(目前仅支持部分 VLMs:Qwen2-VL、Llama-3.2、LLaVA-OneVision 等) 🔥🔥🔥
37
+ - **[2024-10-30]** 支持评估 **[MLVU](https://github.com/JUNJIE99/MLVU)** 和 **[TempCompass](https://arxiv.org/abs/2403.00476v1)**。这两个基准将很快被纳入 **[OpenVLM 视频排行榜](https://huggingface.co/spaces/opencompass/openvlm_video_leaderboard)** 🔥🔥🔥
38
+
39
+ ## 🏗️ 快速开始 <a id="quickstart"></a>
40
+
41
+ 请参阅[**快速开始**](/docs/zh-CN/Quickstart.md)获取入门指南。
42
+
43
+ ## 📊 评测结果,支持的数据集和模型 <a id="data-model-results"></a>
44
+
45
+ ### 评测结果
46
+
47
+ **[OpenVLM Leaderboard](https://huggingface.co/spaces/opencompass/open_vlm_leaderboard)**: **[下载全部细粒度测试结果](http://opencompass.openxlab.space/assets/OpenVLM.json)**.
48
+
49
+ ### 支持的图文多模态评测集
50
+
51
+ - 默认情况下,我们在 [**OpenVLM Leaderboard**](https://huggingface.co/spaces/opencompass/open_vlm_leaderboard) 提供全部测试结果
52
+ - 使用的缩写:`MCQ`: 单项选择题; `Y/N`: 正误判断题; `MTT`: 多轮对话评测; `MTI`: 多图输入评测
53
+ -
54
+ - | Dataset | Dataset Names (for run.py) | Task | Dataset | Dataset Names (for run.py) | Task |
55
+ | ------------------------------------------------------------ | ------------------------------------------------------------ | --------- | ------------------------------------------------------------ | ------------------------------------------------------------ | -------------- |
56
+ | [**MMBench Series**](https://github.com/open-compass/mmbench/): <br>MMBench, MMBench-CN, CCBench | MMBench\_DEV\_[EN/CN] <br>MMBench\_TEST\_[EN/CN]<br>MMBench\_DEV\_[EN/CN]\_V11<br>MMBench\_TEST\_[EN/CN]\_V11<br>CCBench | MCQ | [**MMStar**](https://github.com/MMStar-Benchmark/MMStar) | MMStar | MCQ |
57
+ | [**MME**](https://github.com/BradyFU/Awesome-Multimodal-Large-Language-Models/tree/Evaluation) | MME | Y/N | [**SEEDBench Series**](https://github.com/AILab-CVC/SEED-Bench) | SEEDBench_IMG <br>SEEDBench2 <br>SEEDBench2_Plus | MCQ |
58
+ | [**MM-Vet**](https://github.com/yuweihao/MM-Vet) | MMVet | VQA | [**MMMU**](https://mmmu-benchmark.github.io) | MMMU_[DEV_VAL/TEST] | MCQ |
59
+ | [**MathVista**](https://mathvista.github.io) | MathVista_MINI | VQA | [**ScienceQA_IMG**](https://scienceqa.github.io) | ScienceQA_[VAL/TEST] | MCQ |
60
+ | [**COCO Caption**](https://cocodataset.org) | COCO_VAL | Caption | [**HallusionBench**](https://github.com/tianyi-lab/HallusionBench) | HallusionBench | Y/N |
61
+ | [**OCRVQA**](https://ocr-vqa.github.io)* | OCRVQA_[TESTCORE/TEST] | VQA | [**TextVQA**](https://textvqa.org)* | TextVQA_VAL | VQA |
62
+ | [**ChartQA**](https://github.com/vis-nlp/ChartQA)* | ChartQA_TEST | VQA | [**AI2D**](https://allenai.org/data/diagrams) | AI2D_[TEST/TEST_NO_MASK] | MCQ |
63
+ | [**LLaVABench**](https://huggingface.co/datasets/liuhaotian/llava-bench-in-the-wild) | LLaVABench | VQA | [**DocVQA**](https://www.docvqa.org)+ | DocVQA_[VAL/TEST] | VQA |
64
+ | [**InfoVQA**](https://www.docvqa.org/datasets/infographicvqa)+ | InfoVQA_[VAL/TEST] | VQA | [**OCRBench**](https://github.com/Yuliang-Liu/MultimodalOCR) | OCRBench | VQA |
65
+ | [**RealWorldQA**](https://x.ai/blog/grok-1.5v) | RealWorldQA | MCQ | [**POPE**](https://github.com/AoiDragon/POPE) | POPE | Y/N |
66
+ | [**Core-MM**](https://github.com/core-mm/core-mm)- | CORE_MM (MTI) | VQA | [**MMT-Bench**](https://mmt-bench.github.io) | MMT-Bench\_[VAL/ALL]<br>MMT-Bench\_[VAL/ALL]_MI | MCQ (MTI) |
67
+ | [**MLLMGuard**](https://github.com/Carol-gutianle/MLLMGuard) - | MLLMGuard_DS | VQA | [**AesBench**](https://github.com/yipoh/AesBench)+ | AesBench_[VAL/TEST] | MCQ |
68
+ | [**VCR-wiki**](https://huggingface.co/vcr-org/) + | VCR\_[EN/ZH]\_[EASY/HARD]_[ALL/500/100] | VQA | [**MMLongBench-Doc**](https://mayubo2333.github.io/MMLongBench-Doc/)+ | MMLongBench_DOC | VQA (MTI) |
69
+ | [**BLINK**](https://zeyofu.github.io/blink/) | BLINK | MCQ (MTI) | [**MathVision**](https://mathvision-cuhk.github.io)+ | MathVision<br>MathVision_MINI | VQA |
70
+ | [**MT-VQA**](https://github.com/bytedance/MTVQA) | MTVQA_TEST | VQA | [**MMDU**](https://liuziyu77.github.io/MMDU/)+ | MMDU | VQA (MTT, MTI) |
71
+ | [**Q-Bench1**](https://github.com/Q-Future/Q-Bench) | Q-Bench1_[VAL/TEST] | MCQ | [**A-Bench**](https://github.com/Q-Future/A-Bench) | A-Bench_[VAL/TEST] | MCQ |
72
+ | [**DUDE**](https://arxiv.org/abs/2305.08455)+ | DUDE | VQA (MTI) | [**SlideVQA**](https://arxiv.org/abs/2301.04883)+ | SLIDEVQA<br>SLIDEVQA_MINI | VQA (MTI) |
73
+ | [**TaskMeAnything ImageQA Random**](https://huggingface.co/datasets/weikaih/TaskMeAnything-v1-imageqa-random)+ | TaskMeAnything_v1_imageqa_random | MCQ | [**MMMB and Multilingual MMBench**](https://sun-hailong.github.io/projects/Parrot/)+ | MMMB\_[ar/cn/en/pt/ru/tr]<br>MMBench_dev\_[ar/cn/en/pt/ru/tr]<br>MMMB<br>MTL_MMBench_DEV<br>PS: MMMB & MTL_MMBench_DEV <br>are **all-in-one** names for 6 langs | MCQ |
74
+ | [**A-OKVQA**](https://arxiv.org/abs/2206.01718)+ | A-OKVQA | MCQ | [**MuirBench**](https://muirbench.github.io)+ | MUIRBench | MCQ |
75
+ | [**GMAI-MMBench**](https://huggingface.co/papers/2408.03361)+ | GMAI-MMBench_VAL | MCQ | [**TableVQABench**](https://arxiv.org/abs/2404.19205)+ | TableVQABench | VQA |
76
+ | [**MME-RealWorld**](https://arxiv.org/abs/2408.13257)+ | MME-RealWorld[-CN] | MCQ | [**HRBench**](https://arxiv.org/abs/2408.15556)+ | HRBench[4K/8K] | MCQ |
77
+ | [**MathVerse**](https://mathverse-cuhk.github.io/)+ | MathVerse_MINI<br/>MathVerse_MINI_Vision_Only <br/>MathVerse_MINI_Vision_Dominant<br/>MathVerse_MINI_Vision_Intensive<br/>MathVerse_MINI_Text_Lite<br/>MathVerse_MINI_Text_Dominant | VQA | [**AMBER**](https://github.com/junyangwang0410/AMBER)+ | AMBER | Y/N |
78
+ | [**CRPE**](https://huggingface.co/datasets/OpenGVLab/CRPE)+ | CRPE_[EXIST/RELATION] | VQA | **[MMSearch](https://mmsearch.github.io/)**$$^1$$ | - | **-** |
79
+ | **[R-Bench](https://arxiv.org/abs/2410.05474)**+ | R-Bench-[Dis/Ref] | MCQ | **[WorldMedQA-V](https://www.arxiv.org/abs/2410.12722)**+ | WorldMedQA-V | MCQ |
80
+ | **[GQA](https://cs.stanford.edu/people/dorarad/gqa/about.html)**+ | GQA_TestDev_Balanced | VQA | **[MIA-Bench](https://arxiv.org/abs/2407.01509)**+ | MIA-Bench | VQA |
81
+ | **[WildVision](https://huggingface.co/datasets/WildVision/wildvision-bench)**+ | WildVision | VQA | **[OlympiadBench](https://github.com/OpenBMB/OlympiadBench)** | OlympiadBench | VQA |
82
+
83
+ **\*** 我们只提供了部分模型上的测试结果,剩余模型无法在 zero-shot 设定下测试出合理的精度
84
+
85
+ **\+** 我们尚未提供这个评测集的测试结果
86
+
87
+ **\-** VLMEvalKit 仅支持这个评测集的推理,无法输出最终精度
88
+
89
+ $$^1$$ VLMEvalKit 在评测集的官方代码库中被使用
90
+
91
+ 如果您设置了 API KEY,VLMEvalKit 将使用一个 **LLM** 从输出中提取答案进行匹配判断,否则它将使用**精确匹配**模式 (直接在输出字符串中查找“yes”,“no”,“A”,“B”,“C”等)。**精确匹配只能应用于是或否任务和多选择任务**
92
+
93
+ ### 支持的视频多模态评测集
94
+
95
+ | Dataset | Dataset Names (for run.py) | Task | Dataset | Dataset Names (for run.py) | Task |
96
+ | ------------------------------------------------------------ | -------------------------- | ------------------- | --------------------------------------------- | -------------------------- | --------- |
97
+ | [**MMBench-Video**](https://mmbench-video.github.io) | MMBench-Video | VQA | [**Video-MME**](https://video-mme.github.io/) | Video-MME | MCQ |
98
+ | [**MVBench**](https://github.com/OpenGVLab/Ask-Anything/blob/main/video_chat2/MVBENCH.md) | MVBench/MVBench_MP4 | MCQ | **[MLVU](https://github.com/JUNJIE99/MLVU)** | MLVU | MCQ & VQA |
99
+ | **[TempCompass](https://arxiv.org/abs/2403.00476)** | TempCompass | MCQ & Y/N & Caption | | | |
100
+
101
+ ### 支持的模型
102
+
103
+ **API 模型**
104
+
105
+ | [**GPT-4v (20231106, 20240409)**](https://platform.openai.com/docs/guides/vision) 🎞️🚅 | [**GPT-4o**](https://openai.com/index/hello-gpt-4o/) 🎞️🚅 | [**Gemini-1.0-Pro**](https://platform.openai.com/docs/guides/vision) 🎞️🚅 | [**Gemini-1.5-Pro**](https://platform.openai.com/docs/guides/vision) 🎞️🚅 | [**Step-1V**](https://www.stepfun.com/#step1v) 🎞️🚅 |
106
+ | ------------------------------------------------------------ | ------------------------------------------------------------ | ------------------------------------------------------------ | ------------------------------------------------------------ | ------------------------------------------------- |
107
+ | [**Reka-[Edge / Flash / Core]**](https://www.reka.ai)🚅 | [**Qwen-VL-[Plus / Max]**](https://huggingface.co/spaces/Qwen/Qwen-VL-Max) 🎞️🚅<br>[**Qwen-VL-[Plus / Max]-0809**](https://huggingface.co/spaces/Qwen/Qwen-VL-Max) 🎞️🚅 | [**Claude3-[Haiku / Sonnet / Opus]**](https://www.anthropic.com/news/claude-3-family) 🎞️🚅 | [**GLM-4v**](https://open.bigmodel.cn/dev/howuse/glm4v) 🚅 | [**CongRong**](https://mllm.cloudwalk.com/web) 🎞️🚅 |
108
+ | [**Claude3.5-Sonnet (20240620, 20241022)**](https://www.anthropic.com/news/claude-3-5-sonnet) 🎞️🚅 | [**GPT-4o-Mini**](https://openai.com/index/gpt-4o-mini-advancing-cost-efficient-intelligence/) 🎞️🚅 | [**Yi-Vision**](https://platform.lingyiwanwu.com)🎞️🚅 | [**Hunyuan-Vision**](https://cloud.tencent.com/document/product/1729)🎞️🚅 | [**BlueLM-V**](https://developers.vivo.com/) 🎞️🚅 |
109
+
110
+ **基于 PyTorch / HF 的开源模型**
111
+
112
+ | [**IDEFICS-[9B/80B/v2-8B/v3-8B]-Instruct**](https://huggingface.co/HuggingFaceM4/idefics-9b-instruct)🚅🎞️ | [**InstructBLIP-[7B/13B]**](https://github.com/salesforce/LAVIS/blob/main/projects/instructblip/README.md) | [**LLaVA-[v1-7B/v1.5-7B/v1.5-13B]**](https://github.com/haotian-liu/LLaVA) | [**MiniGPT-4-[v1-7B/v1-13B/v2-7B]**](https://github.com/Vision-CAIR/MiniGPT-4) |
113
+ | ------------------------------------------------------------ | ------------------------------------------------------------ | ------------------------------------------------------------ | ------------------------------------------------------------ |
114
+ | [**mPLUG-Owl[2/3]**](https://github.com/X-PLUG/mPLUG-Owl/tree/main/mPLUG-Owl2)🎞️ | [**OpenFlamingo-v2**](https://github.com/mlfoundations/open_flamingo)🎞️ | [**PandaGPT-13B**](https://github.com/yxuansu/PandaGPT) | [**Qwen-VL**](https://huggingface.co/Qwen/Qwen-VL)🚅🎞️ <br>[**Qwen-VL-Chat**](https://huggingface.co/Qwen/Qwen-VL-Chat)🚅🎞️ |
115
+ | [**VisualGLM-6B**](https://huggingface.co/THUDM/visualglm-6b)🚅 | [**InternLM-XComposer-[1/2]**](https://huggingface.co/internlm/internlm-xcomposer-7b)🚅 | [**ShareGPT4V-[7B/13B]**](https://sharegpt4v.github.io)🚅 | [**TransCore-M**](https://github.com/PCIResearch/TransCore-M) |
116
+ | [**LLaVA (XTuner)**](https://huggingface.co/xtuner/llava-internlm-7b)🚅 | [**CogVLM-[Chat/Llama3]**](https://huggingface.co/THUDM/cogvlm-chat-hf)🚅 | [**ShareCaptioner**](https://huggingface.co/spaces/Lin-Chen/Share-Captioner)🚅 | [**CogVLM-Grounding-Generalist**](https://huggingface.co/THUDM/cogvlm-grounding-generalist-hf)🚅 |
117
+ | [**Monkey**](https://github.com/Yuliang-Liu/Monkey)🚅<br>[**Monkey-Chat**](https://github.com/Yuliang-Liu/Monkey)🚅 | [**EMU2-Chat**](https://github.com/baaivision/Emu)🚅🎞️ | [**Yi-VL-[6B/34B]**](https://huggingface.co/01-ai/Yi-VL-6B) | [**MMAlaya**](https://huggingface.co/DataCanvas/MMAlaya)🚅 |
118
+ | [**InternLM-XComposer-2.5**](https://github.com/InternLM/InternLM-XComposer)🚅🎞️ | [**MiniCPM-[V1/V2/V2.5/V2.6]**](https://github.com/OpenBMB/MiniCPM-V)🚅🎞️ | [**OmniLMM-12B**](https://huggingface.co/openbmb/OmniLMM-12B) | [**InternVL-Chat-[V1-1/V1-2/V1-5/V2]**](https://github.com/OpenGVLab/InternVL)🚅🎞️ |
119
+ | [**DeepSeek-VL**](https://github.com/deepseek-ai/DeepSeek-VL/tree/main)🎞️ | [**LLaVA-NeXT**](https://llava-vl.github.io/blog/2024-01-30-llava-next/)🚅🎞️ | [**Bunny-Llama3**](https://huggingface.co/BAAI/Bunny-v1_1-Llama-3-8B-V)🚅 | [**XVERSE-V-13B**](https://github.com/xverse-ai/XVERSE-V-13B/blob/main/vxverse/models/vxverse.py) |
120
+ | [**PaliGemma-3B**](https://huggingface.co/google/paligemma-3b-pt-448) 🚅 | [**360VL-70B**](https://huggingface.co/qihoo360/360VL-70B) 🚅 | [**Phi-3-Vision**](https://huggingface.co/microsoft/Phi-3-vision-128k-instruct)🚅🎞️<br>[**Phi-3.5-Vision**](https://huggingface.co/microsoft/Phi-3.5-vision-instruct)🚅🎞️ | [**WeMM**](https://github.com/scenarios/WeMM)🚅 |
121
+ | [**GLM-4v-9B**](https://huggingface.co/THUDM/glm-4v-9b) 🚅 | [**Cambrian-[8B/13B/34B]**](https://cambrian-mllm.github.io/) | [**LLaVA-Next-[Qwen-32B]**](https://huggingface.co/lmms-lab/llava-next-qwen-32b) 🎞️ | [**Chameleon-[7B/30B]**](https://huggingface.co/facebook/chameleon-7b)🚅🎞️ |
122
+ | [**Video-LLaVA-7B-[HF]**](https://github.com/PKU-YuanGroup/Video-LLaVA) 🎬 | [**VILA1.5-[3B/8B/13B/40B]**](https://github.com/NVlabs/VILA/)🎞️ | [**Ovis[1.5-Llama3-8B/1.5-Gemma2-9B/1.6-Gemma2-9B/1.6-Llama3.2-3B]**](https://github.com/AIDC-AI/Ovis) 🚅🎞️ | [**Mantis-8B-[siglip-llama3/clip-llama3/Idefics2/Fuyu]**](https://huggingface.co/TIGER-Lab/Mantis-8B-Idefics2) 🎞️ |
123
+ | [**Llama-3-MixSenseV1_1**](https://huggingface.co/Zero-Vision/Llama-3-MixSenseV1_1)🚅 | [**Parrot-7B**](https://github.com/AIDC-AI/Parrot) 🚅 | [**OmChat-v2.0-13B-sinlge-beta**](https://huggingface.co/omlab/omchat-v2.0-13B-single-beta_hf) 🚅 | [**Video-ChatGPT**](https://github.com/mbzuai-oryx/Video-ChatGPT) 🎬 |
124
+ | [**Chat-UniVi-7B[-v1.5]**](https://github.com/PKU-YuanGroup/Chat-UniVi) 🎬 | [**LLaMA-VID-7B**](https://github.com/dvlab-research/LLaMA-VID) 🎬 | [**VideoChat2-HD**](https://huggingface.co/OpenGVLab/VideoChat2_HD_stage4_Mistral_7B) 🎬 | [**PLLaVA-[7B/13B/34B]**](https://huggingface.co/ermu2001/pllava-7b) 🎬 |
125
+ | [**RBDash_72b**](https://github.com/RBDash-Team/RBDash) 🚅🎞️ | [**xgen-mm-phi3-[interleave/dpo]-r-v1.5**](https://huggingface.co/Salesforce/xgen-mm-phi3-mini-instruct-interleave-r-v1.5) 🚅🎞️ | [**Qwen2-VL-[2B/7B/72B]**](https://github.com/QwenLM/Qwen2-VL)🚅🎞️ | [**slime_[7b/8b/13b]**](https://github.com/yfzhang114/SliME)🎞️ |
126
+ | [**Eagle-X4-[8B/13B]**](https://github.com/NVlabs/EAGLE)🚅🎞️, <br>[**Eagle-X5-[7B/13B/34B]**](https://github.com/NVlabs/EAGLE)🚅🎞️ | [**Moondream1**](https://github.com/vikhyat/moondream)🚅, <br>[**Moondream2**](https://github.com/vikhyat/moondream)🚅 | [**XinYuan-VL-2B-Instruct**](https://huggingface.co/Cylingo/Xinyuan-VL-2B)🚅🎞️ | [**Llama-3.2-[11B/90B]-Vision-Instruct**](https://huggingface.co/meta-llama/Llama-3.2-11B-Vision-Instruct)🚅 |
127
+ | [**Kosmos2**](https://huggingface.co/microsoft/kosmos-2-patch14-224)🚅 | [**H2OVL-Mississippi-[0.8B/2B]**](https://huggingface.co/h2oai/h2ovl-mississippi-2b)🚅🎞️ | **[Pixtral-12B](https://huggingface.co/mistralai/Pixtral-12B-2409)**🎞️ | **[Falcon2-VLM-11B](https://huggingface.co/tiiuae/falcon-11B-vlm)**🚅 |
128
+ | **[MiniMonkey](https://huggingface.co/mx262/MiniMonkey)**🚅🎞️ | **[LLaVA-OneVision](https://huggingface.co/lmms-lab/llava-onevision-qwen2-72b-ov-sft)**🚅🎞️ | **[LLaVA-Video](https://huggingface.co/collections/lmms-lab/llava-video-661e86f5e8dabc3ff793c944)**🚅🎞️ | **[Aquila-VL-2B](https://huggingface.co/BAAI/Aquila-VL-2B-llava-qwen)**🚅🎞️ |
129
+ | [**Mini-InternVL-Chat-[2B/4B]-V1-5**](https://github.com/OpenGVLab/InternVL)🚅🎞️ | **[InternVL2 Series](https://huggingface.co/OpenGVLab/InternVL2-8B)** 🚅🎞️ | **[Janus-1.3B](https://huggingface.co/deepseek-ai/Janus-1.3B)**🚅🎞️ | **[molmoE-1B/molmo-7B/molmo-72B](https://huggingface.co/allenai/Molmo-7B-D-0924)**🚅 |
130
+ | **[Points-[Yi-1.5-9B/Qwen-2.5-7B]](https://huggingface.co/WePOINTS/POINTS-Yi-1-5-9B-Chat)**🚅 | **[NVLM](https://huggingface.co/nvidia/NVLM-D-72B)**🚅 | **[VIntern](https://huggingface.co/5CD-AI/Vintern-3B-beta)**🚅🎞️ | **[Aria](https://huggingface.co/rhymes-ai/Aria)**🚅🎞️ |
131
+
132
+ 🎞️ 表示支持多图片输入。
133
+
134
+ 🚅 表示模型可以被直接使用,不需任何额外的配置。
135
+
136
+ 🎬 表示支持视频输入。
137
+
138
+ ### 其他
139
+
140
+ **Transformers 的版本推荐:**
141
+
142
+ **请注意**,某些 VLM 可能无法在某些特定的 transformers 版本下运行,我们建议使用以下设置来评估对应的VLM:
143
+
144
+ - **请用** `transformers==4.33.0` **来运行**: `Qwen series`, `Monkey series`, `InternLM-XComposer Series`, `mPLUG-Owl2`, `OpenFlamingo v2`, `IDEFICS series`, `VisualGLM`, `MMAlaya`, `ShareCaptioner`, `MiniGPT-4 series`, `InstructBLIP series`, `PandaGPT`, `VXVERSE`.
145
+ - **请用** `transformers==4.37.0 ` **来运行**: `LLaVA series`, `ShareGPT4V series`, `TransCore-M`, `LLaVA (XTuner)`, `CogVLM Series`, `EMU2 Series`, `Yi-VL Series`, `MiniCPM-[V1/V2]`, `OmniLMM-12B`, `DeepSeek-VL series`, `InternVL series`, `Cambrian Series`, `VILA Series`, `Llama-3-MixSenseV1_1`, `Parrot-7B`, `PLLaVA Series`.
146
+ - **请用** `transformers==4.40.0 ` **来运行**: `IDEFICS2`, `Bunny-Llama3`, `MiniCPM-Llama3-V2.5`, `360VL-70B`, `Phi-3-Vision`, `WeMM`.
147
+ - **请用** `transformers==latest` **来运行**: `LLaVA-Next series`, `PaliGemma-3B`, `Chameleon series`, `Video-LLaVA-7B-HF`, `Ovis series`, `Mantis series`, `MiniCPM-V2.6`, `OmChat-v2.0-13B-sinlge-beta`, `Idefics-3`, `GLM-4v-9B`, `VideoChat2-HD`.
148
+
149
+ **如何测试一个 VLM 是否可以正常运行:**
150
+
151
+ ```python
152
+ from vlmeval.config import supported_VLM
153
+ model = supported_VLM['idefics_9b_instruct']()
154
+ # 前向单张图片
155
+ ret = model.generate(['assets/apple.jpg', 'What is in this image?'])
156
+ print(ret) # 这张图片上有一个带叶子的红苹果
157
+ # 前向多张图片
158
+ ret = model.generate(['assets/apple.jpg', 'assets/apple.jpg', 'How many apples are there in the provided images? '])
159
+ print(ret) # 提供的图片中有两个苹果
160
+ ```
161
+
162
+ ## 🛠️ 开发指南 <a id="development"></a>
163
+
164
+ 要开发自定义评测数据集,支持其他 VLMs,或为 VLMEvalKit 贡献代码,请参阅[**开发指南**](/docs/zh-CN/Development_zh-CN.md)。
165
+
166
+ 为激励来自社区的共享并分享相应的 credit,在下一次 report 更新中,我们将:
167
+
168
+ - 致谢所有的 contribution
169
+ - 具备三个或以上主要贡献 (支持新模型、评测集、或是主要特性) 的贡献者将可以加入技术报告的作者列表 。合条件的贡献者可以创建 issue 或是在 [VLMEvalKit Discord Channel](https://discord.com/invite/evDT4GZmxN) 私信 kennyutc,我们将进行跟进
170
+
171
+ ## 🎯 VLMEvalKit 的目标 <a id="goal-of-vlmevalkit"></a>
172
+
173
+ **该代码库的设计目标是:**
174
+
175
+ 1. 提供一个**易于使用**的**开源评估工具包**,方便研究人员和开发人员评测现有的多模态大模型,并使评测结果**易于复现**。
176
+ 2. 使 VLM 开发人员能够轻松地评测自己的模型。在多个支持的基准测试上评估 VLM,只需实现一个 `generate_inner()` 函数,所有其他工作负载(数据下载、数据预处理、预测推理、度量计算)都由代码库处理。
177
+
178
+ **该代码库的设计目标不是:**
179
+
180
+ 复现所有**第三方基准测试**原始论文中报告的准确数字。有两个相关的原因:
181
+ 1. VLMEvalKit 对所有 VLMs 使用基于生成的评估(可选使用基于 LLM 的答案提取)。同时,一些基准测试可能官方使用不同的方法(*例如,SEEDBench 使用基于 PPL 的评估*)。对于这些基准测试,我们在相应的结果中比较两个得分。我们鼓励开发人员在代码库中支持其他评估范式。
182
+ 2. 默认情况下,我们对所有多模态模型使用相同的提示模板来评估基准测试。同时,**一些多模态模型可能有他们特定的提示模板**(目前可能未在代码库中涵盖)。我们鼓励 VLM 的开发人员在 VLMEvalKit 中实现自己的提示模板,如果目前未覆盖。这将有助于提高可复现性。
183
+
184
+ ## 🖊️ 引用 <a id="citation"></a>
185
+
186
+ 如果我们的工作对您有所帮助,请考虑 **star🌟** VLMEvalKit。感谢支持!
187
+
188
+ [![Stargazers repo roster for @open-compass/VLMEvalKit](https://reporoster.com/stars/open-compass/VLMEvalKit)](https://github.com/open-compass/VLMEvalKit/stargazers)
189
+
190
+ 如果您在研究中使用了 VLMEvalKit,或希望参考已发布的开源评估结果,请使用以下 BibTeX 条目以及与您使用的特定 VLM / 基准测试相对应的 BibTex 条目。
191
+
192
+ ```bib
193
+ @misc{duan2024vlmevalkit,
194
+ title={VLMEvalKit: An Open-Source Toolkit for Evaluating Large Multi-Modality Models},
195
+ author={Haodong Duan and Junming Yang and Yuxuan Qiao and Xinyu Fang and Lin Chen and Yuan Liu and Xiaoyi Dong and Yuhang Zang and Pan Zhang and Jiaqi Wang and Dahua Lin and Kai Chen},
196
+ year={2024},
197
+ eprint={2407.11691},
198
+ archivePrefix={arXiv},
199
+ primaryClass={cs.CV},
200
+ url={https://arxiv.org/abs/2407.11691},
201
+ }
202
+ ```
203
+
204
+ <p align="right"><a href="#top">🔝回到顶部</a></p>
205
+
206
+ [github-contributors-link]: https://github.com/open-compass/VLMEvalKit/graphs/contributors
207
+ [github-contributors-shield]: https://img.shields.io/github/contributors/open-compass/VLMEvalKit?color=c4f042&labelColor=black&style=flat-square
208
+ [github-forks-link]: https://github.com/open-compass/VLMEvalKit/network/members
209
+ [github-forks-shield]: https://img.shields.io/github/forks/open-compass/VLMEvalKit?color=8ae8ff&labelColor=black&style=flat-square
210
+ [github-issues-link]: https://github.com/open-compass/VLMEvalKit/issues
211
+ [github-issues-shield]: https://img.shields.io/github/issues/open-compass/VLMEvalKit?color=ff80eb&labelColor=black&style=flat-square
212
+ [github-license-link]: https://github.com/open-compass/VLMEvalKit/blob/main/LICENSE
213
+ [github-license-shield]: https://img.shields.io/github/license/open-compass/VLMEvalKit?color=white&labelColor=black&style=flat-square
214
+ [github-stars-link]: https://github.com/open-compass/VLMEvalKit/stargazers
215
+ [github-stars-shield]: https://img.shields.io/github/stars/open-compass/VLMEvalKit?color=ffcb47&labelColor=black&style=flat-square
vlmeval/VLMEvalKit_old/docs/zh-CN/_static/image/logo_icon.svg ADDED
vlmeval/VLMEvalKit_old/docs/zh-CN/_static/js/custom.js ADDED
@@ -0,0 +1,10 @@
 
 
 
 
 
 
 
 
 
 
 
1
+ var collapsedSections = [];
2
+
3
+ $(document).ready(function () {
4
+ $('.model-summary').DataTable({
5
+ "stateSave": false,
6
+ "lengthChange": false,
7
+ "pageLength": 20,
8
+ "order": []
9
+ });
10
+ });
vlmeval/VLMEvalKit_old/docs/zh-CN/_templates/autosummary/class.rst ADDED
@@ -0,0 +1,13 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ .. role:: hidden
2
+ :class: hidden-section
3
+ .. currentmodule:: {{ module }}
4
+
5
+
6
+ {{ name | underline}}
7
+
8
+ .. autoclass:: {{ name }}
9
+ :members:
10
+
11
+ ..
12
+ autogenerated from _templates/autosummary/class.rst
13
+ note it does not have :inherited-members:
vlmeval/VLMEvalKit_old/docs/zh-CN/_templates/callable.rst ADDED
@@ -0,0 +1,14 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ .. role:: hidden
2
+ :class: hidden-section
3
+ .. currentmodule:: {{ module }}
4
+
5
+
6
+ {{ name | underline}}
7
+
8
+ .. autoclass:: {{ name }}
9
+ :members:
10
+ :special-members: __call__
11
+
12
+ ..
13
+ autogenerated from _templates/callable.rst
14
+ note it does not have :inherited-members:
vlmeval/VLMEvalKit_old/docs/zh-CN/conf.py ADDED
@@ -0,0 +1,242 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # flake8: noqa
2
+ # Configuration file for the Sphinx documentation builder.
3
+ #
4
+ # This file only contains a selection of the most common options. For a full
5
+ # list see the documentation:
6
+ # https://www.sphinx-doc.org/en/master/usage/configuration.html
7
+
8
+ # -- Path setup --------------------------------------------------------------
9
+
10
+ # If extensions (or modules to document with autodoc) are in another directory,
11
+ # add these directories to sys.path here. If the directory is relative to the
12
+ # documentation root, use os.path.abspath to make it absolute, like shown here.
13
+ #
14
+ import os
15
+ import ast
16
+ import subprocess
17
+ import sys
18
+
19
+ import pytorch_sphinx_theme
20
+ from sphinx.builders.html import StandaloneHTMLBuilder
21
+
22
+ sys.path.insert(0, os.path.abspath('../../'))
23
+
24
+ # -- Project information -----------------------------------------------------
25
+
26
+ project = 'VLMEvalKit'
27
+ copyright = '2023, VLMEvalKit'
28
+ author = 'VLMEvalKit Authors'
29
+
30
+ # The full version, including alpha/beta/rc tags
31
+ version_file = '../../vlmeval/__init__.py'
32
+
33
+
34
+ def get_version():
35
+ with open(version_file, 'r') as f:
36
+ file_content = f.read()
37
+ # Parse the file content into an abstract syntax tree (AST)
38
+ tree = ast.parse(file_content, filename=version_file)
39
+
40
+ # Iterate through the body of the AST, looking for an assignment to __version__
41
+ for node in tree.body:
42
+ if isinstance(node, ast.Assign):
43
+ for target in node.targets:
44
+ if isinstance(target, ast.Name) and target.id == '__version__':
45
+ return node.value.s
46
+ raise ValueError('__version__ not found')
47
+
48
+
49
+ release = get_version()
50
+
51
+ # -- General configuration ---------------------------------------------------
52
+
53
+ # Add any Sphinx extension module names here, as strings. They can be
54
+ # extensions coming with Sphinx (named 'sphinx.ext.*') or your custom
55
+ # ones.
56
+ extensions = [
57
+ 'sphinx.ext.autodoc',
58
+ 'sphinx.ext.autosummary',
59
+ 'sphinx.ext.intersphinx',
60
+ 'sphinx.ext.napoleon',
61
+ 'sphinx.ext.viewcode',
62
+ 'myst_parser',
63
+ 'sphinx_copybutton',
64
+ 'sphinx_tabs.tabs',
65
+ 'notfound.extension',
66
+ 'sphinxcontrib.jquery',
67
+ 'sphinx_design',
68
+ ]
69
+
70
+ # Add any paths that contain templates here, relative to this directory.
71
+ templates_path = ['_templates']
72
+
73
+ # The suffix(es) of source filenames.
74
+ # You can specify multiple suffix as a list of string:
75
+ #
76
+ source_suffix = {
77
+ '.rst': 'restructuredtext',
78
+ '.md': 'markdown',
79
+ }
80
+
81
+ language = 'cn'
82
+
83
+ # The master toctree document.
84
+ root_doc = 'index'
85
+ html_context = {
86
+ 'github_version': 'latest',
87
+ }
88
+ # List of patterns, relative to source directory, that match files and
89
+ # directories to ignore when looking for source files.
90
+ # This pattern also affects html_static_path and html_extra_path.
91
+ exclude_patterns = ['_build', 'Thumbs.db', '.DS_Store']
92
+
93
+ # -- Options for HTML output -------------------------------------------------
94
+
95
+ # The theme to use for HTML and HTML Help pages. See the documentation for
96
+ # a list of builtin themes.
97
+ #
98
+ html_theme = 'pytorch_sphinx_theme'
99
+ html_theme_path = [pytorch_sphinx_theme.get_html_theme_path()]
100
+
101
+ # Theme options are theme-specific and customize the look and feel of a theme
102
+ # further. For a list of options available for each theme, see the
103
+ # documentation.
104
+ # yapf: disable
105
+ html_theme_options = {
106
+ 'menu': [
107
+ {
108
+ 'name': 'GitHub',
109
+ 'url': 'https://github.com/open-compass/VLMEvalKit'
110
+ },
111
+ ],
112
+ # Specify the language of shared menu
113
+ 'menu_lang': 'cn',
114
+ # Disable the default edit on GitHub
115
+ 'default_edit_on_github': False,
116
+ }
117
+ # yapf: enable
118
+
119
+ # Add any paths that contain custom static files (such as style sheets) here,
120
+ # relative to this directory. They are copied after the builtin static files,
121
+ # so a file named "default.css" will overwrite the builtin "default.css".
122
+ html_static_path = ['_static']
123
+ html_css_files = [
124
+ 'https://cdn.datatables.net/v/bs4/dt-1.12.1/datatables.min.css',
125
+ 'css/readthedocs.css'
126
+ ]
127
+ html_js_files = [
128
+ 'https://cdn.datatables.net/v/bs4/dt-1.12.1/datatables.min.js',
129
+ 'js/custom.js'
130
+ ]
131
+
132
+ # -- Options for HTMLHelp output ---------------------------------------------
133
+
134
+ # Output file base name for HTML help builder.
135
+ htmlhelp_basename = 'vlmevalkitdoc'
136
+
137
+ # -- Options for LaTeX output ------------------------------------------------
138
+
139
+ latex_elements = {
140
+ # The paper size ('letterpaper' or 'a4paper').
141
+ #
142
+ # 'papersize': 'letterpaper',
143
+
144
+ # The font size ('10pt', '11pt' or '12pt').
145
+ #
146
+ # 'pointsize': '10pt',
147
+
148
+ # Additional stuff for the LaTeX preamble.
149
+ #
150
+ # 'preamble': '',
151
+ }
152
+
153
+ # Grouping the document tree into LaTeX files. List of tuples
154
+ # (source start file, target name, title,
155
+ # author, documentclass [howto, manual, or own class]).
156
+ latex_documents = [
157
+ (root_doc, 'vlmevalkit.tex', 'VLMEvalKit Documentation', author,
158
+ 'manual'),
159
+ ]
160
+
161
+ # -- Options for manual page output ------------------------------------------
162
+
163
+ # One entry per manual page. List of tuples
164
+ # (source start file, name, description, authors, manual section).
165
+ man_pages = [(root_doc, 'vlmevalkit', 'VLMEvalKit Documentation', [author],
166
+ 1)]
167
+
168
+ # -- Options for Texinfo output ----------------------------------------------
169
+
170
+ # Grouping the document tree into Texinfo files. List of tuples
171
+ # (source start file, target name, title, author,
172
+ # dir menu entry, description, category)
173
+ texinfo_documents = [
174
+ (root_doc, 'vlmevalkit', 'VLMEvalKit Documentation', author,
175
+ 'VLMEvalKit Authors', 'AGI evaluation toolbox and benchmark.',
176
+ 'Miscellaneous'),
177
+ ]
178
+
179
+ # -- Options for Epub output -------------------------------------------------
180
+
181
+ # Bibliographic Dublin Core info.
182
+ epub_title = project
183
+
184
+ # The unique identifier of the text. This can be a ISBN number
185
+ # or the project homepage.
186
+ #
187
+ # epub_identifier = ''
188
+
189
+ # A unique identification for the text.
190
+ #
191
+ # epub_uid = ''
192
+
193
+ # A list of files that should not be packed into the epub file.
194
+ epub_exclude_files = ['search.html']
195
+
196
+ # set priority when building html
197
+ StandaloneHTMLBuilder.supported_image_types = [
198
+ 'image/svg+xml', 'image/gif', 'image/png', 'image/jpeg'
199
+ ]
200
+
201
+ # -- Extension configuration -------------------------------------------------
202
+ # Ignore >>> when copying code
203
+ copybutton_prompt_text = r'>>> |\.\.\. '
204
+ copybutton_prompt_is_regexp = True
205
+
206
+ # Auto-generated header anchors
207
+ myst_heading_anchors = 3
208
+ # Enable "colon_fence" extension of myst.
209
+ myst_enable_extensions = ['colon_fence', 'dollarmath']
210
+
211
+ # Configuration for intersphinx
212
+ intersphinx_mapping = {
213
+ 'python': ('https://docs.python.org/3', None),
214
+ 'numpy': ('https://numpy.org/doc/stable', None),
215
+ 'torch': ('https://pytorch.org/docs/stable/', None),
216
+ 'mmengine': ('https://mmengine.readthedocs.io/en/latest/', None),
217
+ 'transformers':
218
+ ('https://huggingface.co/docs/transformers/main/en/', None),
219
+ }
220
+ napoleon_custom_sections = [
221
+ # Custom sections for data elements.
222
+ ('Meta fields', 'params_style'),
223
+ ('Data fields', 'params_style'),
224
+ ]
225
+
226
+ # Disable docstring inheritance
227
+ autodoc_inherit_docstrings = False
228
+ # Mock some imports during generate API docs.
229
+ autodoc_mock_imports = ['rich', 'attr', 'einops']
230
+ # Disable displaying type annotations, these can be very verbose
231
+ autodoc_typehints = 'none'
232
+
233
+ # The not found page
234
+ notfound_template = '404.html'
235
+
236
+
237
+ def builder_inited_handler(app):
238
+ subprocess.run(['./cp_origin_docs.sh'])
239
+
240
+
241
+ def setup(app):
242
+ app.connect('builder-inited', builder_inited_handler)
vlmeval/VLMEvalKit_old/docs/zh-CN/index.rst ADDED
@@ -0,0 +1,49 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ 欢迎来到 VLMEvalKit 中文教程!
2
+ ==========================================
3
+
4
+ VLMEvalKit 上手路线
5
+ -------------------------------
6
+
7
+ 为了用户能够快速上手,我们推荐以下流程:
8
+
9
+ - 对于想要使用 VLMEvalKit 的用户,我们推荐先阅读 开始你的第一步_ 部分来设置环境,并启动一个迷你实验熟悉流程。
10
+
11
+ - 若您想进行更多模块的自定义,例如增加数据集和模型,我们提供了 进阶教程_ 。
12
+
13
+ 我们始终非常欢迎用户的 PRs 和 Issues 来完善 VLMEvalKit!
14
+
15
+ .. _快速开始:
16
+ .. toctree::
17
+ :maxdepth: 1
18
+ :caption: 快速开始
19
+
20
+ Quickstart.md
21
+
22
+
23
+ .. .. _教程:
24
+ .. .. toctree::
25
+ .. :maxdepth: 1
26
+ .. :caption: 教程
27
+
28
+ .. user_guides/framework_overview.md
29
+
30
+ .. _进阶教程:
31
+ .. toctree::
32
+ :maxdepth: 1
33
+ :caption: 进阶教程
34
+
35
+ Development.md
36
+ ConfigSystem.md
37
+
38
+ .. .. _其他说明:
39
+ .. .. toctree::
40
+ .. :maxdepth: 1
41
+ .. :caption: 其他说明
42
+
43
+ .. notes/contribution_guide.md
44
+
45
+ 索引与表格
46
+ ==================
47
+
48
+ * :ref:`genindex`
49
+ * :ref:`search`
vlmeval/VLMEvalKit_old/scripts/apires_scan.py ADDED
@@ -0,0 +1,55 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import sys
2
+ from vlmeval import *
3
+ from vlmeval.dataset import SUPPORTED_DATASETS
4
+ FAIL_MSG = 'Failed to obtain answer via API.'
5
+
6
+ root = sys.argv[1]
7
+ if root[-1] in '/\\':
8
+ root = root[:-1]
9
+
10
+ model_name = root.split('/')[-1]
11
+
12
+ for d in SUPPORTED_DATASETS:
13
+ fname = f'{model_name}_{d}.xlsx'
14
+ pth = osp.join(root, fname)
15
+ if osp.exists(pth):
16
+ data = load(pth)
17
+ # Detect Failure
18
+ assert 'prediction' in data
19
+ data['prediction'] = [str(x) for x in data['prediction']]
20
+ fail = [FAIL_MSG in x for x in data['prediction']]
21
+ if sum(fail):
22
+ nfail = sum(fail)
23
+ ntot = len(fail)
24
+ print(f'Model {model_name} x Dataset {d}: {nfail} out of {ntot} failed. {nfail / ntot * 100: .2f}%. ')
25
+
26
+ eval_files = ls(root, match=f'{model_name}_{d}_')
27
+ eval_files = [x for x in eval_files if listinstr([f'{d}_openai', f'{d}_gpt'], x) and x.endswith('.xlsx')]
28
+
29
+ if len(eval_files) == 0:
30
+ print(f'Model {model_name} x Dataset {d} openai missing')
31
+ continue
32
+
33
+ assert len(eval_files) == 1
34
+ eval_file = eval_files[0]
35
+ data = load(eval_file)
36
+
37
+ if 'MMVet' in d:
38
+ bad = [x for x in data['log'] if 'All 5 retries failed.' in str(x)]
39
+ if len(bad):
40
+ print(f'Model {model_name} x Dataset {d} Evaluation: {len(bad)} out of {len(data)} failed.')
41
+ elif 'MathVista' in d:
42
+ bad = [x for x in data['res'] if FAIL_MSG in str(x)]
43
+ if len(bad):
44
+ print(f'Model {model_name} x Dataset {d} Evaluation: {len(bad)} out of {len(data)} failed.')
45
+
46
+ elif d == 'LLaVABench':
47
+ sub = data[data['gpt4_score'] == -1]
48
+ sub = sub[sub['gpt4_score'] == -1]
49
+ if len(sub):
50
+ print(f'Model {model_name} x Dataset {d} Evaluation: {len(sub)} out of {len(data)} failed.')
51
+ else:
52
+ bad = [x for x in data['log'] if FAIL_MSG in str(x)]
53
+ if len(bad):
54
+ print(f'Model {model_name} x Dataset {d} Evaluation: {len(bad)} out of {len(data)} failed.')
55
+
vlmeval/VLMEvalKit_old/scripts/cover.sh ADDED
@@ -0,0 +1,4 @@
 
 
 
 
 
1
+ #!/bin/bash
2
+ DIR=$( cd -- "$( dirname -- "${BASH_SOURCE[0]}" )" &> /dev/null && pwd )
3
+ cp $DIR/../config.py $DIR/../vlmeval/
4
+ cp $DIR/../misc/* $DIR/../vlmeval/vlm/misc/
vlmeval/VLMEvalKit_old/scripts/data_browser.py ADDED
@@ -0,0 +1,166 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ """
2
+ pip install gradio # proxy_on first
3
+ python vis_geochat_data.py
4
+ # browse data in http://127.0.0.1:10064
5
+ """
6
+
7
+ import os
8
+ import io
9
+ import json
10
+ import copy
11
+ import time
12
+ import gradio as gr
13
+ import base64
14
+ from PIL import Image
15
+ from io import BytesIO
16
+ from argparse import Namespace
17
+ # from llava import conversation as conversation_lib
18
+ from typing import Sequence
19
+ from vlmeval import *
20
+ from vlmeval.dataset import SUPPORTED_DATASETS, build_dataset
21
+
22
+ SYS = "You are a helpful assistant. Your job is to faithfully translate all provided text into Chinese faithfully. "
23
+
24
+ # Translator = SiliconFlowAPI(model='Qwen/Qwen2.5-7B-Instruct', system_prompt=SYS)
25
+ Translator = OpenAIWrapper(model='gpt-4o-mini', system_prompt=SYS)
26
+
27
+
28
+ def image_to_mdstring(image):
29
+ return f"![image](data:image/jpeg;base64,{image})"
30
+
31
+
32
+ def images_to_md(images):
33
+ return '\n\n'.join([image_to_mdstring(image) for image in images])
34
+
35
+
36
+ def mmqa_display(question, target_size=768):
37
+ question = {k.lower() if len(k) > 1 else k: v for k, v in question.items()}
38
+ keys = list(question.keys())
39
+ keys = [k for k in keys if k not in ['index', 'image']]
40
+
41
+ idx = question.pop('index', 'XXX')
42
+ text = f'\n- INDEX: {idx}\n'
43
+
44
+ images = question.pop('image')
45
+ if images[0] == '[' and images[-1] == ']':
46
+ images = eval(images)
47
+ else:
48
+ images = [images]
49
+
50
+ qtext = question.pop('question', None)
51
+ if qtext is not None:
52
+ text += f'- QUESTION: {qtext}\n'
53
+
54
+ if 'A' in question:
55
+ text += f'- Choices: \n'
56
+ for k in string.ascii_uppercase:
57
+ if k in question:
58
+ text += f'\t-{k}: {question.pop(k)}\n'
59
+ answer = question.pop('answer', None)
60
+
61
+ for k in question:
62
+ if not pd.isna(question[k]):
63
+ text += f'- {k.upper()}. {question[k]}\n'
64
+
65
+ if answer is not None:
66
+ text += f'- ANSWER: {answer}\n'
67
+
68
+ image_md = images_to_md(images)
69
+
70
+ return text, image_md
71
+
72
+
73
+ def parse_args():
74
+ parser = argparse.ArgumentParser()
75
+ # Essential Args, Setting the Names of Datasets and Models
76
+ parser.add_argument('--port', type=int, default=7860)
77
+ args = parser.parse_args()
78
+ return args
79
+
80
+
81
+ def gradio_app_vis_dataset(port=7860):
82
+ data, loaded_obj = None, {}
83
+
84
+ def btn_submit_click(filename, ann_id):
85
+ if filename not in loaded_obj:
86
+ return filename_change(filename, ann_id)
87
+ nonlocal data
88
+ data_desc = gr.Markdown(f'Visualizing {filename}, {len(data)} samples in total. ')
89
+ if ann_id < 0 or ann_id >= len(data):
90
+ return filename, ann_id, data_desc, gr.Markdown('Invalid Index'), gr.Markdown(f'Index out of range [0, {len(data) - 1}]')
91
+ item = data.iloc[ann_id]
92
+ text, image_md = mmqa_display(item)
93
+ return filename, ann_id, data_desc, image_md, text
94
+
95
+ def btn_next_click(filename, ann_id):
96
+ return btn_submit_click(filename, ann_id + 1)
97
+
98
+ # def translate_click(anno_en):
99
+ # return gr.Markdown(Translator.generate(anno_en))
100
+
101
+ def filename_change(filename, ann_id):
102
+ nonlocal data, loaded_obj
103
+
104
+ def legal_filename(filename):
105
+ LMURoot = LMUDataRoot()
106
+ if filename in SUPPORTED_DATASETS:
107
+ return build_dataset(filename).data
108
+ elif osp.exists(filename):
109
+ data = load(filename)
110
+ assert 'index' in data and 'image' in data
111
+ image_map = {i: image for i, image in zip(data['index'], data['image'])}
112
+ for k, v in image_map.items():
113
+ if (not isinstance(v, str) or len(v) < 64) and v in image_map:
114
+ image_map[k] = image_map[v]
115
+ data['image'] = [image_map[k] for k in data['index']]
116
+ return data
117
+ elif osp.exists(osp.join(LMURoot, filename)):
118
+ filename = osp.join(LMURoot, filename)
119
+ return legal_filename(filename)
120
+ else:
121
+ return None
122
+
123
+ data = legal_filename(filename)
124
+ if data is None:
125
+ return filename, 0, gr.Markdown(''), gr.Markdown("File not found"), gr.Markdown("File not found")
126
+
127
+ loaded_obj[filename] = data
128
+ return btn_submit_click(filename, 0)
129
+
130
+ with gr.Blocks() as app:
131
+
132
+ filename = gr.Textbox(
133
+ value='Dataset Name (supported by VLMEvalKit) or TSV FileName (Relative under `LMURoot` or Real Path)',
134
+ label='Dataset',
135
+ interactive=True,
136
+ visible=True)
137
+
138
+ with gr.Row():
139
+ ann_id = gr.Number(0, label='Sample Index (Press Enter)', interactive=True, visible=True)
140
+ btn_next = gr.Button("Next")
141
+ # btn_translate = gr.Button('CN Translate')
142
+
143
+ with gr.Row():
144
+ data_desc = gr.Markdown('Dataset Description', label='Dataset Description')
145
+
146
+ with gr.Row():
147
+ image_output = gr.Markdown('Image PlaceHolder', label='Image Visualization')
148
+ anno_en = gr.Markdown('Image Annotation', label='Image Annotation')
149
+ # anno_cn = gr.Markdown('Image Annotation (Chinese)', label='Image Annotation (Chinese)')
150
+
151
+ input_components = [filename, ann_id]
152
+ all_components = [filename, ann_id, data_desc, image_output, anno_en]
153
+
154
+ filename.submit(filename_change, input_components, all_components)
155
+ ann_id.submit(btn_submit_click, input_components, all_components)
156
+ btn_next.click(btn_next_click, input_components, all_components)
157
+ # btn_translate.click(translate_click, anno_en, anno_cn)
158
+
159
+ # app.launch()
160
+ app.launch(server_name='0.0.0.0', debug=True, show_error=True, server_port=port)
161
+
162
+
163
+ if __name__ == "__main__":
164
+ args = parse_args()
165
+ gradio_app_vis_dataset(port=args.port)
166
+
vlmeval/VLMEvalKit_old/scripts/mmb_eval_gradio.py ADDED
@@ -0,0 +1,101 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ from vlmeval.smp import *
2
+ from vlmeval.tools import EVAL
3
+ import gradio as gr
4
+
5
+ HEADER = """
6
+ # Welcome to MMBench👏👏
7
+ We are delighted that you are willing to submit the evaluation results to the MMBench official website! The evaluation service currently can handle submissions of MMBench, MMBench-CN, and CCBench. We use `gpt-3.5-turbo-0125` to help answer matching. Evaluation Codes in VLMEvalKit: https://github.com/open-compass/VLMEvalKit. Please adopt / follow the implementation of VLMEvalKit to generate the submission files.
8
+
9
+ The evaluation script is available at https://github.com/open-compass/VLMEvalKit/tree/main/scripts/mmb_eval_gradio.py
10
+ Please contact `[email protected]` for any inquirys about this script.
11
+ """
12
+
13
+ def upload_file(file):
14
+ file_path = file.name
15
+ return file_path
16
+
17
+ def prepare_file(file_name):
18
+ file_md5 = md5(file_name)
19
+ root = LMUDataRoot()
20
+ root = osp.join(root, 'eval_server')
21
+ os.makedirs(root, exist_ok=True)
22
+ suffix = file_name.split('.')[-1]
23
+ if suffix not in ['xlsx', 'tsv', 'csv']:
24
+ return False, "Please submit a file that ends with `.xlsx`, `.tsv`, or `.csv`"
25
+ new_file_name = osp.join(root, f'{file_md5}.{suffix}')
26
+ shutil.move(file_name, new_file_name)
27
+ eval_file = new_file_name
28
+ try:
29
+ data = load(eval_file)
30
+ except:
31
+ return False, "Your excel file can not be successfully loaded by `pd.read_excel`, please double check and submit again. "
32
+ for k in data.keys():
33
+ data[k.lower() if k not in 'ABCD' else k] = data.pop(k)
34
+ if "index" not in data:
35
+ return False, "Your excel file should have a column named `index`, please double check and submit again" , {}
36
+ if "prediction" not in data:
37
+ return False, "Your excel file should have a column named `prediction`, please double check and submit again" , {}
38
+ for ch in 'ABCD':
39
+ if ch not in data:
40
+ return False, f"Your excel file should have a column named `{ch}`, please double check and submit again" , {}
41
+ dump(data, eval_file)
42
+ return True, eval_file
43
+
44
+ def determine_dataset(eval_file):
45
+ data = load(eval_file)
46
+ def cn_ratio(data):
47
+ iscn = [cn_string(x) for x in data['question']]
48
+ return np.mean(iscn)
49
+ max_ind = np.max([int(x) for x in data['index'] if int(x) < 1e5])
50
+ if max_ind < 1000 and 'l2-category' not in data:
51
+ return 'CCBench' if cn_ratio(data) > 0.5 else "Unknown"
52
+ elif max_ind < 3000 :
53
+ return 'MMBench_CN' if cn_ratio(data) > 0.5 else "MMBench"
54
+ else:
55
+ return 'MMBench_CN_V11' if cn_ratio(data) > 0.5 else "MMBench_V11"
56
+
57
+
58
+ def reformat_acc(acc):
59
+ splits = set(acc['split'])
60
+ keys = list(acc.keys())
61
+ keys.remove('split')
62
+ nacc = {'Category': []}
63
+ for sp in splits:
64
+ nacc[sp.upper()] = []
65
+ for k in keys:
66
+ nacc['Category'].append(k)
67
+ for sp in splits:
68
+ nacc[sp.upper()].append(acc[acc['split'] == sp].iloc[0][k] * 100)
69
+ return pd.DataFrame(nacc)
70
+
71
+ def evaluate(file):
72
+ file_name = file.name
73
+ flag, eval_file = prepare_file(file_name)
74
+ if not flag:
75
+ return "Error: " + eval_file
76
+ dataset = determine_dataset(eval_file)
77
+ if dataset == 'Unknown':
78
+ return "Error: Cannot determine the dataset given your submitted file. "
79
+
80
+ eval_id = eval_file.split('/')[-1].split('.')[0]
81
+ ret = f"Evaluation ID: {eval_id}\n"
82
+ timestamp = datetime.datetime.now().strftime('%Y.%m.%d %H:%M:%S')
83
+ ret += f'Evaluation Timestamp: {timestamp}\n'
84
+ acc = EVAL(dataset, eval_file)
85
+ nacc = reformat_acc(acc).round(1)
86
+ return ret, nacc
87
+
88
+ with gr.Blocks() as demo:
89
+ gr.Markdown(HEADER)
90
+ file_output = gr.File()
91
+ upload_button = gr.UploadButton("Click to upload you prediction files for a supported benchmark")
92
+ upload_button.upload(upload_file, upload_button, file_output)
93
+
94
+ btn = gr.Button("🚀 Evaluate")
95
+ eval_log = gr.Textbox(label="Evaluation Log", placeholder="Your evaluation log will be displayed here")
96
+ df_empty = pd.DataFrame([], columns=['Evaluation Result'])
97
+ eval_result = gr.components.DataFrame(value=df_empty)
98
+ btn.click(evaluate, inputs=[file_output], outputs=[eval_log, eval_result])
99
+
100
+ if __name__ == '__main__':
101
+ demo.launch(server_name='0.0.0.0', debug=True, show_error=True)
vlmeval/VLMEvalKit_old/scripts/run.sh ADDED
@@ -0,0 +1,4 @@
 
 
 
 
 
1
+ #!/bin/bash
2
+ set -x
3
+ export GPU=$(nvidia-smi --list-gpus | wc -l)
4
+ torchrun --nproc-per-node=$GPU run.py ${@:1}
vlmeval/VLMEvalKit_old/scripts/srun.sh ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ #!/bin/bash
2
+ set -x
3
+ srun -n1 --ntasks-per-node=1 --partition $1 --gres=gpu:8 --quotatype=reserved --job-name vlmeval --cpus-per-task=64 torchrun --nproc-per-node=8 run.py ${@:2}
vlmeval/VLMEvalKit_old/scripts/visualize.ipynb ADDED
@@ -0,0 +1,266 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "cells": [
3
+ {
4
+ "cell_type": "code",
5
+ "execution_count": null,
6
+ "metadata": {},
7
+ "outputs": [],
8
+ "source": [
9
+ "import json\n",
10
+ "import copy as cp\n",
11
+ "import numpy as np\n",
12
+ "import matplotlib.pyplot as plt\n",
13
+ "import matplotlib.font_manager as fm\n",
14
+ "\n",
15
+ "def download_file(url, filename=None):\n",
16
+ " from urllib.request import urlretrieve\n",
17
+ " if filename is None:\n",
18
+ " filename = url.split('/')[-1]\n",
19
+ " urlretrieve(url, filename)\n",
20
+ "\n",
21
+ "font_URL = 'http://opencompass.openxlab.space/utils/Fonts/segoepr.ttf'\n",
22
+ "download_file(font_URL)\n",
23
+ "\n",
24
+ "font12 = fm.FontProperties(fname='segoepr.ttf', size=12)\n",
25
+ "font15 = fm.FontProperties(fname='segoepr.ttf', size=15, weight='bold')\n",
26
+ "font18 = fm.FontProperties(fname='segoepr.ttf', size=18, weight='bold')\n",
27
+ "\n",
28
+ "DATA_URL = 'http://opencompass.openxlab.space/utils/OpenVLM.json'\n",
29
+ "download_file(DATA_URL)"
30
+ ]
31
+ },
32
+ {
33
+ "cell_type": "code",
34
+ "execution_count": null,
35
+ "metadata": {},
36
+ "outputs": [],
37
+ "source": [
38
+ "def pre_normalize(raw_data, labels):\n",
39
+ " data_list = cp.deepcopy(raw_data)\n",
40
+ " minimum, maximum, max_range, range_map = {}, {}, 0, {}\n",
41
+ " for lb in labels:\n",
42
+ " minimum[lb] = min([x[lb] for x in data_list])\n",
43
+ " maximum[lb] = max([x[lb] for x in data_list])\n",
44
+ " max_range = max(max_range, maximum[lb] - minimum[lb])\n",
45
+ " max_range *= 1.25\n",
46
+ " for lb in labels:\n",
47
+ " mid = (minimum[lb] + maximum[lb]) / 2\n",
48
+ " new_range = (mid - max_range / 2, mid + max_range / 2) if (mid + max_range / 2) < 100 else (100 - max_range, 100)\n",
49
+ " range_map[lb] = new_range\n",
50
+ " for item in data_list:\n",
51
+ " assert new_range[0] <= item[lb] <= new_range[1]\n",
52
+ " item[lb] = (item[lb] - new_range[0]) / max_range * 100\n",
53
+ " return data_list, range_map\n",
54
+ "\n",
55
+ "# solve the problem that some benchmark score is too high and out of range\n",
56
+ "def log_normalize(raw_data, labels):\n",
57
+ " data_list = cp.deepcopy(raw_data)\n",
58
+ " minimum, maximum, max_range, range_map = {}, {}, 0, {}\n",
59
+ " for lb in labels:\n",
60
+ " minimum[lb] = min([np.log(x[lb]) for x in data_list])\n",
61
+ " maximum[lb] = max([np.log(x[lb]) for x in data_list])\n",
62
+ " max_range = max(max_range, maximum[lb] - minimum[lb])\n",
63
+ " max_range *= 1.005\n",
64
+ " for lb in labels:\n",
65
+ " mid = (minimum[lb] + maximum[lb]) / 2\n",
66
+ " new_range = (mid - max_range / 2, mid + max_range / 2) if (mid + max_range / 2) < 100 else (100 - max_range, 100)\n",
67
+ " range_map[lb] = new_range\n",
68
+ " for item in data_list:\n",
69
+ " assert new_range[0] <= np.log(item[lb]) <= new_range[1]\n",
70
+ " item[lb] = (np.log(item[lb]) - new_range[0]) / max_range * 100\n",
71
+ " return data_list, range_map"
72
+ ]
73
+ },
74
+ {
75
+ "cell_type": "code",
76
+ "execution_count": null,
77
+ "metadata": {},
78
+ "outputs": [],
79
+ "source": [
80
+ "# Draw MMBench Radar Graph\n",
81
+ "data = json.loads(open('OpenVLM.json').read())['results']\n",
82
+ "models = list(data)\n",
83
+ "print(models)\n",
84
+ "\n",
85
+ "# model2vis = [\n",
86
+ "# 'GPT-4v (detail: low)', 'GeminiProVision', 'Qwen-VL-Plus', \n",
87
+ "# 'InternLM-XComposer2-VL', 'LLaVA-v1.5-13B', 'CogVLM-17B-Chat',\n",
88
+ "# 'mPLUG-Owl2', 'Qwen-VL-Chat', 'IDEFICS-80B-Instruct'\n",
89
+ "# ]\n",
90
+ "\n",
91
+ "model2vis = [\n",
92
+ " # 'GPT-4v (detail: low)', 'GeminiProVision', 'InternLM-XComposer2-VL', \n",
93
+ " 'GPT-4v (1106, detail-low)', 'Gemini-1.0-Pro', 'Gemini-1.5-Pro', #'Gemini-1.5-Flash', 'Qwen-VL-Plus', \n",
94
+ " 'InternLM-XComposer2', 'LLaVA-v1.5-13B', 'CogVLM-17B-Chat',\n",
95
+ " 'mPLUG-Owl2', 'Qwen-VL-Chat', 'IDEFICS-80B-Instruct'\n",
96
+ "]\n",
97
+ "\n",
98
+ "colors = [\n",
99
+ " '#1f77b4', '#ff7f0e', '#2ca02c', '#d62728', '#9467bd', '#8c564b', \n",
100
+ " '#e377c2', '#7f7f7f', '#bcbd22'\n",
101
+ "]"
102
+ ]
103
+ },
104
+ {
105
+ "cell_type": "code",
106
+ "execution_count": null,
107
+ "metadata": {},
108
+ "outputs": [],
109
+ "source": [
110
+ "from collections import defaultdict\n",
111
+ "\n",
112
+ "split = 'MMBench_TEST_EN'\n",
113
+ "# data_sub = {k: v[split] for k, v in data.items()}\n",
114
+ "data_sub = {k: defaultdict(int, v)[split] for k, v in data.items()}\n",
115
+ "# solve the problem that some model lack the evaluation of MMBench_TEST_EN\n",
116
+ "\n",
117
+ "labels = list(data_sub[model2vis[0]])\n",
118
+ "labels.remove('Overall')\n",
119
+ "num_vars = len(labels)\n",
120
+ "\n",
121
+ "raw_data = [data_sub[m] for m in model2vis]\n",
122
+ "data_list, range_map = pre_normalize(raw_data, labels)\n",
123
+ "\n",
124
+ "alpha = 0.25\n",
125
+ "angles = np.linspace(0, 2 * np.pi, num_vars, endpoint=False).tolist()\n",
126
+ "angles_deg = np.linspace(0, 360, num_vars, endpoint=False).tolist()\n",
127
+ "fig, ax_base = plt.subplots(nrows=1, ncols=1, figsize=(10, 10), subplot_kw=dict(polar=True))\n",
128
+ "\n",
129
+ "for i in range(len(data_list)):\n",
130
+ " item = data_list[i]\n",
131
+ " model_name = model2vis[i]\n",
132
+ " color = colors[i]\n",
133
+ " tmp_angles = angles[:] + [angles[0]]\n",
134
+ " tmp_values = [item[lb] for lb in labels] + [item[labels[0]]]\n",
135
+ " ax_base.plot(tmp_angles, tmp_values, color=color, linewidth=1, linestyle='solid', label=model_name)\n",
136
+ " ax_base.fill(tmp_angles, tmp_values, color=color, alpha=alpha)\n",
137
+ " \n",
138
+ "angles += [angles[0]]\n",
139
+ "ax_base.set_ylim(0, 100)\n",
140
+ "ax_base.set_yticks([40, 60, 80, 100])\n",
141
+ "ax_base.set_yticklabels([''] * 4)\n",
142
+ "\n",
143
+ "ax_base.tick_params(pad=25)\n",
144
+ "ax_base.set_xticks(angles[:-1])\n",
145
+ "ax_base.set_xticklabels(labels, fontproperties=font18)\n",
146
+ "\n",
147
+ "leg = ax_base.legend(loc='center right', bbox_to_anchor=(1.6, 0.5), prop=font15, ncol=1, frameon=True, labelspacing=1.2)\n",
148
+ "for line in leg.get_lines():\n",
149
+ " line.set_linewidth(2.5)\n",
150
+ "\n",
151
+ "cx, cy, sz = 0.44, 0.435, 0.34\n",
152
+ "axes = [fig.add_axes([cx - sz, cy - sz, cx + sz, cy + sz], projection='polar', label='axes%d' % i) for i in range(num_vars)]\n",
153
+ " \n",
154
+ "for ax, angle, label in zip(axes, angles_deg, labels):\n",
155
+ " ax.patch.set_visible(False)\n",
156
+ " ax.grid(False)\n",
157
+ " ax.xaxis.set_visible(False)\n",
158
+ " cur_range = range_map[label]\n",
159
+ " label_list = [cur_range[0] + (cur_range[1] - cur_range[0]) / 5 * i for i in range(2, 6)]\n",
160
+ " label_list = [f'{x:.1f}' for x in label_list]\n",
161
+ " ax.set_rgrids(range(40, 120, 20), angle=angle, labels=label_list, font_properties=font12)\n",
162
+ " ax.spines['polar'].set_visible(False)\n",
163
+ " ax.set_ylim(0, 100)\n",
164
+ "\n",
165
+ "title_text = f'{len(model2vis)} Representative VLMs on MMBench Test.'\n",
166
+ "plt.figtext(.7, .95, title_text, fontproperties=font18, ha='center')\n",
167
+ "plt.show()"
168
+ ]
169
+ },
170
+ {
171
+ "cell_type": "code",
172
+ "execution_count": null,
173
+ "metadata": {},
174
+ "outputs": [],
175
+ "source": [
176
+ "labels = ['SEEDBench_IMG', 'CCBench', 'MMBench_TEST_EN', 'MMBench_TEST_CN', 'MME', 'MMVet', 'MMMU_VAL', 'MathVista', 'HallusionBench', 'LLaVABench']\n",
177
+ "num_vars = len(labels)\n",
178
+ "\n",
179
+ "raw_data = [{k: data[m][k]['Overall'] for k in labels} for m in model2vis]\n",
180
+ "data_list, range_map = pre_normalize(raw_data, labels)\n",
181
+ "\n",
182
+ "alpha = 0.25\n",
183
+ "angles = np.linspace(0, 2 * np.pi, num_vars, endpoint=False).tolist()\n",
184
+ "angles_deg = np.linspace(0, 360, num_vars, endpoint=False).tolist()\n",
185
+ "fig, ax_base = plt.subplots(nrows=1, ncols=1, figsize=(10, 10), subplot_kw=dict(polar=True))\n",
186
+ "\n",
187
+ "for i in range(len(data_list)):\n",
188
+ " item = data_list[i]\n",
189
+ " model_name = model2vis[i]\n",
190
+ " color = colors[i]\n",
191
+ " tmp_angles = angles[:] + [angles[0]]\n",
192
+ " tmp_values = [item[lb] for lb in labels] + [item[labels[0]]]\n",
193
+ " ax_base.plot(tmp_angles, tmp_values, color=color, linewidth=1, linestyle='solid', label=model_name)\n",
194
+ " ax_base.fill(tmp_angles, tmp_values, color=color, alpha=alpha)\n",
195
+ " \n",
196
+ "angles += [angles[0]]\n",
197
+ "ax_base.set_ylim(0, 100)\n",
198
+ "ax_base.set_yticks([40, 60, 80, 100])\n",
199
+ "ax_base.set_yticklabels([''] * 4)\n",
200
+ "\n",
201
+ "ax_base.tick_params(pad=15)\n",
202
+ "ax_base.set_xticks(angles[:-1])\n",
203
+ "ax_base.set_xticklabels(labels, fontproperties=font18)\n",
204
+ "\n",
205
+ "dataset_map = {\n",
206
+ " 'MMBench_TEST_EN': 'MMBench (Test)', \n",
207
+ " 'MMBench_TEST_CN': 'MMBenchCN (Test)', \n",
208
+ " 'MathVista': 'MathVista (TestMini)', \n",
209
+ " 'MMMU_VAL': 'MMMU (Val)'\n",
210
+ "}\n",
211
+ "for i, label in enumerate(ax_base.get_xticklabels()):\n",
212
+ " x,y = label.get_position()\n",
213
+ " text = label.get_text()\n",
214
+ " text = dataset_map[text] if text in dataset_map else text\n",
215
+ " lab = ax_base.text(x, y, text, transform=label.get_transform(),\n",
216
+ " ha=label.get_ha(), va=label.get_va(), font_properties=font15)\n",
217
+ " lab.set_rotation(360 / num_vars * i + 270)\n",
218
+ " labels.append(lab)\n",
219
+ "ax_base.set_xticklabels([])\n",
220
+ "\n",
221
+ "leg = ax_base.legend(loc='center right', bbox_to_anchor=(1.6, 0.5), prop=font15, ncol=1, frameon=True, labelspacing=1.2)\n",
222
+ "for line in leg.get_lines():\n",
223
+ " line.set_linewidth(2.5)\n",
224
+ "\n",
225
+ "cx, cy, sz = 0.44, 0.435, 0.34\n",
226
+ "axes = [fig.add_axes([cx - sz, cy - sz, cx + sz, cy + sz], projection='polar', label='axes%d' % i) for i in range(num_vars)]\n",
227
+ " \n",
228
+ "for ax, angle, label in zip(axes, angles_deg, labels):\n",
229
+ " ax.patch.set_visible(False)\n",
230
+ " ax.grid(False)\n",
231
+ " ax.xaxis.set_visible(False)\n",
232
+ " cur_range = range_map[label]\n",
233
+ " label_list = [cur_range[0] + (cur_range[1] - cur_range[0]) / 5 * i for i in range(2, 6)]\n",
234
+ " label_list = [f'{x:.1f}' for x in label_list]\n",
235
+ " ax.set_rgrids(range(40, 120, 20), angle=angle, labels=label_list, font_properties=font12)\n",
236
+ " ax.spines['polar'].set_visible(False)\n",
237
+ " ax.set_ylim(0, 100)\n",
238
+ "\n",
239
+ "title_text = f'{len(model2vis)} Representative VLMs on {num_vars} Benchmarks in OpenCompass Multi-Modal Leaderboard.'\n",
240
+ "plt.figtext(.7, .95, title_text, fontproperties=font18, ha='center')\n",
241
+ "plt.show()"
242
+ ]
243
+ }
244
+ ],
245
+ "metadata": {
246
+ "kernelspec": {
247
+ "display_name": "base",
248
+ "language": "python",
249
+ "name": "python3"
250
+ },
251
+ "language_info": {
252
+ "codemirror_mode": {
253
+ "name": "ipython",
254
+ "version": 3
255
+ },
256
+ "file_extension": ".py",
257
+ "mimetype": "text/x-python",
258
+ "name": "python",
259
+ "nbconvert_exporter": "python",
260
+ "pygments_lexer": "ipython3",
261
+ "version": "3.8.5"
262
+ }
263
+ },
264
+ "nbformat": 4,
265
+ "nbformat_minor": 2
266
+ }
vlmeval/VLMEvalKit_old/vlmeval/api/__init__.py ADDED
@@ -0,0 +1,26 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ from .gpt import OpenAIWrapper, GPT4V
2
+ from .hf_chat_model import HFChatModel
3
+ from .gemini import GeminiWrapper, GeminiProVision
4
+ from .qwen_vl_api import QwenVLWrapper, QwenVLAPI, Qwen2VLAPI
5
+ from .qwen_api import QwenAPI
6
+ from .claude import Claude_Wrapper, Claude3V
7
+ from .reka import Reka
8
+ from .glm_vision import GLMVisionAPI
9
+ from .cloudwalk import CWWrapper
10
+ from .sensechat_vision import SenseChatVisionAPI
11
+ from .siliconflow import SiliconFlowAPI, TeleMMAPI
12
+ from .hunyuan import HunyuanVision
13
+ from .bailingmm import bailingMMAPI
14
+ from .bluelm_v_api import BlueLMWrapper, BlueLM_V_API
15
+ from .jt_vl_chat import JTVLChatAPI
16
+ from .taiyi import TaiyiAPI
17
+
18
+
19
+ __all__ = [
20
+ 'OpenAIWrapper', 'HFChatModel', 'GeminiWrapper', 'GPT4V',
21
+ 'GeminiProVision', 'QwenVLWrapper', 'QwenVLAPI', 'QwenAPI',
22
+ 'Claude3V', 'Claude_Wrapper', 'Reka', 'GLMVisionAPI',
23
+ 'CWWrapper', 'SenseChatVisionAPI', 'HunyuanVision', 'Qwen2VLAPI',
24
+ 'BlueLMWrapper', 'BlueLM_V_API', 'JTVLChatAPI', 'bailingMMAPI',
25
+ 'TaiyiAPI', 'TeleMMAPI', 'SiliconFlowAPI'
26
+ ]
vlmeval/VLMEvalKit_old/vlmeval/api/bailingmm.py ADDED
@@ -0,0 +1,90 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import base64
2
+ from vlmeval.smp import *
3
+ from vlmeval.api.base import BaseAPI
4
+ from vlmeval.dataset import DATASET_TYPE
5
+ from vlmeval.smp.vlm import encode_image_file_to_base64
6
+ import time
7
+
8
+
9
+ class bailingMMWrapper(BaseAPI):
10
+
11
+ is_api: bool = True
12
+
13
+ def __init__(self,
14
+ model: str,
15
+ retry: int = 5,
16
+ wait: int = 5,
17
+ key: str = None,
18
+ verbose: bool = True,
19
+ system_prompt: str = None,
20
+ max_tokens: int = 1024,
21
+ proxy: str = None,
22
+ **kwargs):
23
+
24
+ self.model = model
25
+ self.fail_msg = 'Failed to obtain answer via bailingMM API.'
26
+ if key is None:
27
+ key = os.environ.get('BAILINGMM_API_KEY', None)
28
+ assert key is not None, ('Please set the API Key for bailingMM.')
29
+ self.key = key
30
+ self.headers = {"Content-Type": "application/json"}
31
+ super().__init__(wait=wait, retry=retry, system_prompt=system_prompt, verbose=verbose, **kwargs)
32
+
33
+ def image_to_base64(self, image_path):
34
+ with open(image_path, 'rb') as image_file:
35
+ encoded_string = str(base64.b64encode(image_file.read()), 'utf-8')
36
+ return encoded_string
37
+
38
+ def prepare_inputs(self, inputs):
39
+ msgs = cp.deepcopy(inputs)
40
+ content = []
41
+ for i, msg in enumerate(msgs):
42
+ if msg['type'] == 'text':
43
+ pass
44
+ else:
45
+ try:
46
+ image_data = self.image_to_base64(msg['value'])
47
+ except Exception as e:
48
+ if self.verbose:
49
+ self.logger.error(e)
50
+ image_data = ''
51
+ msg['value'] = image_data
52
+ content.append(msg)
53
+ return content
54
+
55
+ def generate_inner(self, inputs, **kwargs) -> str:
56
+ assert isinstance(inputs, str) or isinstance(inputs, list)
57
+ start = time.time()
58
+ inputs = [inputs] if isinstance(inputs, str) else inputs
59
+
60
+ messages = self.prepare_inputs(inputs)
61
+
62
+ service_url = "https://bailingchat.alipay.com/api/proxy/eval/antgmm/completions"
63
+
64
+ payload = {
65
+ "structInput": messages,
66
+ "sk": self.key,
67
+ "timeout": 180000
68
+ }
69
+ response = requests.post(service_url, headers=self.headers, json=payload)
70
+ if self.verbose:
71
+ self.logger.info('Time for requesting is:')
72
+ self.logger.info(time.time() - start)
73
+ try:
74
+ assert response.status_code == 200
75
+ output = json.loads(response.text)
76
+ answer = output['preds']['pred']
77
+ if self.verbose:
78
+ self.logger.info(f'inputs: {inputs}\nanswer: {answer}')
79
+ return 0, answer, 'Succeeded! '
80
+ except Exception as e:
81
+ if self.verbose:
82
+ self.logger.error(e)
83
+ self.logger.error(f'The input messages are {inputs}.')
84
+ return -1, self.fail_msg, ''
85
+
86
+
87
+ class bailingMMAPI(bailingMMWrapper):
88
+
89
+ def generate(self, message, dataset=None):
90
+ return super(bailingMMAPI, self).generate(message, dataset=dataset)
vlmeval/VLMEvalKit_old/vlmeval/api/base.py ADDED
@@ -0,0 +1,289 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import time
2
+ import random as rd
3
+ from abc import abstractmethod
4
+ import os.path as osp
5
+ import copy as cp
6
+ from ..smp import get_logger, parse_file, concat_images_vlmeval, LMUDataRoot, md5, decode_base64_to_image_file
7
+
8
+
9
+ class BaseAPI:
10
+
11
+ allowed_types = ['text', 'image']
12
+ INTERLEAVE = True
13
+ INSTALL_REQ = False
14
+
15
+ def __init__(self,
16
+ retry=10,
17
+ wait=3,
18
+ system_prompt=None,
19
+ verbose=True,
20
+ fail_msg='Failed to obtain answer via API.',
21
+ **kwargs):
22
+ """Base Class for all APIs.
23
+
24
+ Args:
25
+ retry (int, optional): The retry times for `generate_inner`. Defaults to 10.
26
+ wait (int, optional): The wait time after each failed retry of `generate_inner`. Defaults to 3.
27
+ system_prompt (str, optional): Defaults to None.
28
+ verbose (bool, optional): Defaults to True.
29
+ fail_msg (str, optional): The message to return when failed to obtain answer.
30
+ Defaults to 'Failed to obtain answer via API.'.
31
+ **kwargs: Other kwargs for `generate_inner`.
32
+ """
33
+
34
+ self.wait = wait
35
+ self.retry = retry
36
+ self.system_prompt = system_prompt
37
+ self.verbose = verbose
38
+ self.fail_msg = fail_msg
39
+ self.logger = get_logger('ChatAPI')
40
+
41
+ if len(kwargs):
42
+ self.logger.info(f'BaseAPI received the following kwargs: {kwargs}')
43
+ self.logger.info('Will try to use them as kwargs for `generate`. ')
44
+ self.default_kwargs = kwargs
45
+
46
+ @abstractmethod
47
+ def generate_inner(self, inputs, **kwargs):
48
+ """The inner function to generate the answer.
49
+
50
+ Returns:
51
+ tuple(int, str, str): ret_code, response, log
52
+ """
53
+ self.logger.warning('For APIBase, generate_inner is an abstract method. ')
54
+ assert 0, 'generate_inner not defined'
55
+ ret_code, answer, log = None, None, None
56
+ # if ret_code is 0, means succeed
57
+ return ret_code, answer, log
58
+
59
+ def working(self):
60
+ """If the API model is working, return True, else return False.
61
+
62
+ Returns:
63
+ bool: If the API model is working, return True, else return False.
64
+ """
65
+ self.old_timeout = None
66
+ if hasattr(self, 'timeout'):
67
+ self.old_timeout = self.timeout
68
+ self.timeout = 120
69
+
70
+ retry = 5
71
+ while retry > 0:
72
+ ret = self.generate('hello')
73
+ if ret is not None and ret != '' and self.fail_msg not in ret:
74
+ if self.old_timeout is not None:
75
+ self.timeout = self.old_timeout
76
+ return True
77
+ retry -= 1
78
+
79
+ if self.old_timeout is not None:
80
+ self.timeout = self.old_timeout
81
+ return False
82
+
83
+ def check_content(self, msgs):
84
+ """Check the content type of the input. Four types are allowed: str, dict, liststr, listdict.
85
+
86
+ Args:
87
+ msgs: Raw input messages.
88
+
89
+ Returns:
90
+ str: The message type.
91
+ """
92
+ if isinstance(msgs, str):
93
+ return 'str'
94
+ if isinstance(msgs, dict):
95
+ return 'dict'
96
+ if isinstance(msgs, list):
97
+ types = [self.check_content(m) for m in msgs]
98
+ if all(t == 'str' for t in types):
99
+ return 'liststr'
100
+ if all(t == 'dict' for t in types):
101
+ return 'listdict'
102
+ return 'unknown'
103
+
104
+ def preproc_content(self, inputs):
105
+ """Convert the raw input messages to a list of dicts.
106
+
107
+ Args:
108
+ inputs: raw input messages.
109
+
110
+ Returns:
111
+ list(dict): The preprocessed input messages. Will return None if failed to preprocess the input.
112
+ """
113
+ if self.check_content(inputs) == 'str':
114
+ return [dict(type='text', value=inputs)]
115
+ elif self.check_content(inputs) == 'dict':
116
+ assert 'type' in inputs and 'value' in inputs
117
+ return [inputs]
118
+ elif self.check_content(inputs) == 'liststr':
119
+ res = []
120
+ for s in inputs:
121
+ mime, pth = parse_file(s)
122
+ if mime is None or mime == 'unknown':
123
+ res.append(dict(type='text', value=s))
124
+ else:
125
+ res.append(dict(type=mime.split('/')[0], value=pth))
126
+ return res
127
+ elif self.check_content(inputs) == 'listdict':
128
+ for item in inputs:
129
+ assert 'type' in item and 'value' in item
130
+ mime, s = parse_file(item['value'])
131
+ if mime is None:
132
+ assert item['type'] == 'text', item['value']
133
+ else:
134
+ assert mime.split('/')[0] == item['type']
135
+ item['value'] = s
136
+ return inputs
137
+ else:
138
+ return None
139
+
140
+ # May exceed the context windows size, so try with different turn numbers.
141
+ def chat_inner(self, inputs, **kwargs):
142
+ _ = kwargs.pop('dataset', None)
143
+ while len(inputs):
144
+ try:
145
+ return self.generate_inner(inputs, **kwargs)
146
+ except Exception as e:
147
+ if self.verbose:
148
+ self.logger.info(f'{type(e)}: {e}')
149
+ inputs = inputs[1:]
150
+ while len(inputs) and inputs[0]['role'] != 'user':
151
+ inputs = inputs[1:]
152
+ continue
153
+ return -1, self.fail_msg + ': ' + 'Failed with all possible conversation turns.', None
154
+
155
+ def chat(self, messages, **kwargs1):
156
+ """The main function for multi-turn chatting. Will call `chat_inner` with the preprocessed input messages."""
157
+ assert hasattr(self, 'chat_inner'), 'The API model should has the `chat_inner` method. '
158
+ for msg in messages:
159
+ assert isinstance(msg, dict) and 'role' in msg and 'content' in msg, msg
160
+ assert self.check_content(msg['content']) in ['str', 'dict', 'liststr', 'listdict'], msg
161
+ msg['content'] = self.preproc_content(msg['content'])
162
+ # merge kwargs
163
+ kwargs = cp.deepcopy(self.default_kwargs)
164
+ kwargs.update(kwargs1)
165
+
166
+ answer = None
167
+ # a very small random delay [0s - 0.5s]
168
+ T = rd.random() * 0.5
169
+ time.sleep(T)
170
+
171
+ assert messages[-1]['role'] == 'user'
172
+
173
+ for i in range(self.retry):
174
+ try:
175
+ ret_code, answer, log = self.chat_inner(messages, **kwargs)
176
+ if ret_code == 0 and self.fail_msg not in answer and answer != '':
177
+ if self.verbose:
178
+ print(answer)
179
+ return answer
180
+ elif self.verbose:
181
+ if not isinstance(log, str):
182
+ try:
183
+ log = log.text
184
+ except Exception as e:
185
+ self.logger.warning(f'Failed to parse {log} as an http response: {str(e)}. ')
186
+ self.logger.info(f'RetCode: {ret_code}\nAnswer: {answer}\nLog: {log}')
187
+ except Exception as err:
188
+ if self.verbose:
189
+ self.logger.error(f'An error occured during try {i}: ')
190
+ self.logger.error(f'{type(err)}: {err}')
191
+ # delay before each retry
192
+ T = rd.random() * self.wait * 2
193
+ time.sleep(T)
194
+
195
+ return self.fail_msg if answer in ['', None] else answer
196
+
197
+ def preprocess_message_with_role(self, message):
198
+ system_prompt = ''
199
+ new_message = []
200
+
201
+ for data in message:
202
+ assert isinstance(data, dict)
203
+ role = data.pop('role', 'user')
204
+ if role == 'system':
205
+ system_prompt += data['value'] + '\n'
206
+ else:
207
+ new_message.append(data)
208
+
209
+ if system_prompt != '':
210
+ if self.system_prompt is None:
211
+ self.system_prompt = system_prompt
212
+ else:
213
+ self.system_prompt += '\n' + system_prompt
214
+ return new_message
215
+
216
+ def generate(self, message, **kwargs1):
217
+ """The main function to generate the answer. Will call `generate_inner` with the preprocessed input messages.
218
+
219
+ Args:
220
+ message: raw input messages.
221
+
222
+ Returns:
223
+ str: The generated answer of the Failed Message if failed to obtain answer.
224
+ """
225
+ if self.check_content(message) == 'listdict':
226
+ message = self.preprocess_message_with_role(message)
227
+
228
+ assert self.check_content(message) in ['str', 'dict', 'liststr', 'listdict'], f'Invalid input type: {message}'
229
+ message = self.preproc_content(message)
230
+ assert message is not None and self.check_content(message) == 'listdict'
231
+ for item in message:
232
+ assert item['type'] in self.allowed_types, f'Invalid input type: {item["type"]}'
233
+
234
+ # merge kwargs
235
+ kwargs = cp.deepcopy(self.default_kwargs)
236
+ kwargs.update(kwargs1)
237
+
238
+ answer = None
239
+ # a very small random delay [0s - 0.5s]
240
+ T = rd.random() * 0.5
241
+ time.sleep(T)
242
+
243
+ for i in range(self.retry):
244
+ try:
245
+ ret_code, answer, log = self.generate_inner(message, **kwargs)
246
+ if ret_code == 0 and self.fail_msg not in answer and answer != '':
247
+ if self.verbose:
248
+ print(answer)
249
+ return answer
250
+ elif self.verbose:
251
+ if not isinstance(log, str):
252
+ try:
253
+ log = log.text
254
+ except Exception as e:
255
+ self.logger.warning(f'Failed to parse {log} as an http response: {str(e)}. ')
256
+ self.logger.info(f'RetCode: {ret_code}\nAnswer: {answer}\nLog: {log}')
257
+ except Exception as err:
258
+ if self.verbose:
259
+ self.logger.error(f'An error occured during try {i}: ')
260
+ self.logger.error(f'{type(err)}: {err}')
261
+ # delay before each retry
262
+ T = rd.random() * self.wait * 2
263
+ time.sleep(T)
264
+
265
+ return self.fail_msg if answer in ['', None] else answer
266
+
267
+ def message_to_promptimg(self, message, dataset=None):
268
+ assert not self.INTERLEAVE
269
+ model_name = self.__class__.__name__
270
+ import warnings
271
+ warnings.warn(
272
+ f'Model {model_name} does not support interleaved input. '
273
+ 'Will use the first image and aggregated texts as prompt. ')
274
+ num_images = len([x for x in message if x['type'] == 'image'])
275
+ if num_images == 0:
276
+ prompt = '\n'.join([x['value'] for x in message if x['type'] == 'text'])
277
+ image = None
278
+ elif num_images == 1:
279
+ prompt = '\n'.join([x['value'] for x in message if x['type'] == 'text'])
280
+ image = [x['value'] for x in message if x['type'] == 'image'][0]
281
+ else:
282
+ prompt = '\n'.join([x['value'] if x['type'] == 'text' else '<image>' for x in message])
283
+ if dataset == 'BLINK':
284
+ image = concat_images_vlmeval(
285
+ [x['value'] for x in message if x['type'] == 'image'],
286
+ target_size=512)
287
+ else:
288
+ image = [x['value'] for x in message if x['type'] == 'image'][0]
289
+ return prompt, image
vlmeval/VLMEvalKit_old/vlmeval/api/claude.py ADDED
@@ -0,0 +1,111 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ from vlmeval.smp import *
2
+ from vlmeval.api.base import BaseAPI
3
+ from time import sleep
4
+ import base64
5
+ import mimetypes
6
+ from PIL import Image
7
+
8
+ url = 'https://openxlab.org.cn/gw/alles-apin-hub/v1/claude/v1/text/chat'
9
+ headers = {
10
+ 'alles-apin-token': '',
11
+ 'Content-Type': 'application/json'
12
+ }
13
+
14
+
15
+ class Claude_Wrapper(BaseAPI):
16
+
17
+ is_api: bool = True
18
+
19
+ def __init__(self,
20
+ model: str = 'claude-3-opus-20240229',
21
+ key: str = None,
22
+ retry: int = 10,
23
+ wait: int = 3,
24
+ system_prompt: str = None,
25
+ verbose: bool = True,
26
+ temperature: float = 0,
27
+ max_tokens: int = 1024,
28
+ **kwargs):
29
+
30
+ self.model = model
31
+ self.headers = headers
32
+ self.temperature = temperature
33
+ self.max_tokens = max_tokens
34
+ if key is not None:
35
+ self.key = key
36
+ else:
37
+ self.key = os.environ.get('ALLES', '')
38
+ self.headers['alles-apin-token'] = self.key
39
+
40
+ super().__init__(retry=retry, wait=wait, verbose=verbose, system_prompt=system_prompt, **kwargs)
41
+
42
+ # inputs can be a lvl-2 nested list: [content1, content2, content3, ...]
43
+ # content can be a string or a list of image & text
44
+ def prepare_itlist(self, inputs):
45
+ assert np.all([isinstance(x, dict) for x in inputs])
46
+ has_images = np.sum([x['type'] == 'image' for x in inputs])
47
+ if has_images:
48
+ content_list = []
49
+ for msg in inputs:
50
+ if msg['type'] == 'text' and msg['value'] != '':
51
+ content_list.append(dict(type='text', text=msg['value']))
52
+ elif msg['type'] == 'image':
53
+ pth = msg['value']
54
+ suffix = osp.splitext(pth)[-1].lower()
55
+ media_type = mimetypes.types_map.get(suffix, None)
56
+ assert media_type is not None
57
+
58
+ content_list.append(dict(
59
+ type='image',
60
+ source={
61
+ 'type': 'base64',
62
+ 'media_type': media_type,
63
+ 'data': encode_image_file_to_base64(pth, target_size=4096)
64
+ }))
65
+ else:
66
+ assert all([x['type'] == 'text' for x in inputs])
67
+ text = '\n'.join([x['value'] for x in inputs])
68
+ content_list = [dict(type='text', text=text)]
69
+ return content_list
70
+
71
+ def prepare_inputs(self, inputs):
72
+ input_msgs = []
73
+ assert isinstance(inputs, list) and isinstance(inputs[0], dict)
74
+ assert np.all(['type' in x for x in inputs]) or np.all(['role' in x for x in inputs]), inputs
75
+ if 'role' in inputs[0]:
76
+ assert inputs[-1]['role'] == 'user', inputs[-1]
77
+ for item in inputs:
78
+ input_msgs.append(dict(role=item['role'], content=self.prepare_itlist(item['content'])))
79
+ else:
80
+ input_msgs.append(dict(role='user', content=self.prepare_itlist(inputs)))
81
+ return input_msgs
82
+
83
+ def generate_inner(self, inputs, **kwargs) -> str:
84
+
85
+ payload = json.dumps({
86
+ 'model': self.model,
87
+ 'max_tokens': self.max_tokens,
88
+ 'messages': self.prepare_inputs(inputs),
89
+ 'system': self.system_prompt,
90
+ **kwargs
91
+ })
92
+ response = requests.request('POST', url, headers=headers, data=payload)
93
+ ret_code = response.status_code
94
+ ret_code = 0 if (200 <= int(ret_code) < 300) else ret_code
95
+ answer = self.fail_msg
96
+
97
+ try:
98
+ resp_struct = json.loads(response.text)
99
+ answer = resp_struct['data']['content'][0]['text'].strip()
100
+ except Exception as err:
101
+ if self.verbose:
102
+ self.logger.error(f'{type(err)}: {err}')
103
+ self.logger.error(response.text if hasattr(response, 'text') else response)
104
+
105
+ return ret_code, answer, response
106
+
107
+
108
+ class Claude3V(Claude_Wrapper):
109
+
110
+ def generate(self, message, dataset=None):
111
+ return super(Claude_Wrapper, self).generate(message)
vlmeval/VLMEvalKit_old/vlmeval/api/cloudwalk.py ADDED
@@ -0,0 +1,107 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ from ..smp import *
2
+ import os
3
+ from .base import BaseAPI
4
+
5
+
6
+ class CWWrapper(BaseAPI):
7
+
8
+ is_api: bool = True
9
+
10
+ def __init__(self,
11
+ model: str = 'cw-congrong-v1.5',
12
+ retry: int = 10,
13
+ wait: int = 5,
14
+ key: str = None,
15
+ verbose: bool = True,
16
+ system_prompt: str = None,
17
+ temperature: float = 0,
18
+ timeout: int = 600,
19
+ api_base: str = 'http://cwapi-vlm01.cw_rb.azurebot.tk/v1/chat/completions',
20
+ max_tokens: int = 1024,
21
+ img_size: int = 512,
22
+ img_detail: str = 'low',
23
+ **kwargs):
24
+
25
+ self.model = model
26
+ self.cur_idx = 0
27
+ self.fail_msg = 'Failed to obtain answer via API. '
28
+ self.max_tokens = max_tokens
29
+ self.temperature = temperature
30
+
31
+ base = os.environ.get('CW_API_BASE', None)
32
+ self.api_base = base if base is not None else api_base
33
+
34
+ env_key = os.environ.get('CW_API_KEY', None)
35
+ self.key = env_key if env_key is not None else key
36
+ assert self.key is not None, 'API key not provided. Please set CW_API_KEY environment variable or \
37
+ pass it to the constructor.'
38
+
39
+ assert img_size > 0 or img_size == -1
40
+ self.img_size = -1 # allways send full size image
41
+ assert img_detail in ['high', 'low']
42
+ self.img_detail = img_detail
43
+
44
+ self.vision = True
45
+ self.timeout = timeout
46
+
47
+ super().__init__(wait=wait, retry=retry, system_prompt=system_prompt, verbose=verbose, **kwargs)
48
+
49
+ # inputs can be a lvl-2 nested list: [content1, content2, content3, ...]
50
+ # content can be a string or a list of image & text
51
+ def prepare_inputs(self, inputs):
52
+ input_msgs = []
53
+ if self.system_prompt is not None:
54
+ input_msgs.append(dict(role='system', content=self.system_prompt))
55
+ has_images = np.sum([x['type'] == 'image' for x in inputs])
56
+ if has_images:
57
+ content_list = []
58
+ for msg in inputs:
59
+ if msg['type'] == 'text':
60
+ content_list.append(dict(type='text', text=msg['value']))
61
+ elif msg['type'] == 'image':
62
+ from PIL import Image
63
+ img = Image.open(msg['value'])
64
+ b64 = encode_image_to_base64(img, target_size=self.img_size)
65
+ img_struct = dict(url=f"data:image/jpeg;base64,{b64}", detail=self.img_detail)
66
+ content_list.append(dict(type='image_url', image_url=img_struct))
67
+ input_msgs.append(dict(role='user', content=content_list))
68
+ else:
69
+ assert all([x['type'] == 'text' for x in inputs])
70
+ text = '\n'.join([x['value'] for x in inputs])
71
+ input_msgs.append(dict(role='user', content=text))
72
+ return input_msgs
73
+
74
+ def generate_inner(self, inputs, **kwargs) -> str:
75
+ input_msgs = self.prepare_inputs(inputs)
76
+ temperature = kwargs.pop('temperature', self.temperature)
77
+ max_tokens = kwargs.pop('max_tokens', self.max_tokens)
78
+
79
+ if 0 < max_tokens <= 100:
80
+ self.logger.warning(
81
+ 'Less than 100 tokens left, '
82
+ 'may exceed the context window with some additional meta symbols. '
83
+ )
84
+ if max_tokens <= 0:
85
+ return 0, self.fail_msg + 'Input string longer than context window. ', 'Length Exceeded. '
86
+
87
+ headers = {'Content-Type': 'application/json', 'Authorization': f'{self.key}'}
88
+ payload = dict(
89
+ model=self.model,
90
+ messages=input_msgs,
91
+ max_tokens=max_tokens,
92
+ n=1,
93
+ temperature=temperature,
94
+ **kwargs)
95
+ response = requests.post(self.api_base, headers=headers, data=json.dumps(payload), timeout=self.timeout * 1.1)
96
+ ret_code = response.status_code
97
+ ret_code = 0 if (200 <= int(ret_code) < 300) else ret_code
98
+ answer = self.fail_msg
99
+ try:
100
+ resp_struct = json.loads(response.text)
101
+ answer = resp_struct['choices'][0]['message']['content'].strip()
102
+ except Exception as err:
103
+ if self.verbose:
104
+ self.logger.error(f'{type(err)}: {err}')
105
+ self.logger.error(response.text if hasattr(response, 'text') else response)
106
+
107
+ return ret_code, answer, response
vlmeval/VLMEvalKit_old/vlmeval/api/gemini.py ADDED
@@ -0,0 +1,119 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ from vlmeval.smp import *
2
+ from vlmeval.api.base import BaseAPI
3
+
4
+ headers = 'Content-Type: application/json'
5
+
6
+
7
+ class GeminiWrapper(BaseAPI):
8
+
9
+ is_api: bool = True
10
+
11
+ def __init__(self,
12
+ model: str = 'gemini-1.0-pro',
13
+ retry: int = 5,
14
+ wait: int = 5,
15
+ key: str = None,
16
+ verbose: bool = True,
17
+ temperature: float = 0.0,
18
+ system_prompt: str = None,
19
+ max_tokens: int = 1024,
20
+ proxy: str = None,
21
+ backend='genai',
22
+ project_id='vlmeval',
23
+ **kwargs):
24
+
25
+ assert model in ['gemini-1.0-pro', 'gemini-1.5-pro', 'gemini-1.5-flash']
26
+
27
+ self.model = model
28
+ self.fail_msg = 'Failed to obtain answer via API. '
29
+ self.max_tokens = max_tokens
30
+ self.temperature = temperature
31
+ if key is None:
32
+ key = os.environ.get('GOOGLE_API_KEY', None)
33
+ # Try to load backend from environment variable
34
+ be = os.environ.get('GOOGLE_API_BACKEND', None)
35
+ if be is not None and be in ['genai', 'vertex']:
36
+ backend = be
37
+
38
+ assert backend in ['genai', 'vertex']
39
+ if backend == 'genai':
40
+ # We have not evaluated Gemini-1.5 w. GenAI backend
41
+ assert key is not None # Vertex does not require API Key
42
+
43
+ self.backend = backend
44
+ self.project_id = project_id
45
+ self.api_key = key
46
+
47
+ if proxy is not None:
48
+ proxy_set(proxy)
49
+ super().__init__(wait=wait, retry=retry, system_prompt=system_prompt, verbose=verbose, **kwargs)
50
+
51
+ def build_msgs_genai(self, inputs):
52
+ messages = [] if self.system_prompt is None else [self.system_prompt]
53
+ for inp in inputs:
54
+ if inp['type'] == 'text':
55
+ messages.append(inp['value'])
56
+ elif inp['type'] == 'image':
57
+ messages.append(Image.open(inp['value']))
58
+ return messages
59
+
60
+ def build_msgs_vertex(self, inputs):
61
+ from vertexai.generative_models import Part, Image
62
+ messages = [] if self.system_prompt is None else [self.system_prompt]
63
+ for inp in inputs:
64
+ if inp['type'] == 'text':
65
+ messages.append(inp['value'])
66
+ elif inp['type'] == 'image':
67
+ messages.append(Part.from_image(Image.load_from_file(inp['value'])))
68
+ return messages
69
+
70
+ def generate_inner(self, inputs, **kwargs) -> str:
71
+ if self.backend == 'genai':
72
+ import google.generativeai as genai
73
+ assert isinstance(inputs, list)
74
+ pure_text = np.all([x['type'] == 'text' for x in inputs])
75
+ genai.configure(api_key=self.api_key)
76
+
77
+ if pure_text and self.model == 'gemini-1.0-pro':
78
+ model = genai.GenerativeModel('gemini-1.0-pro')
79
+ else:
80
+ assert self.model in ['gemini-1.5-pro', 'gemini-1.5-flash']
81
+ model = genai.GenerativeModel(self.model)
82
+
83
+ messages = self.build_msgs_genai(inputs)
84
+ gen_config = dict(max_output_tokens=self.max_tokens, temperature=self.temperature)
85
+ gen_config.update(kwargs)
86
+ try:
87
+ answer = model.generate_content(
88
+ messages,
89
+ generation_config=genai.types.GenerationConfig(**gen_config)).text
90
+ return 0, answer, 'Succeeded! '
91
+ except Exception as err:
92
+ if self.verbose:
93
+ self.logger.error(f'{type(err)}: {err}')
94
+ self.logger.error(f'The input messages are {inputs}.')
95
+
96
+ return -1, '', ''
97
+ elif self.backend == 'vertex':
98
+ import vertexai
99
+ from vertexai.generative_models import GenerativeModel
100
+ vertexai.init(project=self.project_id, location='us-central1')
101
+ model_name = 'gemini-1.0-pro-vision' if self.model == 'gemini-1.0-pro' else self.model
102
+ model = GenerativeModel(model_name=model_name)
103
+ messages = self.build_msgs_vertex(inputs)
104
+ try:
105
+ resp = model.generate_content(messages)
106
+ answer = resp.text
107
+ return 0, answer, 'Succeeded! '
108
+ except Exception as err:
109
+ if self.verbose:
110
+ self.logger.error(f'{type(err)}: {err}')
111
+ self.logger.error(f'The input messages are {inputs}.')
112
+
113
+ return -1, '', ''
114
+
115
+
116
+ class GeminiProVision(GeminiWrapper):
117
+
118
+ def generate(self, message, dataset=None):
119
+ return super(GeminiProVision, self).generate(message)
vlmeval/VLMEvalKit_old/vlmeval/api/glm_vision.py ADDED
@@ -0,0 +1,95 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import requests
2
+ requests.packages.urllib3.disable_warnings()
3
+
4
+ from vlmeval.smp import *
5
+ from vlmeval.api.base import BaseAPI
6
+ from vlmeval.dataset import DATASET_TYPE
7
+ from vlmeval.smp.vlm import encode_image_file_to_base64
8
+
9
+
10
+ class GLMVisionWrapper(BaseAPI):
11
+
12
+ is_api: bool = True
13
+
14
+ def __init__(self,
15
+ model: str,
16
+ retry: int = 5,
17
+ wait: int = 5,
18
+ key: str = None,
19
+ verbose: bool = True,
20
+ system_prompt: str = None,
21
+ max_tokens: int = 4096,
22
+ proxy: str = None,
23
+ **kwargs):
24
+
25
+ self.model = model
26
+ self.fail_msg = 'Failed to obtain answer via API. '
27
+ self.default_params = {
28
+ 'top_k': 1,
29
+ 'best_of': 1,
30
+ 'do_sample': False,
31
+ 'stream': False,
32
+ 'max_tokens': max_tokens,
33
+ "skip_moderation": True
34
+ }
35
+ if key is None:
36
+ key = os.environ.get('GLMV_API_KEY', None)
37
+ assert key is not None, (
38
+ 'Please set the API Key (obtain it here: '
39
+ 'https://open.bigmodel.cn/dev/howuse/introduction)'
40
+ )
41
+ self.key = key
42
+ super().__init__(wait=wait, retry=retry, system_prompt=system_prompt, verbose=verbose, **kwargs)
43
+
44
+ def build_msgs(self, msgs_raw, system_prompt=None, dataset=None):
45
+ msgs = cp.deepcopy(msgs_raw)
46
+ content = []
47
+ for i, msg in enumerate(msgs):
48
+ if msg['type'] == 'text':
49
+ content.append(dict(type='text', text=msg['value']))
50
+ elif msg['type'] == 'image':
51
+ content.append(dict(type='image_url', image_url=dict(url=encode_image_file_to_base64(msg['value']))))
52
+ if dataset in {'HallusionBench', 'POPE'}:
53
+ content.append(dict(type="text", text="Please answer yes or no."))
54
+ ret = [dict(role='user', content=content)]
55
+ return ret
56
+
57
+ def generate_inner(self, inputs, **kwargs) -> str:
58
+ assert isinstance(inputs, str) or isinstance(inputs, list)
59
+ inputs = [inputs] if isinstance(inputs, str) else inputs
60
+
61
+ messages = self.build_msgs(msgs_raw=inputs, dataset=kwargs.get('dataset', None))
62
+
63
+ url = 'https://api.chatglm.cn/v1/chat/completions'
64
+ headers = {
65
+ 'Content-Type': 'application/json',
66
+ 'Request-Id': 'remote-test',
67
+ 'Authorization': f'Bearer {self.key}'
68
+ }
69
+ payload = {
70
+ 'model': self.model,
71
+ 'messages': messages,
72
+ **self.default_params
73
+ }
74
+ response = requests.post(url, headers=headers, data=json.dumps(payload), verify=False)
75
+ output = []
76
+ try:
77
+ assert response.status_code == 200
78
+ for line in response.iter_lines():
79
+ data = json.loads(line.decode('utf-8').lstrip('data: '))
80
+ output.append(data['choices'][0]['message']['content'])
81
+ answer = ''.join(output).replace('</s>', '')
82
+ if self.verbose:
83
+ self.logger.info(f'inputs: {inputs}\nanswer: {answer}')
84
+ return 0, answer, 'Succeeded! '
85
+ except Exception as err:
86
+ if self.verbose:
87
+ self.logger.error(f'{type(err)}: {err}')
88
+ self.logger.error(f'The input messages are {inputs}.')
89
+ return -1, self.fail_msg, ''
90
+
91
+
92
+ class GLMVisionAPI(GLMVisionWrapper):
93
+
94
+ def generate(self, message, dataset=None):
95
+ return super(GLMVisionAPI, self).generate(message, dataset=dataset)
vlmeval/VLMEvalKit_old/vlmeval/api/hf_chat_model.py ADDED
@@ -0,0 +1,246 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import os
2
+ import sys
3
+ import os.path as osp
4
+ import torch
5
+ from ..smp import *
6
+
7
+
8
+ def get_gpu_num(model_name):
9
+ model_name = model_name.lower()
10
+ kws = {
11
+ 8: ['65b', '70b'],
12
+ 4: ['30b', '33b', '35b', '40b'],
13
+ 2: ['13b', '14b', '20b'],
14
+ 1: ['6b', '7b', 'moss'],
15
+ }
16
+ for k in [8, 4, 2, 1]:
17
+ for keyword in kws[k]:
18
+ if keyword in model_name:
19
+ return k
20
+ return 8
21
+
22
+
23
+ validated_llms = [
24
+ 'internlm/internlm-chat-7b', 'internlm/internlm-chat-7b-8k', 'internlm/internlm-chat-20b',
25
+ 'Qwen/Qwen-7B-Chat', 'Qwen/Qwen-14B-Chat',
26
+ 'THUDM/chatglm2-6b', 'THUDM/chatglm2-6b-32k', 'THUDM/chatglm3-6b', 'THUDM/chatglm3-6b-32k',
27
+ 'baichuan-inc/Baichuan2-7B-Chat', 'baichuan-inc/Baichuan2-13B-Chat',
28
+ 'lmsys/vicuna-7b-v1.5', 'lmsys/vicuna-13b-v1.5',
29
+ 'meta-llama/Llama-2-7b-chat-hf'
30
+ ]
31
+ Auto_model = ['chatglm']
32
+
33
+
34
+ class HFChatModel:
35
+
36
+ def _get_context_length(self, model, model_path):
37
+ # By default, we use model.config.seq_length
38
+ model_path = model_path.lower()
39
+ if 'baichuan' in model_path:
40
+ context_window = model.config.model_max_length
41
+ elif 'internlm' in model_path or 'llama' in model_path:
42
+ context_window = model.config.max_position_embeddings
43
+ elif 'vicuna' in model_path:
44
+ context_window = model.generation_config.max_length
45
+ else:
46
+ # chatglm & qwen
47
+ context_window = model.config.seq_length
48
+ return context_window
49
+
50
+ def _get_context_length_robust(self, model, model_path):
51
+ try:
52
+ context_window = self._get_context_length(model, model_path)
53
+ return context_window
54
+ except Exception as err:
55
+ self.logger.critical(f'{type(err)}: {err}')
56
+ self.logger.critical(
57
+ 'Failed to extract context_window information from config / generation_config. '
58
+ 'Please read the above code and check if the logic works for you model path'
59
+ )
60
+ raise NotImplementedError
61
+
62
+ def __init__(self,
63
+ model_path,
64
+ system_prompt: str = None,
65
+ **kwargs):
66
+
67
+ self.logger = get_logger('HFChatModel')
68
+ if 'vicuna' in model_path.lower():
69
+ try:
70
+ from fastchat.model import get_conversation_template
71
+ except Exception as err:
72
+ self.logger.critical('Please install fastchat first to use vicuna. ')
73
+ raise err
74
+
75
+ self.explicit_device = kwargs.pop('device', None)
76
+
77
+ if self.explicit_device is None:
78
+ # If CUDA_VISIBLE_DEVICES is not properly set
79
+ if 'CUDA_VISIBLE_DEVICES' not in os.environ or os.environ['CUDA_VISIBLE_DEVICES'] == '0,1,2,3,4,5,6,7':
80
+ num_gpu = get_gpu_num(model_path)
81
+ gpu_offset = kwargs.pop('gpu_offset', 0)
82
+ cuda_visible_devices = ','.join([str(i) for i in range(gpu_offset, gpu_offset + num_gpu)])
83
+ os.environ['CUDA_VISIBLE_DEVICES'] = cuda_visible_devices
84
+
85
+ from transformers import AutoTokenizer, AutoModelForCausalLM, AutoModel
86
+ from transformers.generation import GenerationConfig
87
+
88
+ if model_path not in validated_llms:
89
+ self.logger.warning(f'{model_path} not in validated LLMs, may have inference troubles. ')
90
+
91
+ self.model_path = model_path
92
+ if listinstr(Auto_model, model_path):
93
+ LoadModel = AutoModel
94
+ else:
95
+ LoadModel = AutoModelForCausalLM
96
+
97
+ assert osp.exists(model_path) or len(model_path.split('/')) == 2
98
+
99
+ device = self.explicit_device if self.explicit_device else 'auto'
100
+
101
+ precision = {}
102
+ if 'internlm-chat-7b' in model_path:
103
+ precision = {'torch_dtype': torch.float16}
104
+ elif 'internlm-chat-20b' in model_path:
105
+ precision = {'torch_dtype': torch.bfloat16}
106
+
107
+ self.tokenizer = AutoTokenizer.from_pretrained(model_path, trust_remote_code=True)
108
+ model = LoadModel.from_pretrained(model_path, trust_remote_code=True, device_map='cpu', **precision)
109
+ model = model.eval()
110
+
111
+ if device != 'cpu':
112
+ model = model.to(f'cuda:{device}' if isinstance(device, int) else 'cuda')
113
+ try:
114
+ model.generation_config = GenerationConfig.from_pretrained(
115
+ model_path, trust_remote_code=True, device_map=device)
116
+ except Exception as err:
117
+ self.logger.warning(f'{type(err)}: {err}')
118
+
119
+ torch.cuda.empty_cache()
120
+ self.model = model
121
+ self.context_length = self._get_context_length_robust(model=model, model_path=model_path)
122
+ self.answer_buffer = 192
123
+ self.system_prompt = system_prompt
124
+ for k, v in kwargs.items():
125
+ self.logger.info(f'Following args will be used for generation (If not set specifically), {k}: {v}. ')
126
+ self.kwargs = kwargs
127
+
128
+ def generate_str(self, input, **kwargs):
129
+ if 'baichuan' in self.model_path.lower():
130
+ messages = []
131
+ messages.append({'role': 'user', 'content': input})
132
+ resp = self.model.chat(self.tokenizer, messages, **kwargs)
133
+ elif 'vicuna' in self.model_path.lower():
134
+ from fastchat.model import get_conversation_template
135
+ conv = get_conversation_template('vicuna')
136
+ conv.append_message(conv.roles[0], input)
137
+ conv.append_message(conv.roles[1], None)
138
+ prompt = conv.get_prompt()
139
+ inputs = self.tokenizer([prompt], return_tensors='pt')
140
+ if torch.cuda.is_available():
141
+ for k in inputs:
142
+ inputs[k] = inputs[k].cuda()
143
+
144
+ params = dict(do_sample=True, temperature=0.7, repetition_penalty=1.0, max_new_tokens=512)
145
+ params.update(self.kwargs)
146
+ params.update(kwargs)
147
+ outputs = self.model.generate(**inputs, **params)
148
+ resp = self.tokenizer.decode(
149
+ outputs[0][len(inputs['input_ids'][0]):],
150
+ skip_special_tokens=True,
151
+ spaces_between_special_tokens=False)
152
+
153
+ else:
154
+ params = self.kwargs
155
+ params.update(kwargs)
156
+ resp, _ = self.model.chat(self.tokenizer, input, history=[], **params)
157
+
158
+ return resp
159
+
160
+ def length_ok(self, inputs):
161
+ tot = len(self.tokenizer.encode(self.system_prompt)) if self.system_prompt is not None else 0
162
+ for s in inputs:
163
+ tot += len(self.tokenizer.encode(s))
164
+ return tot + self.answer_buffer < self.context_length
165
+
166
+ def generate_list(self, full_inputs, offset=0, **kwargs):
167
+ assert isinstance(full_inputs, list)
168
+
169
+ inputs = full_inputs[offset:]
170
+ if not self.length_ok(inputs):
171
+ return self.chat(full_inputs, offset + 1)
172
+
173
+ model_path = self.model_path.lower()
174
+
175
+ if sum([x in model_path for x in ['baichuan']]):
176
+ input_msgs = []
177
+ if self.system_prompt is not None:
178
+ input_msgs.append(dict(role='user', content=self.system_prompt))
179
+ if len(inputs):
180
+ assert isinstance(inputs, list) and isinstance(inputs[0], str)
181
+ roles = ['user', 'assistant'] if len(inputs) % 2 == 1 else ['assistant', 'user']
182
+ roles = roles * len(inputs)
183
+ for role, msg in zip(roles, inputs):
184
+ input_msgs.append(dict(role=role, content=msg))
185
+ response = self.model.chat(self.tokenizer, input_msgs)
186
+ elif sum([x in model_path for x in ['vicuna']]):
187
+ from fastchat.model import get_conversation_template
188
+ conv = get_conversation_template('vicuna')
189
+ assert isinstance(inputs, list) and isinstance(inputs[0], str)
190
+ if len(inputs) % 2 == 1:
191
+ if self.system_prompt is not None:
192
+ conv.append_message(conv.roles[0], self.system_prompt)
193
+ for i in range(len(inputs) // 2):
194
+ conv.append_message(conv.roles[0], inputs[2 * i])
195
+ conv.append_message(conv.roles[1], inputs[2 * i + 1])
196
+ else:
197
+ assert self.system_prompt is not None
198
+ conv.append_message(conv.roles[0], self.system_prompt)
199
+ conv.append_message(conv.roles[1], inputs[0])
200
+ for i in range(len(inputs) // 2 - 1):
201
+ conv.append_message(conv.roles[0], inputs[2 * i + 1])
202
+ conv.append_message(conv.roles[1], inputs[2 * i + 2])
203
+ conv.append_message(conv.roles[0], inputs[-1])
204
+ conv.append_message(conv.roles[1], None)
205
+ prompt = conv.get_prompt()
206
+ inputs = self.tokenizer([prompt], return_tensors='pt')
207
+ if torch.cuda.is_available():
208
+ for k in inputs:
209
+ inputs[k] = inputs[k].cuda()
210
+
211
+ params = dict(do_sample=True, temperature=0.7, repetition_penalty=1.0, max_new_tokens=512)
212
+ params.update(self.kwargs)
213
+ params.update(kwargs)
214
+
215
+ outputs = self.model.generate(**inputs, **params)
216
+ response = self.tokenizer.decode(
217
+ outputs[0][len(inputs['input_ids'][0]):],
218
+ skip_special_tokens=True,
219
+ spaces_between_special_tokens=False)
220
+ response = response.lstrip('\n')
221
+ else:
222
+ # The default option, support internlm, chatglm, qwen
223
+ history, msg = [], None
224
+ if len(inputs) % 2 == 1:
225
+ if self.system_prompt is not None:
226
+ history = [(self.system_prompt, '')]
227
+ for i in range(len(inputs) // 2):
228
+ history.append((inputs[2 * i], inputs[2 * i + 1]))
229
+ else:
230
+ assert self.system_prompt is not None
231
+ history = [(self.system_prompt, inputs[0])]
232
+ for i in range(len(inputs) // 2 - 1):
233
+ history.append((inputs[2 * i + 1], inputs[2 * i + 2]))
234
+ msg = inputs[-1]
235
+
236
+ params = self.kwargs
237
+ params.update(kwargs)
238
+ response, _ = self.model.chat(self.tokenizer, msg, history=history, **params)
239
+
240
+ return response, offset
241
+
242
+ def generate(self, inputs, **kwargs):
243
+ if isinstance(inputs, str):
244
+ return self.generate_str(inputs, **kwargs)
245
+ elif isinstance(inputs, list):
246
+ return self.generate_list(inputs, **kwargs)
vlmeval/VLMEvalKit_old/vlmeval/api/hunyuan.py ADDED
@@ -0,0 +1,147 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ from vlmeval.smp import *
2
+ import os
3
+ import sys
4
+ from vlmeval.api.base import BaseAPI
5
+
6
+
7
+ class HunyuanWrapper(BaseAPI):
8
+
9
+ is_api: bool = True
10
+ _apiVersion = '2023-09-01'
11
+ _service = 'hunyuan'
12
+
13
+ def __init__(self,
14
+ model: str = 'hunyuan-vision',
15
+ retry: int = 5,
16
+ wait: int = 5,
17
+ secret_key: str = None,
18
+ secret_id: str = None,
19
+ verbose: bool = True,
20
+ system_prompt: str = None,
21
+ temperature: float = 0,
22
+ timeout: int = 60,
23
+ api_base: str = 'hunyuan.tencentcloudapi.com',
24
+ **kwargs):
25
+
26
+ self.model = model
27
+ self.cur_idx = 0
28
+ self.fail_msg = 'Failed to obtain answer via API. '
29
+ self.temperature = temperature
30
+
31
+ warnings.warn('You may need to set the env variable HUNYUAN_SECRET_ID & HUNYUAN_SECRET_KEY to use Hunyuan. ')
32
+
33
+ secret_key = os.environ.get('HUNYUAN_SECRET_KEY', secret_key)
34
+ assert secret_key is not None, 'Please set the environment variable HUNYUAN_SECRET_KEY. '
35
+ secret_id = os.environ.get('HUNYUAN_SECRET_ID', secret_id)
36
+ assert secret_id is not None, 'Please set the environment variable HUNYUAN_SECRET_ID. '
37
+
38
+ self.model = model
39
+ self.endpoint = api_base
40
+ self.secret_id = secret_id
41
+ self.secret_key = secret_key
42
+ self.timeout = timeout
43
+
44
+ try:
45
+ from tencentcloud.common import credential
46
+ from tencentcloud.common.profile.client_profile import ClientProfile
47
+ from tencentcloud.common.profile.http_profile import HttpProfile
48
+ from tencentcloud.hunyuan.v20230901 import hunyuan_client
49
+ except ImportError as err:
50
+ self.logger.critical('Please install tencentcloud-sdk-python to use Hunyuan API. ')
51
+ raise err
52
+
53
+ super().__init__(wait=wait, retry=retry, system_prompt=system_prompt, verbose=verbose, **kwargs)
54
+
55
+ cred = credential.Credential(self.secret_id, self.secret_key)
56
+ httpProfile = HttpProfile()
57
+ httpProfile.endpoint = self.endpoint
58
+ clientProfile = ClientProfile()
59
+ clientProfile.httpProfile = httpProfile
60
+ self.client = hunyuan_client.HunyuanClient(cred, 'ap-beijing', clientProfile)
61
+ self.logger.info(
62
+ f'Using Endpoint: {self.endpoint}; API Secret ID: {self.secret_id}; API Secret Key: {self.secret_key}'
63
+ )
64
+
65
+ # inputs can be a lvl-2 nested list: [content1, content2, content3, ...]
66
+ # content can be a string or a list of image & text
67
+ def prepare_itlist(self, inputs):
68
+ assert np.all([isinstance(x, dict) for x in inputs])
69
+ has_images = np.sum([x['type'] == 'image' for x in inputs])
70
+ if has_images:
71
+ content_list = []
72
+ for msg in inputs:
73
+ if msg['type'] == 'text':
74
+ content_list.append(dict(Type='text', Text=msg['value']))
75
+ elif msg['type'] == 'image':
76
+ from PIL import Image
77
+ img = Image.open(msg['value'])
78
+ b64 = encode_image_to_base64(img)
79
+ img_struct = dict(Url=f'data:image/jpeg;base64,{b64}')
80
+ content_list.append(dict(Type='image_url', ImageUrl=img_struct))
81
+ else:
82
+ assert all([x['type'] == 'text' for x in inputs])
83
+ text = '\n'.join([x['value'] for x in inputs])
84
+ content_list = [dict(Type='text', Text=text)]
85
+ return content_list
86
+
87
+ def prepare_inputs(self, inputs):
88
+ input_msgs = []
89
+ if self.system_prompt is not None:
90
+ input_msgs.append(dict(Role='system', Content=self.system_prompt))
91
+ assert isinstance(inputs, list) and isinstance(inputs[0], dict)
92
+ assert np.all(['type' in x for x in inputs]) or np.all(['role' in x for x in inputs]), inputs
93
+ if 'role' in inputs[0]:
94
+ assert inputs[-1]['role'] == 'user', inputs[-1]
95
+ for item in inputs:
96
+ input_msgs.append(dict(Role=item['role'], Contents=self.prepare_itlist(item['content'])))
97
+ else:
98
+ input_msgs.append(dict(Role='user', Contents=self.prepare_itlist(inputs)))
99
+ return input_msgs
100
+
101
+ def generate_inner(self, inputs, **kwargs) -> str:
102
+ from tencentcloud.common.exception.tencent_cloud_sdk_exception import TencentCloudSDKException
103
+ from tencentcloud.hunyuan.v20230901 import models
104
+
105
+ input_msgs = self.prepare_inputs(inputs)
106
+ temperature = kwargs.pop('temperature', self.temperature)
107
+
108
+ payload = dict(
109
+ Model=self.model,
110
+ Messages=input_msgs,
111
+ Temperature=temperature,
112
+ **kwargs)
113
+
114
+ retry_counter = 0
115
+ while retry_counter < 3:
116
+ try:
117
+ req = models.ChatCompletionsRequest()
118
+ req.from_json_string(json.dumps(payload))
119
+ resp = self.client.ChatCompletions(req)
120
+ resp = json.loads(resp.to_json_string())
121
+ answer = resp['Choices'][0]['Message']['Content']
122
+ return 0, answer, resp
123
+ except TencentCloudSDKException as e:
124
+ self.logger.error(f'Got error code: {e.get_code()}')
125
+ if e.get_code() == 'ClientNetworkError':
126
+ return -1, self.fail_msg + e.get_code(), None
127
+ elif e.get_code() in ['InternalError', 'ServerNetworkError']:
128
+ if retry_counter == 3:
129
+ return -1, self.fail_msg + e.get_code(), None
130
+ retry_counter += 1
131
+ continue
132
+ elif e.get_code() in ['LimitExceeded']:
133
+ time.sleep(5)
134
+ if retry_counter == 3:
135
+ return -1, self.fail_msg + e.get_code(), None
136
+ retry_counter += 1
137
+ continue
138
+ else:
139
+ return -1, self.fail_msg + str(e), None
140
+
141
+ return -1, self.fail_msg, None
142
+
143
+
144
+ class HunyuanVision(HunyuanWrapper):
145
+
146
+ def generate(self, message, dataset=None):
147
+ return super(HunyuanVision, self).generate(message)
vlmeval/VLMEvalKit_old/vlmeval/api/jt_vl_chat.py ADDED
@@ -0,0 +1,239 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import pandas as pd
2
+ import requests
3
+ import json
4
+ import os
5
+ import base64
6
+ from vlmeval.smp import *
7
+ from vlmeval.api.base import BaseAPI
8
+ from vlmeval.dataset import DATASET_TYPE
9
+ from vlmeval.dataset import img_root_map
10
+
11
+
12
+ API_ENDPOINT = 'https://jiutian.10086.cn/kunlun/ingress/api/h3t-eeceff/92390745235a40a484d850be19e1f8b4/ai-5d7ae47ec93f4280953273c4001aafee/service-7544ea5ee3e841ad9d01e7af44acef7c/v1/chat/completions' # noqa: E501
13
+ APP_CODE = 'eyJ0eXAiOiJKV1QiLCJhbGciOiJSUzI1NiJ9.eyJzdWIiOiI5ZGQwNmQ2ZjU4YTU0ZGY0OGEzNjRhMjQyNGMwODEyNSIsImlzcyI6ImFwaS1hdXRoLWtleSIsImV4cCI6NDg4MjkwNDA3OX0.k5t_T-955xWMndzBbx4WQQNAgm5DpMos9mHm7vkFipQ3yebCFMfyufpSxORSfEVpBaDS3Nly0dd8ygQYGnDgIQcC72vQ1xtkjCP49LNcqlceoET4rGc1zwRi76XLPSGFES4GcwvEmr7Ilth7XtqZNxcDF_Z7HyHyf1-zF0JIQETYSoxenqLU-gNteNfqRUnlyCgaKh03DscAbYvtoMUxEaFa2ZqyRSwekdHI_SPKCq9aC9G19yDPHTjeiwl1ubtyC5uMy5pERn_ClRsZS3Wyb-GmD5QQsFofrWvCiU_fVJuUiez39pYZvEP8awH0R9B7SkpQ4XOzj3fdytTPYy3g6g' # noqa: E501
14
+
15
+
16
+ class JTVLChatWrapper(BaseAPI):
17
+ is_api: bool = True
18
+ INTERLEAVE = False
19
+
20
+ def __init__(self,
21
+ model: str = 'jt-vl-chat',
22
+ retry: int = 5,
23
+ wait: int = 5,
24
+ api_base: str = API_ENDPOINT,
25
+ key: str = APP_CODE,
26
+ verbose: bool = True,
27
+ system_prompt: str = None,
28
+ temperature: float = 0.7,
29
+ max_tokens: int = 256,
30
+ proxy: str = None,
31
+ **kwargs):
32
+ self.model = model
33
+
34
+ self.temperature = temperature
35
+ self.max_tokens = max_tokens
36
+ self.api_base = api_base
37
+
38
+ if key is None:
39
+ key = os.environ.get('JTVLChat_API_KEY', None)
40
+ assert key is not None, (
41
+ 'Please set the API Key (also called app_code, obtain it here: https://github.com/jiutiancv/JT-VL-Chat)'
42
+ )
43
+
44
+ self.key = key
45
+ super().__init__(wait=wait, retry=retry, system_prompt=system_prompt, verbose=verbose, **kwargs)
46
+
47
+ def dump_image(self, line, dataset):
48
+ """Dump the image(s) of the input line to the corresponding dataset folder.
49
+
50
+ Args:
51
+ line (line of pd.DataFrame): The raw input line.
52
+ dataset (str): The name of the dataset.
53
+
54
+ Returns:
55
+ str | list[str]: The paths of the dumped images.
56
+ """
57
+ ROOT = LMUDataRoot()
58
+ assert isinstance(dataset, str)
59
+
60
+ img_root = os.path.join(ROOT, 'images', img_root_map(dataset) if dataset in img_root_map(dataset) else dataset)
61
+ os.makedirs(img_root, exist_ok=True)
62
+ if 'image' in line:
63
+ if isinstance(line['image'], list):
64
+ tgt_path = []
65
+ assert 'image_path' in line
66
+ for img, im_name in zip(line['image'], line['image_path']):
67
+ path = osp.join(img_root, im_name)
68
+ if not read_ok(path):
69
+ decode_base64_to_image_file(img, path)
70
+ tgt_path.append(path)
71
+ else:
72
+ tgt_path = osp.join(img_root, f"{line['index']}.jpg")
73
+ if not read_ok(tgt_path):
74
+ decode_base64_to_image_file(line['image'], tgt_path)
75
+ tgt_path = [tgt_path]
76
+ else:
77
+ assert 'image_path' in line
78
+ tgt_path = toliststr(line['image_path'])
79
+
80
+ return tgt_path
81
+
82
+ def use_custom_prompt(self, dataset):
83
+ assert dataset is not None
84
+ if listinstr(['MMMU_DEV_VAL','MMMU_TEST'], dataset):
85
+ return False
86
+ else:
87
+ return True
88
+
89
+ def build_multi_choice_prompt(self, line, dataset=None):
90
+ question = line['question']
91
+ hint = line['hint'] if ('hint' in line and not pd.isna(line['hint'])) else None
92
+ if hint is not None:
93
+ question = hint + '\n' + question
94
+
95
+ options = {
96
+ cand: line[cand]
97
+ for cand in string.ascii_uppercase
98
+ if cand in line and not pd.isna(line[cand])
99
+ }
100
+ for key, item in options.items():
101
+ question += f'\n{key}. {item}'
102
+ prompt = question
103
+
104
+ if len(options):
105
+ prompt += '\n请直接回答选项字母。' if cn_string(
106
+ prompt) else "\nAnswer with the option's letter from the given choices directly."
107
+ else:
108
+ prompt += '\n请直接回答问题。' if cn_string(prompt) else '\nAnswer the question directly.'
109
+
110
+ return prompt
111
+
112
+ def build_prompt(self, line, dataset=None):
113
+ assert self.use_custom_prompt(dataset)
114
+ assert dataset is None or isinstance(dataset, str)
115
+
116
+ tgt_path = self.dump_image(line, dataset)
117
+
118
+ if dataset is not None and listinstr(['MME'], dataset):
119
+ question = line['question']
120
+ prompt = question + ' Answer the question using a single word or phrase.'
121
+ elif dataset is not None and listinstr(['HallusionBench'], dataset):
122
+ question = line['question']
123
+ prompt = question + ' Please answer yes or no. Answer the question using a single word or phrase.'
124
+ elif dataset is not None and DATASET_TYPE(dataset) == 'MCQ':
125
+ prompt = self.build_multi_choice_prompt(line, dataset)
126
+ elif dataset is not None and DATASET_TYPE(dataset) == 'VQA':
127
+ if listinstr(['MathVista', 'MathVision'], dataset):
128
+ prompt = line['question']
129
+ elif listinstr(['LLaVABench'], dataset):
130
+ question = line['question']
131
+ prompt = question + '\nAnswer this question in detail.'
132
+ elif listinstr(['MMVet'], dataset):
133
+ prompt = line['question']
134
+ else:
135
+ question = line['question']
136
+ prompt = question + '\nAnswer the question using a single word or phrase.'
137
+ else:
138
+ prompt = line['question']
139
+ message = [dict(type='text', value=prompt)]
140
+ message.extend([dict(type='image', value=s) for s in tgt_path])
141
+ return message
142
+
143
+ def message_to_promptimg(self, message, dataset=None):
144
+ assert not self.INTERLEAVE
145
+ model_name = self.__class__.__name__
146
+ import warnings
147
+ warnings.warn(
148
+ f'Model {model_name} does not support interleaved input. '
149
+ 'Will use the first image and aggregated texts as prompt. ')
150
+ num_images = len([x for x in message if x['type'] == 'image'])
151
+ if num_images == 0:
152
+ prompt = '\n'.join([x['value'] for x in message if x['type'] == 'text'])
153
+ image = None
154
+ else:
155
+ prompt = '\n'.join([x['value'] for x in message if x['type'] == 'text'])
156
+ if dataset == 'BLINK':
157
+ image = concat_images_vlmeval(
158
+ [x['value'] for x in message if x['type'] == 'image'],
159
+ target_size=512)
160
+ else:
161
+ image = [x['value'] for x in message if x['type'] == 'image'][0]
162
+ return prompt, image
163
+
164
+ def get_send_data(self,prompt, image_path, temperature, max_tokens):
165
+ image = ''
166
+ with open(image_path, 'rb') as f:
167
+ image = str(base64.b64encode(f.read()), 'utf-8')
168
+ send_data = {
169
+ "messages": [
170
+ {
171
+ "role": "user",
172
+ "content": prompt
173
+ }
174
+ ],
175
+ "image_base64": image,
176
+ "max_tokens": max_tokens,
177
+ "temperature": temperature
178
+ }
179
+ return send_data
180
+
181
+ def get_send_data_no_image(self,prompt, temperature, max_tokens):
182
+ send_data = {
183
+ "messages": [
184
+ {
185
+ "role": "user",
186
+ "content": prompt
187
+ }
188
+ ],
189
+ "max_tokens": max_tokens,
190
+ "temperature": temperature
191
+ }
192
+ return send_data
193
+
194
+ def generate_inner(self, inputs, **kwargs) -> str:
195
+ assert isinstance(inputs, str) or isinstance(inputs, list)
196
+ inputs = [inputs] if isinstance(inputs, str) else inputs
197
+ dataset = kwargs.get('dataset', None)
198
+ prompt, image_path = self.message_to_promptimg(message=inputs, dataset=dataset)
199
+ # print("prompt:",prompt)
200
+ if image_path:
201
+ send_data = self.get_send_data(
202
+ prompt=prompt,
203
+ image_path=image_path,
204
+ temperature=self.temperature,
205
+ max_tokens=self.max_tokens)
206
+ else:
207
+ send_data = self.get_send_data_no_image(
208
+ prompt=prompt,
209
+ temperature=self.temperature,
210
+ max_tokens=self.max_tokens)
211
+
212
+ json_data = json.dumps(send_data)
213
+
214
+ header_dict = {'Content-Type': 'application/json', 'Authorization': 'Bearer ' + self.key}
215
+
216
+ r = requests.post(self.api_base, headers=header_dict, data=json_data, timeout=3000)
217
+ try:
218
+ assert r.status_code == 200
219
+ r_json = r.json()
220
+ output = r_json['choices'][0]['message']['content']
221
+ if self.verbose:
222
+ self.logger.info(f'inputs: {inputs}\nanswer: {output}')
223
+
224
+ return 0,output,'Succeeded! '
225
+
226
+ except:
227
+ error_msg = f'Error! code {r.status_code} content: {r.content}'
228
+ error_con = r.content.decode('utf-8')
229
+ if self.verbose:
230
+ self.logger.error(error_msg)
231
+ self.logger.error(error_con)
232
+ self.logger.error(f'The input messages are {inputs}.')
233
+ return -1,error_msg,''
234
+
235
+
236
+ class JTVLChatAPI(JTVLChatWrapper):
237
+
238
+ def generate(self, message, dataset=None):
239
+ return super(JTVLChatAPI, self).generate(message, dataset=dataset)
vlmeval/VLMEvalKit_old/vlmeval/api/qwen_api.py ADDED
@@ -0,0 +1,75 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ from http import HTTPStatus
2
+ import os
3
+ from vlmeval.api.base import BaseAPI
4
+ from vlmeval.smp import *
5
+
6
+
7
+ # Note: This is a pure language model API.
8
+ class QwenAPI(BaseAPI):
9
+
10
+ is_api: bool = True
11
+
12
+ def __init__(self,
13
+ model: str = 'qwen-max-1201',
14
+ retry: int = 5,
15
+ wait: int = 5,
16
+ verbose: bool = True,
17
+ seed: int = 2680,
18
+ temperature: float = 0.0,
19
+ system_prompt: str = None,
20
+ key: str = None,
21
+ max_tokens: int = 1024,
22
+ proxy: str = None,
23
+ **kwargs):
24
+
25
+ assert model in ['qwen-turbo', 'qwen-plus', 'qwen-max', 'qwen-max-1201', 'qwen-max-longcontext']
26
+ self.model = model
27
+ import dashscope
28
+ self.fail_msg = 'Failed to obtain answer via API. '
29
+ self.max_tokens = max_tokens
30
+ self.temperature = temperature
31
+ self.seed = seed
32
+ if key is None:
33
+ key = os.environ.get('DASHSCOPE_API_KEY', None)
34
+ assert key is not None, (
35
+ 'Please set the API Key (obtain it here: '
36
+ 'https://help.aliyun.com/zh/dashscope/developer-reference/vl-plus-quick-start)'
37
+ )
38
+ dashscope.api_key = key
39
+ if proxy is not None:
40
+ proxy_set(proxy)
41
+ super().__init__(wait=wait, retry=retry, system_prompt=system_prompt, verbose=verbose, **kwargs)
42
+
43
+ @staticmethod
44
+ def build_msgs(msgs_raw, system_prompt=None):
45
+ msgs = cp.deepcopy(msgs_raw)
46
+ ret = []
47
+ if system_prompt is not None:
48
+ ret.append(dict(role='system', content=system_prompt))
49
+ for i, msg in enumerate(msgs):
50
+ role = 'user' if i % 2 == 0 else 'assistant'
51
+ ret.append(dict(role=role, content=msg))
52
+ return ret
53
+
54
+ def generate_inner(self, inputs, **kwargs) -> str:
55
+ from dashscope import MultiModalConversation
56
+ assert isinstance(inputs, str) or isinstance(inputs, list)
57
+ inputs = [inputs] if isinstance(inputs, str) else inputs
58
+ messages = self.build_msgs(msgs_raw=inputs, system_prompt=self.system_prompt)
59
+
60
+ import dashscope
61
+ response = dashscope.Generation.call(
62
+ model=self.model,
63
+ messages=messages,
64
+ seed=self.seed,
65
+ temperature=self.temperature,
66
+ max_tokens=self.max_tokens,
67
+ result_format='message', # set the result to be "message" format.
68
+ )
69
+ if response.status_code != HTTPStatus.OK:
70
+ return -1, 'Error: Bad Response Statuse Code. ', f'The response status code is {response.status_code}. '
71
+
72
+ try:
73
+ return 0, response['output']['choices'][0]['message']['content'].strip(), 'Succeeded! '
74
+ except Exception as err:
75
+ return -1, f'Error: Failed to parse the response. {err}', response
vlmeval/VLMEvalKit_old/vlmeval/api/qwen_vl_api.py ADDED
@@ -0,0 +1,219 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ from __future__ import annotations
2
+
3
+ import os
4
+ import warnings
5
+
6
+ from vlmeval.smp import *
7
+ from vlmeval.api.base import BaseAPI
8
+ from vlmeval.vlm.qwen2_vl.prompt import Qwen2VLPromptMixin
9
+
10
+
11
+ def ensure_image_url(image: str) -> str:
12
+ prefixes = ['http://', 'https://', 'file://', 'data:image;']
13
+ if any(image.startswith(prefix) for prefix in prefixes):
14
+ return image
15
+ if os.path.exists(image):
16
+ return 'file://' + image
17
+ raise ValueError(f'Invalid image: {image}')
18
+
19
+
20
+ class Qwen2VLAPI(Qwen2VLPromptMixin, BaseAPI):
21
+ is_api: bool = True
22
+
23
+ def __init__(
24
+ self,
25
+ model: str = 'qwen-vl-max-0809',
26
+ key: str | None = None,
27
+ min_pixels: int | None = None,
28
+ max_pixels: int | None = None,
29
+ max_length=1024,
30
+ top_p=0.001,
31
+ top_k=1,
32
+ temperature=0.01,
33
+ repetition_penalty=1.0,
34
+ presence_penalty=0.0,
35
+ seed=3407,
36
+ use_custom_prompt: bool = True,
37
+ **kwargs,
38
+ ):
39
+ import dashscope
40
+
41
+ self.model = model
42
+ self.min_pixels = min_pixels
43
+ self.max_pixels = max_pixels
44
+ self.generate_kwargs = dict(
45
+ max_length=max_length,
46
+ top_p=top_p,
47
+ top_k=top_k,
48
+ temperature=temperature,
49
+ repetition_penalty=repetition_penalty,
50
+ presence_penalty=presence_penalty,
51
+ seed=seed,
52
+ )
53
+
54
+ key = os.environ.get('DASHSCOPE_API_KEY', None) if key is None else key
55
+ assert key is not None, (
56
+ 'Please set the API Key (obtain it here: '
57
+ 'https://help.aliyun.com/zh/dashscope/developer-reference/vl-plus-quick-start)'
58
+ )
59
+ dashscope.api_key = key
60
+ super().__init__(use_custom_prompt=use_custom_prompt, **kwargs)
61
+
62
+ def _prepare_content(self, inputs: list[dict[str, str]], dataset: str | None = None) -> list[dict[str, str]]:
63
+ """
64
+ inputs list[dict[str, str]], each dict has keys: ['type', 'value']
65
+ """
66
+ content = []
67
+ for s in inputs:
68
+ if s['type'] == 'image':
69
+ item = {'type': 'image', 'image': ensure_image_url(s['value'])}
70
+ if dataset == 'OCRBench':
71
+ item['min_pixels'] = 10 * 10 * 28 * 28
72
+ warnings.warn(f"OCRBench dataset uses custom min_pixels={item['min_pixels']}")
73
+ if self.max_pixels is not None:
74
+ item['max_pixels'] = self.max_pixels
75
+ else:
76
+ if self.min_pixels is not None:
77
+ item['min_pixels'] = self.min_pixels
78
+ if self.max_pixels is not None:
79
+ item['max_pixels'] = self.max_pixels
80
+ elif s['type'] == 'text':
81
+ item = {'type': 'text', 'text': s['value']}
82
+ else:
83
+ raise ValueError(f"Invalid message type: {s['type']}, {s}")
84
+ content.append(item)
85
+ return content
86
+
87
+ def generate_inner(self, inputs, **kwargs) -> str:
88
+ import dashscope
89
+
90
+ messages = []
91
+ if self.system_prompt is not None:
92
+ messages.append({'role': 'system', 'content': self.system_prompt})
93
+ messages.append(
94
+ {'role': 'user', 'content': self._prepare_content(inputs, dataset=kwargs.get('dataset', None))}
95
+ )
96
+ if self.verbose:
97
+ print(f'\033[31m{messages}\033[0m')
98
+
99
+ # generate
100
+ generation_kwargs = self.generate_kwargs.copy()
101
+ kwargs.pop('dataset', None)
102
+ generation_kwargs.update(kwargs)
103
+ try:
104
+ response = dashscope.MultiModalConversation.call(
105
+ model=self.model,
106
+ messages=messages,
107
+ **generation_kwargs,
108
+ )
109
+ if self.verbose:
110
+ print(response)
111
+ answer = response.output.choices[0]['message']['content'][0]['text']
112
+ return 0, answer, 'Succeeded! '
113
+ except Exception as err:
114
+ if self.verbose:
115
+ self.logger.error(f'{type(err)}: {err}')
116
+ self.logger.error(f'The input messages are {inputs}.')
117
+ return -1, '', ''
118
+
119
+
120
+ class QwenVLWrapper(BaseAPI):
121
+
122
+ is_api: bool = True
123
+
124
+ def __init__(self,
125
+ model: str = 'qwen-vl-plus',
126
+ retry: int = 5,
127
+ wait: int = 5,
128
+ key: str = None,
129
+ verbose: bool = True,
130
+ temperature: float = 0.0,
131
+ system_prompt: str = None,
132
+ max_tokens: int = 1024,
133
+ proxy: str = None,
134
+ **kwargs):
135
+
136
+ assert model in ['qwen-vl-plus', 'qwen-vl-max']
137
+ self.model = model
138
+ import dashscope
139
+ self.fail_msg = 'Failed to obtain answer via API. '
140
+ self.max_tokens = max_tokens
141
+ self.temperature = temperature
142
+ if key is None:
143
+ key = os.environ.get('DASHSCOPE_API_KEY', None)
144
+ assert key is not None, (
145
+ 'Please set the API Key (obtain it here: '
146
+ 'https://help.aliyun.com/zh/dashscope/developer-reference/vl-plus-quick-start)'
147
+ )
148
+ dashscope.api_key = key
149
+ if proxy is not None:
150
+ proxy_set(proxy)
151
+ super().__init__(wait=wait, retry=retry, system_prompt=system_prompt, verbose=verbose, **kwargs)
152
+
153
+ # inputs can be a lvl-2 nested list: [content1, content2, content3, ...]
154
+ # content can be a string or a list of image & text
155
+ def prepare_itlist(self, inputs):
156
+ assert np.all([isinstance(x, dict) for x in inputs])
157
+ has_images = np.sum([x['type'] == 'image' for x in inputs])
158
+ if has_images:
159
+ content_list = []
160
+ for msg in inputs:
161
+ if msg['type'] == 'text':
162
+ content_list.append(dict(text=msg['value']))
163
+ elif msg['type'] == 'image':
164
+ content_list.append(dict(image='file://' + msg['value']))
165
+ else:
166
+ assert all([x['type'] == 'text' for x in inputs])
167
+ text = '\n'.join([x['value'] for x in inputs])
168
+ content_list = [dict(text=text)]
169
+ return content_list
170
+
171
+ def prepare_inputs(self, inputs):
172
+ input_msgs = []
173
+ if self.system_prompt is not None:
174
+ input_msgs.append(dict(role='system', content=self.system_prompt))
175
+ assert isinstance(inputs, list) and isinstance(inputs[0], dict)
176
+ assert np.all(['type' in x for x in inputs]) or np.all(['role' in x for x in inputs]), inputs
177
+ if 'role' in inputs[0]:
178
+ assert inputs[-1]['role'] == 'user', inputs[-1]
179
+ for item in inputs:
180
+ input_msgs.append(dict(role=item['role'], content=self.prepare_itlist(item['content'])))
181
+ else:
182
+ input_msgs.append(dict(role='user', content=self.prepare_itlist(inputs)))
183
+ return input_msgs
184
+
185
+ def generate_inner(self, inputs, **kwargs) -> str:
186
+ from dashscope import MultiModalConversation
187
+ assert isinstance(inputs, str) or isinstance(inputs, list)
188
+
189
+ if 'type' in inputs[0]:
190
+ pure_text = np.all([x['type'] == 'text' for x in inputs])
191
+ else:
192
+ pure_text = True
193
+ for inp in inputs:
194
+ if not np.all([x['type'] == 'text' for x in inp['content']]):
195
+ pure_text = False
196
+ break
197
+
198
+ assert not pure_text
199
+ messages = self.prepare_inputs(inputs)
200
+ gen_config = dict(max_output_tokens=self.max_tokens, temperature=self.temperature)
201
+ gen_config.update(kwargs)
202
+ try:
203
+ response = MultiModalConversation.call(model=self.model, messages=messages)
204
+ if self.verbose:
205
+ print(response)
206
+ answer = response.output.choices[0]['message']['content'][0]['text']
207
+ return 0, answer, 'Succeeded! '
208
+ except Exception as err:
209
+ if self.verbose:
210
+ self.logger.error(f'{type(err)}: {err}')
211
+ self.logger.error(f'The input messages are {inputs}.')
212
+
213
+ return -1, '', ''
214
+
215
+
216
+ class QwenVLAPI(QwenVLWrapper):
217
+
218
+ def generate(self, message, dataset=None):
219
+ return super(QwenVLAPI, self).generate(message)
vlmeval/VLMEvalKit_old/vlmeval/api/reka.py ADDED
@@ -0,0 +1,60 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ from vlmeval.smp import *
2
+ from vlmeval.api.base import BaseAPI
3
+ from time import sleep
4
+ import mimetypes
5
+
6
+
7
+ class Reka_Wrapper(BaseAPI):
8
+
9
+ is_api: bool = True
10
+ INTERLEAVE: bool = False
11
+
12
+ def __init__(self,
13
+ model: str = 'reka-flash-20240226',
14
+ key: str = None,
15
+ retry: int = 10,
16
+ wait: int = 3,
17
+ system_prompt: str = None,
18
+ verbose: bool = True,
19
+ temperature: float = 0,
20
+ max_tokens: int = 1024,
21
+ **kwargs):
22
+
23
+ try:
24
+ import reka
25
+ except ImportError:
26
+ raise ImportError('Please install reka by running "pip install reka-api"')
27
+
28
+ self.model = model
29
+ default_kwargs = dict(temperature=temperature, request_output_len=max_tokens)
30
+ default_kwargs.update(kwargs)
31
+ self.kwargs = default_kwargs
32
+ if key is not None:
33
+ self.key = key
34
+ else:
35
+ self.key = os.environ.get('REKA_API_KEY', '')
36
+ super().__init__(retry=retry, wait=wait, verbose=verbose, system_prompt=system_prompt, **kwargs)
37
+
38
+ def generate_inner(self, inputs, **kwargs) -> str:
39
+ import reka
40
+ reka.API_KEY = self.key
41
+ dataset = kwargs.pop('dataset', None)
42
+ prompt, image_path = self.message_to_promptimg(inputs, dataset=dataset)
43
+ image_b64 = encode_image_file_to_base64(image_path)
44
+
45
+ response = reka.chat(
46
+ model_name=self.model,
47
+ human=prompt,
48
+ media_url=f'data:image/jpeg;base64,{image_b64}',
49
+ **self.kwargs)
50
+
51
+ try:
52
+ return 0, response['text'], response
53
+ except Exception as err:
54
+ return -1, self.fail_msg + str(err), response
55
+
56
+
57
+ class Reka(Reka_Wrapper):
58
+
59
+ def generate(self, message, dataset=None):
60
+ return super(Reka_Wrapper, self).generate(message)
vlmeval/VLMEvalKit_old/vlmeval/api/siliconflow.py ADDED
@@ -0,0 +1,185 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ from vlmeval.smp import *
2
+ from vlmeval.api.base import BaseAPI
3
+ from vlmeval.dataset import img_root_map
4
+ from vlmeval.dataset import DATASET_TYPE
5
+
6
+ API_BASE = 'https://api.siliconflow.cn/v1/chat/completions'
7
+
8
+
9
+ class SiliconFlowAPI(BaseAPI):
10
+
11
+ is_api: bool = True
12
+
13
+ def __init__(self,
14
+ model: str = 'deepseek-ai/DeepSeek-V2.5',
15
+ retry: int = 5,
16
+ wait: int = 5,
17
+ key: str = None,
18
+ api_base: str = API_BASE,
19
+ verbose: bool = True,
20
+ system_prompt: str = None,
21
+ timeout: int = 60,
22
+ **kwargs):
23
+
24
+ self.model = model
25
+ self.api_base = api_base
26
+
27
+ default_kwargs = {
28
+ 'stream': False,
29
+ 'temperature': 0,
30
+ 'frequency_penalty': 0,
31
+ 'n': 1,
32
+ 'max_tokens': 1024,
33
+ }
34
+ for k, v in default_kwargs.items():
35
+ if k not in kwargs:
36
+ kwargs[k] = default_kwargs[k]
37
+ if key is not None:
38
+ self.key = key
39
+ else:
40
+ self.key = os.environ.get('SiliconFlow_API_KEY', '')
41
+ headers = {
42
+ "Authorization": 'Bearer {}',
43
+ "Content-Type": "application/json"
44
+ }
45
+ headers['Authorization'] = headers['Authorization'].format(self.key)
46
+ self.headers = headers
47
+ super().__init__(wait=wait, retry=retry, system_prompt=system_prompt, verbose=verbose, **kwargs)
48
+
49
+ @staticmethod
50
+ def build_msgs(msgs_raw):
51
+ messages = []
52
+ message = {'role': 'user', 'content': []}
53
+
54
+ def encode_image_to_base64_PNG(image_dir):
55
+ image = Image.open(image_dir)
56
+ from io import BytesIO
57
+ byte_stream = BytesIO()
58
+ image.save(byte_stream, format="PNG")
59
+ byte_data = byte_stream.getvalue()
60
+ base64_encoded_data = base64.b64encode(byte_data)
61
+ base64_string = base64_encoded_data.decode("utf-8")
62
+
63
+ return base64_string
64
+ image_b64 = None
65
+ for msg in msgs_raw:
66
+ if msg['type'] == 'image' and not image_b64:
67
+ image_b64 = encode_image_to_base64_PNG(msg['value'])
68
+ message['content'].append({
69
+ 'image_url': {'url': image_b64},
70
+ 'type': 'image_url'
71
+ })
72
+ elif msg['type'] == 'text':
73
+ message['content'].append({
74
+ 'text': msg['value'],
75
+ 'type': 'text'
76
+ })
77
+
78
+ messages.append(message)
79
+ return messages
80
+
81
+ def generate_inner(self, inputs, **kwargs) -> str:
82
+ default_kwargs = self.default_kwargs
83
+ default_kwargs.update(kwargs)
84
+
85
+ payload = dict(
86
+ model=self.model,
87
+ messages=self.build_msgs(msgs_raw=inputs),
88
+ **default_kwargs)
89
+
90
+ response = requests.post(self.api_base, headers=self.headers, data=json.dumps(payload))
91
+ ret_code = response.status_code
92
+ ret_code = 0 if (200 <= int(ret_code) < 300) else ret_code
93
+
94
+ answer = self.fail_msg
95
+ try:
96
+ resp_struct = json.loads(response.text)
97
+ answer = resp_struct['choices'][0]['message']['content'].strip()
98
+ except:
99
+ pass
100
+ return ret_code, answer, response
101
+
102
+
103
+ class TeleMMAPI(SiliconFlowAPI):
104
+
105
+ is_api: bool = True
106
+
107
+ def __init__(self,
108
+ model: str = 'TeleAI/TeleMM',
109
+ key: str = None,
110
+ **kwargs):
111
+ super().__init__(model=model, key=key, **kwargs)
112
+
113
+ def dump_image(self, line, dataset):
114
+ """Dump the image(s) of the input line to the corresponding dataset folder.
115
+
116
+ Args:
117
+ line (line of pd.DataFrame): The raw input line.
118
+ dataset (str): The name of the dataset.
119
+
120
+ Returns:
121
+ str | list[str]: The paths of the dumped images.
122
+ """
123
+ ROOT = LMUDataRoot()
124
+ assert isinstance(dataset, str)
125
+ # img_root = osp.join(ROOT, 'images', img_root_map[dataset] if dataset in img_root_map else dataset)
126
+ img_root = osp.join(ROOT, 'images', img_root_map(dataset))
127
+ os.makedirs(img_root, exist_ok=True)
128
+ if 'image' in line:
129
+ if isinstance(line['image'], list):
130
+ tgt_path = []
131
+ assert 'image_path' in line
132
+ for img, im_name in zip(line['image'], line['image_path']):
133
+ path = osp.join(img_root, im_name)
134
+ if not read_ok(path):
135
+ decode_base64_to_image_file(img, path)
136
+ tgt_path.append(path)
137
+ else:
138
+ tgt_path = osp.join(img_root, f"{line['index']}.jpg")
139
+ if not read_ok(tgt_path):
140
+ decode_base64_to_image_file(line['image'], tgt_path)
141
+ tgt_path = [tgt_path]
142
+ else:
143
+ assert 'image_path' in line
144
+ tgt_path = toliststr(line['image_path'])
145
+ return tgt_path
146
+
147
+ def use_custom_prompt(self, dataset):
148
+ assert dataset is not None
149
+ if listinstr(['MMDU', 'MME-RealWorld', 'MME-RealWorld-CN'], dataset):
150
+ # For Multi-Turn we don't have custom prompt
151
+ return False
152
+ if 'mmmu' in dataset.lower():
153
+ return True
154
+ return False
155
+
156
+ def build_mmmu(self, line):
157
+ question = line['question']
158
+ options = {
159
+ cand: line[cand]
160
+ for cand in string.ascii_uppercase
161
+ if cand in line and not pd.isna(line[cand])
162
+ }
163
+ options_prompt = 'Options:\n'
164
+ for key, item in options.items():
165
+ options_prompt += f'{key}. {item}\n'
166
+ hint = line['hint'] if ('hint' in line and not pd.isna(line['hint'])) else None
167
+ prompt = ''
168
+ if hint is not None:
169
+ prompt += f'Hint: {hint}\n'
170
+ prompt += f'Question: {question}\n'
171
+ if len(options):
172
+ prompt += options_prompt
173
+ prompt += 'Please select the correct answer from the options above. \n'
174
+ return prompt
175
+
176
+ def build_prompt(self, line, dataset=None):
177
+ assert dataset is None or isinstance(dataset, str)
178
+ assert self.use_custom_prompt(dataset)
179
+ tgt_path = self.dump_image(line, dataset)
180
+ if 'mmmu' in dataset.lower():
181
+ prompt = self.build_mmmu(line)
182
+
183
+ ret = [dict(type='text', value=prompt)]
184
+ ret.extend([dict(type='image', value=s) for s in tgt_path])
185
+ return ret
vlmeval/VLMEvalKit_old/vlmeval/api/stepai.py ADDED
@@ -0,0 +1,87 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ from vlmeval.smp import *
2
+ from vlmeval.api.base import BaseAPI
3
+
4
+ url = 'https://api.stepfun.com/v1/chat/completions'
5
+ headers = {
6
+ 'Content-Type': 'application/json',
7
+ 'Authorization': 'Bearer {}',
8
+ }
9
+
10
+
11
+ class StepAPI_INT(BaseAPI):
12
+
13
+ is_api: bool = True
14
+
15
+ def __init__(self,
16
+ model: str = 'step-1v-8k',
17
+ retry: int = 10,
18
+ wait: int = 3,
19
+ key: str = None,
20
+ temperature: float = 0,
21
+ max_tokens: int = 300,
22
+ verbose: bool = True,
23
+ system_prompt: str = None,
24
+ **kwargs):
25
+ self.model = model
26
+ self.fail_msg = 'Fail to obtain answer via API.'
27
+ self.headers = headers
28
+ self.temperature = temperature
29
+ self.max_tokens = max_tokens
30
+ self.system_prompt = system_prompt
31
+ if key is not None:
32
+ self.key = key
33
+ else:
34
+ self.key = os.environ.get('STEPAI_API_KEY', '')
35
+ headers['Authorization'] = headers['Authorization'].format(self.key)
36
+
37
+ super().__init__(retry=retry, wait=wait, verbose=verbose, system_prompt=system_prompt, **kwargs)
38
+
39
+ @staticmethod
40
+ def build_msgs(msgs_raw):
41
+ messages = []
42
+ message = {'role': 'user', 'content': []}
43
+
44
+ for msg in msgs_raw:
45
+ if msg['type'] == 'image':
46
+ image_b64 = encode_image_file_to_base64(msg['value'])
47
+ message['content'].append({
48
+ 'image_url': {'url': 'data:image/webp;base64,%s' % (image_b64)},
49
+ 'type': 'image_url'
50
+ })
51
+ elif msg['type'] == 'text':
52
+ message['content'].append({
53
+ 'text': msg['value'],
54
+ 'type': 'text'
55
+ })
56
+
57
+ messages.append(message)
58
+ return messages
59
+
60
+ def generate_inner(self, inputs, **kwargs) -> str:
61
+ print(inputs, '\n')
62
+ payload = dict(
63
+ model=self.model,
64
+ max_tokens=self.max_tokens,
65
+ temperature=self.temperature,
66
+ messages=self.build_msgs(msgs_raw=inputs),
67
+ **kwargs)
68
+ response = requests.post(url, headers=headers, data=json.dumps(payload))
69
+ ret_code = response.status_code
70
+ ret_code = 0 if (200 <= int(ret_code) < 300) else ret_code
71
+
72
+ answer = self.fail_msg
73
+ try:
74
+ resp_struct = json.loads(response.text)
75
+ answer = resp_struct['choices'][0]['message']['content'].strip()
76
+ except Exception as err:
77
+ if self.verbose:
78
+ self.logger.error(f'{type(err)}: {err}')
79
+ self.logger.error(response.text if hasattr(response, 'text') else response)
80
+
81
+ return ret_code, answer, response
82
+
83
+
84
+ class Step1V_INT(StepAPI_INT):
85
+
86
+ def generate(self, message, dataset=None):
87
+ return super(StepAPI_INT, self).generate(message)
vlmeval/VLMEvalKit_old/vlmeval/api/taiyi.py ADDED
@@ -0,0 +1,192 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ from vlmeval.smp import *
2
+ from vlmeval.api.base import BaseAPI
3
+ from vlmeval.dataset import DATASET_TYPE, img_root_map
4
+
5
+
6
+ class TaiyiWrapper(BaseAPI):
7
+
8
+ is_api: bool = True
9
+
10
+ def __init__(self,
11
+ model: str = 'taiyi',
12
+ retry: int = 5,
13
+ wait: int = 5,
14
+ key: str = None,
15
+ verbose: bool = False,
16
+ system_prompt: str = None,
17
+ temperature: float = 0,
18
+ timeout: int = 60,
19
+ url: str = "https://taiyi.megvii.com/v1/chat/completions",
20
+ max_tokens: int = 1024,
21
+ **kwargs):
22
+
23
+ self.model = model
24
+ self.fail_msg = 'Failed to obtain answer via API. '
25
+ self.max_tokens = max_tokens
26
+ self.temperature = temperature
27
+
28
+ if key is None:
29
+ key = os.environ.get('TAIYI_API_KEY', None)
30
+ assert key is not None, ('Please set the API Key ')
31
+ self.key = key
32
+
33
+ self.timeout = timeout
34
+ super().__init__(wait=wait, retry=retry, system_prompt=system_prompt, verbose=verbose, **kwargs)
35
+ assert url is not None, ('Please set the url ')
36
+ self.url = url
37
+ self.logger.info(f'Using url: {self.url}; API Key: {self.key}')
38
+
39
+ def use_custom_prompt(self, dataset):
40
+ if DATASET_TYPE(dataset) == 'Y/N' or DATASET_TYPE(dataset) == 'MCQ' or DATASET_TYPE(dataset) == 'VQA':
41
+ return True
42
+ return False
43
+
44
+ def prepare_inputs(self, inputs):
45
+ input_msgs = []
46
+ if self.system_prompt is not None:
47
+ input_msgs.append(dict(role='system', content=self.system_prompt))
48
+ has_images = np.sum([x['type'] == 'image' for x in inputs])
49
+ if has_images:
50
+ content_list = []
51
+ for msg in inputs:
52
+ if msg['type'] == 'text':
53
+ content_list.append(dict(type='text', text=msg['value']))
54
+ elif msg['type'] == 'image':
55
+ imgbytes = open(msg['value'],'rb').read()
56
+ b64 = base64.b64encode(imgbytes).decode('ascii')
57
+ img_struct = dict(url=f'data:image/jpeg;base64,{b64}')
58
+ content_list.append(dict(type='image_url', image_url=img_struct))
59
+ input_msgs.append(dict(role='user', content=content_list))
60
+ else:
61
+ assert all([x['type'] == 'text' for x in inputs])
62
+ text = '\n'.join([x['value'] for x in inputs])
63
+ input_msgs.append(dict(role='user', content=text))
64
+ return input_msgs
65
+
66
+ def set_dump_image(self, dump_image_func):
67
+ self.dump_image_func = dump_image_func
68
+
69
+ def dump_image(self, line, dataset):
70
+ return self.dump_image_func(line)
71
+
72
+ def image_first(self, msgs):
73
+ nr_img = 0
74
+ for s in msgs:
75
+ if s['type'] == 'image':
76
+ nr_img += 1
77
+
78
+ if nr_img == 1:
79
+ new_msgs = []
80
+ img_msg = None
81
+ for s in msgs:
82
+ if s['type'] == 'text':
83
+ new_msgs.append(s)
84
+ else:
85
+ img_msg = s
86
+ new_msgs.insert(0, img_msg)
87
+ else:
88
+ new_msgs = msgs
89
+
90
+ return new_msgs
91
+
92
+ def build_multi_choice_prompt(self, line, dataset=None):
93
+ question = line['question']
94
+ hint = line['hint'] if ('hint' in line and not pd.isna(line['hint'])) else None
95
+ if hint is not None:
96
+ question = hint + '\n' + question
97
+
98
+ options = {
99
+ cand: line[cand]
100
+ for cand in string.ascii_uppercase
101
+ if cand in line and not pd.isna(line[cand])
102
+ }
103
+ for key, item in options.items():
104
+ question += f'\n{key}. {item}'
105
+ prompt = question
106
+
107
+ if len(options):
108
+ prompt += '\n请直接回答选项字母。' if cn_string(
109
+ prompt) else "\nAnswer with the option's letter from the given choices directly."
110
+ else:
111
+ prompt += '\n请直接回答问题。' if cn_string(prompt) else '\nAnswer the question directly.'
112
+
113
+ return prompt
114
+
115
+ def build_yorn_prompt(self, line, dataset=None):
116
+ if listinstr(['HallusionBench'], dataset):
117
+ pre_prompt = 'Read the following question carefully, think and solve it step by step.\n\n'
118
+ else:
119
+ pre_prompt = ''
120
+
121
+ prompt = pre_prompt + line['question'] + ' Please answer yes or no as the final answer.'
122
+
123
+ return prompt
124
+
125
+ def build_vqa_prompt(self, line, dataset=None):
126
+ if listinstr(['OCRBench'], dataset):
127
+ pre_prompt = 'Carefully identify the text in the image and answer the question.\n\n'
128
+ else:
129
+ pre_prompt = ''
130
+
131
+ if listinstr(['MMVet'], dataset):
132
+ post_prompt = '\nAnswer this question in detail.'
133
+ else:
134
+ post_prompt = ''
135
+
136
+ prompt = pre_prompt + line['question'] + post_prompt
137
+
138
+ return prompt
139
+
140
+ def build_prompt(self, line, dataset=None):
141
+ assert self.use_custom_prompt(dataset)
142
+ assert dataset is None or isinstance(dataset, str)
143
+ tgt_path = self.dump_image(line, dataset)
144
+
145
+ if DATASET_TYPE(dataset) == 'MCQ':
146
+ prompt = self.build_multi_choice_prompt(line, dataset)
147
+ elif DATASET_TYPE(dataset) == 'Y/N':
148
+ prompt = self.build_yorn_prompt(line, dataset)
149
+ elif DATASET_TYPE(dataset) == 'VQA':
150
+ prompt = self.build_vqa_prompt(line, dataset)
151
+ else:
152
+ raise RuntimeError(f'Invalid dataset type: {DATASET_TYPE(dataset)}')
153
+ message = []
154
+ message.extend([dict(type='image', value=s) for s in tgt_path])
155
+ message.extend([dict(type='text', value=prompt)])
156
+
157
+ # interleave dataset
158
+ if dataset.startswith('MMMU_'):
159
+ from .. import MMMUDataset
160
+ message = MMMUDataset.split_MMMU(message)
161
+ message = self.image_first(message)
162
+
163
+ return message
164
+
165
+ def generate_inner(self, inputs, **kwargs) -> str:
166
+
167
+ input_msgs = self.prepare_inputs(inputs)
168
+ temperature = kwargs.pop('temperature', self.temperature)
169
+
170
+ headers = {'Authorization': f'Bearer {self.key}'}
171
+ payload = dict(
172
+ model=self.model,
173
+ messages=input_msgs,
174
+ n=1,
175
+ temperature=temperature,
176
+ **kwargs)
177
+ response = requests.post(self.url, headers=headers, data=json.dumps(payload), timeout=self.timeout * 1.1)
178
+ ret_code = response.status_code
179
+ ret_code = 0 if (200 <= int(ret_code) < 300) else ret_code
180
+ answer = self.fail_msg
181
+ try:
182
+ resp_struct = json.loads(response.text)
183
+ answer = resp_struct['choices'][0]['message']['content'].strip()
184
+ except:
185
+ pass
186
+ return ret_code, answer, response
187
+
188
+
189
+ class TaiyiAPI(TaiyiWrapper):
190
+
191
+ def generate(self, message, dataset=None):
192
+ return super(TaiyiAPI, self).generate(message)