Add files using upload-large-folder tool
Browse filesThis view is limited to 50 files because it contains too many changes.
See raw diff
- PaddleMIX/comfyui/ComfyUI_ppdiffusers/utils/callbacks.py +20 -0
- PaddleMIX/comfyui/ComfyUI_ppdiffusers/workflows/SD15/workflow_SD1.5_inpaint.json +515 -0
- PaddleMIX/comfyui/ComfyUI_ppdiffusers/workflows/SD15/workflow_SD1.5_text2img.json +416 -0
- PaddleMIX/comfyui/ComfyUI_ppdiffusers/workflows/SDXL/workflow_SDXL_text2img.json +416 -0
- vlmeval/VLMEvalKit_old/docs/en/.readthedocs.yaml +17 -0
- vlmeval/VLMEvalKit_old/docs/en/ConfigSystem.md +57 -0
- vlmeval/VLMEvalKit_old/docs/en/Development.md +146 -0
- vlmeval/VLMEvalKit_old/docs/en/Makefile +20 -0
- vlmeval/VLMEvalKit_old/docs/en/Quickstart.md +148 -0
- vlmeval/VLMEvalKit_old/docs/en/conf.py +234 -0
- vlmeval/VLMEvalKit_old/docs/en/index.rst +41 -0
- vlmeval/VLMEvalKit_old/docs/zh-CN/_static/css/readthedocs.css +63 -0
- vlmeval/VLMEvalKit_old/docs/zh-CN/_static/image/logo.svg +24 -0
- vlmeval/VLMEvalKit_old/docs/zh-CN/_templates/404.html +18 -0
- vlmeval/VLMEvalKit_old/vlmeval/__pycache__/__init__.cpython-311.pyc +0 -0
- vlmeval/VLMEvalKit_old/vlmeval/__pycache__/__init__.cpython-38.pyc +0 -0
- vlmeval/VLMEvalKit_old/vlmeval/__pycache__/config.cpython-310.pyc +0 -0
- vlmeval/VLMEvalKit_old/vlmeval/__pycache__/inference_mt.cpython-310.pyc +0 -0
- vlmeval/VLMEvalKit_old/vlmeval/__pycache__/inference_video.cpython-310.pyc +0 -0
- vlmeval/VLMEvalKit_old/vlmeval/api/bluelm_v_api.py +120 -0
- vlmeval/VLMEvalKit_old/vlmeval/api/gpt.py +263 -0
- vlmeval/VLMEvalKit_old/vlmeval/api/sensechat_vision.py +257 -0
- vlmeval/VLMEvalKit_old/vlmeval/dataset/__init__.py +228 -0
- vlmeval/VLMEvalKit_old/vlmeval/dataset/__pycache__/__init__.cpython-38.pyc +0 -0
- vlmeval/VLMEvalKit_old/vlmeval/dataset/__pycache__/dude.cpython-310.pyc +0 -0
- vlmeval/VLMEvalKit_old/vlmeval/dataset/__pycache__/image_caption.cpython-310.pyc +0 -0
- vlmeval/VLMEvalKit_old/vlmeval/dataset/__pycache__/image_caption.cpython-38.pyc +0 -0
- vlmeval/VLMEvalKit_old/vlmeval/dataset/__pycache__/longvideobench.cpython-38.pyc +0 -0
- vlmeval/VLMEvalKit_old/vlmeval/dataset/__pycache__/mmbench_video.cpython-310.pyc +0 -0
- vlmeval/VLMEvalKit_old/vlmeval/dataset/__pycache__/slidevqa.cpython-310.pyc +0 -0
- vlmeval/VLMEvalKit_old/vlmeval/dataset/__pycache__/slidevqa.cpython-38.pyc +0 -0
- vlmeval/VLMEvalKit_old/vlmeval/dataset/__pycache__/text_base.cpython-38.pyc +0 -0
- vlmeval/VLMEvalKit_old/vlmeval/dataset/image_caption.py +89 -0
- vlmeval/VLMEvalKit_old/vlmeval/dataset/image_vqa.py +1333 -0
- vlmeval/VLMEvalKit_old/vlmeval/dataset/image_yorn.py +95 -0
- vlmeval/VLMEvalKit_old/vlmeval/dataset/miabench.py +167 -0
- vlmeval/VLMEvalKit_old/vlmeval/dataset/mmmath.py +446 -0
- vlmeval/VLMEvalKit_old/vlmeval/dataset/mvbench.py +668 -0
- vlmeval/VLMEvalKit_old/vlmeval/dataset/text_base.py +88 -0
- vlmeval/VLMEvalKit_old/vlmeval/dataset/vcr.py +335 -0
- vlmeval/VLMEvalKit_old/vlmeval/dataset/video_concat_dataset.py +83 -0
- vlmeval/VLMEvalKit_old/vlmeval/smp/__pycache__/__init__.cpython-310.pyc +0 -0
- vlmeval/VLMEvalKit_old/vlmeval/smp/__pycache__/__init__.cpython-311.pyc +0 -0
- vlmeval/VLMEvalKit_old/vlmeval/smp/__pycache__/__init__.cpython-38.pyc +0 -0
- vlmeval/VLMEvalKit_old/vlmeval/smp/__pycache__/file.cpython-310.pyc +0 -0
- vlmeval/VLMEvalKit_old/vlmeval/smp/__pycache__/file.cpython-311.pyc +0 -0
- vlmeval/VLMEvalKit_old/vlmeval/smp/__pycache__/file.cpython-38.pyc +0 -0
- vlmeval/VLMEvalKit_old/vlmeval/smp/__pycache__/log.cpython-310.pyc +0 -0
- vlmeval/VLMEvalKit_old/vlmeval/smp/__pycache__/log.cpython-38.pyc +0 -0
- vlmeval/VLMEvalKit_old/vlmeval/smp/__pycache__/misc.cpython-310.pyc +0 -0
PaddleMIX/comfyui/ComfyUI_ppdiffusers/utils/callbacks.py
ADDED
@@ -0,0 +1,20 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
# Copyright (c) 2024 PaddlePaddle Authors. All Rights Reserved.
|
2 |
+
#
|
3 |
+
# Licensed under the Apache License, Version 2.0 (the "License");
|
4 |
+
# you may not use this file except in compliance with the License.
|
5 |
+
# You may obtain a copy of the License at
|
6 |
+
#
|
7 |
+
# http://www.apache.org/licenses/LICENSE-2.0
|
8 |
+
#
|
9 |
+
# Unless required by applicable law or agreed to in writing, software
|
10 |
+
# distributed under the License is distributed on an "AS IS" BASIS,
|
11 |
+
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
12 |
+
# See the License for the specific language governing permissions and
|
13 |
+
# limitations under the License.
|
14 |
+
|
15 |
+
from typing import Dict
|
16 |
+
|
17 |
+
|
18 |
+
def progress_callback(pbar, cls, step, timestep, kwargs) -> Dict:
|
19 |
+
pbar.update(1)
|
20 |
+
return {}
|
PaddleMIX/comfyui/ComfyUI_ppdiffusers/workflows/SD15/workflow_SD1.5_inpaint.json
ADDED
@@ -0,0 +1,515 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
{
|
2 |
+
"last_node_id": 23,
|
3 |
+
"last_link_id": 33,
|
4 |
+
"nodes": [
|
5 |
+
{
|
6 |
+
"id": 4,
|
7 |
+
"type": "PaddleSDVaeDecoder",
|
8 |
+
"pos": [
|
9 |
+
1011,
|
10 |
+
398
|
11 |
+
],
|
12 |
+
"size": {
|
13 |
+
"0": 210,
|
14 |
+
"1": 46
|
15 |
+
},
|
16 |
+
"flags": {},
|
17 |
+
"order": 11,
|
18 |
+
"mode": 0,
|
19 |
+
"inputs": [
|
20 |
+
{
|
21 |
+
"name": "latent",
|
22 |
+
"type": "LATENT",
|
23 |
+
"link": 33,
|
24 |
+
"label": "latent"
|
25 |
+
},
|
26 |
+
{
|
27 |
+
"name": "sd_pipe",
|
28 |
+
"type": "PIPELINE",
|
29 |
+
"link": 4,
|
30 |
+
"label": "sd_pipe"
|
31 |
+
}
|
32 |
+
],
|
33 |
+
"outputs": [
|
34 |
+
{
|
35 |
+
"name": "image",
|
36 |
+
"type": "IMAGE",
|
37 |
+
"links": [
|
38 |
+
3
|
39 |
+
],
|
40 |
+
"shape": 3,
|
41 |
+
"label": "image",
|
42 |
+
"slot_index": 0
|
43 |
+
}
|
44 |
+
],
|
45 |
+
"properties": {
|
46 |
+
"Node name for S&R": "PaddleSDVaeDecoder"
|
47 |
+
}
|
48 |
+
},
|
49 |
+
{
|
50 |
+
"id": 5,
|
51 |
+
"type": "PaddleSaveImage",
|
52 |
+
"pos": [
|
53 |
+
1478,
|
54 |
+
470
|
55 |
+
],
|
56 |
+
"size": {
|
57 |
+
"0": 315,
|
58 |
+
"1": 270
|
59 |
+
},
|
60 |
+
"flags": {},
|
61 |
+
"order": 12,
|
62 |
+
"mode": 0,
|
63 |
+
"inputs": [
|
64 |
+
{
|
65 |
+
"name": "images",
|
66 |
+
"type": "IMAGE",
|
67 |
+
"link": 3,
|
68 |
+
"label": "images"
|
69 |
+
}
|
70 |
+
],
|
71 |
+
"properties": {
|
72 |
+
"Node name for S&R": "PaddleSaveImage"
|
73 |
+
},
|
74 |
+
"widgets_values": [
|
75 |
+
"ComfyUI"
|
76 |
+
]
|
77 |
+
},
|
78 |
+
{
|
79 |
+
"id": 7,
|
80 |
+
"type": "LoadImage",
|
81 |
+
"pos": [
|
82 |
+
50,
|
83 |
+
588
|
84 |
+
],
|
85 |
+
"size": {
|
86 |
+
"0": 315,
|
87 |
+
"1": 314
|
88 |
+
},
|
89 |
+
"flags": {},
|
90 |
+
"order": 0,
|
91 |
+
"mode": 0,
|
92 |
+
"outputs": [
|
93 |
+
{
|
94 |
+
"name": "IMAGE",
|
95 |
+
"type": "IMAGE",
|
96 |
+
"links": [
|
97 |
+
30
|
98 |
+
],
|
99 |
+
"shape": 3,
|
100 |
+
"label": "IMAGE",
|
101 |
+
"slot_index": 0
|
102 |
+
},
|
103 |
+
{
|
104 |
+
"name": "MASK",
|
105 |
+
"type": "MASK",
|
106 |
+
"links": [
|
107 |
+
28
|
108 |
+
],
|
109 |
+
"shape": 3,
|
110 |
+
"label": "MASK",
|
111 |
+
"slot_index": 1
|
112 |
+
}
|
113 |
+
],
|
114 |
+
"properties": {
|
115 |
+
"Node name for S&R": "LoadImage"
|
116 |
+
},
|
117 |
+
"widgets_values": [
|
118 |
+
"clipspace/clipspace-mask-572957.png [input]",
|
119 |
+
"image"
|
120 |
+
],
|
121 |
+
"color": "#322",
|
122 |
+
"bgcolor": "#533"
|
123 |
+
},
|
124 |
+
{
|
125 |
+
"id": 15,
|
126 |
+
"type": "PromptInput",
|
127 |
+
"pos": [
|
128 |
+
479,
|
129 |
+
1004
|
130 |
+
],
|
131 |
+
"size": {
|
132 |
+
"0": 400,
|
133 |
+
"1": 200
|
134 |
+
},
|
135 |
+
"flags": {},
|
136 |
+
"order": 1,
|
137 |
+
"mode": 0,
|
138 |
+
"outputs": [
|
139 |
+
{
|
140 |
+
"name": "prompt",
|
141 |
+
"type": "PROMPT",
|
142 |
+
"links": [
|
143 |
+
31
|
144 |
+
],
|
145 |
+
"shape": 3,
|
146 |
+
"label": "prompt",
|
147 |
+
"slot_index": 0
|
148 |
+
}
|
149 |
+
],
|
150 |
+
"properties": {
|
151 |
+
"Node name for S&R": "PromptInput"
|
152 |
+
},
|
153 |
+
"widgets_values": [
|
154 |
+
"1girl, blue hair"
|
155 |
+
]
|
156 |
+
},
|
157 |
+
{
|
158 |
+
"id": 12,
|
159 |
+
"type": "PromptInput",
|
160 |
+
"pos": [
|
161 |
+
965,
|
162 |
+
954
|
163 |
+
],
|
164 |
+
"size": {
|
165 |
+
"0": 400,
|
166 |
+
"1": 200
|
167 |
+
},
|
168 |
+
"flags": {},
|
169 |
+
"order": 2,
|
170 |
+
"mode": 0,
|
171 |
+
"outputs": [
|
172 |
+
{
|
173 |
+
"name": "prompt",
|
174 |
+
"type": "PROMPT",
|
175 |
+
"links": [
|
176 |
+
32
|
177 |
+
],
|
178 |
+
"shape": 3,
|
179 |
+
"label": "prompt",
|
180 |
+
"slot_index": 0
|
181 |
+
}
|
182 |
+
],
|
183 |
+
"properties": {
|
184 |
+
"Node name for S&R": "PromptInput"
|
185 |
+
},
|
186 |
+
"widgets_values": [
|
187 |
+
"low, error, ugly"
|
188 |
+
]
|
189 |
+
},
|
190 |
+
{
|
191 |
+
"id": 19,
|
192 |
+
"type": "Note",
|
193 |
+
"pos": [
|
194 |
+
1406,
|
195 |
+
968
|
196 |
+
],
|
197 |
+
"size": {
|
198 |
+
"0": 210,
|
199 |
+
"1": 58
|
200 |
+
},
|
201 |
+
"flags": {},
|
202 |
+
"order": 3,
|
203 |
+
"mode": 0,
|
204 |
+
"properties": {
|
205 |
+
"text": ""
|
206 |
+
},
|
207 |
+
"widgets_values": [
|
208 |
+
"这里填负向画面提示 (不想要的内容)"
|
209 |
+
],
|
210 |
+
"color": "#432",
|
211 |
+
"bgcolor": "#653"
|
212 |
+
},
|
213 |
+
{
|
214 |
+
"id": 18,
|
215 |
+
"type": "Note",
|
216 |
+
"pos": [
|
217 |
+
254,
|
218 |
+
1013
|
219 |
+
],
|
220 |
+
"size": {
|
221 |
+
"0": 210,
|
222 |
+
"1": 58
|
223 |
+
},
|
224 |
+
"flags": {},
|
225 |
+
"order": 4,
|
226 |
+
"mode": 0,
|
227 |
+
"properties": {
|
228 |
+
"text": ""
|
229 |
+
},
|
230 |
+
"widgets_values": [
|
231 |
+
"这里填正向画面提示 (想要的内容)"
|
232 |
+
],
|
233 |
+
"color": "#432",
|
234 |
+
"bgcolor": "#653"
|
235 |
+
},
|
236 |
+
{
|
237 |
+
"id": 21,
|
238 |
+
"type": "Note",
|
239 |
+
"pos": [
|
240 |
+
990,
|
241 |
+
543
|
242 |
+
],
|
243 |
+
"size": {
|
244 |
+
"0": 217.51138305664062,
|
245 |
+
"1": 164.82931518554688
|
246 |
+
},
|
247 |
+
"flags": {},
|
248 |
+
"order": 5,
|
249 |
+
"mode": 0,
|
250 |
+
"properties": {
|
251 |
+
"text": ""
|
252 |
+
},
|
253 |
+
"widgets_values": [
|
254 |
+
"- denoise是重绘幅度,越高程度越大\n- steps是画笔绘制的步数\n- number是每次同时绘制的张数\n- cfg可以调整画面细节参数\n- scheduler是不同的去噪声方式"
|
255 |
+
],
|
256 |
+
"color": "#432",
|
257 |
+
"bgcolor": "#653"
|
258 |
+
},
|
259 |
+
{
|
260 |
+
"id": 22,
|
261 |
+
"type": "Note",
|
262 |
+
"pos": [
|
263 |
+
1835,
|
264 |
+
498
|
265 |
+
],
|
266 |
+
"size": {
|
267 |
+
"0": 210,
|
268 |
+
"1": 58
|
269 |
+
},
|
270 |
+
"flags": {},
|
271 |
+
"order": 6,
|
272 |
+
"mode": 0,
|
273 |
+
"properties": {
|
274 |
+
"text": ""
|
275 |
+
},
|
276 |
+
"widgets_values": [
|
277 |
+
"这里是最终结果"
|
278 |
+
],
|
279 |
+
"color": "#432",
|
280 |
+
"bgcolor": "#653"
|
281 |
+
},
|
282 |
+
{
|
283 |
+
"id": 23,
|
284 |
+
"type": "Note",
|
285 |
+
"pos": [
|
286 |
+
324,
|
287 |
+
227
|
288 |
+
],
|
289 |
+
"size": {
|
290 |
+
"0": 210,
|
291 |
+
"1": 58
|
292 |
+
},
|
293 |
+
"flags": {},
|
294 |
+
"order": 7,
|
295 |
+
"mode": 0,
|
296 |
+
"properties": {
|
297 |
+
"text": ""
|
298 |
+
},
|
299 |
+
"widgets_values": [
|
300 |
+
"这里选择喜欢的AIGC大模型"
|
301 |
+
],
|
302 |
+
"color": "#432",
|
303 |
+
"bgcolor": "#653"
|
304 |
+
},
|
305 |
+
{
|
306 |
+
"id": 17,
|
307 |
+
"type": "PaddleSDInpaintPipe",
|
308 |
+
"pos": [
|
309 |
+
628,
|
310 |
+
526
|
311 |
+
],
|
312 |
+
"size": {
|
313 |
+
"0": 315,
|
314 |
+
"1": 282
|
315 |
+
},
|
316 |
+
"flags": {},
|
317 |
+
"order": 10,
|
318 |
+
"mode": 0,
|
319 |
+
"inputs": [
|
320 |
+
{
|
321 |
+
"name": "sd_pipe",
|
322 |
+
"type": "PIPELINE",
|
323 |
+
"link": 29,
|
324 |
+
"label": "sd_pipe",
|
325 |
+
"slot_index": 0
|
326 |
+
},
|
327 |
+
{
|
328 |
+
"name": "image",
|
329 |
+
"type": "IMAGE",
|
330 |
+
"link": 30,
|
331 |
+
"label": "image"
|
332 |
+
},
|
333 |
+
{
|
334 |
+
"name": "mask",
|
335 |
+
"type": "MASK",
|
336 |
+
"link": 28,
|
337 |
+
"label": "mask",
|
338 |
+
"slot_index": 2
|
339 |
+
},
|
340 |
+
{
|
341 |
+
"name": "prompt",
|
342 |
+
"type": "PROMPT",
|
343 |
+
"link": 31,
|
344 |
+
"label": "prompt",
|
345 |
+
"slot_index": 3
|
346 |
+
},
|
347 |
+
{
|
348 |
+
"name": "negative_prompt",
|
349 |
+
"type": "PROMPT",
|
350 |
+
"link": 32,
|
351 |
+
"label": "negative_prompt"
|
352 |
+
}
|
353 |
+
],
|
354 |
+
"outputs": [
|
355 |
+
{
|
356 |
+
"name": "latent",
|
357 |
+
"type": "LATENT",
|
358 |
+
"links": [
|
359 |
+
33
|
360 |
+
],
|
361 |
+
"shape": 3,
|
362 |
+
"label": "latent",
|
363 |
+
"slot_index": 0
|
364 |
+
}
|
365 |
+
],
|
366 |
+
"properties": {
|
367 |
+
"Node name for S&R": "PaddleSDInpaintPipe"
|
368 |
+
},
|
369 |
+
"widgets_values": [
|
370 |
+
0.7000000000000001,
|
371 |
+
20,
|
372 |
+
1,
|
373 |
+
1064456556884681,
|
374 |
+
"randomize",
|
375 |
+
7.5,
|
376 |
+
"euler"
|
377 |
+
]
|
378 |
+
},
|
379 |
+
{
|
380 |
+
"id": 1,
|
381 |
+
"type": "PaddleSDCheckpointLoader",
|
382 |
+
"pos": [
|
383 |
+
-36,
|
384 |
+
291
|
385 |
+
],
|
386 |
+
"size": {
|
387 |
+
"0": 315,
|
388 |
+
"1": 58
|
389 |
+
},
|
390 |
+
"flags": {},
|
391 |
+
"order": 8,
|
392 |
+
"mode": 0,
|
393 |
+
"outputs": [
|
394 |
+
{
|
395 |
+
"name": "sd_pipe",
|
396 |
+
"type": "PIPELINE",
|
397 |
+
"links": [
|
398 |
+
4,
|
399 |
+
29
|
400 |
+
],
|
401 |
+
"shape": 3,
|
402 |
+
"label": "sd_pipe",
|
403 |
+
"slot_index": 0
|
404 |
+
}
|
405 |
+
],
|
406 |
+
"properties": {
|
407 |
+
"Node name for S&R": "PaddleSDCheckpointLoader"
|
408 |
+
},
|
409 |
+
"widgets_values": [
|
410 |
+
"sd15/人物写真_majicmixRealistic_v7.safetensors"
|
411 |
+
]
|
412 |
+
},
|
413 |
+
{
|
414 |
+
"id": 20,
|
415 |
+
"type": "Note",
|
416 |
+
"pos": [
|
417 |
+
-204,
|
418 |
+
673
|
419 |
+
],
|
420 |
+
"size": {
|
421 |
+
"0": 210,
|
422 |
+
"1": 58
|
423 |
+
},
|
424 |
+
"flags": {},
|
425 |
+
"order": 9,
|
426 |
+
"mode": 0,
|
427 |
+
"properties": {
|
428 |
+
"text": ""
|
429 |
+
},
|
430 |
+
"widgets_values": [
|
431 |
+
"这里上传原图像,右键可以打开MaskEditor进行mask绘制。"
|
432 |
+
],
|
433 |
+
"color": "#432",
|
434 |
+
"bgcolor": "#653"
|
435 |
+
}
|
436 |
+
],
|
437 |
+
"links": [
|
438 |
+
[
|
439 |
+
3,
|
440 |
+
4,
|
441 |
+
0,
|
442 |
+
5,
|
443 |
+
0,
|
444 |
+
"IMAGE"
|
445 |
+
],
|
446 |
+
[
|
447 |
+
4,
|
448 |
+
1,
|
449 |
+
0,
|
450 |
+
4,
|
451 |
+
1,
|
452 |
+
"PIPELINE"
|
453 |
+
],
|
454 |
+
[
|
455 |
+
28,
|
456 |
+
7,
|
457 |
+
1,
|
458 |
+
17,
|
459 |
+
2,
|
460 |
+
"MASK"
|
461 |
+
],
|
462 |
+
[
|
463 |
+
29,
|
464 |
+
1,
|
465 |
+
0,
|
466 |
+
17,
|
467 |
+
0,
|
468 |
+
"PIPELINE"
|
469 |
+
],
|
470 |
+
[
|
471 |
+
30,
|
472 |
+
7,
|
473 |
+
0,
|
474 |
+
17,
|
475 |
+
1,
|
476 |
+
"IMAGE"
|
477 |
+
],
|
478 |
+
[
|
479 |
+
31,
|
480 |
+
15,
|
481 |
+
0,
|
482 |
+
17,
|
483 |
+
3,
|
484 |
+
"PROMPT"
|
485 |
+
],
|
486 |
+
[
|
487 |
+
32,
|
488 |
+
12,
|
489 |
+
0,
|
490 |
+
17,
|
491 |
+
4,
|
492 |
+
"PROMPT"
|
493 |
+
],
|
494 |
+
[
|
495 |
+
33,
|
496 |
+
17,
|
497 |
+
0,
|
498 |
+
4,
|
499 |
+
0,
|
500 |
+
"LATENT"
|
501 |
+
]
|
502 |
+
],
|
503 |
+
"groups": [],
|
504 |
+
"config": {},
|
505 |
+
"extra": {
|
506 |
+
"ds": {
|
507 |
+
"scale": 0.6303940863128514,
|
508 |
+
"offset": [
|
509 |
+
628.0768100805229,
|
510 |
+
63.29978438298349
|
511 |
+
]
|
512 |
+
}
|
513 |
+
},
|
514 |
+
"version": 0.4
|
515 |
+
}
|
PaddleMIX/comfyui/ComfyUI_ppdiffusers/workflows/SD15/workflow_SD1.5_text2img.json
ADDED
@@ -0,0 +1,416 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
{
|
2 |
+
"last_node_id": 25,
|
3 |
+
"last_link_id": 42,
|
4 |
+
"nodes": [
|
5 |
+
{
|
6 |
+
"id": 4,
|
7 |
+
"type": "PaddleSDVaeDecoder",
|
8 |
+
"pos": [
|
9 |
+
1011,
|
10 |
+
398
|
11 |
+
],
|
12 |
+
"size": {
|
13 |
+
"0": 210,
|
14 |
+
"1": 46
|
15 |
+
},
|
16 |
+
"flags": {},
|
17 |
+
"order": 9,
|
18 |
+
"mode": 0,
|
19 |
+
"inputs": [
|
20 |
+
{
|
21 |
+
"name": "latent",
|
22 |
+
"type": "LATENT",
|
23 |
+
"link": 42,
|
24 |
+
"label": "latent"
|
25 |
+
},
|
26 |
+
{
|
27 |
+
"name": "sd_pipe",
|
28 |
+
"type": "PIPELINE",
|
29 |
+
"link": 4,
|
30 |
+
"label": "sd_pipe"
|
31 |
+
}
|
32 |
+
],
|
33 |
+
"outputs": [
|
34 |
+
{
|
35 |
+
"name": "image",
|
36 |
+
"type": "IMAGE",
|
37 |
+
"links": [
|
38 |
+
3
|
39 |
+
],
|
40 |
+
"shape": 3,
|
41 |
+
"label": "image",
|
42 |
+
"slot_index": 0
|
43 |
+
}
|
44 |
+
],
|
45 |
+
"properties": {
|
46 |
+
"Node name for S&R": "PaddleSDVaeDecoder"
|
47 |
+
}
|
48 |
+
},
|
49 |
+
{
|
50 |
+
"id": 19,
|
51 |
+
"type": "Note",
|
52 |
+
"pos": [
|
53 |
+
1406,
|
54 |
+
968
|
55 |
+
],
|
56 |
+
"size": {
|
57 |
+
"0": 210,
|
58 |
+
"1": 58
|
59 |
+
},
|
60 |
+
"flags": {},
|
61 |
+
"order": 0,
|
62 |
+
"mode": 0,
|
63 |
+
"properties": {
|
64 |
+
"text": ""
|
65 |
+
},
|
66 |
+
"widgets_values": [
|
67 |
+
"这里填负向画面提示 (不想要的内容)"
|
68 |
+
],
|
69 |
+
"color": "#432",
|
70 |
+
"bgcolor": "#653"
|
71 |
+
},
|
72 |
+
{
|
73 |
+
"id": 18,
|
74 |
+
"type": "Note",
|
75 |
+
"pos": [
|
76 |
+
254,
|
77 |
+
1013
|
78 |
+
],
|
79 |
+
"size": {
|
80 |
+
"0": 210,
|
81 |
+
"1": 58
|
82 |
+
},
|
83 |
+
"flags": {},
|
84 |
+
"order": 1,
|
85 |
+
"mode": 0,
|
86 |
+
"properties": {
|
87 |
+
"text": ""
|
88 |
+
},
|
89 |
+
"widgets_values": [
|
90 |
+
"这里填正向画面提示 (想要的内容)"
|
91 |
+
],
|
92 |
+
"color": "#432",
|
93 |
+
"bgcolor": "#653"
|
94 |
+
},
|
95 |
+
{
|
96 |
+
"id": 21,
|
97 |
+
"type": "Note",
|
98 |
+
"pos": [
|
99 |
+
990,
|
100 |
+
543
|
101 |
+
],
|
102 |
+
"size": {
|
103 |
+
"0": 217.51138305664062,
|
104 |
+
"1": 164.82931518554688
|
105 |
+
},
|
106 |
+
"flags": {},
|
107 |
+
"order": 2,
|
108 |
+
"mode": 0,
|
109 |
+
"properties": {
|
110 |
+
"text": ""
|
111 |
+
},
|
112 |
+
"widgets_values": [
|
113 |
+
"- denoise是重绘幅度,越高程度越大\n- steps是画笔绘制的步数\n- number是每次同时绘制的张数\n- cfg可以调整画面细节参数\n- scheduler是不同的去噪声方式"
|
114 |
+
],
|
115 |
+
"color": "#432",
|
116 |
+
"bgcolor": "#653"
|
117 |
+
},
|
118 |
+
{
|
119 |
+
"id": 22,
|
120 |
+
"type": "Note",
|
121 |
+
"pos": [
|
122 |
+
1835,
|
123 |
+
498
|
124 |
+
],
|
125 |
+
"size": {
|
126 |
+
"0": 210,
|
127 |
+
"1": 58
|
128 |
+
},
|
129 |
+
"flags": {},
|
130 |
+
"order": 3,
|
131 |
+
"mode": 0,
|
132 |
+
"properties": {
|
133 |
+
"text": ""
|
134 |
+
},
|
135 |
+
"widgets_values": [
|
136 |
+
"这里是最终结果"
|
137 |
+
],
|
138 |
+
"color": "#432",
|
139 |
+
"bgcolor": "#653"
|
140 |
+
},
|
141 |
+
{
|
142 |
+
"id": 23,
|
143 |
+
"type": "Note",
|
144 |
+
"pos": [
|
145 |
+
324,
|
146 |
+
227
|
147 |
+
],
|
148 |
+
"size": {
|
149 |
+
"0": 210,
|
150 |
+
"1": 58
|
151 |
+
},
|
152 |
+
"flags": {},
|
153 |
+
"order": 4,
|
154 |
+
"mode": 0,
|
155 |
+
"properties": {
|
156 |
+
"text": ""
|
157 |
+
},
|
158 |
+
"widgets_values": [
|
159 |
+
"这里选择喜欢的AIGC大模型"
|
160 |
+
],
|
161 |
+
"color": "#432",
|
162 |
+
"bgcolor": "#653"
|
163 |
+
},
|
164 |
+
{
|
165 |
+
"id": 1,
|
166 |
+
"type": "PaddleSDCheckpointLoader",
|
167 |
+
"pos": [
|
168 |
+
-36,
|
169 |
+
291
|
170 |
+
],
|
171 |
+
"size": {
|
172 |
+
"0": 315,
|
173 |
+
"1": 58
|
174 |
+
},
|
175 |
+
"flags": {},
|
176 |
+
"order": 5,
|
177 |
+
"mode": 0,
|
178 |
+
"outputs": [
|
179 |
+
{
|
180 |
+
"name": "sd_pipe",
|
181 |
+
"type": "PIPELINE",
|
182 |
+
"links": [
|
183 |
+
4,
|
184 |
+
39
|
185 |
+
],
|
186 |
+
"shape": 3,
|
187 |
+
"label": "sd_pipe",
|
188 |
+
"slot_index": 0
|
189 |
+
}
|
190 |
+
],
|
191 |
+
"properties": {
|
192 |
+
"Node name for S&R": "PaddleSDCheckpointLoader"
|
193 |
+
},
|
194 |
+
"widgets_values": [
|
195 |
+
"sd15/25D风_revAnimated_v122.safetensors"
|
196 |
+
]
|
197 |
+
},
|
198 |
+
{
|
199 |
+
"id": 15,
|
200 |
+
"type": "PromptInput",
|
201 |
+
"pos": [
|
202 |
+
479,
|
203 |
+
1004
|
204 |
+
],
|
205 |
+
"size": {
|
206 |
+
"0": 400,
|
207 |
+
"1": 200
|
208 |
+
},
|
209 |
+
"flags": {},
|
210 |
+
"order": 6,
|
211 |
+
"mode": 0,
|
212 |
+
"outputs": [
|
213 |
+
{
|
214 |
+
"name": "prompt",
|
215 |
+
"type": "PROMPT",
|
216 |
+
"links": [
|
217 |
+
40
|
218 |
+
],
|
219 |
+
"shape": 3,
|
220 |
+
"label": "prompt",
|
221 |
+
"slot_index": 0
|
222 |
+
}
|
223 |
+
],
|
224 |
+
"properties": {
|
225 |
+
"Node name for S&R": "PromptInput"
|
226 |
+
},
|
227 |
+
"widgets_values": [
|
228 |
+
"1boy, blue hair, cute, anime style"
|
229 |
+
]
|
230 |
+
},
|
231 |
+
{
|
232 |
+
"id": 12,
|
233 |
+
"type": "PromptInput",
|
234 |
+
"pos": [
|
235 |
+
965,
|
236 |
+
964
|
237 |
+
],
|
238 |
+
"size": {
|
239 |
+
"0": 400,
|
240 |
+
"1": 200
|
241 |
+
},
|
242 |
+
"flags": {},
|
243 |
+
"order": 7,
|
244 |
+
"mode": 0,
|
245 |
+
"outputs": [
|
246 |
+
{
|
247 |
+
"name": "prompt",
|
248 |
+
"type": "PROMPT",
|
249 |
+
"links": [
|
250 |
+
41
|
251 |
+
],
|
252 |
+
"shape": 3,
|
253 |
+
"label": "prompt",
|
254 |
+
"slot_index": 0
|
255 |
+
}
|
256 |
+
],
|
257 |
+
"properties": {
|
258 |
+
"Node name for S&R": "PromptInput"
|
259 |
+
},
|
260 |
+
"widgets_values": [
|
261 |
+
"low, error, ugly, (extra hand), wrong hand, nsfw, nude, extra head"
|
262 |
+
]
|
263 |
+
},
|
264 |
+
{
|
265 |
+
"id": 5,
|
266 |
+
"type": "PaddleSaveImage",
|
267 |
+
"pos": [
|
268 |
+
1478,
|
269 |
+
470
|
270 |
+
],
|
271 |
+
"size": {
|
272 |
+
"0": 315,
|
273 |
+
"1": 270
|
274 |
+
},
|
275 |
+
"flags": {},
|
276 |
+
"order": 10,
|
277 |
+
"mode": 0,
|
278 |
+
"inputs": [
|
279 |
+
{
|
280 |
+
"name": "images",
|
281 |
+
"type": "IMAGE",
|
282 |
+
"link": 3,
|
283 |
+
"label": "images"
|
284 |
+
}
|
285 |
+
],
|
286 |
+
"properties": {
|
287 |
+
"Node name for S&R": "PaddleSaveImage"
|
288 |
+
},
|
289 |
+
"widgets_values": [
|
290 |
+
"ComfyUI"
|
291 |
+
]
|
292 |
+
},
|
293 |
+
{
|
294 |
+
"id": 25,
|
295 |
+
"type": "PaddleSDText2ImagePipe",
|
296 |
+
"pos": [
|
297 |
+
636,
|
298 |
+
537
|
299 |
+
],
|
300 |
+
"size": {
|
301 |
+
"0": 315,
|
302 |
+
"1": 266
|
303 |
+
},
|
304 |
+
"flags": {},
|
305 |
+
"order": 8,
|
306 |
+
"mode": 0,
|
307 |
+
"inputs": [
|
308 |
+
{
|
309 |
+
"name": "sd_pipe",
|
310 |
+
"type": "PIPELINE",
|
311 |
+
"link": 39,
|
312 |
+
"label": "sd_pipe"
|
313 |
+
},
|
314 |
+
{
|
315 |
+
"name": "prompt",
|
316 |
+
"type": "PROMPT",
|
317 |
+
"link": 40,
|
318 |
+
"label": "prompt"
|
319 |
+
},
|
320 |
+
{
|
321 |
+
"name": "negative_prompt",
|
322 |
+
"type": "PROMPT",
|
323 |
+
"link": 41,
|
324 |
+
"label": "negative_prompt"
|
325 |
+
}
|
326 |
+
],
|
327 |
+
"outputs": [
|
328 |
+
{
|
329 |
+
"name": "latent",
|
330 |
+
"type": "LATENT",
|
331 |
+
"links": [
|
332 |
+
42
|
333 |
+
],
|
334 |
+
"shape": 3,
|
335 |
+
"label": "latent",
|
336 |
+
"slot_index": 0
|
337 |
+
}
|
338 |
+
],
|
339 |
+
"properties": {
|
340 |
+
"Node name for S&R": "PaddleSDText2ImagePipe"
|
341 |
+
},
|
342 |
+
"widgets_values": [
|
343 |
+
20,
|
344 |
+
512,
|
345 |
+
768,
|
346 |
+
1,
|
347 |
+
61130596064161,
|
348 |
+
"randomize",
|
349 |
+
7.5,
|
350 |
+
"euler"
|
351 |
+
]
|
352 |
+
}
|
353 |
+
],
|
354 |
+
"links": [
|
355 |
+
[
|
356 |
+
3,
|
357 |
+
4,
|
358 |
+
0,
|
359 |
+
5,
|
360 |
+
0,
|
361 |
+
"IMAGE"
|
362 |
+
],
|
363 |
+
[
|
364 |
+
4,
|
365 |
+
1,
|
366 |
+
0,
|
367 |
+
4,
|
368 |
+
1,
|
369 |
+
"PIPELINE"
|
370 |
+
],
|
371 |
+
[
|
372 |
+
39,
|
373 |
+
1,
|
374 |
+
0,
|
375 |
+
25,
|
376 |
+
0,
|
377 |
+
"PIPELINE"
|
378 |
+
],
|
379 |
+
[
|
380 |
+
40,
|
381 |
+
15,
|
382 |
+
0,
|
383 |
+
25,
|
384 |
+
1,
|
385 |
+
"PROMPT"
|
386 |
+
],
|
387 |
+
[
|
388 |
+
41,
|
389 |
+
12,
|
390 |
+
0,
|
391 |
+
25,
|
392 |
+
2,
|
393 |
+
"PROMPT"
|
394 |
+
],
|
395 |
+
[
|
396 |
+
42,
|
397 |
+
25,
|
398 |
+
0,
|
399 |
+
4,
|
400 |
+
0,
|
401 |
+
"LATENT"
|
402 |
+
]
|
403 |
+
],
|
404 |
+
"groups": [],
|
405 |
+
"config": {},
|
406 |
+
"extra": {
|
407 |
+
"ds": {
|
408 |
+
"scale": 0.7627768444385535,
|
409 |
+
"offset": [
|
410 |
+
342.353878460601,
|
411 |
+
-167.10478701820625
|
412 |
+
]
|
413 |
+
}
|
414 |
+
},
|
415 |
+
"version": 0.4
|
416 |
+
}
|
PaddleMIX/comfyui/ComfyUI_ppdiffusers/workflows/SDXL/workflow_SDXL_text2img.json
ADDED
@@ -0,0 +1,416 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
{
|
2 |
+
"last_node_id": 28,
|
3 |
+
"last_link_id": 51,
|
4 |
+
"nodes": [
|
5 |
+
{
|
6 |
+
"id": 19,
|
7 |
+
"type": "Note",
|
8 |
+
"pos": [
|
9 |
+
1406,
|
10 |
+
968
|
11 |
+
],
|
12 |
+
"size": {
|
13 |
+
"0": 210,
|
14 |
+
"1": 58
|
15 |
+
},
|
16 |
+
"flags": {},
|
17 |
+
"order": 0,
|
18 |
+
"mode": 0,
|
19 |
+
"properties": {
|
20 |
+
"text": ""
|
21 |
+
},
|
22 |
+
"widgets_values": [
|
23 |
+
"这里填负向画面提示 (不想要的内容)"
|
24 |
+
],
|
25 |
+
"color": "#432",
|
26 |
+
"bgcolor": "#653"
|
27 |
+
},
|
28 |
+
{
|
29 |
+
"id": 18,
|
30 |
+
"type": "Note",
|
31 |
+
"pos": [
|
32 |
+
254,
|
33 |
+
1013
|
34 |
+
],
|
35 |
+
"size": {
|
36 |
+
"0": 210,
|
37 |
+
"1": 58
|
38 |
+
},
|
39 |
+
"flags": {},
|
40 |
+
"order": 1,
|
41 |
+
"mode": 0,
|
42 |
+
"properties": {
|
43 |
+
"text": ""
|
44 |
+
},
|
45 |
+
"widgets_values": [
|
46 |
+
"这里填正向画面提示 (想要的内容)"
|
47 |
+
],
|
48 |
+
"color": "#432",
|
49 |
+
"bgcolor": "#653"
|
50 |
+
},
|
51 |
+
{
|
52 |
+
"id": 21,
|
53 |
+
"type": "Note",
|
54 |
+
"pos": [
|
55 |
+
990,
|
56 |
+
543
|
57 |
+
],
|
58 |
+
"size": {
|
59 |
+
"0": 217.51138305664062,
|
60 |
+
"1": 164.82931518554688
|
61 |
+
},
|
62 |
+
"flags": {},
|
63 |
+
"order": 2,
|
64 |
+
"mode": 0,
|
65 |
+
"properties": {
|
66 |
+
"text": ""
|
67 |
+
},
|
68 |
+
"widgets_values": [
|
69 |
+
"- denoise是重绘幅度,越高程度越大\n- steps是画笔绘制的步数\n- number是每次同时绘制的张数\n- cfg可以调整画面细节参数\n- scheduler是不同的去噪声方式"
|
70 |
+
],
|
71 |
+
"color": "#432",
|
72 |
+
"bgcolor": "#653"
|
73 |
+
},
|
74 |
+
{
|
75 |
+
"id": 22,
|
76 |
+
"type": "Note",
|
77 |
+
"pos": [
|
78 |
+
1835,
|
79 |
+
498
|
80 |
+
],
|
81 |
+
"size": {
|
82 |
+
"0": 210,
|
83 |
+
"1": 58
|
84 |
+
},
|
85 |
+
"flags": {},
|
86 |
+
"order": 3,
|
87 |
+
"mode": 0,
|
88 |
+
"properties": {
|
89 |
+
"text": ""
|
90 |
+
},
|
91 |
+
"widgets_values": [
|
92 |
+
"这里是最终结果"
|
93 |
+
],
|
94 |
+
"color": "#432",
|
95 |
+
"bgcolor": "#653"
|
96 |
+
},
|
97 |
+
{
|
98 |
+
"id": 23,
|
99 |
+
"type": "Note",
|
100 |
+
"pos": [
|
101 |
+
324,
|
102 |
+
227
|
103 |
+
],
|
104 |
+
"size": {
|
105 |
+
"0": 210,
|
106 |
+
"1": 58
|
107 |
+
},
|
108 |
+
"flags": {},
|
109 |
+
"order": 4,
|
110 |
+
"mode": 0,
|
111 |
+
"properties": {
|
112 |
+
"text": ""
|
113 |
+
},
|
114 |
+
"widgets_values": [
|
115 |
+
"这里选择喜欢的AIGC大模型"
|
116 |
+
],
|
117 |
+
"color": "#432",
|
118 |
+
"bgcolor": "#653"
|
119 |
+
},
|
120 |
+
{
|
121 |
+
"id": 5,
|
122 |
+
"type": "PaddleSaveImage",
|
123 |
+
"pos": [
|
124 |
+
1478,
|
125 |
+
470
|
126 |
+
],
|
127 |
+
"size": {
|
128 |
+
"0": 315,
|
129 |
+
"1": 270
|
130 |
+
},
|
131 |
+
"flags": {},
|
132 |
+
"order": 10,
|
133 |
+
"mode": 0,
|
134 |
+
"inputs": [
|
135 |
+
{
|
136 |
+
"name": "images",
|
137 |
+
"type": "IMAGE",
|
138 |
+
"link": 51,
|
139 |
+
"label": "images"
|
140 |
+
}
|
141 |
+
],
|
142 |
+
"properties": {
|
143 |
+
"Node name for S&R": "PaddleSaveImage"
|
144 |
+
},
|
145 |
+
"widgets_values": [
|
146 |
+
"ComfyUI"
|
147 |
+
]
|
148 |
+
},
|
149 |
+
{
|
150 |
+
"id": 12,
|
151 |
+
"type": "PromptInput",
|
152 |
+
"pos": [
|
153 |
+
965,
|
154 |
+
964
|
155 |
+
],
|
156 |
+
"size": {
|
157 |
+
"0": 400,
|
158 |
+
"1": 200
|
159 |
+
},
|
160 |
+
"flags": {},
|
161 |
+
"order": 5,
|
162 |
+
"mode": 0,
|
163 |
+
"outputs": [
|
164 |
+
{
|
165 |
+
"name": "prompt",
|
166 |
+
"type": "PROMPT",
|
167 |
+
"links": [
|
168 |
+
48
|
169 |
+
],
|
170 |
+
"shape": 3,
|
171 |
+
"label": "prompt",
|
172 |
+
"slot_index": 0
|
173 |
+
}
|
174 |
+
],
|
175 |
+
"properties": {
|
176 |
+
"Node name for S&R": "PromptInput"
|
177 |
+
},
|
178 |
+
"widgets_values": [
|
179 |
+
"low, error, ugly, (extra hand), wrong hand, nsfw, nude, extra head"
|
180 |
+
]
|
181 |
+
},
|
182 |
+
{
|
183 |
+
"id": 28,
|
184 |
+
"type": "PaddleSDXLVaeDecoder",
|
185 |
+
"pos": [
|
186 |
+
1115.8165436384072,
|
187 |
+
359.29368984194616
|
188 |
+
],
|
189 |
+
"size": {
|
190 |
+
"0": 210,
|
191 |
+
"1": 46
|
192 |
+
},
|
193 |
+
"flags": {},
|
194 |
+
"order": 9,
|
195 |
+
"mode": 0,
|
196 |
+
"inputs": [
|
197 |
+
{
|
198 |
+
"name": "latent",
|
199 |
+
"type": "LATENT",
|
200 |
+
"link": 50,
|
201 |
+
"label": "latent"
|
202 |
+
},
|
203 |
+
{
|
204 |
+
"name": "sd_pipe",
|
205 |
+
"type": "PIPELINE",
|
206 |
+
"link": 49,
|
207 |
+
"label": "sd_pipe"
|
208 |
+
}
|
209 |
+
],
|
210 |
+
"outputs": [
|
211 |
+
{
|
212 |
+
"name": "image",
|
213 |
+
"type": "IMAGE",
|
214 |
+
"links": [
|
215 |
+
51
|
216 |
+
],
|
217 |
+
"shape": 3,
|
218 |
+
"label": "image",
|
219 |
+
"slot_index": 0
|
220 |
+
}
|
221 |
+
],
|
222 |
+
"properties": {
|
223 |
+
"Node name for S&R": "PaddleSDXLVaeDecoder"
|
224 |
+
}
|
225 |
+
},
|
226 |
+
{
|
227 |
+
"id": 27,
|
228 |
+
"type": "PaddleSDXLCheckpointLoader",
|
229 |
+
"pos": [
|
230 |
+
53,
|
231 |
+
413
|
232 |
+
],
|
233 |
+
"size": {
|
234 |
+
"0": 315,
|
235 |
+
"1": 58
|
236 |
+
},
|
237 |
+
"flags": {},
|
238 |
+
"order": 6,
|
239 |
+
"mode": 0,
|
240 |
+
"outputs": [
|
241 |
+
{
|
242 |
+
"name": "sd_pipe",
|
243 |
+
"type": "PIPELINE",
|
244 |
+
"links": [
|
245 |
+
45,
|
246 |
+
49
|
247 |
+
],
|
248 |
+
"shape": 3,
|
249 |
+
"label": "sd_pipe",
|
250 |
+
"slot_index": 0
|
251 |
+
}
|
252 |
+
],
|
253 |
+
"properties": {
|
254 |
+
"Node name for S&R": "PaddleSDXLCheckpointLoader"
|
255 |
+
},
|
256 |
+
"widgets_values": [
|
257 |
+
"sdxl/MJ5风格_SDXL_Dream.safetensors"
|
258 |
+
]
|
259 |
+
},
|
260 |
+
{
|
261 |
+
"id": 15,
|
262 |
+
"type": "PromptInput",
|
263 |
+
"pos": [
|
264 |
+
479,
|
265 |
+
1004
|
266 |
+
],
|
267 |
+
"size": {
|
268 |
+
"0": 400,
|
269 |
+
"1": 200
|
270 |
+
},
|
271 |
+
"flags": {},
|
272 |
+
"order": 7,
|
273 |
+
"mode": 0,
|
274 |
+
"outputs": [
|
275 |
+
{
|
276 |
+
"name": "prompt",
|
277 |
+
"type": "PROMPT",
|
278 |
+
"links": [
|
279 |
+
44
|
280 |
+
],
|
281 |
+
"shape": 3,
|
282 |
+
"label": "prompt",
|
283 |
+
"slot_index": 0
|
284 |
+
}
|
285 |
+
],
|
286 |
+
"properties": {
|
287 |
+
"Node name for S&R": "PromptInput"
|
288 |
+
},
|
289 |
+
"widgets_values": [
|
290 |
+
"1girl, cool, blue hair, cute, sunset, niji anime style"
|
291 |
+
]
|
292 |
+
},
|
293 |
+
{
|
294 |
+
"id": 26,
|
295 |
+
"type": "PaddleSDXLText2ImagePipe",
|
296 |
+
"pos": [
|
297 |
+
503,
|
298 |
+
573
|
299 |
+
],
|
300 |
+
"size": {
|
301 |
+
"0": 315,
|
302 |
+
"1": 266
|
303 |
+
},
|
304 |
+
"flags": {},
|
305 |
+
"order": 8,
|
306 |
+
"mode": 0,
|
307 |
+
"inputs": [
|
308 |
+
{
|
309 |
+
"name": "sd_pipe",
|
310 |
+
"type": "PIPELINE",
|
311 |
+
"link": 45,
|
312 |
+
"label": "sd_pipe"
|
313 |
+
},
|
314 |
+
{
|
315 |
+
"name": "prompt",
|
316 |
+
"type": "PROMPT",
|
317 |
+
"link": 44,
|
318 |
+
"label": "prompt"
|
319 |
+
},
|
320 |
+
{
|
321 |
+
"name": "negative_prompt",
|
322 |
+
"type": "PROMPT",
|
323 |
+
"link": 48,
|
324 |
+
"label": "negative_prompt"
|
325 |
+
}
|
326 |
+
],
|
327 |
+
"outputs": [
|
328 |
+
{
|
329 |
+
"name": "latent",
|
330 |
+
"type": "LATENT",
|
331 |
+
"links": [
|
332 |
+
50
|
333 |
+
],
|
334 |
+
"shape": 3,
|
335 |
+
"label": "latent",
|
336 |
+
"slot_index": 0
|
337 |
+
}
|
338 |
+
],
|
339 |
+
"properties": {
|
340 |
+
"Node name for S&R": "PaddleSDXLText2ImagePipe"
|
341 |
+
},
|
342 |
+
"widgets_values": [
|
343 |
+
20,
|
344 |
+
512,
|
345 |
+
768,
|
346 |
+
1,
|
347 |
+
351732349249869,
|
348 |
+
"randomize",
|
349 |
+
5,
|
350 |
+
"euler"
|
351 |
+
]
|
352 |
+
}
|
353 |
+
],
|
354 |
+
"links": [
|
355 |
+
[
|
356 |
+
44,
|
357 |
+
15,
|
358 |
+
0,
|
359 |
+
26,
|
360 |
+
1,
|
361 |
+
"PROMPT"
|
362 |
+
],
|
363 |
+
[
|
364 |
+
45,
|
365 |
+
27,
|
366 |
+
0,
|
367 |
+
26,
|
368 |
+
0,
|
369 |
+
"PIPELINE"
|
370 |
+
],
|
371 |
+
[
|
372 |
+
48,
|
373 |
+
12,
|
374 |
+
0,
|
375 |
+
26,
|
376 |
+
2,
|
377 |
+
"PROMPT"
|
378 |
+
],
|
379 |
+
[
|
380 |
+
49,
|
381 |
+
27,
|
382 |
+
0,
|
383 |
+
28,
|
384 |
+
1,
|
385 |
+
"PIPELINE"
|
386 |
+
],
|
387 |
+
[
|
388 |
+
50,
|
389 |
+
26,
|
390 |
+
0,
|
391 |
+
28,
|
392 |
+
0,
|
393 |
+
"LATENT"
|
394 |
+
],
|
395 |
+
[
|
396 |
+
51,
|
397 |
+
28,
|
398 |
+
0,
|
399 |
+
5,
|
400 |
+
0,
|
401 |
+
"IMAGE"
|
402 |
+
]
|
403 |
+
],
|
404 |
+
"groups": [],
|
405 |
+
"config": {},
|
406 |
+
"extra": {
|
407 |
+
"ds": {
|
408 |
+
"scale": 0.5730855330116872,
|
409 |
+
"offset": [
|
410 |
+
113.53226463291708,
|
411 |
+
-145.5843663012114
|
412 |
+
]
|
413 |
+
}
|
414 |
+
},
|
415 |
+
"version": 0.4
|
416 |
+
}
|
vlmeval/VLMEvalKit_old/docs/en/.readthedocs.yaml
ADDED
@@ -0,0 +1,17 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
version: 2
|
2 |
+
|
3 |
+
# Set the version of Python and other tools you might need
|
4 |
+
build:
|
5 |
+
os: ubuntu-22.04
|
6 |
+
tools:
|
7 |
+
python: "3.8"
|
8 |
+
|
9 |
+
formats:
|
10 |
+
- epub
|
11 |
+
|
12 |
+
sphinx:
|
13 |
+
configuration: docs/en/conf.py
|
14 |
+
|
15 |
+
python:
|
16 |
+
install:
|
17 |
+
- requirements: requirements/docs.txt
|
vlmeval/VLMEvalKit_old/docs/en/ConfigSystem.md
ADDED
@@ -0,0 +1,57 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
# Config System
|
2 |
+
|
3 |
+
By default, VLMEvalKit launches the evaluation by setting the model name(s) (defined in `/vlmeval/config.py`) and dataset name(s) (defined in `vlmeval/dataset/__init__.py`) in the `run.py` script with the `--model` and `--data` arguments. Such approach is simple and efficient in most scenarios, however, it may not be flexible enough when the user wants to evaluate multiple models / datasets with different settings.
|
4 |
+
|
5 |
+
To address this, VLMEvalKit provides a more flexible config system. The user can specify the model and dataset settings in a json file, and pass the path to the config file to the `run.py` script with the `--config` argument. Here is a sample config json:
|
6 |
+
|
7 |
+
```json
|
8 |
+
{
|
9 |
+
"model": {
|
10 |
+
"GPT4o_20240806_T00_HIGH": {
|
11 |
+
"class": "GPT4V",
|
12 |
+
"model": "gpt-4o-2024-08-06",
|
13 |
+
"temperature": 0,
|
14 |
+
"img_detail": "high"
|
15 |
+
},
|
16 |
+
"GPT4o_20240806_T10_Low": {
|
17 |
+
"class": "GPT4V",
|
18 |
+
"model": "gpt-4o-2024-08-06",
|
19 |
+
"temperature": 1.0,
|
20 |
+
"img_detail": "low"
|
21 |
+
}
|
22 |
+
},
|
23 |
+
"data": {
|
24 |
+
"MME-RealWorld-Lite": {
|
25 |
+
"class": "MMERealWorld",
|
26 |
+
"dataset": "MME-RealWorld-Lite"
|
27 |
+
},
|
28 |
+
"MMBench_DEV_EN_V11": {
|
29 |
+
"class": "ImageMCQDataset",
|
30 |
+
"dataset": "MMBench_DEV_EN_V11"
|
31 |
+
}
|
32 |
+
}
|
33 |
+
}
|
34 |
+
```
|
35 |
+
|
36 |
+
Explanation of the config json:
|
37 |
+
|
38 |
+
1. Now we support two fields: `model` and `data`, each of which is a dictionary. The key of the dictionary is the name of the model / dataset (set by the user), and the value is the setting of the model / dataset.
|
39 |
+
2. For items in `model`, the value is a dictionary containing the following keys:
|
40 |
+
- `class`: The class name of the model, which should be a class name defined in `vlmeval/vlm/__init__.py` (open-source models) or `vlmeval/api/__init__.py` (API models).
|
41 |
+
- Other kwargs: Other kwargs are model-specific parameters, please refer to the definition of the model class for detailed usage. For example, `model`, `temperature`, `img_detail` are arguments of the `GPT4V` class. It's noteworthy that the `model` argument is required by most model classes.
|
42 |
+
3. For the dictionary `data`, we suggest users to use the official dataset name as the key (or part of the key), since we frequently determine the post-processing / judging settings based on the dataset name. For items in `data`, the value is a dictionary containing the following keys:
|
43 |
+
- `class`: The class name of the dataset, which should be a class name defined in `vlmeval/dataset/__init__.py`.
|
44 |
+
- Other kwargs: Other kwargs are dataset-specific parameters, please refer to the definition of the dataset class for detailed usage. Typically, the `dataset` argument is required by most dataset classes.
|
45 |
+
|
46 |
+
Saving the example config json to `config.json`, you can launch the evaluation by:
|
47 |
+
|
48 |
+
```bash
|
49 |
+
python run.py --config config.json
|
50 |
+
```
|
51 |
+
|
52 |
+
That will generate the following output files under the working directory `$WORK_DIR` (Following the format `{$WORK_DIR}/{$MODEL_NAME}/{$MODEL_NAME}_{$DATASET_NAME}_*`):
|
53 |
+
|
54 |
+
- `$WORK_DIR/GPT4o_20240806_T00_HIGH/GPT4o_20240806_T00_HIGH_MME-RealWorld-Lite*`
|
55 |
+
- `$WORK_DIR/GPT4o_20240806_T10_Low/GPT4o_20240806_T10_Low_MME-RealWorld-Lite*`
|
56 |
+
- `$WORK_DIR/GPT4o_20240806_T00_HIGH/GPT4o_20240806_T00_HIGH_MMBench_DEV_EN_V11*`
|
57 |
+
- `$WORK_DIR/GPT4o_20240806_T10_Low/GPT4o_20240806_T10_Low_MMBench_DEV_EN_V11*`
|
vlmeval/VLMEvalKit_old/docs/en/Development.md
ADDED
@@ -0,0 +1,146 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
# Develop new Benchmark / MLLM
|
2 |
+
|
3 |
+
> 🛠️ How to implement a new Benchmark / VLM in VLMEvalKit?
|
4 |
+
|
5 |
+
## Implement a new benchmark
|
6 |
+
|
7 |
+
Example PR: **Math-Vision Benchmark** ([#292](https://github.com/open-compass/VLMEvalKit/pull/292/files))
|
8 |
+
|
9 |
+
In VLMEvalKit, benchmarks are organized as dataset classes. When you try to implement a new benchmark, you can either reuse existing dataset classes (*e.g.*, You can reuse `ImageMCQDataset` when implementing a new multi-choice benchmark), or support a new dataset class. Each dataset must have the following two member functions (either reuse the one of the parent class or implement your own):
|
10 |
+
|
11 |
+
- `build_prompt(self, line)`: The function input `line` is an integer (the sample index) or a `pd.Series` object (the raw record of the sample). The function outputs a `multi-modal message`, serving as the input of an MLLM. The `multi-modal message` is an interleaved list of multi-modal messages adopting the following format (the example includes an image and a text message): `[dict(type='image', value=IMAGE_PTH), dict(type='text', value=prompt)]`.
|
12 |
+
- `evaluate(self, eval_file, **judge_kwargs)`: The function input `eval_file` is the MLLM prediction (typically in `.xlsx` format). If the benchmark requires an external LLM (typically GPT) for evaluation, then `judge_kwargs` can pass the arguments for the LLM. The function outputs the benchmark evaluation results (metrics) in the form of `dict` or `pd.DataFrame`.
|
13 |
+
|
14 |
+
We then brief the typical steps to implement a new benchmark under VLMEvalKit:
|
15 |
+
|
16 |
+
### 1. Prepare your benchmark tsv file
|
17 |
+
|
18 |
+
Currently, we organize a benchmark as one single TSV file. During inference, the data file will be automatically downloaded from the definited `DATASET_URL` link to `$LMUData` file (default path is `$HOME/LMUData`, if not set explicitly). You can upload the prepared TSV file to a downloadable address (e.g., Huggingface) or send it to us at <[email protected]>. We will assist in uploading the dataset to the server. You can also customize `LMUData` path in the environment variable `LMUData=/path/to/your/data`.
|
19 |
+
|
20 |
+
The contents of the TSV file consist of:
|
21 |
+
|
22 |
+
| Dataset Name \ Fields | index | image | image_path | question | hint | multi-choice<br>options | answer | category | l2-category | split |
|
23 |
+
| --------------------------------------- | ----- | ----- | ---------- | -------- | ---- | ----------------------- | ------ | -------- | ----------- | ----- |
|
24 |
+
| MMBench_DEV_[CN/EN] | ✅ | ✅ | | ✅ | ✅ | ✅ | ✅ | ✅ | ✅ | ✅ |
|
25 |
+
| MMBench_TEST_[CN/EN] | ✅ | ✅ | | ✅ | ✅ | ✅ | | ✅ | ✅ | ✅ |
|
26 |
+
| CCBench | ✅ | ✅ | | ✅ | | ✅ | ✅ | ✅ | | |
|
27 |
+
| SEEDBench_IMG | ✅ | ✅ | | ✅ | | ✅ | ✅ | ✅ | | |
|
28 |
+
| MME | ✅ | ✅ | | ✅ | | | ✅ | ✅ | | |
|
29 |
+
| CORE_MM | ✅ | ✅ | ✅ | ✅ | | | | ✅ | | |
|
30 |
+
| MMVet | ✅ | ✅ | | ✅ | | | ✅ | ✅ | | |
|
31 |
+
| MMMU_DEV_VAL | ✅ | ✅ | ✅ | ✅ | | ✅ | ✅ | ✅ | ✅ | ✅ |
|
32 |
+
| COCO_VAL | ✅ | ✅ | | | | | ✅ | | | |
|
33 |
+
| OCRVQA_[TEST/TESTCORE] | ✅ | ✅ | | ✅ | | | ✅ | | | |
|
34 |
+
| TextVQA_VAL | ✅ | ✅ | | ✅ | | | ✅ | | | |
|
35 |
+
| VCR_[EN/ZH]\_[EASY/HARD]\_[ALL/500/100] | ✅ | ✅ | | ✅ | | | ✅ | | | |
|
36 |
+
| MMMB_[en/cn/pt/ar/tr/ru] | ✅ | ✅ | | ✅ | ✅ | ✅ | ✅ | ✅ | |✅ |
|
37 |
+
| MMBench_dev_[en/cn/pt/ar/tr/ru] | ✅ | ✅ | | ✅ | ✅ | ✅ | ✅ | ✅ | ✅ |✅ |
|
38 |
+
|
39 |
+
<div align="center"><b>Table 1. TSV fields of supported datasets.</b></div>
|
40 |
+
|
41 |
+
**Intro to mandatory fields in the `TSV` file:**
|
42 |
+
|
43 |
+
- **index:** Integer, Unique for each line in `tsv`
|
44 |
+
- **image:** The base64 of the image, you can use APIs implemented in `vlmeval/smp/vlm.py` for encoding and decoding:
|
45 |
+
- Encoding: `encode_image_to_base64 `(for PIL Image) / `encode_image_file_to_base64` (for image file path)
|
46 |
+
- Decoding: `decode_base64_to_image`(for PIL Image) / `decode_base64_to_image_file` (for image file path)
|
47 |
+
- **question**: The question corresponding to the image, a string
|
48 |
+
- **answer**: The answer to the question, a string. The `test` split does not need this field
|
49 |
+
|
50 |
+
### 2. Cutomize your benchmark prompt
|
51 |
+
|
52 |
+
`ImageBaseDataset` defines the default prompt format. If you need to add prompts specific to the dataset or input data in the `Interleave` format to the model, you can implement this through the `build_prompt(line)` function. This function takes a line from a TSV file as input, containing fields such as index, image, question, etc. The function returns a dictionary list of multimodal messages `msg` in the format `[dict(type='image', value=IMAGE_PTH), dict(type='text', value=prompt)]`, including the image path and the text prompt to be input into VLMs. For interleave type inputs, you can directly place the dictionary of the image path at the image token position.
|
53 |
+
|
54 |
+
### 3. Cutomize your benchmark metrics
|
55 |
+
|
56 |
+
To add evaluation for a new benchmark, you need to customize a class object to implement the dataset’s metrics calculation. Multimodal datasets inherit from the `ImageBaseDataset` object in `vlmeval/dataset/image_base.py`. The TYPE defines the type of dataset, `DATASET_URL` is the download address of the dataset, and `DATASET_MD5` is the MD5 checksum for consistency checking of the dataset file.
|
57 |
+
|
58 |
+
In this class, **you need to implement** the `evaluate(eval_file, **judge_kwargs)` class function to calculate metrics and output results for the custom dataset. The function input `eval_file` is the path to the model prediction results file `{model_name}_{dataset}.xlsx`. This file can be read as a pandas.DataFrame using the `load(eval_file)` method, containing fields such as index, question, answer, category, prediction, etc. The judge_kwargs will pass a dictionary related to evaluation, such as the name of the `judge model`, the number of API request threads, etc. **The return value** of the function is the calculated accuracy and other metrics, formatted as a dictionary composed of lists, organized into a pandas.DataFrame.
|
59 |
+
|
60 |
+
## Implement a new model
|
61 |
+
|
62 |
+
Example PR: **Support LLaVA-Next-Interleave** ([#294](https://github.com/open-compass/VLMEvalKit/pull/294))
|
63 |
+
|
64 |
+
**1. Support `generate_inner` API (mandatory).**
|
65 |
+
|
66 |
+
All existing models are implemented in `vlmeval/vlm`. For a minimal model, your model class **must implement the method** `generate_inner(msgs, dataset=None)`. In this function, you feed a multi-modal message to your VLM and return the VLM prediction (which is a string). The optional argument `dataset` can be used as the flag for the model to switch among various inference strategies.
|
67 |
+
|
68 |
+
The multi-modal messages `msgs` is a list of dictionaries, each dictionary has two keys: type and value:
|
69 |
+
- `type`: We currently support two types, choices are ["image", "text"].
|
70 |
+
- `value`: When type=='text' , the value is the text message (a single string); when type=='image', the value can be the local path of an image file, or the image URL.
|
71 |
+
|
72 |
+
Currently a multi-modal message may contain arbitrarily interleaved images and texts. If your model do not support that, a practice can be taking the 1st image and concatenated text messages as the input. You can set the `INTERLEAVE = False` in your model class and use `self.message_to_promptimg(message, dataset=dataset)` to build your prompt and the first image's path.
|
73 |
+
|
74 |
+
Here are some examples of multi-modal messages:
|
75 |
+
|
76 |
+
```python
|
77 |
+
IMAGE_PTH = 'assets/apple.jpg'
|
78 |
+
IMAGE_URL = 'https://raw.githubusercontent.com/open-compass/VLMEvalKit/main/assets/apple.jpg'
|
79 |
+
msg1 = [
|
80 |
+
dict(type='image', value=IMAGE_PTH),
|
81 |
+
dict(type='text', value='What is in this image?')
|
82 |
+
]
|
83 |
+
msg2 = [
|
84 |
+
dict(type='image', value=IMAGE_URL),
|
85 |
+
dict(type='image', value=IMAGE_URL),
|
86 |
+
dict(type='text', value='How many apples are there in these images?')
|
87 |
+
]
|
88 |
+
response = model.generate(msg1)
|
89 |
+
```
|
90 |
+
|
91 |
+
For convenience sake, we also support to take a list of string as inputs. In that case, we will check if a string is an image path or image URL and automatically convert it to the list[dict] format:
|
92 |
+
|
93 |
+
```python
|
94 |
+
IMAGE_PTH = 'assets/apple.jpg'
|
95 |
+
IMAGE_URL = 'https://raw.githubusercontent.com/open-compass/VLMEvalKit/main/assets/apple.jpg'
|
96 |
+
msg1 = [IMAGE_PTH, 'What is in this image?']
|
97 |
+
msg2 = [IMAGE_URL, IMAGE_URL, 'How many apples are there in these images?']
|
98 |
+
response = model.generate(msg1)
|
99 |
+
```
|
100 |
+
|
101 |
+
**Support Custom Prompt (optional).**
|
102 |
+
|
103 |
+
Besides, your model can support **custom prompt building** by implementing two optional methods: `use_custom_prompt(dataset)` and `build_prompt(line, dataset=None)`.
|
104 |
+
|
105 |
+
Both functions take the dataset name as the input:
|
106 |
+
|
107 |
+
- `use_custom_prompt(dataset)` returns a boolean flag, indicating whether the model should use the custom prompt building strategy.
|
108 |
+
- If `use_custom_prompt(dataset)` returns True, `build_prompt(line, dataset)` should return a customly bulit multimodal message for the corresponding `dataset`, given `line`, which is a dictionary that includes the necessary information of a data sample. If `use_custom_prompt(dataset)` returns False, the default prompt building strategy will be used.
|
109 |
+
|
110 |
+
**Support multi-turn chatting (optional).**
|
111 |
+
|
112 |
+
You can also support the multi-turn chatting and evaluation with your VLM by supporting the `chat_inner(message, dataset)` function. The function outputs a single string response, and the `message` is a list of chat history, following the below format.
|
113 |
+
|
114 |
+
```python
|
115 |
+
# Assume msg1, msg2, msg3, ... are multi-modal messages following the previously described format
|
116 |
+
# `chat_inner` take the following chat history list as input:
|
117 |
+
message = [
|
118 |
+
dict(role='user', content=msg1),
|
119 |
+
dict(role='assistant', content=msg2),
|
120 |
+
dict(role='user', content=msg3),
|
121 |
+
dict(role='assistant', content=msg4),
|
122 |
+
......
|
123 |
+
dict(role='user', content=msgn),
|
124 |
+
]
|
125 |
+
# `message` should contain an odd number of chat utterances, the role of utterances should be interleaved "user" and "assistant", with the role of the last utterance to be "user".
|
126 |
+
# The chat function will call `chat_inner`
|
127 |
+
response = model.chat(message)
|
128 |
+
```
|
129 |
+
|
130 |
+
### Example PRs:
|
131 |
+
|
132 |
+
- VLM that doesn't support interleaved images and texts, and does not use custom prompts: [[Model] Support glm-4v-9b](https://github.com/open-compass/VLMEvalKit/pull/221)
|
133 |
+
- VLM that supports interleaved images and texts and custom prompts: [Add MiniCPM-Llama3-V-2.5](https://github.com/open-compass/VLMEvalKit/pull/205)
|
134 |
+
- VLM API: [Feature add glmv](https://github.com/open-compass/VLMEvalKit/pull/201)
|
135 |
+
|
136 |
+
## Contribute to VLMEvalKit
|
137 |
+
|
138 |
+
If you want to contribute codes to **VLMEvalKit**, please do the pre-commit check before you submit a PR. That helps to keep the code tidy.
|
139 |
+
|
140 |
+
```bash
|
141 |
+
# Under the directory of VLMEvalKit, install the pre-commit hook:
|
142 |
+
pip install pre-commit
|
143 |
+
pre-commit install
|
144 |
+
pre-commit run --all-files
|
145 |
+
# Then you can commit your code.
|
146 |
+
```
|
vlmeval/VLMEvalKit_old/docs/en/Makefile
ADDED
@@ -0,0 +1,20 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
# Minimal makefile for Sphinx documentation
|
2 |
+
#
|
3 |
+
|
4 |
+
# You can set these variables from the command line, and also
|
5 |
+
# from the environment for the first two.
|
6 |
+
SPHINXOPTS ?=
|
7 |
+
SPHINXBUILD ?= sphinx-build
|
8 |
+
SOURCEDIR = .
|
9 |
+
BUILDDIR = _build
|
10 |
+
|
11 |
+
# Put it first so that "make" without argument is like "make help".
|
12 |
+
help:
|
13 |
+
@$(SPHINXBUILD) -M help "$(SOURCEDIR)" "$(BUILDDIR)" $(SPHINXOPTS) $(O)
|
14 |
+
|
15 |
+
.PHONY: help Makefile
|
16 |
+
|
17 |
+
# Catch-all target: route all unknown targets to Sphinx using the new
|
18 |
+
# "make mode" option. $(O) is meant as a shortcut for $(SPHINXOPTS).
|
19 |
+
%: Makefile
|
20 |
+
@$(SPHINXBUILD) -M $@ "$(SOURCEDIR)" "$(BUILDDIR)" $(SPHINXOPTS) $(O)
|
vlmeval/VLMEvalKit_old/docs/en/Quickstart.md
ADDED
@@ -0,0 +1,148 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
# Quickstart
|
2 |
+
|
3 |
+
Before running the evaluation script, you need to **configure** the VLMs and set the model_paths properly.
|
4 |
+
|
5 |
+
After that, you can use a single script `run.py` to inference and evaluate multiple VLMs and benchmarks at a same time.
|
6 |
+
|
7 |
+
## Step 0. Installation & Setup essential keys
|
8 |
+
|
9 |
+
**Installation.**
|
10 |
+
|
11 |
+
```bash
|
12 |
+
git clone https://github.com/open-compass/VLMEvalKit.git
|
13 |
+
cd VLMEvalKit
|
14 |
+
pip install -e .
|
15 |
+
```
|
16 |
+
|
17 |
+
**Setup Keys.**
|
18 |
+
|
19 |
+
To infer with API models (GPT-4v, Gemini-Pro-V, etc.) or use LLM APIs as the **judge or choice extractor**, you need to first setup API keys. VLMEvalKit will use an judge **LLM** to extract answer from the output if you set the key, otherwise it uses the **exact matching** mode (find "Yes", "No", "A", "B", "C"... in the output strings). **The exact matching can only be applied to the Yes-or-No tasks and the Multi-choice tasks.**
|
20 |
+
- You can place the required keys in `$VLMEvalKit/.env` or directly set them as the environment variable. If you choose to create a `.env` file, its content will look like:
|
21 |
+
|
22 |
+
```bash
|
23 |
+
# The .env file, place it under $VLMEvalKit
|
24 |
+
# API Keys of Proprietary VLMs
|
25 |
+
# QwenVL APIs
|
26 |
+
DASHSCOPE_API_KEY=
|
27 |
+
# Gemini w. Google Cloud Backends
|
28 |
+
GOOGLE_API_KEY=
|
29 |
+
# OpenAI API
|
30 |
+
OPENAI_API_KEY=
|
31 |
+
OPENAI_API_BASE=
|
32 |
+
# StepAI API
|
33 |
+
STEPAI_API_KEY=
|
34 |
+
# REKA API
|
35 |
+
REKA_API_KEY=
|
36 |
+
# GLMV API
|
37 |
+
GLMV_API_KEY=
|
38 |
+
# CongRong API
|
39 |
+
CW_API_BASE=
|
40 |
+
CW_API_KEY=
|
41 |
+
# SenseChat-V API
|
42 |
+
SENSECHAT_AK=
|
43 |
+
SENSECHAT_SK=
|
44 |
+
# Hunyuan-Vision API
|
45 |
+
HUNYUAN_SECRET_KEY=
|
46 |
+
HUNYUAN_SECRET_ID=
|
47 |
+
# You can also set a proxy for calling api models during the evaluation stage
|
48 |
+
EVAL_PROXY=
|
49 |
+
```
|
50 |
+
|
51 |
+
- Fill the blanks with your API keys (if necessary). Those API keys will be automatically loaded when doing the inference and evaluation.
|
52 |
+
## Step 1. Configuration
|
53 |
+
|
54 |
+
**VLM Configuration**: All VLMs are configured in `vlmeval/config.py`. Few legacy VLMs (like MiniGPT-4, LLaVA-v1-7B) requires additional configuration (configuring the code / model_weight root in the config file). During evaluation, you should use the model name specified in `supported_VLM` in `vlmeval/config.py` to select the VLM. Make sure you can successfully infer with the VLM before starting the evaluation with the following command `vlmutil check {MODEL_NAME}`.
|
55 |
+
|
56 |
+
## Step 2. Evaluation
|
57 |
+
|
58 |
+
**New!!!** We integrated a new config system to enable more flexible evaluation settings. Check the [Document](/docs/en/ConfigSystem.md) or run `python run.py --help` for more details 🔥🔥🔥
|
59 |
+
|
60 |
+
We use `run.py` for evaluation. To use the script, you can use `$VLMEvalKit/run.py` or create a soft-link of the script (to use the script anywhere):
|
61 |
+
|
62 |
+
**Arguments**
|
63 |
+
|
64 |
+
- `--data (list[str])`: Set the dataset names that are supported in VLMEvalKit (names can be found in the codebase README).
|
65 |
+
- `--model (list[str])`: Set the VLM names that are supported in VLMEvalKit (defined in `supported_VLM` in `vlmeval/config.py`).
|
66 |
+
- `--mode (str, default to 'all', choices are ['all', 'infer'])`: When `mode` set to "all", will perform both inference and evaluation; when set to "infer", will only perform the inference.
|
67 |
+
- `--nproc (int, default to 4)`: The number of threads for OpenAI API calling.
|
68 |
+
- `--work-dir (str, default to '.')`: The directory to save evaluation results.
|
69 |
+
- `--nframe (int, default to 8)`: The number of frames to sample from a video, only applicable to the evaluation of video benchmarks.
|
70 |
+
- `--pack (bool, store_true)`: A video may associate with multiple questions, if `pack==True`, will ask all questions for a video in a single query.
|
71 |
+
|
72 |
+
**Command for Evaluating Image Benchmarks **
|
73 |
+
|
74 |
+
You can run the script with `python` or `torchrun`:
|
75 |
+
|
76 |
+
```bash
|
77 |
+
# When running with `python`, only one VLM instance is instantiated, and it might use multiple GPUs (depending on its default behavior).
|
78 |
+
# That is recommended for evaluating very large VLMs (like IDEFICS-80B-Instruct).
|
79 |
+
|
80 |
+
# IDEFICS-80B-Instruct on MMBench_DEV_EN, MME, and SEEDBench_IMG, Inference and Evalution
|
81 |
+
python run.py --data MMBench_DEV_EN MME SEEDBench_IMG --model idefics_80b_instruct --verbose
|
82 |
+
# IDEFICS-80B-Instruct on MMBench_DEV_EN, MME, and SEEDBench_IMG, Inference only
|
83 |
+
python run.py --data MMBench_DEV_EN MME SEEDBench_IMG --model idefics_80b_instruct --verbose --mode infer
|
84 |
+
|
85 |
+
# When running with `torchrun`, one VLM instance is instantiated on each GPU. It can speed up the inference.
|
86 |
+
# However, that is only suitable for VLMs that consume small amounts of GPU memory.
|
87 |
+
|
88 |
+
# IDEFICS-9B-Instruct, Qwen-VL-Chat, mPLUG-Owl2 on MMBench_DEV_EN, MME, and SEEDBench_IMG. On a node with 8 GPU. Inference and Evaluation.
|
89 |
+
torchrun --nproc-per-node=8 run.py --data MMBench_DEV_EN MME SEEDBench_IMG --model idefics_80b_instruct qwen_chat mPLUG-Owl2 --verbose
|
90 |
+
# Qwen-VL-Chat on MME. On a node with 2 GPU. Inference and Evaluation.
|
91 |
+
torchrun --nproc-per-node=2 run.py --data MME --model qwen_chat --verbose
|
92 |
+
```
|
93 |
+
|
94 |
+
**Command for Evaluating Video Benchmarks**
|
95 |
+
|
96 |
+
```bash
|
97 |
+
# When running with `python`, only one VLM instance is instantiated, and it might use multiple GPUs (depending on its default behavior).
|
98 |
+
# That is recommended for evaluating very large VLMs (like IDEFICS-80B-Instruct).
|
99 |
+
|
100 |
+
# IDEFICS2-8B on MMBench-Video, with 8 frames as inputs and vanilla evaluation. On a node with 8 GPUs.
|
101 |
+
torchrun --nproc-per-node=8 run.py --data MMBench-Video --model idefics2_8b --nframe 8
|
102 |
+
# GPT-4o (API model) on MMBench-Video, with 16 frames as inputs and pack evaluation (all questions of a video in a single query).
|
103 |
+
python run.py --data MMBench-Video --model GPT4o --nframe 16 --pack
|
104 |
+
```
|
105 |
+
|
106 |
+
The evaluation results will be printed as logs, besides. **Result Files** will also be generated in the directory `$YOUR_WORKING_DIRECTORY/{model_name}`. Files ending with `.csv` contain the evaluated metrics.
|
107 |
+
|
108 |
+
## Deploy a local language model as the judge / choice extractor
|
109 |
+
The default setting mentioned above uses OpenAI's GPT as the judge LLM. However, you can also deploy a local judge LLM with [LMDeploy](https://github.com/InternLM/lmdeploy).
|
110 |
+
|
111 |
+
First install:
|
112 |
+
```
|
113 |
+
pip install lmdeploy openai
|
114 |
+
```
|
115 |
+
|
116 |
+
And then deploy a local judge LLM with the single line of code. LMDeploy will automatically download the model from Huggingface. Assuming we use internlm2-chat-1_8b as the judge, port 23333, and the key sk-123456 (the key must start with "sk-" and follow with any number you like):
|
117 |
+
```
|
118 |
+
lmdeploy serve api_server internlm/internlm2-chat-1_8b --server-port 23333
|
119 |
+
```
|
120 |
+
|
121 |
+
You need to get the model name registered by LMDeploy with the following python code:
|
122 |
+
```
|
123 |
+
from openai import OpenAI
|
124 |
+
client = OpenAI(
|
125 |
+
api_key='sk-123456',
|
126 |
+
base_url="http://0.0.0.0:23333/v1"
|
127 |
+
)
|
128 |
+
model_name = client.models.list().data[0].id
|
129 |
+
```
|
130 |
+
|
131 |
+
Now set some environment variables to tell VLMEvalKit how to use the local judge LLM. As mentioned above, you can also set them in `$VLMEvalKit/.env` file:
|
132 |
+
```
|
133 |
+
OPENAI_API_KEY=sk-123456
|
134 |
+
OPENAI_API_BASE=http://0.0.0.0:23333/v1/chat/completions
|
135 |
+
LOCAL_LLM=<model_name you get>
|
136 |
+
```
|
137 |
+
|
138 |
+
Finally, you can run the commands in step 2 to evaluate your VLM with the local judge LLM.
|
139 |
+
|
140 |
+
Note that
|
141 |
+
|
142 |
+
- If you hope to deploy the judge LLM in a single GPU and evaluate your VLM on other GPUs because of limited GPU memory, try `CUDA_VISIBLE_DEVICES=x` like
|
143 |
+
```
|
144 |
+
CUDA_VISIBLE_DEVICES=0 lmdeploy serve api_server internlm/internlm2-chat-1_8b --server-port 23333
|
145 |
+
CUDA_VISIBLE_DEVICES=1,2,3 torchrun --nproc-per-node=3 run.py --data HallusionBench --model qwen_chat --verbose
|
146 |
+
```
|
147 |
+
- If the local judge LLM is not good enough in following the instructions, the evaluation may fail. Please report such failures (e.g., by issues).
|
148 |
+
- It's possible to deploy the judge LLM in different ways, e.g., use a private LLM (not from HuggingFace) or use a quantized LLM. Please refer to the [LMDeploy doc](https://lmdeploy.readthedocs.io/en/latest/serving/api_server.html). You can use any other deployment framework if they support OpenAI API.
|
vlmeval/VLMEvalKit_old/docs/en/conf.py
ADDED
@@ -0,0 +1,234 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
# flake8: noqa
|
2 |
+
# Configuration file for the Sphinx documentation builder.
|
3 |
+
#
|
4 |
+
# This file only contains a selection of the most common options. For a full
|
5 |
+
# list see the documentation:
|
6 |
+
# https://www.sphinx-doc.org/en/master/usage/configuration.html
|
7 |
+
|
8 |
+
# -- Path setup --------------------------------------------------------------
|
9 |
+
|
10 |
+
# If extensions (or modules to document with autodoc) are in another directory,
|
11 |
+
# add these directories to sys.path here. If the directory is relative to the
|
12 |
+
# documentation root, use os.path.abspath to make it absolute, like shown here.
|
13 |
+
#
|
14 |
+
import os
|
15 |
+
import ast
|
16 |
+
import subprocess
|
17 |
+
import sys
|
18 |
+
|
19 |
+
import pytorch_sphinx_theme
|
20 |
+
from sphinx.builders.html import StandaloneHTMLBuilder
|
21 |
+
|
22 |
+
sys.path.insert(0, os.path.abspath('../../'))
|
23 |
+
|
24 |
+
# -- Project information -----------------------------------------------------
|
25 |
+
|
26 |
+
project = 'VLMEvalKit'
|
27 |
+
copyright = '2023, VLMEvalKit'
|
28 |
+
author = 'VLMEvalKit Authors'
|
29 |
+
|
30 |
+
# The full version, including alpha/beta/rc tags
|
31 |
+
version_file = '../../vlmeval/__init__.py'
|
32 |
+
|
33 |
+
|
34 |
+
def get_version():
|
35 |
+
with open(version_file, 'r') as f:
|
36 |
+
file_content = f.read()
|
37 |
+
# Parse the file content into an abstract syntax tree (AST)
|
38 |
+
tree = ast.parse(file_content, filename=version_file)
|
39 |
+
|
40 |
+
# Iterate through the body of the AST, looking for an assignment to __version__
|
41 |
+
for node in tree.body:
|
42 |
+
if isinstance(node, ast.Assign):
|
43 |
+
for target in node.targets:
|
44 |
+
if isinstance(target, ast.Name) and target.id == '__version__':
|
45 |
+
return node.value.s
|
46 |
+
raise ValueError('__version__ not found')
|
47 |
+
|
48 |
+
|
49 |
+
release = get_version()
|
50 |
+
|
51 |
+
# -- General configuration ---------------------------------------------------
|
52 |
+
|
53 |
+
# Add any Sphinx extension module names here, as strings. They can be
|
54 |
+
# extensions coming with Sphinx (named 'sphinx.ext.*') or your custom
|
55 |
+
# ones.
|
56 |
+
extensions = [
|
57 |
+
'sphinx.ext.autodoc',
|
58 |
+
'sphinx.ext.autosummary',
|
59 |
+
'sphinx.ext.intersphinx',
|
60 |
+
'sphinx.ext.napoleon',
|
61 |
+
'sphinx.ext.viewcode',
|
62 |
+
'myst_parser',
|
63 |
+
'sphinx_copybutton',
|
64 |
+
'sphinx_tabs.tabs',
|
65 |
+
'notfound.extension',
|
66 |
+
'sphinxcontrib.jquery',
|
67 |
+
'sphinx_design',
|
68 |
+
]
|
69 |
+
|
70 |
+
# Add any paths that contain templates here, relative to this directory.
|
71 |
+
templates_path = ['_templates']
|
72 |
+
|
73 |
+
# The suffix(es) of source filenames.
|
74 |
+
# You can specify multiple suffix as a list of string:
|
75 |
+
#
|
76 |
+
source_suffix = {
|
77 |
+
'.rst': 'restructuredtext',
|
78 |
+
'.md': 'markdown',
|
79 |
+
}
|
80 |
+
|
81 |
+
language = 'en'
|
82 |
+
|
83 |
+
# The master toctree document.
|
84 |
+
root_doc = 'index'
|
85 |
+
html_context = {
|
86 |
+
'github_version': 'latest',
|
87 |
+
}
|
88 |
+
# List of patterns, relative to source directory, that match files and
|
89 |
+
# directories to ignore when looking for source files.
|
90 |
+
# This pattern also affects html_static_path and html_extra_path.
|
91 |
+
exclude_patterns = ['_build', 'Thumbs.db', '.DS_Store']
|
92 |
+
|
93 |
+
# -- Options for HTML output -------------------------------------------------
|
94 |
+
|
95 |
+
# The theme to use for HTML and HTML Help pages. See the documentation for
|
96 |
+
# a list of builtin themes.
|
97 |
+
#
|
98 |
+
html_theme = 'pytorch_sphinx_theme'
|
99 |
+
html_theme_path = [pytorch_sphinx_theme.get_html_theme_path()]
|
100 |
+
|
101 |
+
# Theme options are theme-specific and customize the look and feel of a theme
|
102 |
+
# further. For a list of options available for each theme, see the
|
103 |
+
# documentation.
|
104 |
+
# yapf: disable
|
105 |
+
html_theme_options = {
|
106 |
+
'menu': [
|
107 |
+
{
|
108 |
+
'name': 'GitHub',
|
109 |
+
'url': 'https://github.com/open-compass/VLMEvalKit'
|
110 |
+
},
|
111 |
+
],
|
112 |
+
# Specify the language of shared menu
|
113 |
+
'menu_lang': 'en',
|
114 |
+
# Disable the default edit on GitHub
|
115 |
+
'default_edit_on_github': False,
|
116 |
+
}
|
117 |
+
# yapf: enable
|
118 |
+
|
119 |
+
# Add any paths that contain custom static files (such as style sheets) here,
|
120 |
+
# relative to this directory. They are copied after the builtin static files,
|
121 |
+
# so a file named "default.css" will overwrite the builtin "default.css".
|
122 |
+
html_static_path = ['_static']
|
123 |
+
html_css_files = [
|
124 |
+
'https://cdn.datatables.net/v/bs4/dt-1.12.1/datatables.min.css',
|
125 |
+
'css/readthedocs.css'
|
126 |
+
]
|
127 |
+
html_js_files = [
|
128 |
+
'https://cdn.datatables.net/v/bs4/dt-1.12.1/datatables.min.js',
|
129 |
+
'js/custom.js'
|
130 |
+
]
|
131 |
+
|
132 |
+
# -- Options for HTMLHelp output ---------------------------------------------
|
133 |
+
|
134 |
+
# Output file base name for HTML help builder.
|
135 |
+
htmlhelp_basename = 'vlmevalkitdoc'
|
136 |
+
|
137 |
+
# -- Options for LaTeX output ------------------------------------------------
|
138 |
+
|
139 |
+
latex_elements = {
|
140 |
+
# The paper size ('letterpaper' or 'a4paper').
|
141 |
+
#
|
142 |
+
# 'papersize': 'letterpaper',
|
143 |
+
|
144 |
+
# The font size ('10pt', '11pt' or '12pt').
|
145 |
+
#
|
146 |
+
# 'pointsize': '10pt',
|
147 |
+
|
148 |
+
# Additional stuff for the LaTeX preamble.
|
149 |
+
#
|
150 |
+
# 'preamble': '',
|
151 |
+
}
|
152 |
+
|
153 |
+
# Grouping the document tree into LaTeX files. List of tuples
|
154 |
+
# (source start file, target name, title,
|
155 |
+
# author, documentclass [howto, manual, or own class]).
|
156 |
+
latex_documents = [
|
157 |
+
(root_doc, 'vlmevalkit.tex', 'VLMEvalKit Documentation', author,
|
158 |
+
'manual'),
|
159 |
+
]
|
160 |
+
|
161 |
+
# -- Options for manual page output ------------------------------------------
|
162 |
+
|
163 |
+
# One entry per manual page. List of tuples
|
164 |
+
# (source start file, name, description, authors, manual section).
|
165 |
+
man_pages = [(root_doc, 'vlmevalkit', 'VLMEvalKit Documentation', [author],
|
166 |
+
1)]
|
167 |
+
|
168 |
+
# -- Options for Texinfo output ----------------------------------------------
|
169 |
+
|
170 |
+
# Grouping the document tree into Texinfo files. List of tuples
|
171 |
+
# (source start file, target name, title, author,
|
172 |
+
# dir menu entry, description, category)
|
173 |
+
texinfo_documents = [
|
174 |
+
(root_doc, 'vlmevalkit', 'VLMEvalKit Documentation', author,
|
175 |
+
'VLMEvalKit Authors', 'AGI evaluation toolbox and benchmark.',
|
176 |
+
'Miscellaneous'),
|
177 |
+
]
|
178 |
+
|
179 |
+
# -- Options for Epub output -------------------------------------------------
|
180 |
+
|
181 |
+
# Bibliographic Dublin Core info.
|
182 |
+
epub_title = project
|
183 |
+
|
184 |
+
# The unique identifier of the text. This can be a ISBN number
|
185 |
+
# or the project homepage.
|
186 |
+
#
|
187 |
+
# epub_identifier = ''
|
188 |
+
|
189 |
+
# A unique identification for the text.
|
190 |
+
#
|
191 |
+
# epub_uid = ''
|
192 |
+
|
193 |
+
# A list of files that should not be packed into the epub file.
|
194 |
+
epub_exclude_files = ['search.html']
|
195 |
+
|
196 |
+
# set priority when building html
|
197 |
+
StandaloneHTMLBuilder.supported_image_types = [
|
198 |
+
'image/svg+xml', 'image/gif', 'image/png', 'image/jpeg'
|
199 |
+
]
|
200 |
+
|
201 |
+
# -- Extension configuration -------------------------------------------------
|
202 |
+
# Ignore >>> when copying code
|
203 |
+
copybutton_prompt_text = r'>>> |\.\.\. '
|
204 |
+
copybutton_prompt_is_regexp = True
|
205 |
+
|
206 |
+
# Auto-generated header anchors
|
207 |
+
myst_heading_anchors = 3
|
208 |
+
# Enable "colon_fence" extension of myst.
|
209 |
+
myst_enable_extensions = ['colon_fence', 'dollarmath']
|
210 |
+
|
211 |
+
# Configuration for intersphinx
|
212 |
+
intersphinx_mapping = {
|
213 |
+
'python': ('https://docs.python.org/3', None),
|
214 |
+
'numpy': ('https://numpy.org/doc/stable', None),
|
215 |
+
'torch': ('https://pytorch.org/docs/stable/', None),
|
216 |
+
'mmengine': ('https://mmengine.readthedocs.io/en/latest/', None),
|
217 |
+
'transformers':
|
218 |
+
('https://huggingface.co/docs/transformers/main/en/', None),
|
219 |
+
}
|
220 |
+
napoleon_custom_sections = [
|
221 |
+
# Custom sections for data elements.
|
222 |
+
('Meta fields', 'params_style'),
|
223 |
+
('Data fields', 'params_style'),
|
224 |
+
]
|
225 |
+
|
226 |
+
# Disable docstring inheritance
|
227 |
+
autodoc_inherit_docstrings = False
|
228 |
+
# Mock some imports during generate API docs.
|
229 |
+
autodoc_mock_imports = ['rich', 'attr', 'einops']
|
230 |
+
# Disable displaying type annotations, these can be very verbose
|
231 |
+
autodoc_typehints = 'none'
|
232 |
+
|
233 |
+
# The not found page
|
234 |
+
notfound_template = '404.html'
|
vlmeval/VLMEvalKit_old/docs/en/index.rst
ADDED
@@ -0,0 +1,41 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
Welcome to the VLMEvalKit Tutorial!
|
2 |
+
==========================================
|
3 |
+
|
4 |
+
VLMEvalKit Getting Started Guide
|
5 |
+
-------------------------------
|
6 |
+
|
7 |
+
To help users get started quickly, we recommend the following process:
|
8 |
+
|
9 |
+
- For users who want to use VLMEvalKit, we recommend reading the "Start Your First Step" section to set up the environment and start a mini-experiment to familiarize yourself with the process.
|
10 |
+
|
11 |
+
- If you want to customize more modules, such as adding datasets and models, we provide an "Advanced Tutorial."
|
12 |
+
|
13 |
+
We always welcome users' PRs (Pull Requests) and Issues to improve VLMEvalKit!
|
14 |
+
|
15 |
+
.. _Start Your First Step:
|
16 |
+
.. toctree::
|
17 |
+
:maxdepth: 1
|
18 |
+
:caption: Start Your First Step
|
19 |
+
|
20 |
+
Quickstart.md
|
21 |
+
|
22 |
+
.. _Advanced Tutorial:
|
23 |
+
.. toctree::
|
24 |
+
:maxdepth: 1
|
25 |
+
:caption: Advanced Tutorial
|
26 |
+
|
27 |
+
Development.md
|
28 |
+
ConfigSystem.md
|
29 |
+
|
30 |
+
.. _Other Notes:
|
31 |
+
.. toctree::
|
32 |
+
:maxdepth: 1
|
33 |
+
:caption: Other Notes
|
34 |
+
|
35 |
+
Contributors.md
|
36 |
+
|
37 |
+
Index and Tables
|
38 |
+
==================
|
39 |
+
|
40 |
+
* :ref:`genindex`
|
41 |
+
* :ref:`search`
|
vlmeval/VLMEvalKit_old/docs/zh-CN/_static/css/readthedocs.css
ADDED
@@ -0,0 +1,63 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
.header-logo {
|
2 |
+
background-image: url("../image/logo.svg");
|
3 |
+
background-size: 275px 80px;
|
4 |
+
height: 80px;
|
5 |
+
width: 275px;
|
6 |
+
}
|
7 |
+
|
8 |
+
|
9 |
+
@media screen and (min-width: 1100px) {
|
10 |
+
.header-logo {
|
11 |
+
top: -25px;
|
12 |
+
}
|
13 |
+
}
|
14 |
+
|
15 |
+
pre {
|
16 |
+
white-space: pre;
|
17 |
+
}
|
18 |
+
|
19 |
+
@media screen and (min-width: 2000px) {
|
20 |
+
.pytorch-content-left {
|
21 |
+
width: 1200px;
|
22 |
+
margin-left: 30px;
|
23 |
+
}
|
24 |
+
article.pytorch-article {
|
25 |
+
max-width: 1200px;
|
26 |
+
}
|
27 |
+
.pytorch-breadcrumbs-wrapper {
|
28 |
+
width: 1200px;
|
29 |
+
}
|
30 |
+
.pytorch-right-menu.scrolling-fixed {
|
31 |
+
position: fixed;
|
32 |
+
top: 45px;
|
33 |
+
left: 1580px;
|
34 |
+
}
|
35 |
+
}
|
36 |
+
|
37 |
+
|
38 |
+
article.pytorch-article section code {
|
39 |
+
padding: .2em .4em;
|
40 |
+
background-color: #f3f4f7;
|
41 |
+
border-radius: 5px;
|
42 |
+
}
|
43 |
+
|
44 |
+
/* Disable the change in tables */
|
45 |
+
article.pytorch-article section table code {
|
46 |
+
padding: unset;
|
47 |
+
background-color: unset;
|
48 |
+
border-radius: unset;
|
49 |
+
}
|
50 |
+
|
51 |
+
table.autosummary td {
|
52 |
+
width: 50%
|
53 |
+
}
|
54 |
+
|
55 |
+
img.align-center {
|
56 |
+
display: block;
|
57 |
+
margin-left: auto;
|
58 |
+
margin-right: auto;
|
59 |
+
}
|
60 |
+
|
61 |
+
article.pytorch-article p.rubric {
|
62 |
+
font-weight: bold;
|
63 |
+
}
|
vlmeval/VLMEvalKit_old/docs/zh-CN/_static/image/logo.svg
ADDED
|
vlmeval/VLMEvalKit_old/docs/zh-CN/_templates/404.html
ADDED
@@ -0,0 +1,18 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
{% extends "layout.html" %}
|
2 |
+
|
3 |
+
{% block body %}
|
4 |
+
|
5 |
+
<h1>Page Not Found</h1>
|
6 |
+
<p>
|
7 |
+
The page you are looking for cannot be found.
|
8 |
+
</p>
|
9 |
+
<p>
|
10 |
+
If you just switched documentation versions, it is likely that the page you were on is moved. You can look for it in
|
11 |
+
the content table left, or go to <a href="{{ pathto(root_doc) }}">the homepage</a>.
|
12 |
+
</p>
|
13 |
+
<!-- <p>
|
14 |
+
If you cannot find documentation you want, please <a
|
15 |
+
href="">open an issue</a> to tell us!
|
16 |
+
</p> -->
|
17 |
+
|
18 |
+
{% endblock %}
|
vlmeval/VLMEvalKit_old/vlmeval/__pycache__/__init__.cpython-311.pyc
ADDED
Binary file (566 Bytes). View file
|
|
vlmeval/VLMEvalKit_old/vlmeval/__pycache__/__init__.cpython-38.pyc
ADDED
Binary file (396 Bytes). View file
|
|
vlmeval/VLMEvalKit_old/vlmeval/__pycache__/config.cpython-310.pyc
ADDED
Binary file (17.2 kB). View file
|
|
vlmeval/VLMEvalKit_old/vlmeval/__pycache__/inference_mt.cpython-310.pyc
ADDED
Binary file (5.52 kB). View file
|
|
vlmeval/VLMEvalKit_old/vlmeval/__pycache__/inference_video.cpython-310.pyc
ADDED
Binary file (6.27 kB). View file
|
|
vlmeval/VLMEvalKit_old/vlmeval/api/bluelm_v_api.py
ADDED
@@ -0,0 +1,120 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
from vlmeval.smp import *
|
2 |
+
from vlmeval.api.base import BaseAPI
|
3 |
+
import os
|
4 |
+
import json
|
5 |
+
|
6 |
+
|
7 |
+
def multimodal(images, text, url, key, temperature=0, max_tokens=1024, history=[]):
|
8 |
+
if images:
|
9 |
+
pics = []
|
10 |
+
for image in images:
|
11 |
+
with open(image, 'rb') as f:
|
12 |
+
pic = base64.b64encode(f.read()).decode('utf-8')
|
13 |
+
pics.append(pic)
|
14 |
+
data = {'images': pics, 'text': text, 'key': key, 'temperature': temperature, 'max_new_tokens': max_tokens}
|
15 |
+
else:
|
16 |
+
data = {'text': text, 'key': key, 'temperature': temperature, 'max_new_tokens': max_tokens}
|
17 |
+
response = requests.post(url, json=data, headers={'Content-Type': 'application/json'})
|
18 |
+
response = json.loads(response.text)
|
19 |
+
return response
|
20 |
+
|
21 |
+
|
22 |
+
class BlueLMWrapper(BaseAPI):
|
23 |
+
is_api: bool = True
|
24 |
+
|
25 |
+
def __init__(self,
|
26 |
+
model: str = 'BlueLM-V-v3.0',
|
27 |
+
retry: int = 5,
|
28 |
+
wait: int = 5,
|
29 |
+
verbose: bool = True,
|
30 |
+
temperature: float = 0.0,
|
31 |
+
system_prompt: str = None,
|
32 |
+
max_tokens: int = 1024,
|
33 |
+
key: str = None,
|
34 |
+
url: str = 'http://api-ai.vivo.com.cn/multimodal',
|
35 |
+
**kwargs):
|
36 |
+
|
37 |
+
self.model = model
|
38 |
+
self.fail_msg = 'Failed to obtain answer BlueLM-V API. '
|
39 |
+
self.max_tokens = max_tokens
|
40 |
+
self.temperature = temperature
|
41 |
+
self.url = url
|
42 |
+
self.key = key
|
43 |
+
|
44 |
+
if self.key is None:
|
45 |
+
self.key = os.environ.get('BLUELM_V_API_KEY', None)
|
46 |
+
assert self.key is not None, (
|
47 |
+
'Please set the API Key (obtain it here: '
|
48 |
+
'contact by email : [email protected]'
|
49 |
+
)
|
50 |
+
|
51 |
+
super().__init__(wait=wait, retry=retry, system_prompt=system_prompt, verbose=verbose, **kwargs)
|
52 |
+
|
53 |
+
def message_to_promptimg(self, message, dataset=None):
|
54 |
+
|
55 |
+
num_images = len([x for x in message if x['type'] == 'image'])
|
56 |
+
if num_images == 0:
|
57 |
+
prompt = '\n'.join([x['value'] for x in message if x['type'] == 'text'])
|
58 |
+
image = None
|
59 |
+
elif num_images == 1:
|
60 |
+
prompt = '\n'.join([x['value'] for x in message if x['type'] == 'text'])
|
61 |
+
image = [x['value'] for x in message if x['type'] == 'image']
|
62 |
+
else:
|
63 |
+
prompt = '\n'.join([x['value'] if x['type'] == 'text' else '<image>' for x in message])
|
64 |
+
if dataset == 'BLINK':
|
65 |
+
image = concat_images_vlmeval(
|
66 |
+
[x['value'] for x in message if x['type'] == 'image'],
|
67 |
+
target_size=512)
|
68 |
+
else:
|
69 |
+
image = [x['value'] for x in message if x['type'] == 'image']
|
70 |
+
|
71 |
+
if dataset in ['MMBench_DEV_EN_V11', 'MMBench_DEV_CN_V11', 'MMBench_TEST_EN_V11', 'MMBench_TEST_CN_V11',
|
72 |
+
'AI2D_TEST', 'AI2D_TEST_TO_MASK', 'MMMU_DEV_VAL']:
|
73 |
+
prompt = prompt.replace('Please select the correct answer from the options above.',
|
74 |
+
'Answer with the option’s letter from the given choices directly.')
|
75 |
+
elif dataset in ['ChartQA_TEST']:
|
76 |
+
prompt = prompt.replace('Answer the question using a single word or phrase.',
|
77 |
+
'Answer the question using a single number or phrase.')
|
78 |
+
elif dataset in ['DocVQA_VAL', 'DocVQA_TEST', ]:
|
79 |
+
prompt = prompt.replace('Answer the question using a single word or phrase.',
|
80 |
+
'Give the short answer directly.')
|
81 |
+
elif dataset in ['TextVQA_VAL']:
|
82 |
+
prompt = prompt.replace('Answer the question using a single word or phrase.',
|
83 |
+
'When the provided information is insufficient, respond with ’Unanswerable’.'
|
84 |
+
'Answer the question using a single word or phrase.')
|
85 |
+
elif dataset in ['MTVQA_TEST']:
|
86 |
+
prompt = prompt.replace('\nAnswer the question using a word or phrase in the language of the question.', '')
|
87 |
+
elif dataset in ['MathVista_MINI']:
|
88 |
+
if 'Choices:' in prompt:
|
89 |
+
prompt = prompt.replace('Choices:', 'Options:').replace('Hint:', 'Context:')
|
90 |
+
for i in range(1, 7): # replace A ~ F
|
91 |
+
prompt = prompt.replace(f'({chr(64 + i)})', f'{chr(64 + i)}.')
|
92 |
+
prompt += '\nAnswer with the option’s letter from the given choices directly.'
|
93 |
+
else:
|
94 |
+
prompt += '\nAnswer the question using a single word or phrase.'
|
95 |
+
|
96 |
+
return prompt, image
|
97 |
+
|
98 |
+
def generate_inner(self, inputs, **kwargs) -> str:
|
99 |
+
|
100 |
+
assert isinstance(inputs, str) or isinstance(inputs, list)
|
101 |
+
pure_text = np.all([x['type'] == 'text' for x in inputs])
|
102 |
+
assert not pure_text
|
103 |
+
|
104 |
+
prompt, image_path = self.message_to_promptimg(inputs, kwargs['dataset'])
|
105 |
+
|
106 |
+
try:
|
107 |
+
response = multimodal(image_path, prompt, self.url, self.key, self.temperature, self.max_tokens)
|
108 |
+
answer = response['result']
|
109 |
+
return 0, answer, 'Succeeded! '
|
110 |
+
except Exception as err:
|
111 |
+
if self.verbose:
|
112 |
+
self.logger.error(f'{type(err)}: {err}')
|
113 |
+
self.logger.error(f'The input messages are {inputs}.')
|
114 |
+
return -1, '', ''
|
115 |
+
|
116 |
+
|
117 |
+
class BlueLM_V_API(BlueLMWrapper):
|
118 |
+
|
119 |
+
def generate(self, message, dataset=None):
|
120 |
+
return super(BlueLM_V_API, self).generate(message, dataset=dataset)
|
vlmeval/VLMEvalKit_old/vlmeval/api/gpt.py
ADDED
@@ -0,0 +1,263 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
from ..smp import *
|
2 |
+
import os
|
3 |
+
import sys
|
4 |
+
from .base import BaseAPI
|
5 |
+
|
6 |
+
APIBASES = {
|
7 |
+
'OFFICIAL': 'https://api.openai.com/v1/chat/completions',
|
8 |
+
}
|
9 |
+
|
10 |
+
|
11 |
+
def GPT_context_window(model):
|
12 |
+
length_map = {
|
13 |
+
'gpt-4': 8192,
|
14 |
+
'gpt-4-0613': 8192,
|
15 |
+
'gpt-4-turbo-preview': 128000,
|
16 |
+
'gpt-4-1106-preview': 128000,
|
17 |
+
'gpt-4-0125-preview': 128000,
|
18 |
+
'gpt-4-vision-preview': 128000,
|
19 |
+
'gpt-4-turbo': 128000,
|
20 |
+
'gpt-4-turbo-2024-04-09': 128000,
|
21 |
+
'gpt-3.5-turbo': 16385,
|
22 |
+
'gpt-3.5-turbo-0125': 16385,
|
23 |
+
'gpt-3.5-turbo-1106': 16385,
|
24 |
+
'gpt-3.5-turbo-instruct': 4096,
|
25 |
+
}
|
26 |
+
if model in length_map:
|
27 |
+
return length_map[model]
|
28 |
+
else:
|
29 |
+
return 128000
|
30 |
+
|
31 |
+
|
32 |
+
class OpenAIWrapper(BaseAPI):
|
33 |
+
|
34 |
+
is_api: bool = True
|
35 |
+
|
36 |
+
def __init__(self,
|
37 |
+
model: str = 'gpt-3.5-turbo-0613',
|
38 |
+
retry: int = 5,
|
39 |
+
wait: int = 5,
|
40 |
+
key: str = None,
|
41 |
+
verbose: bool = False,
|
42 |
+
system_prompt: str = None,
|
43 |
+
temperature: float = 0,
|
44 |
+
timeout: int = 60,
|
45 |
+
api_base: str = None,
|
46 |
+
max_tokens: int = 1024,
|
47 |
+
img_size: int = 512,
|
48 |
+
img_detail: str = 'low',
|
49 |
+
use_azure: bool = False,
|
50 |
+
**kwargs):
|
51 |
+
|
52 |
+
self.model = model
|
53 |
+
self.cur_idx = 0
|
54 |
+
self.fail_msg = 'Failed to obtain answer via API. '
|
55 |
+
self.max_tokens = max_tokens
|
56 |
+
self.temperature = temperature
|
57 |
+
self.use_azure = use_azure
|
58 |
+
|
59 |
+
if 'step-1v' in model:
|
60 |
+
env_key = os.environ.get('STEPAI_API_KEY', '')
|
61 |
+
if key is None:
|
62 |
+
key = env_key
|
63 |
+
elif 'yi-vision' in model:
|
64 |
+
env_key = os.environ.get('YI_API_KEY', '')
|
65 |
+
if key is None:
|
66 |
+
key = env_key
|
67 |
+
elif 'internvl2-pro' in model:
|
68 |
+
env_key = os.environ.get('InternVL2_PRO_KEY', '')
|
69 |
+
if key is None:
|
70 |
+
key = env_key
|
71 |
+
else:
|
72 |
+
if use_azure:
|
73 |
+
env_key = os.environ.get('AZURE_OPENAI_API_KEY', None)
|
74 |
+
assert env_key is not None, 'Please set the environment variable AZURE_OPENAI_API_KEY. '
|
75 |
+
|
76 |
+
if key is None:
|
77 |
+
key = env_key
|
78 |
+
assert isinstance(key, str), (
|
79 |
+
'Please set the environment variable AZURE_OPENAI_API_KEY to your openai key. '
|
80 |
+
)
|
81 |
+
else:
|
82 |
+
env_key = os.environ.get('OPENAI_API_KEY', '')
|
83 |
+
if key is None:
|
84 |
+
key = env_key
|
85 |
+
assert isinstance(key, str) and key.startswith('sk-'), (
|
86 |
+
f'Illegal openai_key {key}. '
|
87 |
+
'Please set the environment variable OPENAI_API_KEY to your openai key. '
|
88 |
+
)
|
89 |
+
|
90 |
+
self.key = key
|
91 |
+
assert img_size > 0 or img_size == -1
|
92 |
+
self.img_size = img_size
|
93 |
+
assert img_detail in ['high', 'low']
|
94 |
+
self.img_detail = img_detail
|
95 |
+
self.timeout = timeout
|
96 |
+
|
97 |
+
super().__init__(wait=wait, retry=retry, system_prompt=system_prompt, verbose=verbose, **kwargs)
|
98 |
+
|
99 |
+
if use_azure:
|
100 |
+
api_base_template = (
|
101 |
+
'{endpoint}openai/deployments/{deployment_name}/chat/completions?api-version={api_version}'
|
102 |
+
)
|
103 |
+
endpoint = os.getenv('AZURE_OPENAI_ENDPOINT', None)
|
104 |
+
assert endpoint is not None, 'Please set the environment variable AZURE_OPENAI_ENDPOINT. '
|
105 |
+
deployment_name = os.getenv('AZURE_OPENAI_DEPLOYMENT_NAME', None)
|
106 |
+
assert deployment_name is not None, 'Please set the environment variable AZURE_OPENAI_DEPLOYMENT_NAME. '
|
107 |
+
api_version = os.getenv('OPENAI_API_VERSION', None)
|
108 |
+
assert api_version is not None, 'Please set the environment variable OPENAI_API_VERSION. '
|
109 |
+
|
110 |
+
self.api_base = api_base_template.format(
|
111 |
+
endpoint=os.getenv('AZURE_OPENAI_ENDPOINT'),
|
112 |
+
deployment_name=os.getenv('AZURE_OPENAI_DEPLOYMENT_NAME'),
|
113 |
+
api_version=os.getenv('OPENAI_API_VERSION')
|
114 |
+
)
|
115 |
+
else:
|
116 |
+
if api_base is None:
|
117 |
+
if 'OPENAI_API_BASE' in os.environ and os.environ['OPENAI_API_BASE'] != '':
|
118 |
+
self.logger.info('Environment variable OPENAI_API_BASE is set. Will use it as api_base. ')
|
119 |
+
api_base = os.environ['OPENAI_API_BASE']
|
120 |
+
else:
|
121 |
+
api_base = 'OFFICIAL'
|
122 |
+
|
123 |
+
assert api_base is not None
|
124 |
+
|
125 |
+
if api_base in APIBASES:
|
126 |
+
self.api_base = APIBASES[api_base]
|
127 |
+
elif api_base.startswith('http'):
|
128 |
+
self.api_base = api_base
|
129 |
+
else:
|
130 |
+
self.logger.error('Unknown API Base. ')
|
131 |
+
raise NotImplementedError
|
132 |
+
|
133 |
+
self.logger.info(f'Using API Base: {self.api_base}; API Key: {self.key}')
|
134 |
+
|
135 |
+
# inputs can be a lvl-2 nested list: [content1, content2, content3, ...]
|
136 |
+
# content can be a string or a list of image & text
|
137 |
+
def prepare_itlist(self, inputs):
|
138 |
+
assert np.all([isinstance(x, dict) for x in inputs])
|
139 |
+
has_images = np.sum([x['type'] == 'image' for x in inputs])
|
140 |
+
if has_images:
|
141 |
+
content_list = []
|
142 |
+
for msg in inputs:
|
143 |
+
if msg['type'] == 'text':
|
144 |
+
content_list.append(dict(type='text', text=msg['value']))
|
145 |
+
elif msg['type'] == 'image':
|
146 |
+
from PIL import Image
|
147 |
+
img = Image.open(msg['value'])
|
148 |
+
b64 = encode_image_to_base64(img, target_size=self.img_size)
|
149 |
+
img_struct = dict(url=f'data:image/jpeg;base64,{b64}', detail=self.img_detail)
|
150 |
+
content_list.append(dict(type='image_url', image_url=img_struct))
|
151 |
+
else:
|
152 |
+
assert all([x['type'] == 'text' for x in inputs])
|
153 |
+
text = '\n'.join([x['value'] for x in inputs])
|
154 |
+
content_list = [dict(type='text', text=text)]
|
155 |
+
return content_list
|
156 |
+
|
157 |
+
def prepare_inputs(self, inputs):
|
158 |
+
input_msgs = []
|
159 |
+
if self.system_prompt is not None:
|
160 |
+
input_msgs.append(dict(role='system', content=self.system_prompt))
|
161 |
+
assert isinstance(inputs, list) and isinstance(inputs[0], dict)
|
162 |
+
assert np.all(['type' in x for x in inputs]) or np.all(['role' in x for x in inputs]), inputs
|
163 |
+
if 'role' in inputs[0]:
|
164 |
+
assert inputs[-1]['role'] == 'user', inputs[-1]
|
165 |
+
for item in inputs:
|
166 |
+
input_msgs.append(dict(role=item['role'], content=self.prepare_itlist(item['content'])))
|
167 |
+
else:
|
168 |
+
input_msgs.append(dict(role='user', content=self.prepare_itlist(inputs)))
|
169 |
+
return input_msgs
|
170 |
+
|
171 |
+
def generate_inner(self, inputs, **kwargs) -> str:
|
172 |
+
input_msgs = self.prepare_inputs(inputs)
|
173 |
+
temperature = kwargs.pop('temperature', self.temperature)
|
174 |
+
max_tokens = kwargs.pop('max_tokens', self.max_tokens)
|
175 |
+
|
176 |
+
context_window = GPT_context_window(self.model)
|
177 |
+
new_max_tokens = min(max_tokens, context_window - self.get_token_len(inputs))
|
178 |
+
if 0 < new_max_tokens <= 100 and new_max_tokens < max_tokens:
|
179 |
+
self.logger.warning(
|
180 |
+
'Less than 100 tokens left, '
|
181 |
+
'may exceed the context window with some additional meta symbols. '
|
182 |
+
)
|
183 |
+
if new_max_tokens <= 0:
|
184 |
+
return 0, self.fail_msg + 'Input string longer than context window. ', 'Length Exceeded. '
|
185 |
+
max_tokens = new_max_tokens
|
186 |
+
|
187 |
+
# Will send request if use Azure, dk how to use openai client for it
|
188 |
+
if self.use_azure:
|
189 |
+
headers = {'Content-Type': 'application/json', 'api-key': self.key}
|
190 |
+
elif 'internvl2-pro' in self.model:
|
191 |
+
headers = {'Content-Type': 'application/json', 'Authorization': self.key}
|
192 |
+
else:
|
193 |
+
headers = {'Content-Type': 'application/json', 'Authorization': f'Bearer {self.key}'}
|
194 |
+
payload = dict(
|
195 |
+
model=self.model,
|
196 |
+
messages=input_msgs,
|
197 |
+
max_tokens=max_tokens,
|
198 |
+
n=1,
|
199 |
+
temperature=temperature,
|
200 |
+
**kwargs)
|
201 |
+
response = requests.post(
|
202 |
+
self.api_base,
|
203 |
+
headers=headers, data=json.dumps(payload), timeout=self.timeout * 1.1)
|
204 |
+
ret_code = response.status_code
|
205 |
+
ret_code = 0 if (200 <= int(ret_code) < 300) else ret_code
|
206 |
+
answer = self.fail_msg
|
207 |
+
try:
|
208 |
+
resp_struct = json.loads(response.text)
|
209 |
+
answer = resp_struct['choices'][0]['message']['content'].strip()
|
210 |
+
except Exception as err:
|
211 |
+
if self.verbose:
|
212 |
+
self.logger.error(f'{type(err)}: {err}')
|
213 |
+
self.logger.error(response.text if hasattr(response, 'text') else response)
|
214 |
+
|
215 |
+
return ret_code, answer, response
|
216 |
+
|
217 |
+
def get_image_token_len(self, img_path, detail='low'):
|
218 |
+
import math
|
219 |
+
if detail == 'low':
|
220 |
+
return 85
|
221 |
+
|
222 |
+
im = Image.open(img_path)
|
223 |
+
height, width = im.size
|
224 |
+
if width > 1024 or height > 1024:
|
225 |
+
if width > height:
|
226 |
+
height = int(height * 1024 / width)
|
227 |
+
width = 1024
|
228 |
+
else:
|
229 |
+
width = int(width * 1024 / height)
|
230 |
+
height = 1024
|
231 |
+
|
232 |
+
h = math.ceil(height / 512)
|
233 |
+
w = math.ceil(width / 512)
|
234 |
+
total = 85 + 170 * h * w
|
235 |
+
return total
|
236 |
+
|
237 |
+
def get_token_len(self, inputs) -> int:
|
238 |
+
import tiktoken
|
239 |
+
try:
|
240 |
+
enc = tiktoken.encoding_for_model(self.model)
|
241 |
+
except Exception as err:
|
242 |
+
if 'gpt' in self.model.lower():
|
243 |
+
if self.verbose:
|
244 |
+
self.logger.warning(f'{type(err)}: {err}')
|
245 |
+
enc = tiktoken.encoding_for_model('gpt-4')
|
246 |
+
else:
|
247 |
+
return 0
|
248 |
+
assert isinstance(inputs, list)
|
249 |
+
tot = 0
|
250 |
+
for item in inputs:
|
251 |
+
if 'role' in item:
|
252 |
+
tot += self.get_token_len(item['content'])
|
253 |
+
elif item['type'] == 'text':
|
254 |
+
tot += len(enc.encode(item['value']))
|
255 |
+
elif item['type'] == 'image':
|
256 |
+
tot += self.get_image_token_len(item['value'], detail=self.img_detail)
|
257 |
+
return tot
|
258 |
+
|
259 |
+
|
260 |
+
class GPT4V(OpenAIWrapper):
|
261 |
+
|
262 |
+
def generate(self, message, dataset=None):
|
263 |
+
return super(GPT4V, self).generate(message)
|
vlmeval/VLMEvalKit_old/vlmeval/api/sensechat_vision.py
ADDED
@@ -0,0 +1,257 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
from vlmeval.smp import *
|
2 |
+
from vlmeval.api.base import BaseAPI
|
3 |
+
from vlmeval.dataset import img_root_map
|
4 |
+
from vlmeval.dataset import DATASET_TYPE
|
5 |
+
|
6 |
+
|
7 |
+
class SenseChatVisionWrapper(BaseAPI):
|
8 |
+
|
9 |
+
is_api: bool = True
|
10 |
+
|
11 |
+
def __init__(self,
|
12 |
+
model: str = 'SenseChat-5-Vision',
|
13 |
+
retry: int = 5,
|
14 |
+
wait: int = 5,
|
15 |
+
ak: str = None,
|
16 |
+
sk: str = None,
|
17 |
+
verbose: bool = True,
|
18 |
+
system_prompt: str = None,
|
19 |
+
max_tokens: int = 1024,
|
20 |
+
proxy: str = None,
|
21 |
+
**kwargs):
|
22 |
+
|
23 |
+
self.model = model
|
24 |
+
self.fail_msg = 'Failed to obtain answer via API. '
|
25 |
+
self.ak = os.environ.get('SENSECHAT_AK', None) if ak is None else ak
|
26 |
+
self.sk = os.environ.get('SENSECHAT_SK', None) if sk is None else sk
|
27 |
+
assert self.ak is not None and self.sk is not None
|
28 |
+
self.max_new_tokens = max_tokens
|
29 |
+
super().__init__(wait=wait, retry=retry, system_prompt=system_prompt, verbose=verbose, **kwargs)
|
30 |
+
|
31 |
+
def dump_image(self, line, dataset):
|
32 |
+
"""Dump the image(s) of the input line to the corresponding dataset folder.
|
33 |
+
|
34 |
+
Args:
|
35 |
+
line (line of pd.DataFrame): The raw input line.
|
36 |
+
dataset (str): The name of the dataset.
|
37 |
+
|
38 |
+
Returns:
|
39 |
+
str | list[str]: The paths of the dumped images.
|
40 |
+
"""
|
41 |
+
ROOT = LMUDataRoot()
|
42 |
+
assert isinstance(dataset, str)
|
43 |
+
img_root = osp.join(ROOT, 'images', img_root_map[dataset] if dataset in img_root_map else dataset)
|
44 |
+
os.makedirs(img_root, exist_ok=True)
|
45 |
+
if 'image' in line:
|
46 |
+
if isinstance(line['image'], list):
|
47 |
+
tgt_path = []
|
48 |
+
assert 'image_path' in line
|
49 |
+
for img, im_name in zip(line['image'], line['image_path']):
|
50 |
+
path = osp.join(img_root, im_name)
|
51 |
+
if not read_ok(path):
|
52 |
+
decode_base64_to_image_file(img, path)
|
53 |
+
tgt_path.append(path)
|
54 |
+
else:
|
55 |
+
tgt_path = osp.join(img_root, f"{line['index']}.jpg")
|
56 |
+
if not read_ok(tgt_path):
|
57 |
+
decode_base64_to_image_file(line['image'], tgt_path)
|
58 |
+
tgt_path = [tgt_path]
|
59 |
+
else:
|
60 |
+
assert 'image_path' in line
|
61 |
+
tgt_path = toliststr(line['image_path'])
|
62 |
+
|
63 |
+
return tgt_path
|
64 |
+
|
65 |
+
def image_to_base64(self, image_path):
|
66 |
+
import base64
|
67 |
+
with open(image_path, 'rb') as image_file:
|
68 |
+
encoded_string = base64.b64encode(image_file.read())
|
69 |
+
return encoded_string.decode('utf-8')
|
70 |
+
|
71 |
+
def encode_jwt_token(self, ak, sk):
|
72 |
+
import jwt
|
73 |
+
headers = {'alg': 'HS256', 'typ': 'JWT'}
|
74 |
+
payload = {
|
75 |
+
'iss': ak,
|
76 |
+
'exp': int(time.time())
|
77 |
+
+ 1800, # 填写您期望的有效时间,此处示例代表当前时间+30分钟
|
78 |
+
'nbf': int(time.time()) - 5, # 填写您期望的生效时间,此处示例代表当前时间-5秒
|
79 |
+
}
|
80 |
+
token = jwt.encode(payload, sk, headers=headers)
|
81 |
+
return token
|
82 |
+
|
83 |
+
def use_custom_prompt(self, dataset):
|
84 |
+
return True
|
85 |
+
|
86 |
+
def build_multi_choice_prompt(self, line, dataset=None):
|
87 |
+
question = line['question']
|
88 |
+
hint = line['hint'] if ('hint' in line and not pd.isna(line['hint'])) else None
|
89 |
+
if hint is not None:
|
90 |
+
question = hint + '\n' + question
|
91 |
+
|
92 |
+
options = {
|
93 |
+
cand: line[cand]
|
94 |
+
for cand in string.ascii_uppercase
|
95 |
+
if cand in line and not pd.isna(line[cand])
|
96 |
+
}
|
97 |
+
for key, item in options.items():
|
98 |
+
question += f'\n{key}. {item}'
|
99 |
+
prompt = question
|
100 |
+
|
101 |
+
if len(options):
|
102 |
+
prompt += '\n请直接回答选项字母。' if cn_string(
|
103 |
+
prompt) else "\nAnswer with the option's letter from the given choices directly."
|
104 |
+
else:
|
105 |
+
prompt += '\n请直接回答问题。' if cn_string(prompt) else '\nAnswer the question directly.'
|
106 |
+
|
107 |
+
return prompt
|
108 |
+
|
109 |
+
def build_prompt(self, line, dataset=None):
|
110 |
+
assert self.use_custom_prompt(dataset)
|
111 |
+
assert dataset is None or isinstance(dataset, str)
|
112 |
+
|
113 |
+
tgt_path = self.dump_image(line, dataset)
|
114 |
+
|
115 |
+
if dataset is not None and listinstr(['MME'], dataset):
|
116 |
+
question = line['question']
|
117 |
+
prompt = question + ' Answer the question using a single word or phrase.'
|
118 |
+
elif dataset is not None and listinstr(['HallusionBench'], dataset):
|
119 |
+
question = line['question']
|
120 |
+
prompt = question + ' Please answer yes or no. Answer the question using a single word or phrase.'
|
121 |
+
elif dataset is not None and DATASET_TYPE(dataset) == 'MCQ' and 'MMMU' not in dataset:
|
122 |
+
prompt = self.build_multi_choice_prompt(line, dataset)
|
123 |
+
elif dataset is not None and DATASET_TYPE(dataset) == 'VQA':
|
124 |
+
if 'MathVista' in dataset:
|
125 |
+
prompt = line['question']
|
126 |
+
elif listinstr(['LLaVABench'], dataset):
|
127 |
+
question = line['question']
|
128 |
+
prompt = question + '\nAnswer this question in detail.'
|
129 |
+
elif listinstr(['MMVet'], dataset):
|
130 |
+
prompt = line['question']
|
131 |
+
else:
|
132 |
+
question = line['question']
|
133 |
+
prompt = question + '\nAnswer the question using a single word or phrase.'
|
134 |
+
elif dataset is not None and 'MMMU' in dataset:
|
135 |
+
question = line['question']
|
136 |
+
options = {
|
137 |
+
cand: line[cand]
|
138 |
+
for cand in string.ascii_uppercase
|
139 |
+
if cand in line and not pd.isna(line[cand])
|
140 |
+
}
|
141 |
+
for key, item in options.items():
|
142 |
+
question += f'\n{key}. {item}'
|
143 |
+
prompt = {
|
144 |
+
'multiple-choice': 'You are an expert in {}. Please solve the university-level {} examination question, which includes interleaved images and text. Your output should be divided into two parts: First, reason about the correct answer. Then write the answer in the following format where X is exactly one of the choices given by the problem: "ANSWER: X". If you are uncertain of the correct answer, guess the most likely one.', # noqa: E501
|
145 |
+
'open': 'You are an expert in {}. Please solve the university-level {} examination question, which includes interleaved images and text. Your output should be divided into two parts: First, reason about the correct answer. Then write the answer in the following format where X is only the answer and nothing else: "ANSWER: X"' # noqa: E501
|
146 |
+
}
|
147 |
+
subject = '_'.join(line['id'].split('_')[1:-1])
|
148 |
+
prompt = prompt[line['question_type']].format(subject, subject) + '\n' + question
|
149 |
+
else:
|
150 |
+
prompt = line['question']
|
151 |
+
|
152 |
+
message = [dict(type='text', value=prompt)]
|
153 |
+
message.extend([dict(type='image', value=s) for s in tgt_path])
|
154 |
+
|
155 |
+
return message
|
156 |
+
|
157 |
+
def message_to_promptimg(self, message, dataset=None):
|
158 |
+
if dataset is None or listinstr(['MMMU', 'BLINK'], dataset):
|
159 |
+
prompt = '\n'.join([x['value'] for x in message if x['type'] == 'text'])
|
160 |
+
image = [[x['value'] for x in message if x['type'] == 'image'][0]]
|
161 |
+
else:
|
162 |
+
prompt = '\n'.join([x['value'] for x in message if x['type'] == 'text'])
|
163 |
+
image = [x['value'] for x in message if x['type'] == 'image']
|
164 |
+
return prompt, image
|
165 |
+
|
166 |
+
def generate_inner(self, inputs, **kwargs) -> str:
|
167 |
+
assert isinstance(inputs, str) or isinstance(inputs, list)
|
168 |
+
inputs = [inputs] if isinstance(inputs, str) else inputs
|
169 |
+
dataset = kwargs.get('dataset', None)
|
170 |
+
|
171 |
+
if dataset is not None and listinstr(['ChartQA_TEST'], dataset):
|
172 |
+
self.max_num = 12
|
173 |
+
elif dataset is not None and listinstr(['DocVQA_VAL', 'DocVQA_TEST'], dataset):
|
174 |
+
self.max_num = 18
|
175 |
+
elif dataset is not None and listinstr(['InfoVQA_VAL', 'InfoVQA_TEST', 'OCRBench'], dataset):
|
176 |
+
self.max_num = 24
|
177 |
+
else:
|
178 |
+
self.max_num = 6
|
179 |
+
|
180 |
+
if dataset is None:
|
181 |
+
pass
|
182 |
+
elif listinstr(['AI2D_TEST'], dataset):
|
183 |
+
self.max_new_tokens = 10
|
184 |
+
elif 'MMMU' in dataset:
|
185 |
+
self.max_new_tokens = 1024
|
186 |
+
elif 'MMBench' in dataset:
|
187 |
+
self.max_new_tokens = 100
|
188 |
+
|
189 |
+
prompt, image = self.message_to_promptimg(message=inputs, dataset=dataset)
|
190 |
+
|
191 |
+
url = 'https://api.sensenova.cn/v1/llm/chat-completions'
|
192 |
+
api_secret_key = self.encode_jwt_token(self.ak, self.sk)
|
193 |
+
|
194 |
+
content = [{
|
195 |
+
'image_base64': self.image_to_base64(item),
|
196 |
+
'image_file_id': '',
|
197 |
+
'image_url': '',
|
198 |
+
'text': '',
|
199 |
+
'text': '',
|
200 |
+
'type': 'image_base64'
|
201 |
+
} for item in image]
|
202 |
+
|
203 |
+
content.append({
|
204 |
+
'image_base64': '',
|
205 |
+
'image_file_id': '',
|
206 |
+
'image_url': '',
|
207 |
+
'text': prompt,
|
208 |
+
'type': 'text'
|
209 |
+
})
|
210 |
+
|
211 |
+
message = [{'content': content, 'role': 'user'}]
|
212 |
+
|
213 |
+
data = {
|
214 |
+
'messages': message,
|
215 |
+
'max_new_tokens': self.max_new_tokens,
|
216 |
+
'model': self.model,
|
217 |
+
'stream': False,
|
218 |
+
}
|
219 |
+
headers = {
|
220 |
+
'Content-type': 'application/json',
|
221 |
+
'Authorization': 'Bearer ' + api_secret_key
|
222 |
+
}
|
223 |
+
|
224 |
+
response = requests.post(
|
225 |
+
url,
|
226 |
+
headers=headers,
|
227 |
+
json=data,
|
228 |
+
)
|
229 |
+
request_id = response.headers['x-request-id']
|
230 |
+
|
231 |
+
time.sleep(1)
|
232 |
+
try:
|
233 |
+
assert response.status_code == 200
|
234 |
+
response = response.json()['data']['choices'][0]['message'].strip()
|
235 |
+
if dataset is not None and 'MMMU' in dataset:
|
236 |
+
response = response.split('ANSWER: ')[-1].strip()
|
237 |
+
if self.verbose:
|
238 |
+
self.logger.info(f'inputs: {inputs}\nanswer: {response}')
|
239 |
+
return 0, response, 'Succeeded! '
|
240 |
+
except Exception as err:
|
241 |
+
if self.verbose:
|
242 |
+
self.logger.error('---------------------------ERROR---------------------------')
|
243 |
+
self.logger.error(response.json())
|
244 |
+
self.logger.error(f'{type(err)}: {err}')
|
245 |
+
self.logger.error('---------------------------request_id---------------------------' + request_id)
|
246 |
+
self.logger.error(
|
247 |
+
'api error' + response.json()['error']['message']
|
248 |
+
+ str([input['value'] if input['type'] == 'image' else None for input in inputs])
|
249 |
+
)
|
250 |
+
self.logger.error(f'The input messages are {inputs}.')
|
251 |
+
return -1, response.json()['error']['message'], ''
|
252 |
+
|
253 |
+
|
254 |
+
class SenseChatVisionAPI(SenseChatVisionWrapper):
|
255 |
+
|
256 |
+
def generate(self, message, dataset=None):
|
257 |
+
return super(SenseChatVisionAPI, self).generate(message, dataset=dataset)
|
vlmeval/VLMEvalKit_old/vlmeval/dataset/__init__.py
ADDED
@@ -0,0 +1,228 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
import warnings
|
2 |
+
|
3 |
+
from .image_base import img_root_map, ImageBaseDataset
|
4 |
+
from .image_caption import ImageCaptionDataset
|
5 |
+
from .image_yorn import ImageYORNDataset
|
6 |
+
from .image_mcq import (
|
7 |
+
ImageMCQDataset, MMMUDataset, CustomMCQDataset, MUIRDataset, GMAIMMBenchDataset, MMERealWorld, HRBenchDataset,
|
8 |
+
NaturalBenchDataset
|
9 |
+
)
|
10 |
+
from .image_mt import MMDUDataset
|
11 |
+
from .image_vqa import (
|
12 |
+
ImageVQADataset, MathVision, OCRBench, MathVista, LLaVABench, MMVet, MTVQADataset, TableVQABench,
|
13 |
+
CustomVQADataset, CRPE, MathVerse, OlympiadBench, QSpatial, VizWiz, MMNIAH
|
14 |
+
)
|
15 |
+
|
16 |
+
from .text_mcq import CustomTextMCQDataset, TextMCQDataset
|
17 |
+
|
18 |
+
from .vcr import VCRDataset
|
19 |
+
from .mmlongbench import MMLongBench
|
20 |
+
from .dude import DUDE
|
21 |
+
from .slidevqa import SlideVQA
|
22 |
+
|
23 |
+
from .mmbench_video import MMBenchVideo
|
24 |
+
from .videomme import VideoMME
|
25 |
+
from .mvbench import MVBench, MVBench_MP4
|
26 |
+
from .mlvu import MLVU, MLVU_MCQ, MLVU_OpenEnded
|
27 |
+
from .tempcompass import TempCompass, TempCompass_Captioning, TempCompass_MCQ, TempCompass_YorN
|
28 |
+
from .longvideobench import LongVideoBench
|
29 |
+
from .video_concat_dataset import ConcatVideoDataset
|
30 |
+
from .mmgenbench import MMGenBench
|
31 |
+
|
32 |
+
from .miabench import MIABench
|
33 |
+
from .wildvision import WildVision
|
34 |
+
from .mmmath import MMMath
|
35 |
+
from .dynamath import Dynamath
|
36 |
+
from .utils import *
|
37 |
+
from ..smp import *
|
38 |
+
|
39 |
+
|
40 |
+
class ConcatDataset(ImageBaseDataset):
|
41 |
+
# This dataset takes multiple dataset names as input and aggregate them into a single dataset.
|
42 |
+
# Each single dataset should not have a field named `SUB_DATASET`
|
43 |
+
|
44 |
+
DATASET_SETS = {
|
45 |
+
'MMMB': ['MMMB_ar', 'MMMB_cn', 'MMMB_en', 'MMMB_pt', 'MMMB_ru', 'MMMB_tr'],
|
46 |
+
'MTL_MMBench_DEV': [
|
47 |
+
'MMBench_dev_ar', 'MMBench_dev_cn', 'MMBench_dev_en',
|
48 |
+
'MMBench_dev_pt', 'MMBench_dev_ru', 'MMBench_dev_tr'
|
49 |
+
]
|
50 |
+
}
|
51 |
+
|
52 |
+
def __init__(self, dataset):
|
53 |
+
datasets = self.DATASET_SETS[dataset]
|
54 |
+
self.dataset_map = {}
|
55 |
+
# The name of the compliation
|
56 |
+
self.dataset_name = dataset
|
57 |
+
self.datasets = datasets
|
58 |
+
for dname in datasets:
|
59 |
+
dataset = build_dataset(dname)
|
60 |
+
assert dataset is not None, dataset
|
61 |
+
self.dataset_map[dname] = dataset
|
62 |
+
TYPES = [x.TYPE for x in self.dataset_map.values()]
|
63 |
+
MODALITIES = [x.MODALITY for x in self.dataset_map.values()]
|
64 |
+
assert np.all([x == TYPES[0] for x in TYPES]), (datasets, TYPES)
|
65 |
+
assert np.all([x == MODALITIES[0] for x in MODALITIES]), (datasets, MODALITIES)
|
66 |
+
self.TYPE = TYPES[0]
|
67 |
+
self.MODALITY = MODALITIES[0]
|
68 |
+
data_all = []
|
69 |
+
for dname in datasets:
|
70 |
+
data = self.dataset_map[dname].data
|
71 |
+
data['SUB_DATASET'] = [dname] * len(data)
|
72 |
+
data_new = localize_df(data, dname, nproc=16)
|
73 |
+
data_all.append(data_new)
|
74 |
+
|
75 |
+
data = pd.concat(data_all)
|
76 |
+
data['original_index'] = data.pop('index')
|
77 |
+
data['index'] = np.arange(len(data))
|
78 |
+
self.data = data
|
79 |
+
|
80 |
+
def build_prompt(self, line):
|
81 |
+
if isinstance(line, int):
|
82 |
+
line = self.data.iloc[line]
|
83 |
+
idx = line['original_index']
|
84 |
+
dname = line['SUB_DATASET']
|
85 |
+
org_data = self.dataset_map[dname].data
|
86 |
+
org_line = cp.deepcopy(org_data[org_data['index'] == idx]).iloc[0]
|
87 |
+
return self.dataset_map[dname].build_prompt(org_line)
|
88 |
+
|
89 |
+
def dump_image(self, line):
|
90 |
+
# Assert all images are pre-dumped
|
91 |
+
assert 'image' not in line
|
92 |
+
assert 'image_path' in line
|
93 |
+
tgt_path = toliststr(line['image_path'])
|
94 |
+
return tgt_path
|
95 |
+
|
96 |
+
@classmethod
|
97 |
+
def supported_datasets(cls):
|
98 |
+
return list(cls.DATASET_SETS)
|
99 |
+
|
100 |
+
def evaluate(self, eval_file, **judge_kwargs):
|
101 |
+
suffix = eval_file.split('.')[-1]
|
102 |
+
# First, split the eval_file by dataset
|
103 |
+
data_all = load(eval_file)
|
104 |
+
for dname in self.datasets:
|
105 |
+
tgt = eval_file.replace(self.dataset_name, dname)
|
106 |
+
data_sub = data_all[data_all['SUB_DATASET'] == dname]
|
107 |
+
data_sub.pop('index')
|
108 |
+
data_sub['index'] = data_sub.pop('original_index')
|
109 |
+
data_sub.pop('SUB_DATASET')
|
110 |
+
dump(data_sub, tgt)
|
111 |
+
# Then, evaluate each dataset separately
|
112 |
+
results_all = []
|
113 |
+
for dname in self.datasets:
|
114 |
+
tgt = eval_file.replace(self.dataset_name, dname)
|
115 |
+
res = self.dataset_map[dname].evaluate(tgt, **judge_kwargs)
|
116 |
+
assert isinstance(res, pd.DataFrame)
|
117 |
+
res['DATASET'] = [dname] * len(res)
|
118 |
+
results_all.append(res)
|
119 |
+
result = pd.concat(results_all)
|
120 |
+
score_file = eval_file.replace(f'.{suffix}', '_acc.csv')
|
121 |
+
dump(result, score_file)
|
122 |
+
return result
|
123 |
+
|
124 |
+
|
125 |
+
# Add new supported dataset class here
|
126 |
+
IMAGE_DATASET = [
|
127 |
+
ImageCaptionDataset, ImageYORNDataset, ImageMCQDataset, ImageVQADataset, MathVision,
|
128 |
+
MMMUDataset, OCRBench, MathVista, LLaVABench, MMVet, MTVQADataset, TableVQABench,
|
129 |
+
MMLongBench, VCRDataset, MMDUDataset, DUDE, SlideVQA, MUIRDataset,
|
130 |
+
GMAIMMBenchDataset, MMERealWorld, HRBenchDataset, CRPE, MathVerse, NaturalBenchDataset,
|
131 |
+
MIABench, OlympiadBench, WildVision, MMMath, QSpatial, Dynamath, MMGenBench, VizWiz, MMNIAH
|
132 |
+
]
|
133 |
+
|
134 |
+
VIDEO_DATASET = [
|
135 |
+
MMBenchVideo, VideoMME, MVBench, MVBench_MP4, LongVideoBench,
|
136 |
+
MLVU, MLVU_MCQ, MLVU_OpenEnded,
|
137 |
+
TempCompass, TempCompass_MCQ, TempCompass_Captioning, TempCompass_YorN
|
138 |
+
]
|
139 |
+
|
140 |
+
TEXT_DATASET = [
|
141 |
+
TextMCQDataset
|
142 |
+
]
|
143 |
+
|
144 |
+
CUSTOM_DATASET = [
|
145 |
+
CustomMCQDataset, CustomVQADataset, CustomTextMCQDataset
|
146 |
+
]
|
147 |
+
|
148 |
+
DATASET_COLLECTION = [ConcatDataset, ConcatVideoDataset]
|
149 |
+
|
150 |
+
DATASET_CLASSES = IMAGE_DATASET + VIDEO_DATASET + TEXT_DATASET + CUSTOM_DATASET + DATASET_COLLECTION
|
151 |
+
SUPPORTED_DATASETS = []
|
152 |
+
for DATASET_CLS in DATASET_CLASSES:
|
153 |
+
SUPPORTED_DATASETS.extend(DATASET_CLS.supported_datasets())
|
154 |
+
|
155 |
+
|
156 |
+
def DATASET_TYPE(dataset, *, default: str = 'MCQ') -> str:
|
157 |
+
for cls in DATASET_CLASSES:
|
158 |
+
if dataset in cls.supported_datasets():
|
159 |
+
if hasattr(cls, 'TYPE'):
|
160 |
+
return cls.TYPE
|
161 |
+
# Have to add specific routine to handle ConcatDataset
|
162 |
+
if dataset in ConcatDataset.DATASET_SETS:
|
163 |
+
dataset_list = ConcatDataset.DATASET_SETS[dataset]
|
164 |
+
TYPES = [DATASET_TYPE(dname) for dname in dataset_list]
|
165 |
+
assert np.all([x == TYPES[0] for x in TYPES]), (dataset_list, TYPES)
|
166 |
+
return TYPES[0]
|
167 |
+
|
168 |
+
if 'openended' in dataset.lower():
|
169 |
+
return 'VQA'
|
170 |
+
warnings.warn(f'Dataset {dataset} is a custom one and not annotated as `openended`, will treat as {default}. ')
|
171 |
+
return default
|
172 |
+
|
173 |
+
|
174 |
+
def DATASET_MODALITY(dataset, *, default: str = 'IMAGE') -> str:
|
175 |
+
if dataset is None:
|
176 |
+
warnings.warn(f'Dataset is not specified, will treat modality as {default}. ')
|
177 |
+
return default
|
178 |
+
for cls in DATASET_CLASSES:
|
179 |
+
if dataset in cls.supported_datasets():
|
180 |
+
if hasattr(cls, 'MODALITY'):
|
181 |
+
return cls.MODALITY
|
182 |
+
# Have to add specific routine to handle ConcatDataset
|
183 |
+
if dataset in ConcatDataset.DATASET_SETS:
|
184 |
+
dataset_list = ConcatDataset.DATASET_SETS[dataset]
|
185 |
+
MODALITIES = [DATASET_MODALITY(dname) for dname in dataset_list]
|
186 |
+
assert np.all([x == MODALITIES[0] for x in MODALITIES]), (dataset_list, MODALITIES)
|
187 |
+
return MODALITIES[0]
|
188 |
+
|
189 |
+
if 'VIDEO' in dataset.lower():
|
190 |
+
return 'VIDEO'
|
191 |
+
elif 'IMAGE' in dataset.lower():
|
192 |
+
return 'IMAGE'
|
193 |
+
warnings.warn(f'Dataset {dataset} is a custom one, will treat modality as {default}. ')
|
194 |
+
return default
|
195 |
+
|
196 |
+
|
197 |
+
def build_dataset(dataset_name, **kwargs):
|
198 |
+
for cls in DATASET_CLASSES:
|
199 |
+
if dataset_name in cls.supported_datasets():
|
200 |
+
return cls(dataset=dataset_name, **kwargs)
|
201 |
+
|
202 |
+
warnings.warn(f'Dataset {dataset_name} is not officially supported. ')
|
203 |
+
|
204 |
+
data_file = osp.join(LMUDataRoot(), f'{dataset_name}.tsv')
|
205 |
+
if not osp.exists(data_file):
|
206 |
+
warnings.warn(f'Data file {data_file} does not exist. Dataset building failed. ')
|
207 |
+
return None
|
208 |
+
|
209 |
+
data = load(data_file)
|
210 |
+
if 'question' not in [x.lower() for x in data.columns]:
|
211 |
+
warnings.warn(f'Data file {data_file} does not have a `question` column. Dataset building failed. ')
|
212 |
+
return None
|
213 |
+
|
214 |
+
if 'A' in data and 'B' in data:
|
215 |
+
if 'image' in data or 'image_path' in data:
|
216 |
+
warnings.warn(f'Will assume unsupported dataset {dataset_name} as a Custom MCQ dataset. ')
|
217 |
+
return CustomMCQDataset(dataset=dataset_name, **kwargs)
|
218 |
+
else:
|
219 |
+
warnings.warn(f'Will assume unsupported dataset {dataset_name} as a Custom Text MCQ dataset. ')
|
220 |
+
return CustomTextMCQDataset(dataset=dataset_name, **kwargs)
|
221 |
+
else:
|
222 |
+
warnings.warn(f'Will assume unsupported dataset {dataset_name} as a Custom VQA dataset. ')
|
223 |
+
return CustomVQADataset(dataset=dataset_name, **kwargs)
|
224 |
+
|
225 |
+
|
226 |
+
__all__ = [
|
227 |
+
'build_dataset', 'img_root_map', 'build_judge', 'extract_answer_from_item', 'prefetch_answer', 'DEBUG_MESSAGE'
|
228 |
+
] + [cls.__name__ for cls in DATASET_CLASSES]
|
vlmeval/VLMEvalKit_old/vlmeval/dataset/__pycache__/__init__.cpython-38.pyc
ADDED
Binary file (8.18 kB). View file
|
|
vlmeval/VLMEvalKit_old/vlmeval/dataset/__pycache__/dude.cpython-310.pyc
ADDED
Binary file (7.03 kB). View file
|
|
vlmeval/VLMEvalKit_old/vlmeval/dataset/__pycache__/image_caption.cpython-310.pyc
ADDED
Binary file (3.12 kB). View file
|
|
vlmeval/VLMEvalKit_old/vlmeval/dataset/__pycache__/image_caption.cpython-38.pyc
ADDED
Binary file (3.02 kB). View file
|
|
vlmeval/VLMEvalKit_old/vlmeval/dataset/__pycache__/longvideobench.cpython-38.pyc
ADDED
Binary file (10.6 kB). View file
|
|
vlmeval/VLMEvalKit_old/vlmeval/dataset/__pycache__/mmbench_video.cpython-310.pyc
ADDED
Binary file (10.2 kB). View file
|
|
vlmeval/VLMEvalKit_old/vlmeval/dataset/__pycache__/slidevqa.cpython-310.pyc
ADDED
Binary file (6.76 kB). View file
|
|
vlmeval/VLMEvalKit_old/vlmeval/dataset/__pycache__/slidevqa.cpython-38.pyc
ADDED
Binary file (6.88 kB). View file
|
|
vlmeval/VLMEvalKit_old/vlmeval/dataset/__pycache__/text_base.cpython-38.pyc
ADDED
Binary file (3.46 kB). View file
|
|
vlmeval/VLMEvalKit_old/vlmeval/dataset/image_caption.py
ADDED
@@ -0,0 +1,89 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
from .image_base import ImageBaseDataset
|
2 |
+
from ..smp import *
|
3 |
+
|
4 |
+
MY_PROMPT = '''
|
5 |
+
Hãy mô tả chi tiết người bức ảnh. Hãy sử dụng tiếng Việt.
|
6 |
+
Hãy miêu tả về áo, quần, đầu/mặt, giày/dép, ba lô/túi xách, điện thoại, phương tiện di chuyển,...
|
7 |
+
'''
|
8 |
+
|
9 |
+
|
10 |
+
class COCO_Caption_Scorer:
|
11 |
+
def __init__(self, ref, gt):
|
12 |
+
from pycocoevalcap.bleu.bleu import Bleu
|
13 |
+
from pycocoevalcap.rouge.rouge import Rouge
|
14 |
+
from pycocoevalcap.cider.cider import Cider
|
15 |
+
|
16 |
+
self.ref = ref
|
17 |
+
self.gt = gt
|
18 |
+
print("setting up scorers...")
|
19 |
+
self.scorers = [
|
20 |
+
(Bleu(4), ["Bleu_1", "Bleu_2", "Bleu_3", "Bleu_4"]),
|
21 |
+
(Rouge(), "ROUGE_L"),
|
22 |
+
(Cider(), "CIDEr"),
|
23 |
+
]
|
24 |
+
|
25 |
+
def compute_scores(self):
|
26 |
+
total_scores = {}
|
27 |
+
for scorer, method in self.scorers:
|
28 |
+
print("computing %s score..." % (scorer.method()))
|
29 |
+
score, scores = scorer.compute_score(self.gt, self.ref)
|
30 |
+
if isinstance(method, list):
|
31 |
+
for sc, scs, m in zip(score, scores, method):
|
32 |
+
print("%s: %0.3f" % (m, sc * 100))
|
33 |
+
total_scores["Bleu"] = [x * 100 for x in score]
|
34 |
+
else:
|
35 |
+
print("%s: %0.3f" % (method, score * 100))
|
36 |
+
total_scores[method] = score * 100
|
37 |
+
|
38 |
+
print("*****DONE*****")
|
39 |
+
for key, value in total_scores.items():
|
40 |
+
print("{}:{}".format(key, value))
|
41 |
+
return total_scores
|
42 |
+
|
43 |
+
|
44 |
+
class ImageCaptionDataset(ImageBaseDataset):
|
45 |
+
|
46 |
+
TYPE = "Caption"
|
47 |
+
|
48 |
+
DATASET_URL = {
|
49 |
+
"COCO_VAL": "https://opencompass.openxlab.space/utils/VLMEval/COCO_VAL.tsv",
|
50 |
+
}
|
51 |
+
|
52 |
+
DATASET_MD5 = {
|
53 |
+
"COCO_VAL": "72a5079dead060269ac222c5aa5128af",
|
54 |
+
}
|
55 |
+
|
56 |
+
def load_data(self, dataset):
|
57 |
+
global MY_PROMPT
|
58 |
+
data = super().load_data(dataset)
|
59 |
+
if "question" not in data:
|
60 |
+
data["question"] = [MY_PROMPT] * len(data)
|
61 |
+
return data
|
62 |
+
|
63 |
+
# def load_data(self, dataset):
|
64 |
+
# data = super().load_data(dataset)
|
65 |
+
# if "question" not in data:
|
66 |
+
# data["question"] = [
|
67 |
+
# (
|
68 |
+
# "Please describe this image in general. Directly provide the description, "
|
69 |
+
# 'do not include prefix like "This image depicts". '
|
70 |
+
# )
|
71 |
+
# ] * len(data)
|
72 |
+
# return data
|
73 |
+
|
74 |
+
# It returns a dictionary of scores
|
75 |
+
@classmethod
|
76 |
+
def evaluate(self, eval_file, **kwargs):
|
77 |
+
data = load(eval_file)
|
78 |
+
lt = len(data)
|
79 |
+
lines = [data.iloc[i] for i in range(lt)]
|
80 |
+
ref, gt = {}, {}
|
81 |
+
for i, line in enumerate(lines):
|
82 |
+
ref[str(i)] = [str(line["prediction"])]
|
83 |
+
gt[str(i)] = eval(line["answer"])
|
84 |
+
|
85 |
+
scorer = COCO_Caption_Scorer(ref, gt)
|
86 |
+
coco_caption_score_dict = scorer.compute_scores()
|
87 |
+
score_pth = eval_file.replace(".xlsx", "_score.json")
|
88 |
+
dump(coco_caption_score_dict, score_pth)
|
89 |
+
return coco_caption_score_dict
|
vlmeval/VLMEvalKit_old/vlmeval/dataset/image_vqa.py
ADDED
@@ -0,0 +1,1333 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
import os
|
2 |
+
import re
|
3 |
+
import tempfile
|
4 |
+
from functools import partial
|
5 |
+
from jinja2.sandbox import SandboxedEnvironment
|
6 |
+
from jinja2 import Template
|
7 |
+
|
8 |
+
import pandas as pd
|
9 |
+
|
10 |
+
from .image_base import ImageBaseDataset
|
11 |
+
from .utils import build_judge, DEBUG_MESSAGE
|
12 |
+
from ..smp import *
|
13 |
+
from ..utils import track_progress_rich
|
14 |
+
import ipdb
|
15 |
+
|
16 |
+
|
17 |
+
class ImageVQADataset(ImageBaseDataset):
|
18 |
+
TYPE = 'VQA'
|
19 |
+
|
20 |
+
DATASET_URL = {
|
21 |
+
'OCRVQA_TEST': 'https://opencompass.openxlab.space/utils/VLMEval/OCRVQA_TEST.tsv',
|
22 |
+
'OCRVQA_TESTCORE': 'https://opencompass.openxlab.space/utils/VLMEval/OCRVQA_TESTCORE.tsv',
|
23 |
+
'TextVQA_VAL': 'https://opencompass.openxlab.space/utils/VLMEval/TextVQA_VAL.tsv',
|
24 |
+
'DocVQA_VAL': 'https://opencompass.openxlab.space/utils/VLMEval/DocVQA_VAL.tsv',
|
25 |
+
'DocVQA_TEST': 'https://opencompass.openxlab.space/utils/VLMEval/DocVQA_TEST.tsv',
|
26 |
+
'InfoVQA_VAL': 'https://opencompass.openxlab.space/utils/VLMEval/InfoVQA_VAL.tsv',
|
27 |
+
'InfoVQA_TEST': 'https://opencompass.openxlab.space/utils/VLMEval/InfoVQA_TEST.tsv',
|
28 |
+
'ChartQA_TEST': 'https://opencompass.openxlab.space/utils/VLMEval/ChartQA_TEST.tsv',
|
29 |
+
'GQA_TestDev_Balanced': 'https://opencompass.openxlab.space/utils/VLMEval/GQA_TestDev_Balanced.tsv',
|
30 |
+
}
|
31 |
+
|
32 |
+
DATASET_MD5 = {
|
33 |
+
'OCRVQA_TEST': 'ca46a6d74b403e9d6c0b670f6fc00db9',
|
34 |
+
'OCRVQA_TESTCORE': 'c5239fe77db8bdc1f2ad8e55e0d1fe97',
|
35 |
+
'TextVQA_VAL': 'b233b31f551bbf4056f2f955da3a92cd',
|
36 |
+
'DocVQA_VAL': 'd5ee77e1926ff10690d469c56b73eabf',
|
37 |
+
'DocVQA_TEST': '6a2f28cac26ef2d3447374e8c6f6c8e9',
|
38 |
+
'InfoVQA_VAL': '2342e9c225222f0ef4dec545ebb126fe',
|
39 |
+
'InfoVQA_TEST': 'df535bf51b88dc9718252c34131a6227',
|
40 |
+
'ChartQA_TEST': 'c902e0aa9be5582a7aad6dcf52734b42',
|
41 |
+
'GQA_TestDev_Balanced': 'fead7df22befc1ed3ca2b62ea26fa17b',
|
42 |
+
}
|
43 |
+
|
44 |
+
def build_prompt(self, line):
|
45 |
+
msgs = super().build_prompt(line)
|
46 |
+
assert msgs[-1]['type'] == 'text'
|
47 |
+
msgs[-1]['value'] += '\nAnswer the question using a single word or phrase.'
|
48 |
+
return msgs
|
49 |
+
|
50 |
+
# It returns a DataFrame
|
51 |
+
def evaluate(self, eval_file, **judge_kwargs):
|
52 |
+
from .utils.vqa_eval import hit_calculate, process_line
|
53 |
+
|
54 |
+
data = load(eval_file)
|
55 |
+
dataset = self.dataset_name
|
56 |
+
assert 'answer' in data and 'prediction' in data
|
57 |
+
data['prediction'] = [str(x) for x in data['prediction']]
|
58 |
+
data['answer'] = [str(x) for x in data['answer']]
|
59 |
+
lt = len(data)
|
60 |
+
pool = mp.Pool(16)
|
61 |
+
lines = [data.iloc[i] for i in range(lt)]
|
62 |
+
if listinstr(['TextVQA'], dataset):
|
63 |
+
res = pool.map(partial(process_line, method='vqa_score'), lines)
|
64 |
+
elif listinstr(['ChartQA'], dataset):
|
65 |
+
res = pool.map(partial(process_line, method='relaxed_accuracy'), lines)
|
66 |
+
elif listinstr(['OCRVQA', 'GQA'], dataset):
|
67 |
+
res = pool.map(partial(process_line, method='accuracy'), lines)
|
68 |
+
elif listinstr(['DocVQA', 'InfoVQA'], dataset):
|
69 |
+
res = pool.map(partial(process_line, method='anls'), lines)
|
70 |
+
else: # default using vqa_score to calculate score
|
71 |
+
res = pool.map(process_line, lines)
|
72 |
+
hit = hit_calculate(res, dataset)
|
73 |
+
ret = dict()
|
74 |
+
if 'split' in data:
|
75 |
+
splits = set(data['split'])
|
76 |
+
for sp in splits:
|
77 |
+
sub = [r for l, r in zip(lines, res) if l['split'] == sp]
|
78 |
+
# [np.mean(x['match']) >= full_score_weight for x in sub]
|
79 |
+
hit = hit_calculate(sub, dataset)
|
80 |
+
ret[sp] = np.mean(hit) * 100
|
81 |
+
sub = [r for l, r in zip(lines, res)]
|
82 |
+
hit = hit_calculate(sub, dataset)
|
83 |
+
ret['Overall'] = np.mean(hit) * 100
|
84 |
+
else:
|
85 |
+
ret['Overall'] = np.mean(hit) * 100
|
86 |
+
if 'category' in data:
|
87 |
+
cates = list(set(data['category']))
|
88 |
+
cates.sort()
|
89 |
+
for c in cates:
|
90 |
+
sub = [r for l, r in zip(lines, res) if l['category'] == c]
|
91 |
+
# [np.mean(x['match']) >= full_score_weight for x in sub]
|
92 |
+
hit = hit_calculate(sub, dataset)
|
93 |
+
ret[c] = np.mean(hit) * 100
|
94 |
+
ret = d2df(ret)
|
95 |
+
ret.round(2)
|
96 |
+
|
97 |
+
suffix = eval_file.split('.')[-1]
|
98 |
+
result_file = eval_file.replace(f'.{suffix}', '_acc.csv')
|
99 |
+
dump(ret, result_file)
|
100 |
+
return ret
|
101 |
+
|
102 |
+
|
103 |
+
class VizWiz(ImageBaseDataset):
|
104 |
+
TYPE = 'VQA'
|
105 |
+
DATASET_URL = {
|
106 |
+
'VizWiz': 'https://opencompass.openxlab.space/utils/VLMEval/VizWiz.tsv'
|
107 |
+
}
|
108 |
+
DATASET_MD5 = {
|
109 |
+
'VizWiz': 'fa4ac4164467563ed2fac6eac6631bd0'
|
110 |
+
}
|
111 |
+
|
112 |
+
@classmethod
|
113 |
+
def evaluate(self, eval_file, **judge_kwargs):
|
114 |
+
from .utils.vqa_eval import hit_calculate, process_line
|
115 |
+
|
116 |
+
suffix = eval_file.split('.')[-1]
|
117 |
+
result_file = eval_file.replace(f'.{suffix}', '_acc.csv')
|
118 |
+
|
119 |
+
if not osp.exists(result_file):
|
120 |
+
data = load(eval_file)
|
121 |
+
assert 'answers' in data and 'prediction' in data
|
122 |
+
data['prediction'] = [str(x) for x in data['prediction']]
|
123 |
+
data['answer'] = [str(x) for x in data['answers']]
|
124 |
+
|
125 |
+
lt = len(data)
|
126 |
+
pool = mp.Pool(16)
|
127 |
+
lines = [data.iloc[i] for i in range(lt)]
|
128 |
+
res = pool.map(process_line, lines)
|
129 |
+
|
130 |
+
hit = hit_calculate(res, 'VizWiz')
|
131 |
+
ret = dict()
|
132 |
+
|
133 |
+
ret['Overall'] = np.mean(hit) * 100
|
134 |
+
ret = d2df(ret)
|
135 |
+
ret.round(2)
|
136 |
+
|
137 |
+
dump(ret, result_file)
|
138 |
+
|
139 |
+
retz = pd.read_csv(result_file)
|
140 |
+
return retz
|
141 |
+
|
142 |
+
|
143 |
+
class OCRBench(ImageBaseDataset):
|
144 |
+
TYPE = 'VQA'
|
145 |
+
DATASET_URL = {
|
146 |
+
'OCRBench': 'https://opencompass.openxlab.space/utils/VLMEval/OCRBench.tsv'
|
147 |
+
}
|
148 |
+
DATASET_MD5 = {'OCRBench': 'e953d98a987cc6e26ef717b61260b778'}
|
149 |
+
|
150 |
+
# It returns a dictionary
|
151 |
+
@classmethod
|
152 |
+
def evaluate(self, eval_file, **judge_kwargs):
|
153 |
+
OCRBench_score = {
|
154 |
+
'Regular Text Recognition': 0,
|
155 |
+
'Irregular Text Recognition': 0,
|
156 |
+
'Artistic Text Recognition': 0,
|
157 |
+
'Handwriting Recognition': 0,
|
158 |
+
'Digit String Recognition': 0,
|
159 |
+
'Non-Semantic Text Recognition': 0,
|
160 |
+
'Scene Text-centric VQA': 0,
|
161 |
+
'Doc-oriented VQA': 0,
|
162 |
+
'Key Information Extraction': 0,
|
163 |
+
'Handwritten Mathematical Expression Recognition': 0,
|
164 |
+
}
|
165 |
+
|
166 |
+
data = load(eval_file)
|
167 |
+
lt = len(data)
|
168 |
+
lines = [data.iloc[i] for i in range(lt)]
|
169 |
+
for i in tqdm(range(len(lines))):
|
170 |
+
line = lines[i]
|
171 |
+
predict = str(line['prediction'])
|
172 |
+
answers = eval(line['answer'])
|
173 |
+
category = line['category']
|
174 |
+
if category == 'Handwritten Mathematical Expression Recognition':
|
175 |
+
for j in range(len(answers)):
|
176 |
+
answer = answers[j].strip().replace('\n', ' ').replace(' ', '')
|
177 |
+
predict = predict.strip().replace('\n', ' ').replace(' ', '')
|
178 |
+
if answer in predict:
|
179 |
+
OCRBench_score[category] += 1
|
180 |
+
break
|
181 |
+
else:
|
182 |
+
for j in range(len(answers)):
|
183 |
+
answer = answers[j].lower().strip().replace('\n', ' ')
|
184 |
+
predict = predict.lower().strip().replace('\n', ' ')
|
185 |
+
if answer in predict:
|
186 |
+
OCRBench_score[category] += 1
|
187 |
+
break
|
188 |
+
|
189 |
+
final_score_dict = {}
|
190 |
+
final_score_dict['Text Recognition'] = \
|
191 |
+
(OCRBench_score['Regular Text Recognition'] + OCRBench_score['Irregular Text Recognition']
|
192 |
+
+ OCRBench_score['Artistic Text Recognition'] + OCRBench_score['Handwriting Recognition']
|
193 |
+
+ OCRBench_score['Digit String Recognition'] + OCRBench_score['Non-Semantic Text Recognition'])
|
194 |
+
final_score_dict['Scene Text-centric VQA'] = OCRBench_score['Scene Text-centric VQA']
|
195 |
+
final_score_dict['Doc-oriented VQA'] = OCRBench_score['Doc-oriented VQA']
|
196 |
+
final_score_dict['Key Information Extraction'] = OCRBench_score['Key Information Extraction']
|
197 |
+
final_score_dict['Handwritten Mathematical Expression Recognition'] = \
|
198 |
+
(OCRBench_score['Handwritten Mathematical Expression Recognition'])
|
199 |
+
final_score_dict['Final Score'] = \
|
200 |
+
(final_score_dict['Text Recognition'] + final_score_dict['Scene Text-centric VQA']
|
201 |
+
+ final_score_dict['Doc-oriented VQA'] + final_score_dict['Key Information Extraction']
|
202 |
+
+ final_score_dict['Handwritten Mathematical Expression Recognition'])
|
203 |
+
final_score_dict['Final Score Norm'] = (float(final_score_dict['Final Score']) / 10)
|
204 |
+
score_pth = eval_file.replace('.xlsx', '_score.json')
|
205 |
+
dump(final_score_dict, score_pth)
|
206 |
+
return final_score_dict
|
207 |
+
|
208 |
+
|
209 |
+
class MathVista(ImageBaseDataset):
|
210 |
+
TYPE = 'VQA'
|
211 |
+
DATASET_URL = {
|
212 |
+
'MathVista_MINI': 'https://opencompass.openxlab.space/utils/VLMEval/MathVista_MINI.tsv'
|
213 |
+
}
|
214 |
+
DATASET_MD5 = {'MathVista_MINI': 'f199b98e178e5a2a20e7048f5dcb0464'}
|
215 |
+
|
216 |
+
# It returns a DataFrame
|
217 |
+
@classmethod
|
218 |
+
def evaluate(self, eval_file, **judge_kwargs):
|
219 |
+
from .utils.mathvista import MathVista_auxeval, MathVista_acc
|
220 |
+
|
221 |
+
model = judge_kwargs['model']
|
222 |
+
suffix = eval_file.split('.')[-1]
|
223 |
+
storage = eval_file.replace(f'.{suffix}', f'_{model}.xlsx')
|
224 |
+
tmp_file = eval_file.replace(f'.{suffix}', f'_{model}.pkl')
|
225 |
+
nproc = judge_kwargs.pop('nproc', 4)
|
226 |
+
|
227 |
+
if not osp.exists(storage):
|
228 |
+
data = load(eval_file)
|
229 |
+
model = build_judge(max_tokens=128, **judge_kwargs)
|
230 |
+
assert model.working(), ('MathVista evaluation requires a working OPENAI API\n' + DEBUG_MESSAGE)
|
231 |
+
lt = len(data)
|
232 |
+
lines = [data.iloc[i] for i in range(lt)]
|
233 |
+
tups = [(model, line) for line in lines]
|
234 |
+
indices = [line['index'] for line in lines]
|
235 |
+
|
236 |
+
ans = {}
|
237 |
+
if osp.exists(tmp_file):
|
238 |
+
ans = load(tmp_file)
|
239 |
+
tups = [x for x, i in zip(tups, indices) if i not in ans]
|
240 |
+
indices = [i for i in indices if i not in ans]
|
241 |
+
|
242 |
+
if len(indices):
|
243 |
+
new_results = track_progress_rich(
|
244 |
+
MathVista_auxeval,
|
245 |
+
tups,
|
246 |
+
nproc=nproc,
|
247 |
+
chunksize=nproc,
|
248 |
+
keys=indices,
|
249 |
+
save=tmp_file,
|
250 |
+
)
|
251 |
+
ans = load(tmp_file)
|
252 |
+
for k, v in zip(indices, new_results):
|
253 |
+
assert k in ans
|
254 |
+
assert ans[k]['log'] == v['log'] and ans[k]['res'] == v['res']
|
255 |
+
|
256 |
+
data['res'] = [ans[idx]['res'] for idx in data['index']]
|
257 |
+
data['log'] = [ans[idx]['log'] for idx in data['index']]
|
258 |
+
dump(data, storage)
|
259 |
+
|
260 |
+
score = MathVista_acc(storage)
|
261 |
+
score_pth = storage.replace('.xlsx', '_score.csv')
|
262 |
+
dump(score, score_pth)
|
263 |
+
return score
|
264 |
+
|
265 |
+
|
266 |
+
class MathVerse(ImageBaseDataset):
|
267 |
+
TYPE = 'VQA'
|
268 |
+
DATASET_URL = {
|
269 |
+
'MathVerse_MINI': 'http://opencompass.openxlab.space/utils/benchmarks/MathVerse/MathVerse_MINIV.tsv', # noqa
|
270 |
+
'MathVerse_MINI_Vision_Only': 'http://opencompass.openxlab.space/utils/benchmarks/MathVerse/MathVerse_MINIVOnly.tsv', # noqa
|
271 |
+
'MathVerse_MINI_Vision_Dominant': 'http://opencompass.openxlab.space/utils/benchmarks/MathVerse/MathVerse_MINIVDom.tsv', # noqa
|
272 |
+
'MathVerse_MINI_Vision_Intensive': 'http://opencompass.openxlab.space/utils/benchmarks/MathVerse/MathVerse_MINIVInt.tsv', # noqa
|
273 |
+
'MathVerse_MINI_Text_Lite': 'http://opencompass.openxlab.space/utils/benchmarks/MathVerse/MathVerse_MINITLite.tsv', # noqa
|
274 |
+
'MathVerse_MINI_Text_Dominant': 'http://opencompass.openxlab.space/utils/benchmarks/MathVerse/MathVerse_MINITDom.tsv', # noqa
|
275 |
+
}
|
276 |
+
DATASET_MD5 = {
|
277 |
+
'MathVerse_MINI': '5017caca32b7fa110c350a1bea861b65',
|
278 |
+
'MathVerse_MINI_Vision_Only': '68a11d4680014ac881fa37adeadea3a4',
|
279 |
+
'MathVerse_MINI_Vision_Dominant': 'b8fb63852d261ab2aaefba29cc2414d3',
|
280 |
+
'MathVerse_MINI_Vision_Intensive': '01cbd35be202bb0c4873a4186a63bc19',
|
281 |
+
'MathVerse_MINI_Text_Lite': '19e4b13bdd30b89a03b2e358bcfefa04',
|
282 |
+
'MathVerse_MINI_Text_Dominant': '4f5cd2fa6630ea00bb11d6fde1f6fe6a',
|
283 |
+
}
|
284 |
+
|
285 |
+
# It returns a DataFrame
|
286 |
+
@classmethod
|
287 |
+
def evaluate(self, eval_file, **judge_kwargs):
|
288 |
+
from .utils.mathverse import MathVerse_auxeval_extract, MathVerse_auxeval_score, MathVerse_acc
|
289 |
+
|
290 |
+
model = judge_kwargs['model']
|
291 |
+
suffix = eval_file.split('.')[-1]
|
292 |
+
storage_extract = eval_file.replace(f'.{suffix}', f'_{model}_extract.xlsx')
|
293 |
+
tmp_file_extract = eval_file.replace(f'.{suffix}', f'_{model}_extract.pkl')
|
294 |
+
storage_score = eval_file.replace(f'.{suffix}', f'_{model}_score.xlsx')
|
295 |
+
tmp_file_score = eval_file.replace(f'.{suffix}', f'_{model}_score.pkl')
|
296 |
+
nproc = judge_kwargs.pop('nproc', 4)
|
297 |
+
# stage1: extract the answer
|
298 |
+
if not osp.exists(storage_extract):
|
299 |
+
data = load(eval_file)
|
300 |
+
model = build_judge(max_tokens=128, **judge_kwargs)
|
301 |
+
assert model.working(), ('MathVerse evaluation requires a working OPENAI API\n' + DEBUG_MESSAGE)
|
302 |
+
lt = len(data)
|
303 |
+
lines = [data.iloc[i] for i in range(lt)]
|
304 |
+
tups = [(model, line) for line in lines]
|
305 |
+
indices = [line['index'] for line in lines]
|
306 |
+
|
307 |
+
ans = {}
|
308 |
+
if osp.exists(tmp_file_extract):
|
309 |
+
ans = load(tmp_file_extract)
|
310 |
+
tups = [x for x, i in zip(tups, indices) if i not in ans]
|
311 |
+
indices = [i for i in indices if i not in ans]
|
312 |
+
|
313 |
+
if len(indices):
|
314 |
+
new_results = track_progress_rich(
|
315 |
+
MathVerse_auxeval_extract,
|
316 |
+
tups,
|
317 |
+
nproc=nproc,
|
318 |
+
chunksize=nproc,
|
319 |
+
keys=indices,
|
320 |
+
save=tmp_file_extract,
|
321 |
+
)
|
322 |
+
ans = load(tmp_file_extract)
|
323 |
+
for k, v in zip(indices, new_results):
|
324 |
+
assert k in ans
|
325 |
+
assert ans[k]['log_extract'] == v['log_extract'] and ans[k]['extract'] == v['extract']
|
326 |
+
|
327 |
+
data['extract'] = [ans[idx]['extract'] for idx in data['index']]
|
328 |
+
data['log_extract'] = [ans[idx]['log_extract'] for idx in data['index']]
|
329 |
+
dump(data, storage_extract)
|
330 |
+
|
331 |
+
# stage2: score the answer
|
332 |
+
if not osp.exists(storage_score):
|
333 |
+
data = load(storage_extract)
|
334 |
+
model = build_judge(max_tokens=128, **judge_kwargs)
|
335 |
+
assert model.working(), ('MathVerse evaluation requires a working OPENAI API\n' + DEBUG_MESSAGE)
|
336 |
+
lt = len(data)
|
337 |
+
lines = [data.iloc[i] for i in range(lt)]
|
338 |
+
tups = [(model, line) for line in lines]
|
339 |
+
indices = [line['index'] for line in lines]
|
340 |
+
|
341 |
+
ans = {}
|
342 |
+
if osp.exists(tmp_file_score):
|
343 |
+
ans = load(tmp_file_score)
|
344 |
+
tups = [x for x, i in zip(tups, indices) if i not in ans]
|
345 |
+
indices = [i for i in indices if i not in ans]
|
346 |
+
|
347 |
+
if len(indices):
|
348 |
+
new_results = track_progress_rich(
|
349 |
+
MathVerse_auxeval_score,
|
350 |
+
tups,
|
351 |
+
nproc=nproc,
|
352 |
+
chunksize=nproc,
|
353 |
+
keys=indices,
|
354 |
+
save=tmp_file_score,
|
355 |
+
)
|
356 |
+
ans = load(tmp_file_score)
|
357 |
+
for k, v in zip(indices, new_results):
|
358 |
+
assert k in ans
|
359 |
+
assert ans[k]['log_score'] == v['log_score'] and ans[k]['score'] == v['score']
|
360 |
+
|
361 |
+
data['score'] = [ans[idx]['score'] for idx in data['index']]
|
362 |
+
data['log_score'] = [ans[idx]['log_score'] for idx in data['index']]
|
363 |
+
dump(data, storage_score)
|
364 |
+
|
365 |
+
score = MathVerse_acc(storage_score)
|
366 |
+
score_pth = storage_score.replace('.xlsx', '.csv')
|
367 |
+
dump(score, score_pth)
|
368 |
+
return score
|
369 |
+
|
370 |
+
|
371 |
+
class MathVision(ImageBaseDataset):
|
372 |
+
TYPE = 'VQA'
|
373 |
+
DATASET_URL = {
|
374 |
+
'MathVision': 'https://opencompass.openxlab.space/utils/VLMEval/MathVision.tsv',
|
375 |
+
'MathVision_MINI': 'https://opencompass.openxlab.space/utils/VLMEval/MathVision_MINI.tsv'
|
376 |
+
}
|
377 |
+
DATASET_MD5 = {
|
378 |
+
'MathVision': '93f6de14f7916e598aa1b7165589831e',
|
379 |
+
'MathVision_MINI': '060fe4fa5d868987ce179307bd5f8a33'
|
380 |
+
}
|
381 |
+
|
382 |
+
# It returns a DataFrame
|
383 |
+
@classmethod
|
384 |
+
def evaluate(self, eval_file, **judge_kwargs):
|
385 |
+
from .utils.mathv import MATH_V_auxeval, MATH_V_acc
|
386 |
+
|
387 |
+
if 'model' in judge_kwargs:
|
388 |
+
model = judge_kwargs['model']
|
389 |
+
else:
|
390 |
+
model = os.path.basename(os.environ.get('LOCAL_LLM'))
|
391 |
+
suffix = eval_file.split('.')[-1]
|
392 |
+
storage = eval_file.replace(f'.{suffix}', f'_{model}.xlsx')
|
393 |
+
tmp_file = eval_file.replace(f'.{suffix}', f'_{model}.pkl')
|
394 |
+
nproc = judge_kwargs.pop('nproc', 4)
|
395 |
+
|
396 |
+
if not osp.exists(storage):
|
397 |
+
data = load(eval_file)
|
398 |
+
model = build_judge(max_tokens=128, **judge_kwargs)
|
399 |
+
assert model.working(), ('MATH-Vision evaluation requires a working OPENAI API\n' + DEBUG_MESSAGE)
|
400 |
+
lt = len(data)
|
401 |
+
lines = [data.iloc[i] for i in range(lt)]
|
402 |
+
tups = [(model, line) for line in lines]
|
403 |
+
indices = [line['index'] for line in lines]
|
404 |
+
|
405 |
+
ans = {}
|
406 |
+
if osp.exists(tmp_file):
|
407 |
+
ans = load(tmp_file)
|
408 |
+
tups = [x for x, i in zip(tups, indices) if i not in ans]
|
409 |
+
indices = [i for i in indices if i not in ans]
|
410 |
+
|
411 |
+
if len(indices):
|
412 |
+
new_results = track_progress_rich(
|
413 |
+
MATH_V_auxeval,
|
414 |
+
tups,
|
415 |
+
nproc=nproc,
|
416 |
+
chunksize=nproc,
|
417 |
+
keys=indices,
|
418 |
+
save=tmp_file,
|
419 |
+
)
|
420 |
+
ans = load(tmp_file)
|
421 |
+
for k, v in zip(indices, new_results):
|
422 |
+
assert k in ans
|
423 |
+
assert ans[k]['log'] == v['log'] and ans[k]['res'] == v['res']
|
424 |
+
|
425 |
+
data['res'] = [ans[idx]['res'] for idx in data['index']]
|
426 |
+
data['log'] = [ans[idx]['log'] for idx in data['index']]
|
427 |
+
dump(data, storage)
|
428 |
+
|
429 |
+
score = MATH_V_acc(storage)
|
430 |
+
score_pth = storage.replace('.xlsx', '_score.csv')
|
431 |
+
dump(score, score_pth)
|
432 |
+
return score
|
433 |
+
|
434 |
+
|
435 |
+
class OlympiadBench(ImageBaseDataset):
|
436 |
+
TYPE = 'VQA_ex_prompt'
|
437 |
+
DATASET_URL = {
|
438 |
+
'OlympiadBench': 'https://opencompass.openxlab.space/utils/VLMEval/OlympiadBench.tsv',
|
439 |
+
'OlympiadBench_EN': 'https://opencompass.openxlab.space/utils/VLMEval/OlympiadBench_EN.tsv',
|
440 |
+
'OlympiadBench_CN': 'https://opencompass.openxlab.space/utils/VLMEval/OlympiadBench_CN.tsv'
|
441 |
+
}
|
442 |
+
DATASET_MD5 = {
|
443 |
+
'OlympiadBench': '9735ae0f0299eae1e7d07f5a7feab914',
|
444 |
+
'OlympiadBench_EN': '5c68e100d394351fc7049f29d4d4efed',
|
445 |
+
'OlympiadBench_CN': 'ea01b16788955702c79650c701e5b623'
|
446 |
+
}
|
447 |
+
|
448 |
+
def dump_image(self, line):
|
449 |
+
os.makedirs(self.img_root, exist_ok=True)
|
450 |
+
|
451 |
+
tgt_path_z = []
|
452 |
+
if isinstance(line['image'], list):
|
453 |
+
for i in range(len(line['image'])):
|
454 |
+
tgt_path = osp.join(self.img_root, f"{line['index']}--{i+1}.jpg")
|
455 |
+
if not read_ok(tgt_path):
|
456 |
+
decode_base64_to_image_file(line['image'][i], tgt_path)
|
457 |
+
tgt_path_z.append(tgt_path)
|
458 |
+
else:
|
459 |
+
tgt_path = osp.join(self.img_root, f"{line['index']}.jpg")
|
460 |
+
if not read_ok(tgt_path):
|
461 |
+
decode_base64_to_image_file(line['image'], tgt_path)
|
462 |
+
tgt_path_z.append(tgt_path)
|
463 |
+
return tgt_path_z
|
464 |
+
|
465 |
+
def build_prompt(self, line):
|
466 |
+
|
467 |
+
from .utils.olympiadbench import get_answer_type_text, make_input
|
468 |
+
|
469 |
+
self.is_chinese = 'zh' in line['source']
|
470 |
+
self.is_math = 'maths' in line['source']
|
471 |
+
self.is_theorem_proving = 'TP' in line['source']
|
472 |
+
|
473 |
+
if self.is_chinese:
|
474 |
+
subject_content = '数学' if self.is_math else '物理'
|
475 |
+
if self.is_theorem_proving:
|
476 |
+
prompt = (
|
477 |
+
f"以下是中国{subject_content}竞赛中的证明题。请根据题目的要求,运用逻辑推理及常用定理证明题目中的命题。"
|
478 |
+
"证明过程中使用的变量和公式请使用LaTeX格式表示。"
|
479 |
+
)
|
480 |
+
else:
|
481 |
+
answer_type_text = get_answer_type_text(line['answer_type'], is_chinese=True,
|
482 |
+
multiple_answer=line['is_multiple_answer'])
|
483 |
+
if line['is_multiple_answer']:
|
484 |
+
multiple_answer_text = '\\boxed{用英文逗号连接的多个答案}'
|
485 |
+
else:
|
486 |
+
multiple_answer_text = '\\boxed{答案}'
|
487 |
+
unit_text = ''
|
488 |
+
if line['unit']:
|
489 |
+
multiple_answer_text += '(单位)'
|
490 |
+
unit_text = ',注意答案的单位不要放在\\boxed{}中'
|
491 |
+
prompt = (
|
492 |
+
f'以下是中国{subject_content}竞赛中的解答题{answer_type_text}。请根据题目的要求和所提供的信息计算得出答案。'
|
493 |
+
f'解答过程和结果中使用的变量和公式请使用LaTeX格式表示。请在最后以“所以最终答案是{multiple_answer_text}。”'
|
494 |
+
f'显式给出结果{unit_text}。'
|
495 |
+
)
|
496 |
+
else:
|
497 |
+
subject_content = 'Math' if self.is_math else 'Physics'
|
498 |
+
if self.is_theorem_proving:
|
499 |
+
prompt = (
|
500 |
+
f'The following is a theorem proving problem from an International {subject_content} competition. '
|
501 |
+
'Please use logical reasoning and common theorems to prove the proposition in the problem '
|
502 |
+
'according to the given requirements. '
|
503 |
+
'Please use LaTeX format to represent the variables and formulas used in the proof.'
|
504 |
+
)
|
505 |
+
else:
|
506 |
+
if line['is_multiple_answer']:
|
507 |
+
multiple_answer_text = '\\boxed{multiple answers connected with commas}'
|
508 |
+
else:
|
509 |
+
multiple_answer_text = '\\boxed{answer}'
|
510 |
+
unit_text = ''
|
511 |
+
if line['unit']:
|
512 |
+
multiple_answer_text += '(unit)'
|
513 |
+
unit_text = ', note that the unit of the answer should not be included in \\boxed{}'
|
514 |
+
answer_type_text = get_answer_type_text(line['answer_type'], is_chinese=False,
|
515 |
+
multiple_answer=line['is_multiple_answer'])
|
516 |
+
prompt = (
|
517 |
+
f'The following is an open-ended problem from an International {subject_content} competition. '
|
518 |
+
f'{answer_type_text}Please calculate the answer according to the given requirements and '
|
519 |
+
'the information provided. Please use LaTeX format to represent the variables and formulas '
|
520 |
+
'used in the solution process and results. Please end your solution with "So the final answer '
|
521 |
+
f'is {multiple_answer_text}." and give the result explicitly{unit_text}.'
|
522 |
+
)
|
523 |
+
|
524 |
+
if self.is_math:
|
525 |
+
input = make_input(prompt, line['question'])
|
526 |
+
else:
|
527 |
+
if 'context' in line.keys() and str(line['context']) != 'nan': # cannot be null
|
528 |
+
input = make_input(prompt, line['context'] + '\n' + line['question'])
|
529 |
+
else:
|
530 |
+
input = make_input(prompt, line['question'])
|
531 |
+
|
532 |
+
ret = [dict(type='text', value=input)]
|
533 |
+
tgt_path = self.dump_image(line)
|
534 |
+
|
535 |
+
ret.extend([dict(type='image', value=s) for s in tgt_path])
|
536 |
+
|
537 |
+
return ret
|
538 |
+
|
539 |
+
@classmethod
|
540 |
+
def evaluate(self, eval_file, **judge_kwargs):
|
541 |
+
from .utils.olympiadbench import MathJudger, extract_answer
|
542 |
+
judger = MathJudger()
|
543 |
+
|
544 |
+
suffix = eval_file.split('.')[-1]
|
545 |
+
name_str1 = 'judge'
|
546 |
+
name_str2 = 'score'
|
547 |
+
result_file = eval_file.replace(f'.{suffix}', f'_{name_str1}_result.xlsx')
|
548 |
+
score_file = eval_file.replace(f'.{suffix}', f'_{name_str2}_result.csv')
|
549 |
+
|
550 |
+
if not osp.exists(result_file):
|
551 |
+
data = load(eval_file)
|
552 |
+
scorez = []
|
553 |
+
|
554 |
+
for i in tqdm(data.iterrows()):
|
555 |
+
line = i[1]
|
556 |
+
model_answer = line['prediction']
|
557 |
+
is_chinese = 'zh' in line['source']
|
558 |
+
model_answer = extract_answer(is_chinese, model_answer, is_deepseek=False)
|
559 |
+
answer_type = line['answer_type']
|
560 |
+
|
561 |
+
final_answer = line['final_answer'][2:-2]
|
562 |
+
|
563 |
+
if str(answer_type) != 'nan' and 'Tuple' in answer_type:
|
564 |
+
judge_result = judger.judge(model_answer, final_answer)
|
565 |
+
else:
|
566 |
+
if str(line['error']) != 'nan':
|
567 |
+
if ',' in line['error']:
|
568 |
+
precisions = line['error'].split(',')
|
569 |
+
precisions = [float(p) if p else 1e-8 for p in precisions]
|
570 |
+
judge_result = judger.judge(model_answer, final_answer, precisions)
|
571 |
+
else:
|
572 |
+
precision = float(line['error'])
|
573 |
+
judge_result = judger.judge(model_answer, final_answer, precision)
|
574 |
+
else:
|
575 |
+
judge_result = judger.judge(model_answer, final_answer)
|
576 |
+
scorez.append(judge_result)
|
577 |
+
|
578 |
+
data['score'] = scorez
|
579 |
+
dump(data, result_file)
|
580 |
+
|
581 |
+
judge_file = load(result_file)
|
582 |
+
|
583 |
+
if not osp.exists(score_file):
|
584 |
+
name_list = ['OE_MM_maths_en_COMP', 'OE_MM_maths_zh_CEE', 'OE_MM_maths_zh_COMP', 'OE_MM_physics_en_COMP',
|
585 |
+
'OE_MM_physics_zh_CEE','OE_TO_maths_en_COMP', 'OE_TO_maths_zh_CEE', 'OE_TO_maths_zh_COMP',
|
586 |
+
'OE_TO_physics_en_COMP', 'OE_TO_physics_zh_CEE']
|
587 |
+
|
588 |
+
sample_list = [[] for _ in range(len(name_list))]
|
589 |
+
for i in judge_file.iterrows():
|
590 |
+
line = i[1]
|
591 |
+
for j in range(len(name_list)):
|
592 |
+
if line['source'] == name_list[j]:
|
593 |
+
sample_list[j].append(line['score'])
|
594 |
+
|
595 |
+
acc_dict = {}
|
596 |
+
correct_list = []
|
597 |
+
|
598 |
+
# fine-grained
|
599 |
+
for i in range(len(name_list)):
|
600 |
+
correct_num = 0
|
601 |
+
for j in sample_list[i]:
|
602 |
+
if j:
|
603 |
+
correct_num += 1
|
604 |
+
correct_list.append(correct_num)
|
605 |
+
acc = 100 * correct_num / len(sample_list[i])
|
606 |
+
acc_dict[name_list[i]] = [acc]
|
607 |
+
|
608 |
+
# 4 grained
|
609 |
+
labela = ['zh', 'en']
|
610 |
+
labelb = ['maths', 'physics']
|
611 |
+
|
612 |
+
grain_list = [[x,y] for x in labela for y in labelb]
|
613 |
+
for j in grain_list:
|
614 |
+
dict_name = j[0] + "_" + j[1]
|
615 |
+
correct_num = 0
|
616 |
+
full_num = 0
|
617 |
+
for i in range(len(name_list)):
|
618 |
+
if all(k in name_list[i] for k in j):
|
619 |
+
correct_num += correct_list[i]
|
620 |
+
full_num += len(sample_list[i])
|
621 |
+
acc = 100 * correct_num / full_num
|
622 |
+
acc_dict[dict_name] = [acc]
|
623 |
+
|
624 |
+
# 2 grained
|
625 |
+
grain_list = ['maths', 'physics']
|
626 |
+
for j in grain_list:
|
627 |
+
dict_name = j
|
628 |
+
correct_num = 0
|
629 |
+
full_num = 0
|
630 |
+
for i in range(len(name_list)):
|
631 |
+
if j in name_list[i]:
|
632 |
+
correct_num += correct_list[i]
|
633 |
+
full_num += len(sample_list[i])
|
634 |
+
acc = 100 * correct_num / full_num
|
635 |
+
acc_dict[dict_name] = [acc]
|
636 |
+
|
637 |
+
# AVG
|
638 |
+
correct_num = sum(correct_list)
|
639 |
+
acc = 100 * correct_num / len(judge_file)
|
640 |
+
acc_dict['AVG'] = [acc]
|
641 |
+
|
642 |
+
acc_pd = pd.DataFrame(acc_dict)
|
643 |
+
acc_pd.to_csv(score_file, index=False, encoding='gbk')
|
644 |
+
|
645 |
+
accdz = pd.read_csv(score_file)
|
646 |
+
return accdz
|
647 |
+
|
648 |
+
|
649 |
+
class LLaVABench(ImageBaseDataset):
|
650 |
+
TYPE = 'VQA'
|
651 |
+
DATASET_URL = {'LLaVABench': 'https://opencompass.openxlab.space/utils/VLMEval/LLaVABench.tsv'}
|
652 |
+
DATASET_MD5 = {'LLaVABench': 'd382a093f749a697820d3dadd61c8428'}
|
653 |
+
|
654 |
+
# It returns a DataFrame
|
655 |
+
@classmethod
|
656 |
+
def evaluate(self, eval_file, **judge_kwargs):
|
657 |
+
from .utils.llavabench import (
|
658 |
+
build_prompt,
|
659 |
+
LLaVABench_atomeval,
|
660 |
+
LLaVABench_score,
|
661 |
+
)
|
662 |
+
|
663 |
+
suffix = '.' + eval_file.split('.')[-1]
|
664 |
+
record_file = eval_file.replace(suffix, '_openai_result' + suffix)
|
665 |
+
score_file = eval_file.replace(suffix, '_score.csv')
|
666 |
+
nproc = judge_kwargs.pop('nproc', 4)
|
667 |
+
system_prompt = 'You are a helpful and precise assistant for checking the quality of the answer.'
|
668 |
+
|
669 |
+
if not osp.exists(record_file):
|
670 |
+
data = load(eval_file)
|
671 |
+
lines = [data.iloc[i] for i in range(len(data))]
|
672 |
+
model = build_judge(temperature=0.2, system_prompt=system_prompt, **judge_kwargs)
|
673 |
+
assert model.working(), ('LLaVABench evaluation requires a working OPENAI API\n' + DEBUG_MESSAGE)
|
674 |
+
|
675 |
+
prompts = [build_prompt(line) for line in lines]
|
676 |
+
tups = [(model, prompt) for prompt in prompts]
|
677 |
+
scores = track_progress_rich(LLaVABench_atomeval, tups, nproc=nproc, chunksize=nproc)
|
678 |
+
data['gpt4_score'] = [x[0] for x in scores]
|
679 |
+
data['score'] = [x[1] for x in scores]
|
680 |
+
dump(data, record_file)
|
681 |
+
|
682 |
+
data = load(record_file)
|
683 |
+
ret = LLaVABench_score(data).round(1)
|
684 |
+
dump(ret, score_file)
|
685 |
+
return ret
|
686 |
+
|
687 |
+
|
688 |
+
class MMVet(ImageBaseDataset):
|
689 |
+
TYPE = 'VQA'
|
690 |
+
DATASET_URL = {
|
691 |
+
'MMVet': 'https://opencompass.openxlab.space/utils/VLMEval/MMVet.tsv'
|
692 |
+
}
|
693 |
+
DATASET_MD5 = {'MMVet': '748aa6d4aa9d4de798306a63718455e3'}
|
694 |
+
|
695 |
+
# It returns a DataFrame
|
696 |
+
@classmethod
|
697 |
+
def evaluate(self, eval_file, **judge_kwargs):
|
698 |
+
from .utils.mmvet import MMVet_auxeval, MMVet_acc
|
699 |
+
|
700 |
+
suffix = eval_file.split('.')[-1]
|
701 |
+
model = judge_kwargs['model']
|
702 |
+
storage = eval_file.replace(f'.{suffix}', f'_{model}.xlsx')
|
703 |
+
tmp_file = eval_file.replace(f'.{suffix}', f'_{model}.pkl')
|
704 |
+
nproc = judge_kwargs.pop('nproc', 4)
|
705 |
+
if not osp.exists(storage):
|
706 |
+
data = load(eval_file)
|
707 |
+
model = build_judge(max_tokens=3, **judge_kwargs)
|
708 |
+
assert model.working(), ('MMVet evaluation requires a working OPENAI API\n' + DEBUG_MESSAGE)
|
709 |
+
|
710 |
+
lt = len(data)
|
711 |
+
lines = [data.iloc[i] for i in range(lt)]
|
712 |
+
tups = [(model, line) for line in lines]
|
713 |
+
indices = [line['index'] for line in lines]
|
714 |
+
|
715 |
+
ans = load(tmp_file) if osp.exists(tmp_file) else {}
|
716 |
+
tups = [x for x, i in zip(tups, indices) if i not in ans]
|
717 |
+
indices = [i for i in indices if i not in ans]
|
718 |
+
|
719 |
+
if len(indices):
|
720 |
+
new_results = track_progress_rich(
|
721 |
+
MMVet_auxeval,
|
722 |
+
tups,
|
723 |
+
nproc=nproc,
|
724 |
+
chunksize=nproc,
|
725 |
+
keys=indices,
|
726 |
+
save=tmp_file,
|
727 |
+
)
|
728 |
+
ans = load(tmp_file)
|
729 |
+
for k, v in zip(indices, new_results):
|
730 |
+
assert k in ans
|
731 |
+
assert ans[k]['log'] == v['log'] and ans[k]['score'] == v['score']
|
732 |
+
data['score'] = [ans[idx]['score'] for idx in data['index']]
|
733 |
+
data['log'] = [ans[idx]['log'] for idx in data['index']]
|
734 |
+
dump(data, storage)
|
735 |
+
|
736 |
+
score, score_fine = MMVet_acc(storage)
|
737 |
+
score_pth = storage.replace('.xlsx', '_score.csv')
|
738 |
+
score_fine_pth = storage.replace('.xlsx', '_score_fine.csv')
|
739 |
+
dump(score, score_pth)
|
740 |
+
dump(score_fine, score_fine_pth)
|
741 |
+
return score
|
742 |
+
|
743 |
+
|
744 |
+
class MTVQADataset(ImageBaseDataset):
|
745 |
+
TYPE = 'VQA'
|
746 |
+
DATASET_URL = {'MTVQA_TEST': 'https://opencompass.openxlab.space/utils/VLMEval/MTVQA_TEST.tsv'}
|
747 |
+
DATASET_MD5 = {'MTVQA_TEST': 'd87c17dbab934b7cd89c0a3c1c5657f4'}
|
748 |
+
|
749 |
+
@classmethod
|
750 |
+
def evaluate(self, eval_file, **judge_kwargs):
|
751 |
+
data = load(eval_file)
|
752 |
+
assert 'answer' in data and 'prediction' in data and 'category' in data
|
753 |
+
data['prediction'] = [str(x) for x in data['prediction']]
|
754 |
+
data['answer'] = [str(x) for x in data['answer']]
|
755 |
+
if 'split' in data:
|
756 |
+
assert np.all([x.lower() == 'test' for x in data['split']]), 'We only support MTVQA_TEST for now. '
|
757 |
+
lt = len(data)
|
758 |
+
category_scores = defaultdict(list)
|
759 |
+
for i in range(lt):
|
760 |
+
line = data.iloc[i]
|
761 |
+
ans = line['answer'].strip().lower().replace('.', '')
|
762 |
+
pred = line['prediction'].strip().lower().replace('.', '')
|
763 |
+
cate = line['category']
|
764 |
+
score = 1.0 if ans in pred else 0.0
|
765 |
+
category_scores[cate].append(score)
|
766 |
+
category_scores['Average'].append(score)
|
767 |
+
# Calculate the average score for each category, the score is normalized to [0, 100]
|
768 |
+
category_averages = {category: np.mean(scores) * 100 for category, scores in category_scores.items()}
|
769 |
+
|
770 |
+
suffix = eval_file.split('.')[-1]
|
771 |
+
result_file = eval_file.replace(f'.{suffix}', '_acc.json')
|
772 |
+
dump(category_averages, result_file)
|
773 |
+
|
774 |
+
return category_averages
|
775 |
+
|
776 |
+
# MT-VQA adopts a custom prompt
|
777 |
+
def build_prompt(self, line):
|
778 |
+
msgs = super().build_prompt(line)
|
779 |
+
assert sum([x['type'] == 'text' for x in msgs]) == 1
|
780 |
+
for item in msgs:
|
781 |
+
if item['type'] == 'text':
|
782 |
+
item['value'] += '\nAnswer the question using a word or phrase in the language of the question.'
|
783 |
+
return msgs
|
784 |
+
|
785 |
+
|
786 |
+
class TableVQABench(ImageBaseDataset):
|
787 |
+
TYPE = 'VQA'
|
788 |
+
DATASET_URL = {
|
789 |
+
'TableVQABench': 'https://pai-aigc-photog.oss-cn-hangzhou.aliyuncs.com/mentor-vil/datasets/tablevqa-bench.tsv'
|
790 |
+
}
|
791 |
+
DATASET_MD5 = {'TableVQABench': '2550adc61bdc82d8e62f3b003de7c62d'}
|
792 |
+
|
793 |
+
from .utils.tablevqabench import FINTABNETQA_PROMPT, VTABFACT_PROMPT, VWTQ_PROMPT
|
794 |
+
|
795 |
+
# It returns a DataFrame
|
796 |
+
@classmethod
|
797 |
+
def evaluate(self, eval_file, **judge_kwargs):
|
798 |
+
import pandas as pd
|
799 |
+
from .utils.tablevqabench import evaluate_fintabnet, evaluate_tabfact, evaluate_wtq
|
800 |
+
|
801 |
+
data = load(eval_file)
|
802 |
+
assert 'answer' in data and 'prediction' in data
|
803 |
+
|
804 |
+
data['prediction'] = data['prediction'].str.replace('^Answer: ', '', regex=True)
|
805 |
+
data_group = dict(tuple(data.groupby('split')))
|
806 |
+
eval_result = {'split': [], 'average_scores': []}
|
807 |
+
for split in ['fintabnetqa', 'vtabfact', 'vwtq', 'vwtq_syn']:
|
808 |
+
data_split = data_group[split].to_dict(orient='records')
|
809 |
+
if split == 'fintabnetqa':
|
810 |
+
split_eval_meta = evaluate_fintabnet(data_split, ['accuracy'])
|
811 |
+
elif split == 'vtabfact':
|
812 |
+
split_eval_meta = evaluate_tabfact(data_split, ['accuracy'])
|
813 |
+
elif split == 'vwtq' or split == 'vwtq_syn':
|
814 |
+
split_eval_meta = evaluate_wtq(data_split, ['accuracy'])
|
815 |
+
eval_result['split'].append(split)
|
816 |
+
eval_result['average_scores'].append(split_eval_meta['average_scores'])
|
817 |
+
|
818 |
+
suffix = eval_file.split('.')[-1]
|
819 |
+
result_file = eval_file.replace(f'.{suffix}', '_acc.csv')
|
820 |
+
eval_result = pd.DataFrame(eval_result)
|
821 |
+
dump(eval_result, result_file)
|
822 |
+
|
823 |
+
return eval_result
|
824 |
+
|
825 |
+
# TableVQABench adopts a custom prompt
|
826 |
+
def build_prompt(self, line):
|
827 |
+
msgs = super().build_prompt(line)
|
828 |
+
assert sum([x['type'] == 'text' for x in msgs]) == 1
|
829 |
+
for item in msgs:
|
830 |
+
if item['type'] == 'text':
|
831 |
+
if line['split'] == 'fintabnetqa':
|
832 |
+
item['value'] = self.FINTABNETQA_PROMPT.format_map({'question': item['value']})
|
833 |
+
elif line['split'] == 'vtabfact':
|
834 |
+
item['value'] = self.VTABFACT_PROMPT.format_map({'question': item['value']})
|
835 |
+
elif line['split'] == 'vwtq_syn' or line['split'] == 'vwtq':
|
836 |
+
item['value'] = self.VWTQ_PROMPT.format_map({'question': item['value']})
|
837 |
+
return msgs
|
838 |
+
|
839 |
+
|
840 |
+
class CustomVQADataset(ImageBaseDataset):
|
841 |
+
TYPE = 'VQA'
|
842 |
+
|
843 |
+
def load_data(self, dataset):
|
844 |
+
data_path = osp.join(LMUDataRoot(), f'{dataset}.tsv')
|
845 |
+
|
846 |
+
if file_size(data_path, 'GB') > 1:
|
847 |
+
local_path = data_path.replace('.tsv', '_local.tsv')
|
848 |
+
if not osp.exists(local_path) or os.environ.get('FORCE_LOCAL', None):
|
849 |
+
from ..tools import LOCALIZE
|
850 |
+
|
851 |
+
LOCALIZE(data_path, local_path)
|
852 |
+
data_path = local_path
|
853 |
+
return load(data_path)
|
854 |
+
|
855 |
+
def evaluate(self, eval_file, **judge_kwargs):
|
856 |
+
raise NotImplementedError
|
857 |
+
|
858 |
+
|
859 |
+
class CRPE(ImageBaseDataset):
|
860 |
+
TYPE = 'VQA'
|
861 |
+
DATASET_URL = {
|
862 |
+
'CRPE_EXIST': 'https://huggingface.co/datasets/petter12321/crpe_vlmevalkit/resolve/main/CRPE_EXIST.tsv',
|
863 |
+
'CRPE_RELATION': 'https://huggingface.co/datasets/petter12321/crpe_vlmevalkit/resolve/main/CRPE_RELATION.tsv'
|
864 |
+
}
|
865 |
+
DATASET_MD5 = {
|
866 |
+
'CRPE_EXIST': '315584e23ac1ff7f8719ed3b7ad90f08',
|
867 |
+
'CRPE_RELATION': 'bad7094cde0b572288f4b119c2d0c656'}
|
868 |
+
|
869 |
+
@classmethod
|
870 |
+
def evaluate(self, eval_file, **judge_kwargs):
|
871 |
+
from .utils.crpe import is_correct
|
872 |
+
# find-image, count-text, find-text,
|
873 |
+
# infer-choose, count-image, visual-reasoning
|
874 |
+
score = {
|
875 |
+
'exist': 0,
|
876 |
+
'subject': 0,
|
877 |
+
'predicate': 0,
|
878 |
+
'object': 0,
|
879 |
+
'total': 0,
|
880 |
+
}
|
881 |
+
num = {
|
882 |
+
'exist': 0,
|
883 |
+
'subject': 0,
|
884 |
+
'predicate': 0,
|
885 |
+
'object': 0,
|
886 |
+
'total': 0,
|
887 |
+
}
|
888 |
+
final_score_dict = {
|
889 |
+
'exist': 0,
|
890 |
+
'subject': 0,
|
891 |
+
'predicate': 0,
|
892 |
+
'object': 0,
|
893 |
+
'total': 0,
|
894 |
+
}
|
895 |
+
data = load(eval_file)
|
896 |
+
lt = len(data)
|
897 |
+
lines = [data.iloc[i] for i in range(lt)]
|
898 |
+
for i in tqdm(range(len(lines))):
|
899 |
+
line = lines[i]
|
900 |
+
predict = str(line['prediction'])
|
901 |
+
answers = str(line['answer'])
|
902 |
+
# print("predict =", predict)
|
903 |
+
# print("answers =", answers)
|
904 |
+
category = line['category']
|
905 |
+
if is_correct(answers, predict):
|
906 |
+
score[category] += 1
|
907 |
+
score['total'] += 1
|
908 |
+
num[category] += 1
|
909 |
+
num['total'] += 1
|
910 |
+
|
911 |
+
for category in ['exist', 'subject', 'predicate', 'object', 'total']:
|
912 |
+
if num[category] != 0:
|
913 |
+
final_score_dict[category] = score[category] / num[category]
|
914 |
+
else:
|
915 |
+
final_score_dict[category] = None
|
916 |
+
|
917 |
+
score_pth = eval_file.replace('.xlsx', '_score.json')
|
918 |
+
dump(final_score_dict, score_pth)
|
919 |
+
return final_score_dict
|
920 |
+
|
921 |
+
def build_prompt(self, line):
|
922 |
+
ROOT = LMUDataRoot()
|
923 |
+
msgs = super().build_prompt(line)
|
924 |
+
for msg in msgs:
|
925 |
+
if msg['type'] == 'image':
|
926 |
+
msg['value'] = osp.join(osp.join(ROOT, 'images', self.dataset_name), msg['value'])
|
927 |
+
return msgs
|
928 |
+
|
929 |
+
|
930 |
+
class QSpatial(ImageBaseDataset):
|
931 |
+
TYPE = 'VQA'
|
932 |
+
DATASET_URL = {
|
933 |
+
'QSpatial_plus': '',
|
934 |
+
'QSpatial_scannet': ''
|
935 |
+
}
|
936 |
+
|
937 |
+
# NOTE: To evaluate Q-Spatial-ScanNet, you need to get the permission from ScanNet website
|
938 |
+
# Once you get the permission, you can use the helper code here to download and extract necessary images:
|
939 |
+
# https://github.com/andrewliao11/Q-Spatial-Bench-code?tab=readme-ov-file#for-qspatial_scannet
|
940 |
+
qspatial_root = "TO_BE_REPLACED_WITH_THE_PATH_TO_QSPATIAL_DATASET"
|
941 |
+
url = "https://raw.githubusercontent.com/andrewliao11/Q-Spatial-Bench-code/refs/heads/main/prompt_templates/"
|
942 |
+
|
943 |
+
def post_build(self, dataset):
|
944 |
+
# Download the prompt templates from github
|
945 |
+
|
946 |
+
links = [
|
947 |
+
self.url + "system_prompt.txt",
|
948 |
+
self.url + "spatial_prompt_single.txt",
|
949 |
+
self.url + "spatial_prompt_steps.txt",
|
950 |
+
self.url + "standard_prompt.txt",
|
951 |
+
self.url + "zero_shot_prompt.txt"
|
952 |
+
]
|
953 |
+
with tempfile.TemporaryDirectory() as temp_dir:
|
954 |
+
for link in links:
|
955 |
+
tgt_path = os.path.join(temp_dir, link.split("/")[-1])
|
956 |
+
os.system(f"wget {link} -O {tgt_path}")
|
957 |
+
|
958 |
+
self.system_prompt = open(os.path.join(temp_dir, "system_prompt.txt")).read()
|
959 |
+
self._prompt_templates = dict(
|
960 |
+
spatial_prompt_single=open(os.path.join(temp_dir, "spatial_prompt_single.txt")).read(),
|
961 |
+
spatial_prompt_steps=open(os.path.join(temp_dir, "spatial_prompt_steps.txt")).read(),
|
962 |
+
standard_prompt=open(os.path.join(temp_dir, "standard_prompt.txt")).read(),
|
963 |
+
zero_shot_prompt=open(os.path.join(temp_dir, "zero_shot_prompt.txt")).read(),
|
964 |
+
)
|
965 |
+
|
966 |
+
# Given one data record, return the built prompt (a multi-modal message), can override
|
967 |
+
def build_prompt(self, line):
|
968 |
+
|
969 |
+
text_prompt_template = self._prompt_templates["spatial_prompt_single"]
|
970 |
+
env = SandboxedEnvironment()
|
971 |
+
text_prompt = env.from_string(text_prompt_template).render(question=line["question"])
|
972 |
+
tgt_path = self.dump_image(line)
|
973 |
+
|
974 |
+
msgs = []
|
975 |
+
if isinstance(tgt_path, list):
|
976 |
+
msgs.extend([dict(type='image', value=p) for p in tgt_path])
|
977 |
+
else:
|
978 |
+
msgs = [dict(type='image', value=tgt_path)]
|
979 |
+
|
980 |
+
msgs.append(dict(type='text', value=f"{self.system_prompt}\n{text_prompt}"))
|
981 |
+
return msgs
|
982 |
+
|
983 |
+
# Given the dataset name, return the dataset as a pandas dataframe, can override
|
984 |
+
def load_data(self, dataset):
|
985 |
+
import io
|
986 |
+
import pandas as pd
|
987 |
+
from datasets import load_dataset
|
988 |
+
|
989 |
+
hf_dataset = load_dataset("andrewliao11/Q-Spatial-Bench", split=dataset)
|
990 |
+
df = hf_dataset.to_pandas()
|
991 |
+
|
992 |
+
df.reset_index(drop=True, inplace=True)
|
993 |
+
df['index'] = df.index
|
994 |
+
df['answer'] = list(zip(df['answer_value'], df['answer_unit']))
|
995 |
+
df = df[['index'] + [col for col in df.columns if col != 'index']]
|
996 |
+
|
997 |
+
if dataset == "QSpatial_scannet":
|
998 |
+
df = df.drop(columns=["image"])
|
999 |
+
df["image"] = [Image.open(os.path.join(self.qspatial_root, image_path)) for image_path in df["image_path"]]
|
1000 |
+
else:
|
1001 |
+
df["image"] = [Image.open(io.BytesIO(image_dict["bytes"])) for image_dict in df["image"]]
|
1002 |
+
|
1003 |
+
df["image"] = [encode_image_to_base64(image) for image in df["image"]]
|
1004 |
+
return df
|
1005 |
+
|
1006 |
+
@classmethod
|
1007 |
+
def get_multiplier(self, unit):
|
1008 |
+
|
1009 |
+
unit = unit.lower()
|
1010 |
+
if unit in ["meters", "meter", "m", "metre", "metres"]:
|
1011 |
+
multiplier = 100
|
1012 |
+
elif unit in ["centimeters", "centimeter", "cm"]:
|
1013 |
+
multiplier = 1
|
1014 |
+
elif unit in ["feet", "foot", "ft"]:
|
1015 |
+
multiplier = 30.48
|
1016 |
+
elif unit in ["inch", "inches", "in"]:
|
1017 |
+
multiplier = 2.54
|
1018 |
+
elif unit in ["mm"]:
|
1019 |
+
multiplier = 0.1
|
1020 |
+
else:
|
1021 |
+
print(f"Unknown unit: {unit}")
|
1022 |
+
multiplier = 0.
|
1023 |
+
|
1024 |
+
return multiplier
|
1025 |
+
|
1026 |
+
@classmethod
|
1027 |
+
def parse_string(self, input_str):
|
1028 |
+
# Regular expression to match the pattern (number or range, text)
|
1029 |
+
match = re.match(r'\(([\d.-]+), (.+)\)', input_str)
|
1030 |
+
if match:
|
1031 |
+
number_part = match.group(1)
|
1032 |
+
text = match.group(2)
|
1033 |
+
|
1034 |
+
if '-' in number_part:
|
1035 |
+
start, end = map(float, number_part.split('-'))
|
1036 |
+
number = (start + end) / 2
|
1037 |
+
else:
|
1038 |
+
number = float(number_part)
|
1039 |
+
|
1040 |
+
return number * self.get_multiplier(text)
|
1041 |
+
else:
|
1042 |
+
print(f"Unable to parse the input string {input_str}")
|
1043 |
+
return 0
|
1044 |
+
|
1045 |
+
@classmethod
|
1046 |
+
def parse_prediction(self, vlm_response):
|
1047 |
+
# Value
|
1048 |
+
pattern = r'scalar{([^}]*)}'
|
1049 |
+
str_inside_scalar_boxes = re.findall(pattern, vlm_response)[-1]
|
1050 |
+
scalar_list = re.findall(r'\d+\.?\d*', str_inside_scalar_boxes)
|
1051 |
+
parsed_scalar = np.array(scalar_list).astype(float).mean()
|
1052 |
+
|
1053 |
+
# Unit
|
1054 |
+
pattern = r'distance_unit{([^}]*)}'
|
1055 |
+
str_inside_unit_boxes = re.findall(pattern, vlm_response)
|
1056 |
+
parsed_unit = str_inside_unit_boxes[-1]
|
1057 |
+
|
1058 |
+
pred_value_in_cms = parsed_scalar * self.get_multiplier(parsed_unit)
|
1059 |
+
return pred_value_in_cms
|
1060 |
+
|
1061 |
+
# It returns a dictionary
|
1062 |
+
@classmethod
|
1063 |
+
def evaluate(self, eval_file, **judge_kwargs):
|
1064 |
+
|
1065 |
+
data = load(eval_file)
|
1066 |
+
if "model" in judge_kwargs:
|
1067 |
+
from .utils.qspatial import QSpatial_auxeval
|
1068 |
+
|
1069 |
+
# extract using model
|
1070 |
+
model = judge_kwargs['model']
|
1071 |
+
suffix = eval_file.split('.')[-1]
|
1072 |
+
storage = eval_file.replace(f'.{suffix}', f'_{model}.xlsx')
|
1073 |
+
tmp_file = eval_file.replace(f'.{suffix}', f'_{model}.pkl')
|
1074 |
+
nproc = judge_kwargs.pop('nproc', 4)
|
1075 |
+
|
1076 |
+
if not osp.exists(storage):
|
1077 |
+
model = build_judge(max_tokens=128, **judge_kwargs)
|
1078 |
+
|
1079 |
+
assert model.working(), ('Evaluation requires a working OPENAI API\n' + DEBUG_MESSAGE)
|
1080 |
+
lt = len(data)
|
1081 |
+
lines = [data.iloc[i] for i in range(lt)]
|
1082 |
+
tups = [(model, line) for line in lines]
|
1083 |
+
indices = [line['index'] for line in lines]
|
1084 |
+
|
1085 |
+
ans = {}
|
1086 |
+
if osp.exists(tmp_file):
|
1087 |
+
ans = load(tmp_file)
|
1088 |
+
tups = [x for x, i in zip(tups, indices) if i not in ans]
|
1089 |
+
indices = [i for i in indices if i not in ans]
|
1090 |
+
|
1091 |
+
if len(indices):
|
1092 |
+
new_results = track_progress_rich(
|
1093 |
+
QSpatial_auxeval,
|
1094 |
+
tups,
|
1095 |
+
nproc=nproc,
|
1096 |
+
chunksize=nproc,
|
1097 |
+
keys=indices,
|
1098 |
+
save=tmp_file,
|
1099 |
+
)
|
1100 |
+
ans = load(tmp_file)
|
1101 |
+
for k, v in zip(indices, new_results):
|
1102 |
+
assert k in ans
|
1103 |
+
assert ans[k]['log'] == v['log'] and ans[k]['res'] == v['res']
|
1104 |
+
|
1105 |
+
data['res'] = [ans[idx]['res'] for idx in data['index']]
|
1106 |
+
data['log'] = [ans[idx]['log'] for idx in data['index']]
|
1107 |
+
dump(data, storage)
|
1108 |
+
|
1109 |
+
data = load(storage)
|
1110 |
+
|
1111 |
+
pred_value_in_cms = []
|
1112 |
+
for res in data["res"]:
|
1113 |
+
try:
|
1114 |
+
pred_value_in_cms.append(self.parse_string(res))
|
1115 |
+
except ValueError:
|
1116 |
+
pred_value_in_cms.append(0.)
|
1117 |
+
|
1118 |
+
pred_value_in_cms = np.array(pred_value_in_cms) + 1e-8
|
1119 |
+
else:
|
1120 |
+
# regex parsing
|
1121 |
+
pred_value_in_cms = []
|
1122 |
+
n_errors_in_parsing = 0
|
1123 |
+
for pred in data["prediction"]:
|
1124 |
+
try:
|
1125 |
+
parsed_value = self.parse_prediction(pred)
|
1126 |
+
except IndexError:
|
1127 |
+
n_errors_in_parsing += 1
|
1128 |
+
parsed_value = 1e-8
|
1129 |
+
|
1130 |
+
pred_value_in_cms.append(parsed_value)
|
1131 |
+
|
1132 |
+
print(f"Encounter {n_errors_in_parsing} errors in parsing")
|
1133 |
+
pred_value_in_cms = np.array(pred_value_in_cms) + 1e-8
|
1134 |
+
|
1135 |
+
# Ground truth
|
1136 |
+
ground_truth_value_in_cms = []
|
1137 |
+
for answer in data["answer"]:
|
1138 |
+
value, unit = eval(answer)
|
1139 |
+
ground_truth_value_in_cms.append(value * self.get_multiplier(unit))
|
1140 |
+
ground_truth_value_in_cms = np.array(ground_truth_value_in_cms) + 1e-8
|
1141 |
+
|
1142 |
+
# Calculate the score
|
1143 |
+
pred_gt = pred_value_in_cms / ground_truth_value_in_cms
|
1144 |
+
gt_pred = ground_truth_value_in_cms / pred_value_in_cms
|
1145 |
+
delta_2 = np.stack([pred_gt, gt_pred]).max(0) < 2.
|
1146 |
+
delta_1_point_5 = np.stack([pred_gt, gt_pred]).max(0) < 1.5
|
1147 |
+
|
1148 |
+
data["eval_score_delta_2"] = delta_2
|
1149 |
+
data["eval_score_delta_1_point_5"] = delta_1_point_5
|
1150 |
+
|
1151 |
+
final_score_dict = {
|
1152 |
+
"delta_2": delta_2.mean(),
|
1153 |
+
"delta_1_point_5": delta_1_point_5.mean()
|
1154 |
+
}
|
1155 |
+
for question_type in set(data["question_type"]):
|
1156 |
+
filtered_data = data[data["question_type"] == question_type]
|
1157 |
+
delta_2_per_question_type = filtered_data["eval_score_delta_2"].mean()
|
1158 |
+
delta_1_point_5_per_question_type = filtered_data["eval_score_delta_1_point_5"].mean()
|
1159 |
+
final_score_dict.update({f"{question_type}_delta_2": delta_2_per_question_type})
|
1160 |
+
final_score_dict.update({f"{question_type}_delta_1_point_5": delta_1_point_5_per_question_type})
|
1161 |
+
|
1162 |
+
score_pth = eval_file.replace('.xlsx', '_score.json')
|
1163 |
+
dump(final_score_dict, score_pth)
|
1164 |
+
return final_score_dict
|
1165 |
+
|
1166 |
+
|
1167 |
+
class MMNIAH(ImageBaseDataset):
|
1168 |
+
TYPE = 'VQA'
|
1169 |
+
DATASET_URL = {
|
1170 |
+
'MM_NIAH_VAL':
|
1171 |
+
'https://huggingface.co/datasets/petter12321/MM-NIAH-VLMEvalKit/resolve/main/MM_NIAH_VAL.tsv',
|
1172 |
+
'MM_NIAH_TEST':
|
1173 |
+
['https://huggingface.co/datasets/petter12321/MM-NIAH-VLMEvalKit/resolve/main/part-aa',
|
1174 |
+
'https://huggingface.co/datasets/petter12321/MM-NIAH-VLMEvalKit/resolve/main/part-ab',
|
1175 |
+
'https://huggingface.co/datasets/petter12321/MM-NIAH-VLMEvalKit/resolve/main/part-ac',
|
1176 |
+
'https://huggingface.co/datasets/petter12321/MM-NIAH-VLMEvalKit/resolve/main/part-ad',
|
1177 |
+
'https://huggingface.co/datasets/petter12321/MM-NIAH-VLMEvalKit/resolve/main/part-ae']}
|
1178 |
+
DATASET_MD5 = {'MM_NIAH_VAL': '27e5a8c3cef7746cb38f89cd86c474c5',
|
1179 |
+
'MM_NIAH_TEST': 'f490eb2a43096307465fe9e7ef13497c'}
|
1180 |
+
|
1181 |
+
def prepare_tsv(self, url, file_md5=None):
|
1182 |
+
import os
|
1183 |
+
data_root = LMUDataRoot()
|
1184 |
+
os.makedirs(data_root, exist_ok=True)
|
1185 |
+
update_flag = False
|
1186 |
+
file_name = 'MM_NIAH_VAL.tsv' if 'MM_NIAH_VAL' in url else 'MM_NIAH_TEST.tsv'
|
1187 |
+
data_path = osp.join(data_root, file_name)
|
1188 |
+
if osp.exists(data_path) and (file_md5 is None or md5(data_path) == file_md5):
|
1189 |
+
pass
|
1190 |
+
elif file_name == 'MM_NIAH_TEST.tsv':
|
1191 |
+
warnings.warn('The dataset tsv is not downloaded')
|
1192 |
+
for i in range(len(url)):
|
1193 |
+
if osp.exists(osp.join(data_root, 'part-a' + chr(ord('a') + i))):
|
1194 |
+
print('part_a' + chr(ord('a') + i) + ' is existed')
|
1195 |
+
continue
|
1196 |
+
download_file(url[i], data_path)
|
1197 |
+
file_prefix = 'part-'
|
1198 |
+
output_file = data_path
|
1199 |
+
split_files = sorted([f for f in os.listdir(data_root) if f.startswith(file_prefix)])
|
1200 |
+
with open(output_file, 'wb') as outfile:
|
1201 |
+
# 逐个读取每个拆分文件并写入到输出文件
|
1202 |
+
for filename in split_files:
|
1203 |
+
with open(osp.join(data_root, filename), 'rb') as infile:
|
1204 |
+
outfile.write(infile.read())
|
1205 |
+
update_flag = True
|
1206 |
+
else:
|
1207 |
+
warnings.warn('The dataset tsv is not downloaded')
|
1208 |
+
download_file(url, data_path)
|
1209 |
+
update_flag = True
|
1210 |
+
|
1211 |
+
if file_size(data_path, 'GB') > 1:
|
1212 |
+
local_path = data_path.replace('.tsv', '_local.tsv')
|
1213 |
+
if not osp.exists(local_path) or os.environ.get('FORCE_LOCAL', None) or update_flag:
|
1214 |
+
from ..tools import LOCALIZE
|
1215 |
+
LOCALIZE(data_path, local_path)
|
1216 |
+
data_path = local_path
|
1217 |
+
return load(data_path)
|
1218 |
+
|
1219 |
+
@classmethod
|
1220 |
+
def evaluate(self, eval_file, **judge_kwargs):
|
1221 |
+
from .utils.mmniah import is_correct
|
1222 |
+
# find-image, count-text, find-text,
|
1223 |
+
# infer-choose, count-image, visual-reasoning
|
1224 |
+
MMNIAH_score = {
|
1225 |
+
'count-text': 0,
|
1226 |
+
'find-image': 0,
|
1227 |
+
'find-text': 0,
|
1228 |
+
'infer-choose': 0,
|
1229 |
+
'count-image': 0,
|
1230 |
+
'visual-reasoning': 0,
|
1231 |
+
'total': 0,
|
1232 |
+
}
|
1233 |
+
MMNIAH_num = {
|
1234 |
+
'count-text': 0,
|
1235 |
+
'find-image': 0,
|
1236 |
+
'find-text': 0,
|
1237 |
+
'infer-choose': 0,
|
1238 |
+
'count-image': 0,
|
1239 |
+
'visual-reasoning': 0,
|
1240 |
+
'total': 0,
|
1241 |
+
}
|
1242 |
+
final_score_dict = {
|
1243 |
+
'count-text': 0,
|
1244 |
+
'find-image': 0,
|
1245 |
+
'find-text': 0,
|
1246 |
+
'infer-choose': 0,
|
1247 |
+
'count-image': 0,
|
1248 |
+
'visual-reasoning': 0,
|
1249 |
+
'total': 0,
|
1250 |
+
}
|
1251 |
+
data = load(eval_file)
|
1252 |
+
lt = len(data)
|
1253 |
+
lines = [data.iloc[i] for i in range(lt)]
|
1254 |
+
for i in tqdm(range(len(lines))):
|
1255 |
+
line = lines[i]
|
1256 |
+
predict = line['prediction']
|
1257 |
+
answers = line['answer']
|
1258 |
+
category = line['category']
|
1259 |
+
if category in ['visual-reasoning', 'find-image']:
|
1260 |
+
answers = int(answers)
|
1261 |
+
if is_correct(answers, predict):
|
1262 |
+
MMNIAH_score[category] += 1
|
1263 |
+
MMNIAH_score['total'] += 1
|
1264 |
+
MMNIAH_num[category] += 1
|
1265 |
+
MMNIAH_num['total'] += 1
|
1266 |
+
|
1267 |
+
for category in ['find-image', 'count-text', 'find-text',
|
1268 |
+
'infer-choose', 'count-image', 'visual-reasoning', 'total']:
|
1269 |
+
if MMNIAH_num[category] != 0:
|
1270 |
+
final_score_dict[category] = MMNIAH_score[category] / MMNIAH_num[category]
|
1271 |
+
else:
|
1272 |
+
final_score_dict[category] = None
|
1273 |
+
|
1274 |
+
score_pth = eval_file.replace('.xlsx', '_score.json')
|
1275 |
+
dump(final_score_dict, score_pth)
|
1276 |
+
return final_score_dict
|
1277 |
+
|
1278 |
+
def build_prompt(self, line):
|
1279 |
+
msgs = super().build_prompt(line)
|
1280 |
+
if isinstance(line, int):
|
1281 |
+
line = self.data.iloc[line]
|
1282 |
+
totalchoice = line['multi-choice options']
|
1283 |
+
totalchoice = eval(totalchoice)
|
1284 |
+
# find-image, count-text, find-text,
|
1285 |
+
# infer-choose, count-image, visual-reasoning
|
1286 |
+
context = msgs[-1]['value']
|
1287 |
+
context = eval(context)
|
1288 |
+
question = context[0] + '\n' + context[1]
|
1289 |
+
# tgt_path是所有图像地址列表
|
1290 |
+
tgt_path = []
|
1291 |
+
for i in range(len(msgs) - 1):
|
1292 |
+
tgt_path.append(msgs[i]['value'])
|
1293 |
+
choices = totalchoice[0]
|
1294 |
+
choices_image = totalchoice[1]
|
1295 |
+
if choices:
|
1296 |
+
for c_idx, c in enumerate(choices):
|
1297 |
+
question = f"{question}\n{chr(c_idx + ord('A'))}. {c}"
|
1298 |
+
question += "\nAnswer with the option's letter from the given choices directly."
|
1299 |
+
elif choices_image:
|
1300 |
+
for c_idx in range(len(choices_image)):
|
1301 |
+
question = f"{question}\n{chr(c_idx + ord('A'))}. <image>"
|
1302 |
+
question += "\nAnswer with the option's letter from the given choices directly."
|
1303 |
+
else:
|
1304 |
+
question += '\nAnswer the question using a single word or phrase.'
|
1305 |
+
question = '<start>' + question + '<end>'
|
1306 |
+
question = question.split('<image>')
|
1307 |
+
if choices_image:
|
1308 |
+
for i in range(len(question) - 5):
|
1309 |
+
question[i] = question[i] + '\n<image>'
|
1310 |
+
for i in range(len(question) - 5, len(question) - 1):
|
1311 |
+
question[i] = question[i] + '<image>'
|
1312 |
+
else:
|
1313 |
+
for i in range(len(question) - 1):
|
1314 |
+
question[i] = question[i] + '\n<image>'
|
1315 |
+
assert len(tgt_path) + 1 == len(question)
|
1316 |
+
context = []
|
1317 |
+
for i in range(len(tgt_path)):
|
1318 |
+
context.append(question[i])
|
1319 |
+
context.append(tgt_path[i])
|
1320 |
+
context.append(question[-1])
|
1321 |
+
context[0] = context[0][7:]
|
1322 |
+
context[-1] = context[-1][:-5]
|
1323 |
+
msgs = []
|
1324 |
+
for i in range(len(context)):
|
1325 |
+
if i % 2 == 0:
|
1326 |
+
msgs.append(dict(type='text', value=context[i]))
|
1327 |
+
else:
|
1328 |
+
ROOT = LMUDataRoot()
|
1329 |
+
msgs.append(dict(type='image', value=osp.join(osp.join(ROOT, 'images', self.dataset_name), context[i])))
|
1330 |
+
for element in msgs:
|
1331 |
+
if element['value'] == '':
|
1332 |
+
msgs.remove(element)
|
1333 |
+
return msgs
|
vlmeval/VLMEvalKit_old/vlmeval/dataset/image_yorn.py
ADDED
@@ -0,0 +1,95 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
from ..smp import *
|
2 |
+
from ..utils import *
|
3 |
+
from .image_base import ImageBaseDataset
|
4 |
+
from .utils import build_judge, DEBUG_MESSAGE
|
5 |
+
|
6 |
+
|
7 |
+
class ImageYORNDataset(ImageBaseDataset):
|
8 |
+
|
9 |
+
TYPE = 'Y/N'
|
10 |
+
|
11 |
+
DATASET_URL = {
|
12 |
+
'MME': 'https://opencompass.openxlab.space/utils/VLMEval/MME.tsv',
|
13 |
+
'HallusionBench': 'https://opencompass.openxlab.space/utils/VLMEval/HallusionBench.tsv',
|
14 |
+
'POPE': 'https://opencompass.openxlab.space/utils/VLMEval/POPE.tsv',
|
15 |
+
'AMBER': 'https://huggingface.co/datasets/yifanzhang114/AMBER_base64/resolve/main/AMBER.tsv',
|
16 |
+
}
|
17 |
+
|
18 |
+
DATASET_MD5 = {
|
19 |
+
'MME': 'b36b43c3f09801f5d368627fb92187c3',
|
20 |
+
'HallusionBench': '0c23ac0dc9ef46832d7a24504f2a0c7c',
|
21 |
+
'POPE': 'c12f5acb142f2ef1f85a26ba2fbe41d5',
|
22 |
+
'AMBER': '970d94c0410916166e0a76ba75da7934',
|
23 |
+
}
|
24 |
+
|
25 |
+
# It returns a dataframe
|
26 |
+
def evaluate(self, eval_file, **judge_kwargs):
|
27 |
+
from .utils.yorn import YOrN_Extraction, YOrN_auxeval
|
28 |
+
from .utils.yorn import default_rating, MME_rating, Hallusion_rating, POPE_rating, AMBER_rating
|
29 |
+
|
30 |
+
dataset = self.dataset_name
|
31 |
+
data = load(eval_file)
|
32 |
+
data['prediction'] = [str(x) for x in data['prediction']]
|
33 |
+
storage = eval_file.replace('.xlsx', '_auxmatch.xlsx')
|
34 |
+
tmp_file = eval_file.replace('.xlsx', '_tmp.pkl')
|
35 |
+
nproc = judge_kwargs.pop('nproc', 4)
|
36 |
+
|
37 |
+
if not osp.exists(storage):
|
38 |
+
ans_map = {k: YOrN_Extraction(v) for k, v in zip(data['index'], data['prediction'])}
|
39 |
+
if osp.exists(tmp_file):
|
40 |
+
tmp = load(tmp_file)
|
41 |
+
for k in tmp:
|
42 |
+
if ans_map[k] == 'Unknown' and tmp[k] != 'Unknown':
|
43 |
+
ans_map[k] = tmp[k]
|
44 |
+
|
45 |
+
data['extracted'] = [ans_map[x] for x in data['index']]
|
46 |
+
unknown = data[data['extracted'] == 'Unknown']
|
47 |
+
|
48 |
+
model = judge_kwargs.get('model', 'exact_matching')
|
49 |
+
if model == 'exact_matching':
|
50 |
+
model = None
|
51 |
+
elif gpt_key_set():
|
52 |
+
model = build_judge(**judge_kwargs)
|
53 |
+
if not model.working():
|
54 |
+
warnings.warn('OPENAI API is not working properly, will use exact matching for evaluation')
|
55 |
+
warnings.warn(DEBUG_MESSAGE)
|
56 |
+
model = None
|
57 |
+
else:
|
58 |
+
model = None
|
59 |
+
warnings.warn('OPENAI_API_KEY is not working properly, will use exact matching for evaluation')
|
60 |
+
|
61 |
+
if model is not None:
|
62 |
+
lt = len(unknown)
|
63 |
+
lines = [unknown.iloc[i] for i in range(lt)]
|
64 |
+
tups = [(model, line) for line in lines]
|
65 |
+
indices = list(unknown['index'])
|
66 |
+
if len(tups):
|
67 |
+
res = track_progress_rich(
|
68 |
+
YOrN_auxeval, tups, nproc=nproc, chunksize=nproc, keys=indices, save=tmp_file)
|
69 |
+
for k, v in zip(indices, res):
|
70 |
+
ans_map[k] = v
|
71 |
+
|
72 |
+
data['extracted'] = [ans_map[x] for x in data['index']]
|
73 |
+
dump(data, storage)
|
74 |
+
|
75 |
+
data = load(storage)
|
76 |
+
if listinstr(['AMBER'], dataset):
|
77 |
+
data['score'] = (data['answer'].str.lower() == data['extracted'].str.lower())
|
78 |
+
else:
|
79 |
+
data['score'] = (data['answer'] == data['extracted'])
|
80 |
+
dump(data, storage)
|
81 |
+
|
82 |
+
if dataset is not None and listinstr(['MME'], dataset):
|
83 |
+
score = MME_rating(storage)
|
84 |
+
elif dataset is not None and listinstr(['Hallusion'], dataset):
|
85 |
+
score = Hallusion_rating(storage)
|
86 |
+
elif dataset is not None and listinstr(['POPE'], dataset):
|
87 |
+
score = POPE_rating(storage)
|
88 |
+
elif dataset is not None and listinstr(['AMBER'], dataset):
|
89 |
+
score = AMBER_rating(storage)
|
90 |
+
else:
|
91 |
+
score = default_rating(storage)
|
92 |
+
|
93 |
+
score_tgt = eval_file.replace('.xlsx', '_score.csv')
|
94 |
+
dump(score, score_tgt)
|
95 |
+
return score
|
vlmeval/VLMEvalKit_old/vlmeval/dataset/miabench.py
ADDED
@@ -0,0 +1,167 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
import json
|
2 |
+
import os
|
3 |
+
|
4 |
+
import pandas as pd
|
5 |
+
|
6 |
+
from .image_base import ImageBaseDataset
|
7 |
+
from ..smp import *
|
8 |
+
from .utils import build_judge, DEBUG_MESSAGE
|
9 |
+
from ..utils import track_progress_rich
|
10 |
+
|
11 |
+
|
12 |
+
def generate_prompt(d):
|
13 |
+
question = d['question']
|
14 |
+
weights = eval(d['component_weight'])
|
15 |
+
components = eval(d['components'])
|
16 |
+
num_of_component = int(d['num_of_component'])
|
17 |
+
response = d['prediction']
|
18 |
+
|
19 |
+
if num_of_component == 1:
|
20 |
+
components = f"The first component is: '{components[0]}'. "
|
21 |
+
score = f"The first component is worth: {weights[0]} scores. "
|
22 |
+
elif num_of_component == 2:
|
23 |
+
components = f"The first component is: '{components[0]}', and the second component is '{components[1]}'. "
|
24 |
+
score = f"The first and second component is each worth {weights[0]} and {weights[1]} scores. "
|
25 |
+
elif num_of_component == 3:
|
26 |
+
components = (
|
27 |
+
f"The first component is: '{components[0]}', and the second component is '{components[1]}', "
|
28 |
+
f"and the third component is '{components[2]}'. "
|
29 |
+
)
|
30 |
+
score = (
|
31 |
+
"The first, second, and third component is each worth "
|
32 |
+
f"{weights[0]}, {weights[1]}, and {weights[2]} scores."
|
33 |
+
)
|
34 |
+
elif num_of_component == 4:
|
35 |
+
components = (
|
36 |
+
f"The first component is: '{components[0]}', and the second component is '{components[1]}', "
|
37 |
+
f"and the third component is '{components[2]}', and the fourth component is '{components[3]}'. "
|
38 |
+
)
|
39 |
+
score = (
|
40 |
+
"The first, second, third, and fourth component is each worth "
|
41 |
+
f"{weights[0]}, {weights[1]}, {weights[2]}, and {weights[3]} scores."
|
42 |
+
)
|
43 |
+
elif num_of_component == 5:
|
44 |
+
components = (
|
45 |
+
f"The first component is: '{components[0]}', and the second component is '{components[1]}', "
|
46 |
+
f"and the third component is '{components[2]}', and the fourth component is '{components[3]}', "
|
47 |
+
f"and the fifth component is '{components[4]}'. "
|
48 |
+
)
|
49 |
+
score = (
|
50 |
+
"The first, second, third, fourth, and fifth component is each worth "
|
51 |
+
f"{weights[0]}, {weights[1]}, {weights[2]}, {weights[3]}, and {weights[4]} scores."
|
52 |
+
)
|
53 |
+
|
54 |
+
return (
|
55 |
+
"Here is an instruction for a multimodal LLM: '"
|
56 |
+
f"{question}"
|
57 |
+
"'. You need to grade if the response from the model follows each component of the instruction. "
|
58 |
+
f"{components}"
|
59 |
+
"The response is: '"
|
60 |
+
f"{response}"
|
61 |
+
"'. You need to score the response and be strict. The total score ranges from 0 to 10, "
|
62 |
+
"depending on if the response follows the instruction. "
|
63 |
+
f"{score}"
|
64 |
+
"List scores of each component, and the total score in one sentence in this format: "
|
65 |
+
"score of component 1: x/2, score of component 2: y/8, total score: z/10. Then explain your reasons."
|
66 |
+
)
|
67 |
+
|
68 |
+
|
69 |
+
def process_rawscore(component_type, raw_score):
|
70 |
+
first_sentence = raw_score.split('.')[0].split(',')
|
71 |
+
score_dict = {}
|
72 |
+
for i in range(len(first_sentence) - 1):
|
73 |
+
score_ = first_sentence[i].split(':')[1][1:].split('/')
|
74 |
+
score = int(score_[0]) / int(score_[1])
|
75 |
+
score_dict[component_type[i]] = score
|
76 |
+
total_score_ = first_sentence[i + 1].split(':')[1][1:].split('/')
|
77 |
+
total_score = int(total_score_[0]) / int(total_score_[1])
|
78 |
+
score_dict['total_score'] = total_score
|
79 |
+
return score_dict
|
80 |
+
|
81 |
+
|
82 |
+
def get_score_dict(data, score_raw):
|
83 |
+
cat_score_dict = {}
|
84 |
+
for i in range(len(data)):
|
85 |
+
try:
|
86 |
+
cmp = data['component_type'][i][2:-2]
|
87 |
+
cmp_list = cmp.split('\', \'')
|
88 |
+
score_dict = process_rawscore(cmp_list, score_raw[i])
|
89 |
+
for key, val in score_dict.items():
|
90 |
+
if key not in cat_score_dict.keys():
|
91 |
+
cat_score_dict[key] = [val]
|
92 |
+
else:
|
93 |
+
cat_score_dict[key].append(val)
|
94 |
+
except:
|
95 |
+
pass
|
96 |
+
cat_score_dict_average = {}
|
97 |
+
for key, val in cat_score_dict.items():
|
98 |
+
cat_score_dict_average[key] = sum(val) / len(val)
|
99 |
+
return cat_score_dict_average
|
100 |
+
|
101 |
+
|
102 |
+
class MIABench(ImageBaseDataset):
|
103 |
+
TYPE = 'VQA'
|
104 |
+
|
105 |
+
DATASET_URL = {
|
106 |
+
'MIA-Bench': 'https://opencompass.openxlab.space/utils/VLMEval/Mia-Bench.tsv',
|
107 |
+
}
|
108 |
+
DATASET_MD5 = {
|
109 |
+
'MIA-Bench': '0b9de595f4dd40af18a69b94d89aba82',
|
110 |
+
}
|
111 |
+
|
112 |
+
@classmethod
|
113 |
+
def evaluate(self, eval_file, **judge_kwargs):
|
114 |
+
judge_name = judge_kwargs.pop('model', 'gpt-4o')
|
115 |
+
|
116 |
+
model = build_judge(model=judge_name, **judge_kwargs)
|
117 |
+
suffix = eval_file.split('.')[-1]
|
118 |
+
|
119 |
+
storage = eval_file.replace(f'.{suffix}', f'_{judge_name}.xlsx') # noqa: F841
|
120 |
+
tmp_file = eval_file.replace(f'.{suffix}', f'_{judge_name}.pkl') # noqa: F841
|
121 |
+
nproc = judge_kwargs.pop('nproc', 4) # noqa: F841
|
122 |
+
|
123 |
+
if not osp.exists(storage):
|
124 |
+
data = load(eval_file)
|
125 |
+
num_samples = len(data)
|
126 |
+
lines = [data.loc[i] for i in range(num_samples)]
|
127 |
+
prompts = [generate_prompt(line) for line in lines]
|
128 |
+
org_data = MIABench('MIA-Bench').data
|
129 |
+
img_map = {x: y for x, y in zip(org_data['index'], org_data['image'])}
|
130 |
+
image_b64 = [img_map[idx] for idx in data['index']]
|
131 |
+
indices = list(data['index'])
|
132 |
+
mm_messages = [
|
133 |
+
dict(message=[
|
134 |
+
dict(type='text', value=prompt),
|
135 |
+
dict(type='image', value=f'data:image/jpeg;base64,{b64}')
|
136 |
+
])
|
137 |
+
for prompt, b64 in zip(prompts, image_b64)
|
138 |
+
]
|
139 |
+
|
140 |
+
res = {}
|
141 |
+
if osp.exists(tmp_file):
|
142 |
+
res = load(tmp_file)
|
143 |
+
|
144 |
+
jobs = {k: v for k, v in zip(indices, mm_messages) if k not in res}
|
145 |
+
job_keys = list(jobs.keys())
|
146 |
+
job_vals = [jobs[k] for k in job_keys]
|
147 |
+
|
148 |
+
resps = track_progress_rich(
|
149 |
+
model.generate,
|
150 |
+
job_vals,
|
151 |
+
nproc=nproc,
|
152 |
+
chunksize=nproc,
|
153 |
+
keys=job_keys,
|
154 |
+
save=tmp_file,
|
155 |
+
)
|
156 |
+
for k, resp in zip(job_keys, resps):
|
157 |
+
res[k] = resp
|
158 |
+
data['score_raw'] = [res[idx] for idx in indices]
|
159 |
+
dump(data, storage)
|
160 |
+
|
161 |
+
goresult = load(storage)
|
162 |
+
results = get_score_dict(goresult, goresult['score_raw'])
|
163 |
+
result_pth = storage.replace('.xlsx', '_score.csv')
|
164 |
+
results_pd = pd.DataFrame.from_dict(list(results.items()))
|
165 |
+
dump(results_pd, result_pth)
|
166 |
+
|
167 |
+
return results
|
vlmeval/VLMEvalKit_old/vlmeval/dataset/mmmath.py
ADDED
@@ -0,0 +1,446 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
import re
|
2 |
+
import json
|
3 |
+
import sympy as sp
|
4 |
+
import numpy as np
|
5 |
+
from sympy import simplify, Eq, sympify, Pow, pi
|
6 |
+
from sympy.parsing.latex import parse_latex
|
7 |
+
import sys
|
8 |
+
import math
|
9 |
+
import os
|
10 |
+
import argparse
|
11 |
+
|
12 |
+
from .image_base import ImageBaseDataset
|
13 |
+
from ..utils import track_progress_rich
|
14 |
+
from ..smp import load, dump
|
15 |
+
|
16 |
+
|
17 |
+
class AutoScoringJudge:
|
18 |
+
def __init__(self):
|
19 |
+
# Map of special symbols to their replacements
|
20 |
+
self.special_signal_map = {
|
21 |
+
"\\left": "",
|
22 |
+
"\\right": "",
|
23 |
+
"厘米":"",
|
24 |
+
# "∶": ":",
|
25 |
+
",": ",",
|
26 |
+
"$": "",
|
27 |
+
"(":"(",
|
28 |
+
")":")",
|
29 |
+
"\\infty":"oo",
|
30 |
+
"\\colon ":":",
|
31 |
+
# "\\approx": "=",
|
32 |
+
# "\\simeq": "=",
|
33 |
+
# "\\sim": "=",
|
34 |
+
# "^\\prime": "'",
|
35 |
+
# "^{\\prime}": "'",
|
36 |
+
"+":"+",
|
37 |
+
"\\, ": "",
|
38 |
+
"\\,":"",
|
39 |
+
"^\\circ": "",
|
40 |
+
"^{\\circ}": "",
|
41 |
+
# "%": "",
|
42 |
+
}
|
43 |
+
self.pi = parse_latex("\\pi")
|
44 |
+
# MM-Math default precision
|
45 |
+
self.precision = 1e-2
|
46 |
+
|
47 |
+
def trans_greater_sign_to_interval(self, expr:str):
|
48 |
+
expr_tmp = expr.split("<")
|
49 |
+
return "(" + expr_tmp[0] + ", " + expr_tmp[-1] + ")"
|
50 |
+
|
51 |
+
def split_by_comma(self, expr: str):
|
52 |
+
# Splits expressions by commas outside of brackets
|
53 |
+
in_bracket_num = 0
|
54 |
+
splitted_expr = []
|
55 |
+
start_idx = 0
|
56 |
+
for i, char in enumerate(expr):
|
57 |
+
if char in ["(", "["]:
|
58 |
+
in_bracket_num += 1
|
59 |
+
elif char in [")", "]"]:
|
60 |
+
in_bracket_num -= 1
|
61 |
+
elif char == "," and in_bracket_num == 0:
|
62 |
+
splitted_expr.append(expr[start_idx:i].strip())
|
63 |
+
start_idx = i + 1
|
64 |
+
|
65 |
+
if start_idx < len(expr):
|
66 |
+
splitted_expr.append(expr[start_idx:].strip())
|
67 |
+
|
68 |
+
return splitted_expr
|
69 |
+
|
70 |
+
def trans_plus_minus_sign(self, expr_list: list):
|
71 |
+
# Translates plus-minus signs into separate expressions
|
72 |
+
new_expr_list = []
|
73 |
+
for expr in expr_list:
|
74 |
+
if "\\pm" in expr:
|
75 |
+
new_expr_list.append(expr.replace("\\pm", "+"))
|
76 |
+
new_expr_list.append(expr.replace("\\pm", "-"))
|
77 |
+
else:
|
78 |
+
new_expr_list.append(expr)
|
79 |
+
|
80 |
+
return new_expr_list
|
81 |
+
|
82 |
+
def judge(self, expression1, expression2, precision=1e-2):
|
83 |
+
# Judge if two expressions are equal (expression1 is considered as the Ground Truth)
|
84 |
+
# Default precision is a list for supporting multiple expressions
|
85 |
+
precision = precision if isinstance(precision, list) else [precision]
|
86 |
+
|
87 |
+
try:
|
88 |
+
expression1, expression2 = self.preprocess(expression1, expression2)
|
89 |
+
except:
|
90 |
+
return False
|
91 |
+
if expression1 == expression2:
|
92 |
+
# print("Exactly equal")
|
93 |
+
return True
|
94 |
+
|
95 |
+
# Remove Chinese characters from the string, as answers like "yes" or "no" in Chinese have been considered
|
96 |
+
expression1 = expression1 if re.fullmatch(r"[\u4e00-\u9fff]+", expression1) else re.sub(r'[\u4e00-\u9fff]+', '', expression1) # noqa: E501
|
97 |
+
expression2 = expression2 if re.fullmatch(r'[\u4e00-\u9fff]+', expression2) else re.sub(r'[\u4e00-\u9fff]+', '', expression2) # noqa: E501
|
98 |
+
# Check if two < or > in expression
|
99 |
+
if self.is_two_greater_sign(expression1):
|
100 |
+
expression1 = self.trans_greater_sign_to_interval(expression1)
|
101 |
+
|
102 |
+
if self.is_two_greater_sign(expression2):
|
103 |
+
expression2 = self.trans_greater_sign_to_interval(expression2)
|
104 |
+
|
105 |
+
expression1 = self.split_by_comma(expression1)
|
106 |
+
expression2 = self.split_by_comma(expression2)
|
107 |
+
|
108 |
+
temp_list1 = self.trans_plus_minus_sign(expression1)
|
109 |
+
temp_list2 = self.trans_plus_minus_sign(expression2)
|
110 |
+
|
111 |
+
# Set up a list for allowed errors
|
112 |
+
if len(precision) <= 1:
|
113 |
+
precision = precision * len(temp_list1)
|
114 |
+
|
115 |
+
if len(temp_list1) != len(temp_list2):
|
116 |
+
return False
|
117 |
+
|
118 |
+
# Check if elements in both lists can be paired and are equal
|
119 |
+
idx = -1
|
120 |
+
while len(temp_list1) != 0:
|
121 |
+
idx = (idx + 1) % len(temp_list1)
|
122 |
+
|
123 |
+
item1 = temp_list1[idx]
|
124 |
+
self.precision = precision[idx]
|
125 |
+
|
126 |
+
for item2 in temp_list2:
|
127 |
+
if self.is_equal(item1, item2):
|
128 |
+
temp_list1.remove(item1)
|
129 |
+
temp_list2.remove(item2)
|
130 |
+
precision.remove(self.precision)
|
131 |
+
break
|
132 |
+
else:
|
133 |
+
# If no match was found, return False
|
134 |
+
return False
|
135 |
+
|
136 |
+
# If all elements are matched, return True
|
137 |
+
return True
|
138 |
+
|
139 |
+
def is_interval(self, expr):
|
140 |
+
# Checks if an expression is an interval
|
141 |
+
return expr.startswith(("(", "[")) and expr.endswith((")", "]"))
|
142 |
+
|
143 |
+
def is_two_greater_sign(self, expr):
|
144 |
+
match = re.findall(r'<', expr)
|
145 |
+
return len(match) == 2
|
146 |
+
|
147 |
+
def sympy_sub_pi(self, expression_sympy):
|
148 |
+
# Replaces the symbol for pi in sympy expressions with its numerical value
|
149 |
+
return expression_sympy.subs(self.pi, math.pi)
|
150 |
+
|
151 |
+
def is_equal(self, expression1, expression2):
|
152 |
+
# Default first expression is ground truth. Check if expressions are equal in different aspects
|
153 |
+
if expression1 == expression2 and expression1 != "" and expression2 != "":
|
154 |
+
# print("Equivalent natively")
|
155 |
+
return True
|
156 |
+
|
157 |
+
# First check if both are intervals
|
158 |
+
if self.is_interval(expression1) and self.is_interval(expression2):
|
159 |
+
try:
|
160 |
+
if self.interval_equal(expression1, expression2):
|
161 |
+
# print("Interval equivalent")
|
162 |
+
return True
|
163 |
+
except:
|
164 |
+
return False
|
165 |
+
|
166 |
+
# Then check for numerical equality
|
167 |
+
try:
|
168 |
+
if self.numerical_equal(expression1, expression2):
|
169 |
+
# print("Numerically equivalent")
|
170 |
+
return True
|
171 |
+
except:
|
172 |
+
pass
|
173 |
+
# Then check if expressions are mathematically equal
|
174 |
+
try:
|
175 |
+
if self.expression_equal(expression1, expression2) and not ("=" in expression1 and "=" in expression2):
|
176 |
+
# print("Expression equivalent")
|
177 |
+
return True
|
178 |
+
except:
|
179 |
+
pass
|
180 |
+
|
181 |
+
# Lastly, check for equation equality
|
182 |
+
try:
|
183 |
+
if self.equation_equal(expression1, expression2):
|
184 |
+
# print("Equation equivalent")
|
185 |
+
return True
|
186 |
+
except:
|
187 |
+
pass
|
188 |
+
|
189 |
+
return False
|
190 |
+
|
191 |
+
def numerical_equal(self, expression1: str, expression2: str, include_percentage: bool = True):
|
192 |
+
# Check if two numerical values are equal within an allowed error range
|
193 |
+
# Includes possible percentage cases
|
194 |
+
reference = float(expression1)
|
195 |
+
prediction = float(expression2)
|
196 |
+
|
197 |
+
if include_percentage:
|
198 |
+
gt_result = [reference / 100, reference, reference * 100]
|
199 |
+
else:
|
200 |
+
gt_result = [reference]
|
201 |
+
|
202 |
+
for item in gt_result:
|
203 |
+
if abs(item - prediction) <= self.precision * 1.01:
|
204 |
+
return True
|
205 |
+
return False
|
206 |
+
|
207 |
+
def expression_equal(self, exp1, exp2):
|
208 |
+
# Check if two expressions are mathematically equivalent
|
209 |
+
# Extract expression and use sympy for equivalence checking
|
210 |
+
def extract_expression(expression):
|
211 |
+
if "=" in expression:
|
212 |
+
expression = expression.split("=")[1]
|
213 |
+
return expression.strip()
|
214 |
+
|
215 |
+
exp1 = extract_expression(exp1)
|
216 |
+
exp2 = extract_expression(exp2)
|
217 |
+
|
218 |
+
exp_too_long = len(exp1) > 300 or len(exp2) > 300
|
219 |
+
|
220 |
+
expr1_sym = sympify(parse_latex(exp1))
|
221 |
+
expr2_sym = sympify(parse_latex(exp2))
|
222 |
+
if expr1_sym == expr2_sym:
|
223 |
+
return True
|
224 |
+
else:
|
225 |
+
expr1_sym = self.sympy_sub_pi(expr1_sym)
|
226 |
+
expr2_sym = self.sympy_sub_pi(expr2_sym)
|
227 |
+
|
228 |
+
if (expr1_sym.has(sp.Symbol) and not expr2_sym.has(sp.Symbol)) or \
|
229 |
+
(not expr1_sym.has(sp.Symbol) and expr2_sym.has(sp.Symbol)):
|
230 |
+
return False
|
231 |
+
elif not expr1_sym.has(sp.Symbol) and not expr2_sym.has(sp.Symbol):
|
232 |
+
try:
|
233 |
+
if not (self.can_compute_power(expr1_sym) and self.can_compute_power(expr2_sym)):
|
234 |
+
print("These two numbers cannot be calculated by the current computer for: "
|
235 |
+
f"\"{str(expr1_sym)}\" and \"{str(expr2_sym)}\"")
|
236 |
+
return False
|
237 |
+
if exp_too_long:
|
238 |
+
print(f'Expression {exp1} or {exp2} is too long to compute. ')
|
239 |
+
return False
|
240 |
+
if abs(expr1_sym.evalf() - expr2_sym.evalf()) <= self.precision * 1.01:
|
241 |
+
return True
|
242 |
+
else:
|
243 |
+
return False
|
244 |
+
except:
|
245 |
+
return False
|
246 |
+
elif exp_too_long:
|
247 |
+
print(f'Expression {exp1} or {exp2} is too long to compute. ')
|
248 |
+
return False
|
249 |
+
else:
|
250 |
+
try:
|
251 |
+
simplified_expr = simplify(expr1_sym - expr2_sym)
|
252 |
+
num_value = simplified_expr.evalf()
|
253 |
+
return abs(num_value) < 1e-3
|
254 |
+
except:
|
255 |
+
return False
|
256 |
+
|
257 |
+
def equation_equal(self, expression1, expression2):
|
258 |
+
# Check if two equations are mathematically equivalent
|
259 |
+
# Simplify equations and use sympy for equivalence checking
|
260 |
+
def simplify_equation(latex_eq):
|
261 |
+
lhs, rhs = latex_eq.split('=')
|
262 |
+
|
263 |
+
lhs_expr = parse_latex(lhs)
|
264 |
+
rhs_expr = parse_latex(rhs)
|
265 |
+
|
266 |
+
equation = Eq(lhs_expr, rhs_expr)
|
267 |
+
|
268 |
+
simplified_eq = simplify(equation.lhs - equation.rhs)
|
269 |
+
|
270 |
+
return simplified_eq
|
271 |
+
|
272 |
+
expr1_sym = simplify_equation(expression1)
|
273 |
+
expr2_sym = simplify_equation(expression2)
|
274 |
+
|
275 |
+
division_result_1 = simplify(expr1_sym / expr2_sym)
|
276 |
+
division_result_2 = simplify(expr2_sym / expr1_sym)
|
277 |
+
|
278 |
+
if ((division_result_1.is_Integer and division_result_1 != 0) or # noqa: W504
|
279 |
+
(division_result_2.is_Integer and division_result_2 != 0)):
|
280 |
+
return True
|
281 |
+
else:
|
282 |
+
return False
|
283 |
+
|
284 |
+
def interval_equal(self, expression1, expression2):
|
285 |
+
# Check if two intervals are mathematically equivalent
|
286 |
+
def compare_two_interval(inter1, inter2):
|
287 |
+
if inter1[0] != inter2[0] or inter1[-1] != inter2[-1]:
|
288 |
+
return False
|
289 |
+
|
290 |
+
inter1 = inter1.strip('[]()')
|
291 |
+
inter2 = inter2.strip('[]()')
|
292 |
+
|
293 |
+
items_1 = inter1.split(',')
|
294 |
+
items_2 = inter2.split(',')
|
295 |
+
|
296 |
+
for item_1, item_2 in zip(items_1, items_2):
|
297 |
+
if not self.expression_equal(item_1, item_2):
|
298 |
+
return False
|
299 |
+
return True
|
300 |
+
|
301 |
+
interval1 = expression1
|
302 |
+
interval2 = expression2
|
303 |
+
|
304 |
+
if interval1 == interval2:
|
305 |
+
return True
|
306 |
+
else:
|
307 |
+
inter_list1 = interval1.split("\\cup")
|
308 |
+
inter_list2 = interval2.split("\\cup")
|
309 |
+
|
310 |
+
if len(inter_list1) != len(inter_list2):
|
311 |
+
return False
|
312 |
+
else:
|
313 |
+
for inter1, inter2 in zip(inter_list1, inter_list2):
|
314 |
+
if not compare_two_interval(inter1, inter2):
|
315 |
+
return False
|
316 |
+
return True
|
317 |
+
|
318 |
+
def preprocess(self, expression1, expression2):
|
319 |
+
# Preprocess expressions to extract and replace special symbols
|
320 |
+
def extract_boxed_content(latex_str):
|
321 |
+
boxed_matches = re.finditer(r'\\boxed{', latex_str)
|
322 |
+
results = ""
|
323 |
+
|
324 |
+
for match in boxed_matches:
|
325 |
+
start_index = match.end()
|
326 |
+
end_index = start_index
|
327 |
+
stack = 1
|
328 |
+
|
329 |
+
while stack > 0 and end_index < len(latex_str):
|
330 |
+
if latex_str[end_index] == '{':
|
331 |
+
stack += 1
|
332 |
+
elif latex_str[end_index] == '}':
|
333 |
+
stack -= 1
|
334 |
+
end_index += 1
|
335 |
+
|
336 |
+
if stack == 0:
|
337 |
+
content = latex_str[start_index:end_index - 1]
|
338 |
+
results += content + ","
|
339 |
+
else:
|
340 |
+
raise ValueError("Mismatched braces in LaTeX string.")
|
341 |
+
|
342 |
+
if results == "":
|
343 |
+
last_line_ans = latex_str.strip().split("\n")[-1]
|
344 |
+
dollar_pattern = r"\$(.*?)\$"
|
345 |
+
answers = re.findall(dollar_pattern, last_line_ans)
|
346 |
+
|
347 |
+
if answers:
|
348 |
+
for ans in answers:
|
349 |
+
results += ans + ","
|
350 |
+
else:
|
351 |
+
results = latex_str
|
352 |
+
|
353 |
+
return results
|
354 |
+
|
355 |
+
def sepcial_symbol_replace(expression):
|
356 |
+
|
357 |
+
expression = expression.replace("\\text{cm}^2", '').replace("\\text{cm}", "").replace("\\,cm", '').replace("\\text{ cm}", '').replace("cm", '').replace("\\text{分米}^2", '').replace("cm^{2}", '').replace("60 \\text{ cm}^2",'').replace("\\ \\text{m}", "").replace("\\text{米}","").strip() # noqa: E501
|
358 |
+
|
359 |
+
expression = re.sub(r"(.+)m$", r"\1", expression)
|
360 |
+
|
361 |
+
if "\\in " in expression:
|
362 |
+
expression = expression.split("\\in ")[1]
|
363 |
+
|
364 |
+
for signal in self.special_signal_map:
|
365 |
+
expression = expression.replace(signal, self.special_signal_map[signal])
|
366 |
+
|
367 |
+
expression = re.sub(r'(\\sin|\\cos|\\tan)(\d+)', r'\1((\2/180)\\pi)', expression)
|
368 |
+
|
369 |
+
expression = expression.strip("\n,.:;^_=+`!@#%^&*~,。")
|
370 |
+
|
371 |
+
pattern = r'\\(?:mathrm|mathbf)\{~?([^}]*)\}'
|
372 |
+
expression = re.sub(pattern, r'\1', expression)
|
373 |
+
|
374 |
+
return expression
|
375 |
+
|
376 |
+
exp1, exp2 = extract_boxed_content(expression1), extract_boxed_content(expression2)
|
377 |
+
|
378 |
+
exp1, exp2 = sepcial_symbol_replace(exp1), sepcial_symbol_replace(exp2)
|
379 |
+
|
380 |
+
return exp1, exp2
|
381 |
+
|
382 |
+
def can_compute_power(self, expr):
|
383 |
+
# Checks if a power expression can be computed
|
384 |
+
if isinstance(expr, Pow):
|
385 |
+
base, exp = expr.as_base_exp()
|
386 |
+
if base.is_number and exp.is_number:
|
387 |
+
MAX_EXP = 1000 # Adjust based on computing environment
|
388 |
+
if abs(exp.evalf()) > MAX_EXP:
|
389 |
+
return False
|
390 |
+
else:
|
391 |
+
return True
|
392 |
+
else:
|
393 |
+
return False
|
394 |
+
else:
|
395 |
+
return True # Not a power expression, can compute
|
396 |
+
|
397 |
+
|
398 |
+
class MMMath(ImageBaseDataset):
|
399 |
+
|
400 |
+
TYPE = 'VQA'
|
401 |
+
|
402 |
+
DATASET_URL = {
|
403 |
+
'MM-Math': 'https://opencompass.openxlab.space/utils/VLMEval/MM-Math.tsv',
|
404 |
+
}
|
405 |
+
DATASET_MD5 = {
|
406 |
+
'MM-Math': '1f064ed7c4e0e8926a3fa65849419ca5',
|
407 |
+
}
|
408 |
+
|
409 |
+
@classmethod
|
410 |
+
def evaluate(self, eval_file, **kwargs):
|
411 |
+
|
412 |
+
data = load(eval_file)
|
413 |
+
judger = AutoScoringJudge()
|
414 |
+
func = judger.judge
|
415 |
+
|
416 |
+
tups = [dict(expression1=x, expression2=y) for x, y in zip(data['answer'], data['prediction'])]
|
417 |
+
|
418 |
+
res = track_progress_rich(func, tups, nproc=16)
|
419 |
+
data['hit'] = res
|
420 |
+
dump(data, eval_file)
|
421 |
+
|
422 |
+
score_file = eval_file.replace('.xlsx', '_score.json')
|
423 |
+
score = {}
|
424 |
+
score['overall'] = np.mean(data['hit'])
|
425 |
+
# Results by Difficulty
|
426 |
+
difficulties = set(data['difficulty'])
|
427 |
+
for d in difficulties:
|
428 |
+
score[f'Difficulty-{d}'] = np.mean(data[data['difficulty'] == d]['hit'])
|
429 |
+
|
430 |
+
# Results by Year
|
431 |
+
years = set(data['year'])
|
432 |
+
for y in years:
|
433 |
+
score[f'Year-{y}'] = np.mean(data[data['year'] == y]['hit'])
|
434 |
+
|
435 |
+
# Results by Knowledge-L1
|
436 |
+
points = set(data['knowledge_l1'])
|
437 |
+
for p in points:
|
438 |
+
score[f'Knowledge-L1-{p}'] = np.mean(data[data['knowledge_l1'] == p]['hit'])
|
439 |
+
|
440 |
+
# Results by Knowledge-L2
|
441 |
+
points = set(data['knowledge_l2'])
|
442 |
+
for p in points:
|
443 |
+
score[f'Knowledge-L2-{p}'] = np.mean(data[data['knowledge_l2'] == p]['hit'])
|
444 |
+
|
445 |
+
dump(score, score_file)
|
446 |
+
return score
|
vlmeval/VLMEvalKit_old/vlmeval/dataset/mvbench.py
ADDED
@@ -0,0 +1,668 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
import huggingface_hub
|
2 |
+
from huggingface_hub import snapshot_download
|
3 |
+
from ..smp import *
|
4 |
+
from .video_base import VideoBaseDataset
|
5 |
+
from .utils import build_judge, DEBUG_MESSAGE
|
6 |
+
from ..utils import track_progress_rich
|
7 |
+
import torchvision.transforms as T
|
8 |
+
from torchvision import transforms
|
9 |
+
from torchvision.transforms.functional import InterpolationMode
|
10 |
+
from decord import VideoReader, cpu
|
11 |
+
import imageio
|
12 |
+
import cv2
|
13 |
+
import zipfile
|
14 |
+
import os
|
15 |
+
import glob
|
16 |
+
from .utils.mvbench import *
|
17 |
+
|
18 |
+
FAIL_MSG = 'Failed to obtain answer via API.'
|
19 |
+
|
20 |
+
|
21 |
+
class MVBench(VideoBaseDataset):
|
22 |
+
|
23 |
+
MD5 = 'fd21d36522cdedd46d84dc46715ad832'
|
24 |
+
SYS = """Carefully watch the video and pay attention to the cause and sequence of events, \
|
25 |
+
the detail and movement of objects, and the action and pose of persons. \
|
26 |
+
Based on your observations, select the best option that accurately addresses the question.
|
27 |
+
"""
|
28 |
+
|
29 |
+
TYPE = 'Video-MCQ'
|
30 |
+
|
31 |
+
def __init__(self, dataset='MVBench', pack=False):
|
32 |
+
self.type_data_list = {
|
33 |
+
'Action Sequence': ('action_sequence.json',
|
34 |
+
'your_data_path/star/Charades_v1_480/', 'video', True), # has start & end
|
35 |
+
'Action Prediction': ('action_prediction.json',
|
36 |
+
'your_data_path/star/Charades_v1_480/', 'video', True), # has start & end
|
37 |
+
'Action Antonym': ('action_antonym.json',
|
38 |
+
'your_data_path/ssv2_video/', 'video', False),
|
39 |
+
'Fine-grained Action': ('fine_grained_action.json',
|
40 |
+
'your_data_path/Moments_in_Time_Raw/videos/', 'video', False),
|
41 |
+
'Unexpected Action': ('unexpected_action.json',
|
42 |
+
'your_data_path/FunQA_test/test/', 'video', False),
|
43 |
+
'Object Existence': ('object_existence.json',
|
44 |
+
'your_data_path/clevrer/video_validation/', 'video', False),
|
45 |
+
'Object Interaction': ('object_interaction.json',
|
46 |
+
'your_data_path/star/Charades_v1_480/', 'video', True), # has start & end
|
47 |
+
'Object Shuffle': ('object_shuffle.json',
|
48 |
+
'your_data_path/perception/videos/', 'video', False),
|
49 |
+
'Moving Direction': ('moving_direction.json',
|
50 |
+
'your_data_path/clevrer/video_validation/', 'video', False),
|
51 |
+
'Action Localization': ('action_localization.json',
|
52 |
+
'your_data_path/sta/sta_video/', 'video', True), # has start & end
|
53 |
+
'Scene Transition': ('scene_transition.json',
|
54 |
+
'your_data_path/scene_qa/video/', 'video', False),
|
55 |
+
'Action Count': ('action_count.json',
|
56 |
+
'your_data_path/perception/videos/', 'video', False),
|
57 |
+
'Moving Count': ('moving_count.json',
|
58 |
+
'your_data_path/clevrer/video_validation/', 'video', False),
|
59 |
+
'Moving Attribute': ('moving_attribute.json',
|
60 |
+
'your_data_path/clevrer/video_validation/', 'video', False),
|
61 |
+
'State Change': ('state_change.json',
|
62 |
+
'your_data_path/perception/videos/', 'video', False),
|
63 |
+
'Fine-grained Pose': ('fine_grained_pose.json',
|
64 |
+
'your_data_path/nturgbd/', 'video', False),
|
65 |
+
'Character Order': ('character_order.json',
|
66 |
+
'your_data_path/perception/videos/', 'video', False),
|
67 |
+
'Egocentric Navigation': ('egocentric_navigation.json',
|
68 |
+
'your_data_path/vlnqa/', 'video', False),
|
69 |
+
'Episodic Reasoning': ('episodic_reasoning.json',
|
70 |
+
'your_data_path/tvqa/frames_fps3_hq/', 'frame', True), # has start & end, read frame
|
71 |
+
'Counterfactual Inference': ('counterfactual_inference.json',
|
72 |
+
'your_data_path/clevrer/video_validation/', 'video', False),
|
73 |
+
}
|
74 |
+
super().__init__(dataset=dataset, pack=pack)
|
75 |
+
|
76 |
+
@classmethod
|
77 |
+
def supported_datasets(cls):
|
78 |
+
return ['MVBench']
|
79 |
+
|
80 |
+
def prepare_dataset(self, dataset_name='MVBench', repo_id='OpenGVLab/MVBench'):
|
81 |
+
def check_integrity(pth):
|
82 |
+
data_file = osp.join(pth, f'{dataset_name}.tsv')
|
83 |
+
|
84 |
+
if not os.path.exists(data_file):
|
85 |
+
return False
|
86 |
+
|
87 |
+
if md5(data_file) != self.MD5:
|
88 |
+
return False
|
89 |
+
|
90 |
+
data = load(data_file)
|
91 |
+
for idx, item in data.iterrows():
|
92 |
+
if not osp.exists(osp.join(pth, item['prefix'], item['video'])):
|
93 |
+
return False
|
94 |
+
return True
|
95 |
+
|
96 |
+
if modelscope_flag_set():
|
97 |
+
repo_id = 'modelscope/MVBench'
|
98 |
+
|
99 |
+
cache_path = get_cache_path(repo_id, branch='main')
|
100 |
+
if cache_path is not None and check_integrity(cache_path):
|
101 |
+
dataset_path = cache_path
|
102 |
+
else:
|
103 |
+
def unzip_hf_zip(pth):
|
104 |
+
pth = os.path.join(pth, 'video/')
|
105 |
+
for filename in os.listdir(pth):
|
106 |
+
if filename.endswith('.zip'):
|
107 |
+
# 构建完整的文件路径
|
108 |
+
zip_path = os.path.join(pth, filename)
|
109 |
+
|
110 |
+
# 解压 ZIP 文件
|
111 |
+
with zipfile.ZipFile(zip_path, 'r') as zip_ref:
|
112 |
+
zip_ref.extractall(pth)
|
113 |
+
|
114 |
+
def generate_tsv(pth):
|
115 |
+
data_file = osp.join(pth, f'{dataset_name}.tsv')
|
116 |
+
if os.path.exists(data_file) and md5(data_file) == self.MD5:
|
117 |
+
return
|
118 |
+
json_data_dir = os.path.join(pth, 'json')
|
119 |
+
self.data_list = []
|
120 |
+
for k, v in self.type_data_list.items():
|
121 |
+
with open(os.path.join(json_data_dir, v[0]), 'r') as f:
|
122 |
+
json_data = json.load(f)
|
123 |
+
for data in json_data:
|
124 |
+
if os.path.exists(os.path.join(pth, v[1].replace('your_data_path', 'video'), data['video'])):
|
125 |
+
self.data_list.append({
|
126 |
+
'task_type': k,
|
127 |
+
'prefix': v[1].replace('your_data_path', 'video'),
|
128 |
+
'data_type': v[2],
|
129 |
+
'bound': v[3],
|
130 |
+
'start': data['start'] if 'start' in data.keys() else None,
|
131 |
+
'end': data['end'] if 'end' in data.keys() else None,
|
132 |
+
'video': data['video'],
|
133 |
+
'question': data['question'],
|
134 |
+
'answer': data['answer'],
|
135 |
+
'candidates': data['candidates']
|
136 |
+
})
|
137 |
+
else:
|
138 |
+
print(
|
139 |
+
'NTURGB-D zip file is removed according to MVBench, you can view it at '
|
140 |
+
'https://huggingface.co/datasets/OpenGVLab/MVBench for detailed reason.'
|
141 |
+
)
|
142 |
+
raise Exception(
|
143 |
+
f"{os.path.join(v[1].replace('your_data_path', 'video'), data['video'])} does not exist"
|
144 |
+
)
|
145 |
+
|
146 |
+
data_df = pd.DataFrame(self.data_list)
|
147 |
+
data_df = data_df.assign(index=range(len(data_df)))
|
148 |
+
data_df.to_csv(data_file, sep='\t', index=False)
|
149 |
+
|
150 |
+
def move_files(pth):
|
151 |
+
src_folder = os.path.join(pth, 'video/data0613')
|
152 |
+
if not os.path.exists(src_folder):
|
153 |
+
return
|
154 |
+
for subdir in os.listdir(src_folder):
|
155 |
+
subdir_path = os.path.join(src_folder, subdir)
|
156 |
+
if os.path.isdir(subdir_path):
|
157 |
+
for subsubdir in os.listdir(subdir_path):
|
158 |
+
subsubdir_path = os.path.join(subdir_path, subsubdir)
|
159 |
+
if os.path.isdir(subsubdir_path):
|
160 |
+
for item in os.listdir(subsubdir_path):
|
161 |
+
item_path = os.path.join(subsubdir_path, item)
|
162 |
+
target_folder = os.path.join(pth, 'video', subdir, subsubdir)
|
163 |
+
if not os.path.exists(target_folder):
|
164 |
+
os.makedirs(target_folder)
|
165 |
+
target_path = os.path.join(target_folder, item)
|
166 |
+
try:
|
167 |
+
shutil.move(item_path, target_path)
|
168 |
+
except Exception as e:
|
169 |
+
print(f"Error moving {item_path} to {target_path}: {e}")
|
170 |
+
|
171 |
+
if modelscope_flag_set():
|
172 |
+
from modelscope import dataset_snapshot_download
|
173 |
+
dataset_path = dataset_snapshot_download(dataset_id=repo_id, revision='master')
|
174 |
+
else:
|
175 |
+
hf_token = os.environ.get('HUGGINGFACE_TOKEN')
|
176 |
+
huggingface_hub.login(hf_token)
|
177 |
+
dataset_path = snapshot_download(repo_id=repo_id, repo_type='dataset')
|
178 |
+
unzip_hf_zip(dataset_path)
|
179 |
+
move_files(dataset_path)
|
180 |
+
generate_tsv(dataset_path)
|
181 |
+
|
182 |
+
data_file = osp.join(dataset_path, f'{dataset_name}.tsv')
|
183 |
+
|
184 |
+
self.decord_method = {
|
185 |
+
'video': self.read_video,
|
186 |
+
'gif': self.read_gif,
|
187 |
+
'frame': self.read_frame,
|
188 |
+
}
|
189 |
+
|
190 |
+
self.nframe = 8
|
191 |
+
self.frame_fps = 3
|
192 |
+
|
193 |
+
# transform
|
194 |
+
self.transform = T.Compose([
|
195 |
+
Stack(),
|
196 |
+
ToTorchFormatTensor()
|
197 |
+
])
|
198 |
+
|
199 |
+
return dict(root=dataset_path, data_file=data_file)
|
200 |
+
|
201 |
+
def get_index(self, bound, fps, max_frame, first_idx=0):
|
202 |
+
if bound:
|
203 |
+
start, end = bound[0], bound[1]
|
204 |
+
else:
|
205 |
+
start, end = -100000, 100000
|
206 |
+
start_idx = max(first_idx, round(start * fps))
|
207 |
+
end_idx = min(round(end * fps), max_frame)
|
208 |
+
seg_size = float(end_idx - start_idx) / self.num_segments
|
209 |
+
frame_indices = np.array([
|
210 |
+
int(start_idx + (seg_size / 2) + np.round(seg_size * idx))
|
211 |
+
for idx in range(self.num_segments)
|
212 |
+
])
|
213 |
+
return frame_indices
|
214 |
+
|
215 |
+
def read_video(self, video_path, bound=None):
|
216 |
+
vr = VideoReader(video_path, ctx=cpu(0), num_threads=1)
|
217 |
+
max_frame = len(vr) - 1
|
218 |
+
fps = float(vr.get_avg_fps())
|
219 |
+
|
220 |
+
images_group = list()
|
221 |
+
frame_indices = self.get_index(bound, fps, max_frame, first_idx=0)
|
222 |
+
for frame_index in frame_indices:
|
223 |
+
img = Image.fromarray(vr[frame_index].asnumpy())
|
224 |
+
images_group.append(img)
|
225 |
+
torch_imgs = self.transform(images_group)
|
226 |
+
return torch_imgs
|
227 |
+
|
228 |
+
def read_gif(self, video_path, bound=None, fps=25):
|
229 |
+
gif = imageio.get_reader(video_path)
|
230 |
+
max_frame = len(gif) - 1
|
231 |
+
|
232 |
+
images_group = list()
|
233 |
+
frame_indices = self.get_index(bound, fps, max_frame, first_idx=0)
|
234 |
+
for index, frame in enumerate(gif):
|
235 |
+
if index in frame_indices:
|
236 |
+
img = cv2.cvtColor(frame, cv2.COLOR_RGBA2RGB)
|
237 |
+
img = Image.fromarray(img)
|
238 |
+
images_group.append(img)
|
239 |
+
torch_imgs = self.transform(images_group)
|
240 |
+
return torch_imgs
|
241 |
+
|
242 |
+
def read_frame(self, video_path, bound=None, fps=3):
|
243 |
+
max_frame = len(os.listdir(video_path))
|
244 |
+
images_group = list()
|
245 |
+
frame_indices = self.get_index(bound, fps, max_frame, first_idx=1) # frame_idx starts from 1
|
246 |
+
for frame_index in frame_indices:
|
247 |
+
img = Image.open(os.path.join(video_path, f'{frame_index:05d}.jpg'))
|
248 |
+
images_group.append(img)
|
249 |
+
torch_imgs = self.transform(images_group)
|
250 |
+
return torch_imgs
|
251 |
+
|
252 |
+
def save_video_frames(self, imgs, video_name, frames):
|
253 |
+
|
254 |
+
frame_paths = self.frame_paths(video_name, frames)
|
255 |
+
flag = np.all([osp.exists(p) for p in frame_paths])
|
256 |
+
|
257 |
+
if not flag:
|
258 |
+
block_size = imgs.size(0) // frames
|
259 |
+
split_tensors = torch.split(imgs, block_size)
|
260 |
+
to_pil = transforms.ToPILImage()
|
261 |
+
images = [to_pil(arr) for arr in split_tensors]
|
262 |
+
for im, pth in zip(images, frame_paths):
|
263 |
+
if not osp.exists(pth):
|
264 |
+
im.save(pth)
|
265 |
+
|
266 |
+
return frame_paths
|
267 |
+
|
268 |
+
def qa_template(self, data):
|
269 |
+
question = f"Question: {data['question']}\n"
|
270 |
+
question += 'Options:\n'
|
271 |
+
answer = data['answer']
|
272 |
+
answer_idx = -1
|
273 |
+
for idx, c in enumerate(eval(data['candidates'])):
|
274 |
+
question += f"({chr(ord('A') + idx)}) {c}\n"
|
275 |
+
if c == answer:
|
276 |
+
answer_idx = idx
|
277 |
+
question = question.rstrip()
|
278 |
+
answer = f"({chr(ord('A') + answer_idx)}) {answer}"
|
279 |
+
return question, answer
|
280 |
+
|
281 |
+
def load_into_video_and_process(self, line):
|
282 |
+
try:
|
283 |
+
from moviepy.editor import VideoFileClip, ImageSequenceClip
|
284 |
+
except:
|
285 |
+
raise ImportError(
|
286 |
+
'MoviePy is not installed, please install it by running "pip install moviepy==1.0.3"'
|
287 |
+
)
|
288 |
+
video_path = os.path.join(self.data_root, line['prefix'], line['video'])
|
289 |
+
|
290 |
+
if line['data_type'] in ['gif'] or os.path.splitext(video_path)[1] in ['.webm']:
|
291 |
+
processed_video_path = video_path.replace(os.path.splitext(video_path)[1], '.mp4')
|
292 |
+
if not os.path.exists(processed_video_path):
|
293 |
+
# using MoviePy to transform GIF, webm into mp4 format
|
294 |
+
gif_clip = VideoFileClip(video_path)
|
295 |
+
gif_clip.write_videofile(processed_video_path, codec='libx264')
|
296 |
+
gif_clip.close()
|
297 |
+
elif line['data_type'] in ['frame']:
|
298 |
+
input_images = os.path.join(video_path, '*.jpg')
|
299 |
+
processed_video_path = f'{video_path}.mp4'
|
300 |
+
if not os.path.exists(processed_video_path):
|
301 |
+
# using MoviePy to transform images into mp4
|
302 |
+
image_files = sorted(glob.glob(input_images))
|
303 |
+
image_clip = ImageSequenceClip(image_files, fps=self.frame_fps)
|
304 |
+
image_clip.write_videofile(processed_video_path, codec='libx264')
|
305 |
+
image_clip.close()
|
306 |
+
else:
|
307 |
+
processed_video_path = video_path
|
308 |
+
|
309 |
+
if line['bound']:
|
310 |
+
base_name, suffix = os.path.splitext(processed_video_path)
|
311 |
+
output_video_path = f'{base_name}_processed{suffix}'
|
312 |
+
if not os.path.exists(output_video_path):
|
313 |
+
video_clip = VideoFileClip(processed_video_path)
|
314 |
+
clip = video_clip.subclip(line['start'], min(line['end'], video_clip.duration))
|
315 |
+
clip.write_videofile(output_video_path)
|
316 |
+
clip.close()
|
317 |
+
else:
|
318 |
+
output_video_path = processed_video_path
|
319 |
+
|
320 |
+
return output_video_path
|
321 |
+
|
322 |
+
def save_video_into_images(self, line, num_frames):
|
323 |
+
bound = None
|
324 |
+
if line['bound']:
|
325 |
+
bound = (
|
326 |
+
line['start'],
|
327 |
+
line['end'],
|
328 |
+
)
|
329 |
+
video_path = os.path.join(self.data_root, line['prefix'], line['video'])
|
330 |
+
decord_method = self.decord_method[line['data_type']]
|
331 |
+
self.num_segments = num_frames if num_frames > 0 else self.nframe
|
332 |
+
torch_imgs = decord_method(video_path, bound)
|
333 |
+
img_frame_paths = self.save_video_frames(torch_imgs, line['video'], self.num_segments)
|
334 |
+
return img_frame_paths
|
335 |
+
|
336 |
+
def build_prompt(self, line, num_frames, video_llm, fps):
|
337 |
+
if fps > 0:
|
338 |
+
raise ValueError('MVBench does not support fps setting, please transfer to MVBench_MP4!')
|
339 |
+
if isinstance(line, int):
|
340 |
+
assert line < len(self)
|
341 |
+
line = self.data.iloc[line]
|
342 |
+
|
343 |
+
question, answer = self.qa_template(line)
|
344 |
+
message = [dict(type='text', value=self.SYS, role='system')]
|
345 |
+
message.append(dict(type='text', value=question))
|
346 |
+
if video_llm:
|
347 |
+
new_video_path = self.load_into_video_and_process(line)
|
348 |
+
message.append(dict(type='video', value=new_video_path))
|
349 |
+
else:
|
350 |
+
img_frame_paths = self.save_video_into_images(line, num_frames)
|
351 |
+
for im in img_frame_paths:
|
352 |
+
message.append(dict(type='image', value=im))
|
353 |
+
message.append(dict(type='text', value='\nOnly give the best option.'))
|
354 |
+
message.append(dict(type='text', value='Best option:(', role='assistant'))
|
355 |
+
return message
|
356 |
+
|
357 |
+
@classmethod
|
358 |
+
def evaluate(self, eval_file, **judge_kwargs):
|
359 |
+
|
360 |
+
assert eval_file.endswith('.xlsx'), 'data file should be an xlsx file'
|
361 |
+
|
362 |
+
tmp_file = eval_file.replace('.xlsx', '_tmp.pkl')
|
363 |
+
tgt_file = eval_file.replace('.xlsx', '_rating.json')
|
364 |
+
score_file = eval_file.replace('.xlsx', '_score.xlsx')
|
365 |
+
|
366 |
+
if not osp.exists(score_file):
|
367 |
+
model = judge_kwargs.setdefault('model', 'chatgpt-0125')
|
368 |
+
assert model in ['chatgpt-0125', 'exact_matching', 'gpt-4-0125']
|
369 |
+
|
370 |
+
if model == 'exact_matching':
|
371 |
+
model = None
|
372 |
+
elif gpt_key_set():
|
373 |
+
model = build_judge(**judge_kwargs)
|
374 |
+
if not model.working():
|
375 |
+
warnings.warn('OPENAI API is not working properly, will use exact matching for evaluation')
|
376 |
+
warnings.warn(DEBUG_MESSAGE)
|
377 |
+
model = None
|
378 |
+
else:
|
379 |
+
warnings.warn('OPENAI_API_KEY is not set properly, will use exact matching for evaluation')
|
380 |
+
model = None
|
381 |
+
res = {} if not osp.exists(tmp_file) else load(tmp_file)
|
382 |
+
res = {k: v for k, v in res.items() if FAIL_MSG not in v}
|
383 |
+
|
384 |
+
data = load(eval_file)
|
385 |
+
data_un = data[~pd.isna(data['prediction'])]
|
386 |
+
|
387 |
+
for idx in data_un['index']:
|
388 |
+
ans = data.loc[data['index'] == idx, 'answer'].values[0]
|
389 |
+
pred = data.loc[data['index'] == idx, 'prediction'].values[0]
|
390 |
+
options = eval(data.loc[data['index'] == idx, 'candidates'].values[0])
|
391 |
+
answer_idx = -1
|
392 |
+
for id, c in enumerate(options):
|
393 |
+
if c == ans:
|
394 |
+
answer_idx = id
|
395 |
+
ans = f"({chr(ord('A') + answer_idx)}) {ans}"
|
396 |
+
input_item = data.loc[data['index'] == idx].to_dict(orient='records')[0]
|
397 |
+
for id, option_content in enumerate(eval(input_item['candidates'])):
|
398 |
+
input_item[chr(ord('A') + id)] = option_content
|
399 |
+
if option_content == input_item['answer']:
|
400 |
+
input_item['answer'] = chr(ord('A') + id)
|
401 |
+
|
402 |
+
if FAIL_MSG in pred:
|
403 |
+
data.loc[idx, 'score'] = -1
|
404 |
+
else:
|
405 |
+
data.loc[idx, 'score'] = int(check_ans_with_model(
|
406 |
+
pred, ans, model,
|
407 |
+
input_item,
|
408 |
+
'MVBench'
|
409 |
+
))
|
410 |
+
|
411 |
+
rejected = [x for x in data['score'] if x == -1]
|
412 |
+
|
413 |
+
print(
|
414 |
+
f'Among {len(data)} questions, failed to obtain prediction for {len(data) - len(data_un)} questions, '
|
415 |
+
f'failed to obtain the score for another {len(rejected)} questions. '
|
416 |
+
f'Those questions will be counted as -1 score in ALL rating, and will not be counted in VALID rating.'
|
417 |
+
)
|
418 |
+
|
419 |
+
dump(data, score_file)
|
420 |
+
|
421 |
+
rating = get_dimension_rating(score_file)
|
422 |
+
dump(rating, tgt_file)
|
423 |
+
return rating
|
424 |
+
|
425 |
+
|
426 |
+
class MVBench_MP4(VideoBaseDataset):
|
427 |
+
|
428 |
+
MP4_MD5 = '5c8c6f8b7972c2de65a629590f7c42f5'
|
429 |
+
SYS = """Carefully watch the video and pay attention to the cause and sequence of events, \
|
430 |
+
the detail and movement of objects, and the action and pose of persons. \
|
431 |
+
Based on your observations, select the best option that accurately addresses the question.
|
432 |
+
"""
|
433 |
+
TYPE = 'Video-MCQ'
|
434 |
+
|
435 |
+
def __init__(self, dataset='MVBench_MP4', pack=False):
|
436 |
+
super().__init__(dataset=dataset, pack=pack)
|
437 |
+
|
438 |
+
@classmethod
|
439 |
+
def supported_datasets(cls):
|
440 |
+
return ['MVBench_MP4']
|
441 |
+
|
442 |
+
def prepare_dataset(self, dataset_name='MVBench_MP4', repo_id='OpenGVLab/MVBench'):
|
443 |
+
def check_integrity(pth):
|
444 |
+
data_file = osp.join(pth, f'{dataset_name}.tsv')
|
445 |
+
|
446 |
+
if not os.path.exists(data_file):
|
447 |
+
return False
|
448 |
+
|
449 |
+
if md5(data_file) != self.MP4_MD5:
|
450 |
+
return False
|
451 |
+
|
452 |
+
data = load(data_file)
|
453 |
+
for idx, item in data.iterrows():
|
454 |
+
if not osp.exists(osp.join(pth, item['prefix'], item['video'])):
|
455 |
+
return False
|
456 |
+
return True
|
457 |
+
|
458 |
+
if modelscope_flag_set():
|
459 |
+
repo_id = 'modelscope/MVBench'
|
460 |
+
|
461 |
+
cache_path = get_cache_path(repo_id, branch='video')
|
462 |
+
if cache_path is not None and check_integrity(cache_path):
|
463 |
+
dataset_path = cache_path
|
464 |
+
else:
|
465 |
+
def generate_tsv(pth):
|
466 |
+
data_file = osp.join(pth, f'{dataset_name}.tsv')
|
467 |
+
if os.path.exists(data_file) and md5(data_file) == self.MP4_MD5:
|
468 |
+
return
|
469 |
+
json_data_path = os.path.join(dataset_path, 'test.json')
|
470 |
+
json_data = load(json_data_path)
|
471 |
+
root_data_dict = json_data['root']
|
472 |
+
self.data_list = []
|
473 |
+
for k, v in json_data['meta'].items():
|
474 |
+
for item in v:
|
475 |
+
self.data_list.append({
|
476 |
+
'task_type': k,
|
477 |
+
'prefix': root_data_dict[k],
|
478 |
+
'video': item['video'],
|
479 |
+
'question': item['question'],
|
480 |
+
'answer': item['answer'],
|
481 |
+
'candidates': item['candidates']
|
482 |
+
})
|
483 |
+
data_df = pd.DataFrame(self.data_list)
|
484 |
+
data_df = data_df.assign(index=range(len(data_df)))
|
485 |
+
data_df.to_csv(data_file, sep='\t', index=False)
|
486 |
+
|
487 |
+
if modelscope_flag_set():
|
488 |
+
from modelscope import dataset_snapshot_download
|
489 |
+
dataset_path = dataset_snapshot_download(dataset_id=repo_id, revision='video')
|
490 |
+
else:
|
491 |
+
hf_token = os.environ.get('HUGGINGFACE_TOKEN')
|
492 |
+
huggingface_hub.login(hf_token)
|
493 |
+
dataset_path = snapshot_download(repo_id=repo_id, repo_type='dataset', revision='video')
|
494 |
+
generate_tsv(dataset_path)
|
495 |
+
|
496 |
+
data_file = osp.join(dataset_path, f'{dataset_name}.tsv')
|
497 |
+
|
498 |
+
self.nframe = 8
|
499 |
+
|
500 |
+
# transform
|
501 |
+
self.transform = T.Compose([
|
502 |
+
Stack(),
|
503 |
+
ToTorchFormatTensor()
|
504 |
+
])
|
505 |
+
|
506 |
+
return dict(root=dataset_path, data_file=data_file)
|
507 |
+
|
508 |
+
def qa_template(self, data):
|
509 |
+
question = f"Question: {data['question']}\n"
|
510 |
+
question += 'Options:\n'
|
511 |
+
answer = data['answer']
|
512 |
+
answer_idx = -1
|
513 |
+
for idx, c in enumerate(eval(data['candidates'])):
|
514 |
+
question += f"({chr(ord('A') + idx)}) {c}\n"
|
515 |
+
if c == answer:
|
516 |
+
answer_idx = idx
|
517 |
+
question = question.rstrip()
|
518 |
+
answer = f"({chr(ord('A') + answer_idx)}) {answer}"
|
519 |
+
return question, answer
|
520 |
+
|
521 |
+
def get_index_by_frame(self, max_frame):
|
522 |
+
seg_size = float(max_frame) / self.num_segments
|
523 |
+
frame_indices = np.array([
|
524 |
+
int((seg_size / 2) + np.round(seg_size * idx))
|
525 |
+
for idx in range(self.num_segments)
|
526 |
+
])
|
527 |
+
return frame_indices
|
528 |
+
|
529 |
+
def get_index_by_fps(self, vid, fps):
|
530 |
+
total_frames = len(vid)
|
531 |
+
video_fps = vid.get_avg_fps()
|
532 |
+
total_duration = total_frames / video_fps
|
533 |
+
required_frames = int(total_duration * fps)
|
534 |
+
step_size = video_fps / fps
|
535 |
+
frame_indices = np.array([int(i * step_size) for i in range(required_frames)])
|
536 |
+
self.num_segments = len(frame_indices)
|
537 |
+
return frame_indices
|
538 |
+
|
539 |
+
def read_video(self, video_path, fps=-1):
|
540 |
+
vr = VideoReader(video_path, ctx=cpu(0), num_threads=1)
|
541 |
+
max_frame = len(vr) - 1
|
542 |
+
|
543 |
+
images_group = list()
|
544 |
+
if fps < 0:
|
545 |
+
frame_indices = self.get_index_by_frame(max_frame)
|
546 |
+
else:
|
547 |
+
frame_indices = self.get_index_by_fps(vr, fps)
|
548 |
+
|
549 |
+
for frame_index in frame_indices:
|
550 |
+
img = Image.fromarray(vr[frame_index].asnumpy())
|
551 |
+
images_group.append(img)
|
552 |
+
torch_imgs = self.transform(images_group)
|
553 |
+
return torch_imgs
|
554 |
+
|
555 |
+
def save_video_frames(self, imgs, video_name, frames, fps):
|
556 |
+
if fps > 0:
|
557 |
+
frame_paths = self.frame_paths_fps(video_name, frames, fps)
|
558 |
+
else:
|
559 |
+
frame_paths = self.frame_paths(video_name, frames)
|
560 |
+
flag = np.all([osp.exists(p) for p in frame_paths])
|
561 |
+
|
562 |
+
if not flag:
|
563 |
+
block_size = imgs.size(0) // frames
|
564 |
+
split_tensors = torch.split(imgs, block_size)
|
565 |
+
to_pil = transforms.ToPILImage()
|
566 |
+
images = [to_pil(arr) for arr in split_tensors]
|
567 |
+
for im, pth in zip(images, frame_paths):
|
568 |
+
if not osp.exists(pth):
|
569 |
+
im.save(pth)
|
570 |
+
|
571 |
+
return frame_paths
|
572 |
+
|
573 |
+
def save_video_into_images(self, line, num_frames, fps=-1):
|
574 |
+
video_path = os.path.join(self.data_root, line['prefix'], line['video'])
|
575 |
+
if fps <= 0:
|
576 |
+
self.num_segments = num_frames if num_frames > 0 else self.nframe
|
577 |
+
else:
|
578 |
+
self.num_segments = 0
|
579 |
+
torch_imgs = self.read_video(video_path, fps)
|
580 |
+
img_frame_paths = self.save_video_frames(torch_imgs, line['video'], self.num_segments, fps)
|
581 |
+
return img_frame_paths
|
582 |
+
|
583 |
+
def build_prompt(self, line, num_frames, video_llm, fps):
|
584 |
+
if isinstance(line, int):
|
585 |
+
assert line < len(self)
|
586 |
+
line = self.data.iloc[line]
|
587 |
+
|
588 |
+
question, answer = self.qa_template(line)
|
589 |
+
message = [dict(type='text', value=self.SYS, role='system')]
|
590 |
+
message.append(dict(type='text', value=question))
|
591 |
+
video_path = os.path.join(self.data_root, line['prefix'], line['video'])
|
592 |
+
if video_llm:
|
593 |
+
message.append(dict(type='video', value=video_path))
|
594 |
+
else:
|
595 |
+
img_frame_paths = self.save_video_into_images(line, num_frames, fps)
|
596 |
+
for im in img_frame_paths:
|
597 |
+
message.append(dict(type='image', value=im))
|
598 |
+
message.append(dict(type='text', value='\nOnly give the best option.'))
|
599 |
+
message.append(dict(type='text', value='Best option:(', role='assistant'))
|
600 |
+
return message
|
601 |
+
|
602 |
+
@classmethod
|
603 |
+
def evaluate(self, eval_file, **judge_kwargs):
|
604 |
+
|
605 |
+
assert eval_file.endswith('.xlsx'), 'data file should be an xlsx file'
|
606 |
+
|
607 |
+
tmp_file = eval_file.replace('.xlsx', '_tmp.pkl')
|
608 |
+
tgt_file = eval_file.replace('.xlsx', '_rating.json')
|
609 |
+
score_file = eval_file.replace('.xlsx', '_score.xlsx')
|
610 |
+
|
611 |
+
if not osp.exists(score_file):
|
612 |
+
model = judge_kwargs.setdefault('model', 'chatgpt-0125')
|
613 |
+
assert model in ['chatgpt-0125', 'exact_matching', 'gpt-4-0125']
|
614 |
+
|
615 |
+
if model == 'exact_matching':
|
616 |
+
model = None
|
617 |
+
elif gpt_key_set():
|
618 |
+
model = build_judge(**judge_kwargs)
|
619 |
+
if not model.working():
|
620 |
+
warnings.warn('OPENAI API is not working properly, will use exact matching for evaluation')
|
621 |
+
warnings.warn(DEBUG_MESSAGE)
|
622 |
+
model = None
|
623 |
+
else:
|
624 |
+
warnings.warn('OPENAI_API_KEY is not set properly, will use exact matching for evaluation')
|
625 |
+
model = None
|
626 |
+
res = {} if not osp.exists(tmp_file) else load(tmp_file)
|
627 |
+
res = {k: v for k, v in res.items() if FAIL_MSG not in v}
|
628 |
+
|
629 |
+
data = load(eval_file)
|
630 |
+
data_un = data[~pd.isna(data['prediction'])]
|
631 |
+
|
632 |
+
for idx in data_un['index']:
|
633 |
+
ans = data.loc[data['index'] == idx, 'answer'].values[0]
|
634 |
+
pred = data.loc[data['index'] == idx, 'prediction'].values[0]
|
635 |
+
options = eval(data.loc[data['index'] == idx, 'candidates'].values[0])
|
636 |
+
answer_idx = -1
|
637 |
+
for id, c in enumerate(options):
|
638 |
+
if c == ans:
|
639 |
+
answer_idx = id
|
640 |
+
ans = f"({chr(ord('A') + answer_idx)}) {ans}"
|
641 |
+
input_item = data.loc[data['index'] == idx].to_dict(orient='records')[0]
|
642 |
+
for id, option_content in enumerate(eval(input_item['candidates'])):
|
643 |
+
input_item[chr(ord('A') + id)] = option_content
|
644 |
+
if option_content == input_item['answer']:
|
645 |
+
input_item['answer'] = chr(ord('A') + id)
|
646 |
+
|
647 |
+
if FAIL_MSG in pred:
|
648 |
+
data.loc[idx, 'score'] = -1
|
649 |
+
else:
|
650 |
+
data.loc[idx, 'score'] = int(check_ans_with_model(
|
651 |
+
pred, ans, model,
|
652 |
+
input_item,
|
653 |
+
'MVBench_MP4'
|
654 |
+
))
|
655 |
+
|
656 |
+
rejected = [x for x in data['score'] if x == -1]
|
657 |
+
|
658 |
+
print(
|
659 |
+
f'Among {len(data)} questions, failed to obtain prediction for {len(data) - len(data_un)} questions, '
|
660 |
+
f'failed to obtain the score for another {len(rejected)} questions. '
|
661 |
+
f'Those questions will be counted as -1 score in ALL rating, and will not be counted in VALID rating.'
|
662 |
+
)
|
663 |
+
|
664 |
+
dump(data, score_file)
|
665 |
+
|
666 |
+
rating = get_dimension_rating(score_file)
|
667 |
+
dump(rating, tgt_file)
|
668 |
+
return rating
|
vlmeval/VLMEvalKit_old/vlmeval/dataset/text_base.py
ADDED
@@ -0,0 +1,88 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
from abc import abstractmethod
|
2 |
+
from ..smp import *
|
3 |
+
|
4 |
+
|
5 |
+
class TextBaseDataset:
|
6 |
+
MODALITY = 'TEXT'
|
7 |
+
DATASET_URL = {}
|
8 |
+
DATASET_MD5 = {}
|
9 |
+
|
10 |
+
def __init__(self, dataset='MMBench', **kwargs):
|
11 |
+
self.dataset_name = dataset
|
12 |
+
|
13 |
+
data = self.load_data(dataset)
|
14 |
+
|
15 |
+
data['index'] = [str(x) for x in data['index']]
|
16 |
+
|
17 |
+
if np.all([istype(x, int) for x in data['index']]):
|
18 |
+
data['index'] = [int(x) for x in data['index']]
|
19 |
+
|
20 |
+
self.data = data
|
21 |
+
self.post_build(dataset)
|
22 |
+
|
23 |
+
def __len__(self):
|
24 |
+
return len(self.data)
|
25 |
+
|
26 |
+
def __getitem__(self, idx):
|
27 |
+
return dict(self.data.iloc[idx])
|
28 |
+
|
29 |
+
def prepare_tsv(self, url, file_md5=None):
|
30 |
+
data_root = LMUDataRoot()
|
31 |
+
os.makedirs(data_root, exist_ok=True)
|
32 |
+
update_flag = False
|
33 |
+
file_name = url.split('/')[-1]
|
34 |
+
data_path = osp.join(data_root, file_name)
|
35 |
+
if osp.exists(data_path) and (file_md5 is None or md5(data_path) == file_md5):
|
36 |
+
pass
|
37 |
+
else:
|
38 |
+
warnings.warn('The dataset tsv is not downloaded')
|
39 |
+
download_file(url, data_path)
|
40 |
+
update_flag = True
|
41 |
+
|
42 |
+
if file_size(data_path, 'GB') > 1:
|
43 |
+
local_path = data_path.replace('.tsv', '_local.tsv')
|
44 |
+
if not osp.exists(local_path) or os.environ.get('FORCE_LOCAL', None) or update_flag:
|
45 |
+
from ..tools import LOCALIZE
|
46 |
+
LOCALIZE(data_path, local_path)
|
47 |
+
data_path = local_path
|
48 |
+
return load(data_path)
|
49 |
+
|
50 |
+
def dump_image(self, line):
|
51 |
+
return []
|
52 |
+
|
53 |
+
def display(self, line):
|
54 |
+
if isinstance(line, int):
|
55 |
+
line = self.data.iloc[line]
|
56 |
+
assert isinstance(line, pd.Series) or isinstance(line, dict)
|
57 |
+
mmqa_display(line)
|
58 |
+
|
59 |
+
# Return a list of dataset names that are supported by this class, can override
|
60 |
+
@classmethod
|
61 |
+
def supported_datasets(cls):
|
62 |
+
return list(cls.DATASET_URL)
|
63 |
+
|
64 |
+
# Given the dataset name, return the dataset as a pandas dataframe, can override
|
65 |
+
def load_data(self, dataset):
|
66 |
+
url = self.DATASET_URL[dataset]
|
67 |
+
file_md5 = self.DATASET_MD5[dataset]
|
68 |
+
return self.prepare_tsv(url, file_md5)
|
69 |
+
|
70 |
+
# Post built hook, will be called after the dataset is built, can override
|
71 |
+
def post_build(self, dataset):
|
72 |
+
pass
|
73 |
+
|
74 |
+
# Given one data record, return the built prompt (a multi-modal message), can override
|
75 |
+
def build_prompt(self, line):
|
76 |
+
if isinstance(line, int):
|
77 |
+
line = self.data.iloc[line]
|
78 |
+
|
79 |
+
question = line['question']
|
80 |
+
|
81 |
+
msgs = []
|
82 |
+
msgs.append(dict(type='text', value=question))
|
83 |
+
return msgs
|
84 |
+
|
85 |
+
# Given the prediction file, return the evaluation results in the format of a dictionary or pandas dataframe
|
86 |
+
@abstractmethod
|
87 |
+
def evaluate(self, eval_file, **judge_kwargs):
|
88 |
+
pass
|
vlmeval/VLMEvalKit_old/vlmeval/dataset/vcr.py
ADDED
@@ -0,0 +1,335 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
import uuid
|
2 |
+
from functools import partial
|
3 |
+
from .image_base import ImageBaseDataset
|
4 |
+
from ..smp import *
|
5 |
+
|
6 |
+
rouge = None
|
7 |
+
nlp_en = None
|
8 |
+
nlp_zh = None
|
9 |
+
nlp = None
|
10 |
+
|
11 |
+
|
12 |
+
def initialize():
|
13 |
+
import evaluate
|
14 |
+
import spacy
|
15 |
+
|
16 |
+
global rouge, nlp_en, nlp_zh, nlp
|
17 |
+
|
18 |
+
try:
|
19 |
+
rouge = evaluate.load('rouge', experiment_id=str(uuid.uuid4()))
|
20 |
+
except Exception as e:
|
21 |
+
logging.critical(f'{type(e)}: {e}')
|
22 |
+
logging.critical('Please first `pip install rouge_score`.')
|
23 |
+
|
24 |
+
try:
|
25 |
+
nlp_en = spacy.load('en_core_web_sm')
|
26 |
+
except Exception as e:
|
27 |
+
logging.warning(f'{type(e)}: {e}')
|
28 |
+
logging.warning('Will automatically download en_core_web_sm via spacy.')
|
29 |
+
spacy.cli.download('en_core_web_sm')
|
30 |
+
nlp_en = spacy.load('en_core_web_sm')
|
31 |
+
|
32 |
+
try:
|
33 |
+
nlp_zh = spacy.load('zh_core_web_sm')
|
34 |
+
except Exception as e:
|
35 |
+
logging.warning(f'{type(e)}: {e}')
|
36 |
+
logging.warning('Will automatically download zh_core_web_sm via spacy.')
|
37 |
+
spacy.cli.download('zh_core_web_sm')
|
38 |
+
nlp_zh = spacy.load('zh_core_web_sm')
|
39 |
+
|
40 |
+
nlp = {'en': nlp_en, 'zh': nlp_zh}
|
41 |
+
|
42 |
+
|
43 |
+
def rough_filter(answer_text):
|
44 |
+
if "I can't" in answer_text:
|
45 |
+
return False
|
46 |
+
elif 'I cannot' in answer_text:
|
47 |
+
return False
|
48 |
+
elif 'sorry' in answer_text.lower():
|
49 |
+
return False
|
50 |
+
if '无法' in answer_text:
|
51 |
+
return False
|
52 |
+
elif '抱歉' in answer_text:
|
53 |
+
return False
|
54 |
+
else:
|
55 |
+
return True
|
56 |
+
|
57 |
+
|
58 |
+
def zero_template(crossed_text):
|
59 |
+
return {
|
60 |
+
'crossed_text': crossed_text,
|
61 |
+
'max_sim_val': 0,
|
62 |
+
'max_sim_string': '',
|
63 |
+
'precision': 0,
|
64 |
+
'recall': 0,
|
65 |
+
'f1': 0,
|
66 |
+
'jaccard': 0,
|
67 |
+
'rouge1': 0,
|
68 |
+
'exact_match': 0,
|
69 |
+
}
|
70 |
+
|
71 |
+
|
72 |
+
def tokenize(text, language):
|
73 |
+
"""
|
74 |
+
Tokenize the text and return the tokens.
|
75 |
+
|
76 |
+
Parameters:
|
77 |
+
text (str): The text to tokenize.
|
78 |
+
language (str): The language of the text.
|
79 |
+
|
80 |
+
Returns:
|
81 |
+
list: The list of tokens.
|
82 |
+
"""
|
83 |
+
assert language in ['en', 'zh']
|
84 |
+
nlp_language = nlp[language]
|
85 |
+
processed_text = nlp_language(text)
|
86 |
+
return [token.text for token in processed_text]
|
87 |
+
|
88 |
+
|
89 |
+
def find_best_match(needle, hay, language, rouge):
|
90 |
+
"""
|
91 |
+
Finds the best matching n-gram in the haystack for the given needle.
|
92 |
+
|
93 |
+
Parameters:
|
94 |
+
needle (str): The string to find.
|
95 |
+
hay (str): The text to search within.
|
96 |
+
|
97 |
+
Returns:
|
98 |
+
tuple: The highest similarity value and the best matching string.
|
99 |
+
"""
|
100 |
+
assert language in ['en', 'zh']
|
101 |
+
from nltk.util import ngrams
|
102 |
+
from difflib import SequenceMatcher as SM
|
103 |
+
|
104 |
+
tokens_hay = tokenize(hay, language)
|
105 |
+
tokens_needle = tokenize(needle, language)
|
106 |
+
|
107 |
+
splitter = '' if language == 'zh' else ' '
|
108 |
+
ngrams_ = ngrams(tokens_hay, len(tokens_needle))
|
109 |
+
max_sim_val = 0
|
110 |
+
max_sim_string = ''
|
111 |
+
max_sim_ngram = []
|
112 |
+
tokens_needle_set = set(tokens_needle)
|
113 |
+
ngrams_hasjoint = [
|
114 |
+
ngram
|
115 |
+
for ngram in ngrams_
|
116 |
+
if not set(ngram).isdisjoint(tokens_needle_set)
|
117 |
+
]
|
118 |
+
|
119 |
+
for ngram in ngrams_hasjoint:
|
120 |
+
hay_ngram = splitter.join(ngram)
|
121 |
+
similarity = SM(None, hay_ngram, needle).ratio()
|
122 |
+
if similarity > max_sim_val:
|
123 |
+
max_sim_val = similarity
|
124 |
+
max_sim_string = hay_ngram
|
125 |
+
max_sim_ngram = ngram
|
126 |
+
|
127 |
+
# Evaluate
|
128 |
+
if len(max_sim_ngram) == 0:
|
129 |
+
return {
|
130 |
+
'crossed_text': needle,
|
131 |
+
'max_sim_val': 0,
|
132 |
+
'max_sim_string': '',
|
133 |
+
'precision': 0,
|
134 |
+
'recall': 0,
|
135 |
+
'f1': 0,
|
136 |
+
'jaccard': 0,
|
137 |
+
'rouge1': 0,
|
138 |
+
'exact_match': 0,
|
139 |
+
}
|
140 |
+
pred_set = set(max_sim_ngram)
|
141 |
+
ref_set = set(tokens_needle)
|
142 |
+
correct_tokens = pred_set.intersection(ref_set)
|
143 |
+
len_correct_tokens = len(correct_tokens)
|
144 |
+
|
145 |
+
precision = len_correct_tokens / len(pred_set)
|
146 |
+
recall = len_correct_tokens / len(ref_set)
|
147 |
+
if (precision + recall) == 0:
|
148 |
+
f1 = 0
|
149 |
+
else:
|
150 |
+
f1 = 2 * precision * recall / (precision + recall)
|
151 |
+
union = pred_set.union(ref_set)
|
152 |
+
jaccard = len_correct_tokens / len(union) if len(union) > 0 else 0
|
153 |
+
rouge_1 = rouge.compute(
|
154 |
+
predictions=[max_sim_string],
|
155 |
+
references=[needle],
|
156 |
+
tokenizer=partial(tokenize, language=language),
|
157 |
+
rouge_types=['rouge1'],
|
158 |
+
)['rouge1']
|
159 |
+
exact_match = float(list(max_sim_ngram) == list(tokens_needle))
|
160 |
+
out = {
|
161 |
+
'crossed_text': needle,
|
162 |
+
'max_sim_string': max_sim_string,
|
163 |
+
'max_sim_val': max_sim_val,
|
164 |
+
'precision': precision,
|
165 |
+
'recall': recall,
|
166 |
+
'f1': f1,
|
167 |
+
'jaccard': jaccard,
|
168 |
+
'rouge1': rouge_1,
|
169 |
+
'exact_match': exact_match,
|
170 |
+
}
|
171 |
+
return out
|
172 |
+
|
173 |
+
|
174 |
+
def process_match_single_new(
|
175 |
+
image_id, prediction, answer, language, progress
|
176 |
+
):
|
177 |
+
"""
|
178 |
+
process the inference results for a single image and calculate the metrics
|
179 |
+
|
180 |
+
Parameters:
|
181 |
+
image_id (int): The image id (question id).
|
182 |
+
prediction (str): The prediction text.
|
183 |
+
answer (Union[str, List[str]]): The answer text, or a list of answer texts. The masked n-grams in the image.
|
184 |
+
language (str): The language of the text. Can be "en" or "zh".
|
185 |
+
rouge (rouge): The rouge metric object.
|
186 |
+
progress (multiprocessing.Queue): The progress queue.
|
187 |
+
|
188 |
+
Returns:
|
189 |
+
tuple: The image id (question_id, int) and the result per id (dict of dict of dict).
|
190 |
+
"""
|
191 |
+
result_per_id = {image_id: {}}
|
192 |
+
if isinstance(answer, str):
|
193 |
+
answer = eval(answer)
|
194 |
+
assert isinstance(answer, list)
|
195 |
+
result = prediction.split('Assistant: ')[-1]
|
196 |
+
for i, crossed_text in enumerate(answer):
|
197 |
+
if rough_filter(result):
|
198 |
+
find_best_match_result = find_best_match(
|
199 |
+
crossed_text, result, language, rouge
|
200 |
+
)
|
201 |
+
if i == 0:
|
202 |
+
result_per_id[image_id] = {str(i): find_best_match_result}
|
203 |
+
else:
|
204 |
+
result_per_id[image_id][str(i)] = find_best_match_result
|
205 |
+
else:
|
206 |
+
if i == 0:
|
207 |
+
result_per_id[image_id] = {str(i): zero_template(crossed_text)}
|
208 |
+
else:
|
209 |
+
result_per_id[image_id][str(i)] = zero_template(crossed_text)
|
210 |
+
progress.put(1)
|
211 |
+
return image_id, result_per_id
|
212 |
+
|
213 |
+
|
214 |
+
class VCRDataset(ImageBaseDataset):
|
215 |
+
TYPE = 'VQA'
|
216 |
+
|
217 |
+
URL_PREFIX = 'https://huggingface.co/datasets/vcr-org'
|
218 |
+
|
219 |
+
DATASET_URL = {
|
220 |
+
'VCR_EN_EASY_500': f'{URL_PREFIX}/VCR-wiki-en-easy-test-500/resolve/main/VCR-wiki-en-easy-test-500.tsv',
|
221 |
+
'VCR_EN_EASY_100': f'{URL_PREFIX}/VCR-wiki-en-easy-test-100/resolve/main/VCR-wiki-en-easy-test-100.tsv',
|
222 |
+
'VCR_EN_EASY_ALL': f'{URL_PREFIX}/VCR-wiki-en-easy-test/resolve/main/VCR-wiki-en-easy-test.tsv',
|
223 |
+
'VCR_EN_HARD_500': f'{URL_PREFIX}/VCR-wiki-en-hard-test-500/resolve/main/VCR-wiki-en-hard-test-500.tsv',
|
224 |
+
'VCR_EN_HARD_100': f'{URL_PREFIX}/VCR-wiki-en-hard-test-100/resolve/main/VCR-wiki-en-hard-test-100.tsv',
|
225 |
+
'VCR_EN_HARD_ALL': f'{URL_PREFIX}/VCR-wiki-en-hard-test/resolve/main/VCR-wiki-en-hard-test.tsv',
|
226 |
+
'VCR_ZH_EASY_500': f'{URL_PREFIX}/VCR-wiki-zh-easy-test-500/resolve/main/VCR-wiki-zh-easy-test-500.tsv',
|
227 |
+
'VCR_ZH_EASY_100': f'{URL_PREFIX}/VCR-wiki-zh-easy-test-100/resolve/main/VCR-wiki-zh-easy-test-100.tsv',
|
228 |
+
'VCR_ZH_EASY_ALL': f'{URL_PREFIX}/VCR-wiki-zh-easy-test/resolve/main/VCR-wiki-zh-easy-test.tsv',
|
229 |
+
'VCR_ZH_HARD_500': f'{URL_PREFIX}/VCR-wiki-zh-hard-test-500/resolve/main/VCR-wiki-zh-hard-test-500.tsv',
|
230 |
+
'VCR_ZH_HARD_100': f'{URL_PREFIX}/VCR-wiki-zh-hard-test-100/resolve/main/VCR-wiki-zh-hard-test-100.tsv',
|
231 |
+
'VCR_ZH_HARD_ALL': f'{URL_PREFIX}/VCR-wiki-zh-hard-test/resolve/main/VCR-wiki-zh-hard-test.tsv',
|
232 |
+
}
|
233 |
+
|
234 |
+
DATASET_MD5 = {
|
235 |
+
'VCR_EN_EASY_500': 'fd9258db52f8685dc710619a0ea0a261',
|
236 |
+
'VCR_EN_EASY_100': '9df5d7266683458621ecbe122beb72f0',
|
237 |
+
'VCR_EN_EASY_ALL': '8a9b96885f251d1c85f42f84073327f1',
|
238 |
+
'VCR_EN_HARD_500': '0a22a85080b6a1f52b1f95e302d43df4',
|
239 |
+
'VCR_EN_HARD_100': '1b20f5cbcbeae0b0bec77f7a36143958',
|
240 |
+
'VCR_EN_HARD_ALL': '2d8b8b1ee0eba0e0b618fd3aa7d9710e',
|
241 |
+
'VCR_ZH_EASY_500': 'beca5fd54176adf44cf94bd9b50cf048',
|
242 |
+
'VCR_ZH_EASY_100': '4a86a5678a79844d6d22ab0629c51cd5',
|
243 |
+
'VCR_ZH_EASY_ALL': '5050fe7f0027ad2068fd4c7f220edaea',
|
244 |
+
'VCR_ZH_HARD_500': '617e3360f75c54455625cb0a8da5c1e7',
|
245 |
+
'VCR_ZH_HARD_100': 'b0e38c85f5d5e63894a3b881c372a62b',
|
246 |
+
'VCR_ZH_HARD_ALL': '54bbfef448206518b03127ef8b61404c',
|
247 |
+
}
|
248 |
+
|
249 |
+
def __init__(self, dataset='VCR_EN_EASY_500', skip_noimg=True):
|
250 |
+
super().__init__(dataset, skip_noimg)
|
251 |
+
|
252 |
+
initialize()
|
253 |
+
self.language = 'en' if 'EN' in dataset else 'zh'
|
254 |
+
self.difficulty = 'easy' if 'EASY' in dataset else 'hard'
|
255 |
+
|
256 |
+
# def build_prompt(self, line):
|
257 |
+
# msgs = super().build_prompt(line)
|
258 |
+
# assert msgs[-1]['type'] == 'text'
|
259 |
+
# if self.language == 'zh':
|
260 |
+
# msgs[-1]['value'] += '图像中被覆盖的文本是什么?请在不输出解释的情况下还原被覆盖的文本。'
|
261 |
+
# else:
|
262 |
+
# msgs[-1]['value'] += ('What is the covered texts in the image? '
|
263 |
+
# 'Please restore the covered texts without outputting the explanations.')
|
264 |
+
# return msgs
|
265 |
+
|
266 |
+
def evaluate(self, eval_file, **judge_kwargs):
|
267 |
+
import multiprocessing
|
268 |
+
|
269 |
+
vcr_score_list = {'Exact_Match': [], 'Jaccard': []}
|
270 |
+
vcr_score = {'Exact_Match': 0, 'Jaccard': 0}
|
271 |
+
logger = get_logger('Evaluation')
|
272 |
+
data = load(eval_file)
|
273 |
+
|
274 |
+
lt = len(data)
|
275 |
+
lines = [data.iloc[i] for i in range(lt)]
|
276 |
+
|
277 |
+
pool = multiprocessing.Pool()
|
278 |
+
manager = multiprocessing.Manager()
|
279 |
+
progress_queue = manager.Queue()
|
280 |
+
results = []
|
281 |
+
|
282 |
+
overall_results = {str(image_id): {} for image_id in range(len(lines))}
|
283 |
+
|
284 |
+
for instance_id, instance in enumerate(lines):
|
285 |
+
results.append(
|
286 |
+
pool.apply_async(
|
287 |
+
process_match_single_new,
|
288 |
+
args=(
|
289 |
+
str(instance_id),
|
290 |
+
instance['prediction'],
|
291 |
+
instance['answer'],
|
292 |
+
self.language,
|
293 |
+
progress_queue,
|
294 |
+
),
|
295 |
+
)
|
296 |
+
)
|
297 |
+
pool.close()
|
298 |
+
|
299 |
+
# Display progress bar
|
300 |
+
for _ in tqdm(range(len(results))):
|
301 |
+
progress_queue.get()
|
302 |
+
|
303 |
+
pool.join()
|
304 |
+
|
305 |
+
# Merging results into overall_result
|
306 |
+
for result in results:
|
307 |
+
image_id, result_per_id = result.get()
|
308 |
+
overall_results[str(image_id)].update(result_per_id[image_id])
|
309 |
+
for blank_id_str in result_per_id[image_id].keys():
|
310 |
+
vcr_score_list['Exact_Match'].append(
|
311 |
+
result_per_id[image_id][blank_id_str]['exact_match']
|
312 |
+
)
|
313 |
+
vcr_score_list['Jaccard'].append(
|
314 |
+
result_per_id[image_id][blank_id_str]['jaccard']
|
315 |
+
)
|
316 |
+
vcr_score['Exact_Match'] = np.mean(vcr_score_list['Exact_Match'])
|
317 |
+
vcr_score['Jaccard'] = np.mean(vcr_score_list['Jaccard'])
|
318 |
+
results_out = {
|
319 |
+
k: v for i in range(len(results)) for k, v in results[i].get()[1].items()
|
320 |
+
}
|
321 |
+
results_with_metrics = {
|
322 |
+
'Exact_Match': vcr_score['Exact_Match'],
|
323 |
+
'Jaccard': vcr_score['Jaccard'],
|
324 |
+
'Predictions': results_out,
|
325 |
+
}
|
326 |
+
score_pth = eval_file.replace(
|
327 |
+
'.xlsx', f'{self.language}_{self.difficulty}_score.json'
|
328 |
+
)
|
329 |
+
dump(results_with_metrics, score_pth)
|
330 |
+
logger.info(
|
331 |
+
f'VCR successfully finished evaluating {eval_file}, results saved in {score_pth}'
|
332 |
+
)
|
333 |
+
logger.info('Score: ')
|
334 |
+
for key, value in vcr_score.items():
|
335 |
+
logger.info('{}:{}'.format(key, value))
|
vlmeval/VLMEvalKit_old/vlmeval/dataset/video_concat_dataset.py
ADDED
@@ -0,0 +1,83 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
from ..smp import *
|
2 |
+
from .video_base import VideoBaseDataset
|
3 |
+
|
4 |
+
|
5 |
+
class ConcatVideoDataset(VideoBaseDataset):
|
6 |
+
# This dataset takes multiple dataset names as input and aggregate them into a single dataset.
|
7 |
+
# Each single dataset should not have a field named `SUB_DATASET`
|
8 |
+
|
9 |
+
DATASET_SETS = {}
|
10 |
+
|
11 |
+
def __init__(self, dataset):
|
12 |
+
from . import build_dataset
|
13 |
+
datasets = self.DATASET_SETS[dataset]
|
14 |
+
self.dataset_map = {}
|
15 |
+
# The name of the compliation
|
16 |
+
self.dataset_name = dataset
|
17 |
+
self.datasets = datasets
|
18 |
+
for dname in datasets:
|
19 |
+
dataset = build_dataset(dname)
|
20 |
+
assert dataset is not None, dataset
|
21 |
+
self.dataset_map[dname] = dataset
|
22 |
+
TYPES = [x.TYPE for x in self.dataset_map.values()]
|
23 |
+
MODALITIES = [x.MODALITY for x in self.dataset_map.values()]
|
24 |
+
# assert np.all([x == TYPES[0] for x in TYPES]), (datasets, TYPES)
|
25 |
+
assert np.all([x == MODALITIES[0] for x in MODALITIES]), (datasets, MODALITIES)
|
26 |
+
self.TYPE = TYPES
|
27 |
+
self.MODALITY = MODALITIES[0]
|
28 |
+
data_all = []
|
29 |
+
for dname in datasets:
|
30 |
+
data = self.dataset_map[dname].data
|
31 |
+
data['SUB_DATASET'] = [dname] * len(data)
|
32 |
+
data_all.append(data)
|
33 |
+
|
34 |
+
data = pd.concat(data_all)
|
35 |
+
data['original_index'] = data.pop('index')
|
36 |
+
data['index'] = np.arange(len(data))
|
37 |
+
self.data = data
|
38 |
+
|
39 |
+
def build_prompt(self, line, num_frames, video_llm, fps):
|
40 |
+
if isinstance(line, int):
|
41 |
+
line = self.data.iloc[line]
|
42 |
+
idx = line['original_index']
|
43 |
+
dname = line['SUB_DATASET']
|
44 |
+
org_data = self.dataset_map[dname].data
|
45 |
+
org_line = cp.deepcopy(org_data[org_data['index'] == idx]).iloc[0]
|
46 |
+
return self.dataset_map[dname].build_prompt(org_line, num_frames, video_llm, fps)
|
47 |
+
|
48 |
+
def dump_image(self, line):
|
49 |
+
# Assert all images are pre-dumped
|
50 |
+
assert 'image' not in line
|
51 |
+
assert 'image_path' in line
|
52 |
+
tgt_path = toliststr(line['image_path'])
|
53 |
+
return tgt_path
|
54 |
+
|
55 |
+
@classmethod
|
56 |
+
def supported_datasets(cls):
|
57 |
+
return [] # list(cls.DATASET_SETS)
|
58 |
+
|
59 |
+
def evaluate(self, eval_file, **judge_kwargs):
|
60 |
+
suffix = eval_file.split('.')[-1]
|
61 |
+
# First, split the eval_file by dataset
|
62 |
+
data_all = load(eval_file)
|
63 |
+
for dname in self.datasets:
|
64 |
+
tgt = eval_file.replace(self.dataset_name, dname)
|
65 |
+
data_sub = data_all[data_all['SUB_DATASET'] == dname]
|
66 |
+
data_sub.pop('index')
|
67 |
+
data_sub['index'] = data_sub.pop('original_index')
|
68 |
+
data_sub.pop('SUB_DATASET')
|
69 |
+
dump(data_sub, tgt)
|
70 |
+
# Then, evaluate each dataset separately
|
71 |
+
results_all = {}
|
72 |
+
for dname in self.datasets:
|
73 |
+
tgt = eval_file.replace(self.dataset_name, dname)
|
74 |
+
res = self.dataset_map[dname].evaluate(tgt, **judge_kwargs)
|
75 |
+
results_all.update(res)
|
76 |
+
|
77 |
+
result = pd.DataFrame(results_all, index=['success', 'overall'])
|
78 |
+
result = result.T
|
79 |
+
for idx, item in result.iterrows():
|
80 |
+
result.loc[idx, 'acc'] = round(item['success'] / item['overall'] * 100, 1)
|
81 |
+
score_file = eval_file.replace(f'.{suffix}', '_acc.csv')
|
82 |
+
dump(result, score_file)
|
83 |
+
return result
|
vlmeval/VLMEvalKit_old/vlmeval/smp/__pycache__/__init__.cpython-310.pyc
ADDED
Binary file (220 Bytes). View file
|
|
vlmeval/VLMEvalKit_old/vlmeval/smp/__pycache__/__init__.cpython-311.pyc
ADDED
Binary file (274 Bytes). View file
|
|
vlmeval/VLMEvalKit_old/vlmeval/smp/__pycache__/__init__.cpython-38.pyc
ADDED
Binary file (218 Bytes). View file
|
|
vlmeval/VLMEvalKit_old/vlmeval/smp/__pycache__/file.cpython-310.pyc
ADDED
Binary file (12.7 kB). View file
|
|
vlmeval/VLMEvalKit_old/vlmeval/smp/__pycache__/file.cpython-311.pyc
ADDED
Binary file (25.5 kB). View file
|
|
vlmeval/VLMEvalKit_old/vlmeval/smp/__pycache__/file.cpython-38.pyc
ADDED
Binary file (12.8 kB). View file
|
|
vlmeval/VLMEvalKit_old/vlmeval/smp/__pycache__/log.cpython-310.pyc
ADDED
Binary file (1.23 kB). View file
|
|
vlmeval/VLMEvalKit_old/vlmeval/smp/__pycache__/log.cpython-38.pyc
ADDED
Binary file (1.23 kB). View file
|
|
vlmeval/VLMEvalKit_old/vlmeval/smp/__pycache__/misc.cpython-310.pyc
ADDED
Binary file (10.3 kB). View file
|
|