tuandunghcmut commited on
Commit
385fd77
·
verified ·
1 Parent(s): ca0b425

Add files using upload-large-folder tool

Browse files
This view is limited to 50 files because it contains too many changes.   See raw diff
Files changed (50) hide show
  1. PaddleMIX/comfyui/ComfyUI_ppdiffusers/utils/callbacks.py +20 -0
  2. PaddleMIX/comfyui/ComfyUI_ppdiffusers/workflows/SD15/workflow_SD1.5_inpaint.json +515 -0
  3. PaddleMIX/comfyui/ComfyUI_ppdiffusers/workflows/SD15/workflow_SD1.5_text2img.json +416 -0
  4. PaddleMIX/comfyui/ComfyUI_ppdiffusers/workflows/SDXL/workflow_SDXL_text2img.json +416 -0
  5. vlmeval/VLMEvalKit_old/docs/en/.readthedocs.yaml +17 -0
  6. vlmeval/VLMEvalKit_old/docs/en/ConfigSystem.md +57 -0
  7. vlmeval/VLMEvalKit_old/docs/en/Development.md +146 -0
  8. vlmeval/VLMEvalKit_old/docs/en/Makefile +20 -0
  9. vlmeval/VLMEvalKit_old/docs/en/Quickstart.md +148 -0
  10. vlmeval/VLMEvalKit_old/docs/en/conf.py +234 -0
  11. vlmeval/VLMEvalKit_old/docs/en/index.rst +41 -0
  12. vlmeval/VLMEvalKit_old/docs/zh-CN/_static/css/readthedocs.css +63 -0
  13. vlmeval/VLMEvalKit_old/docs/zh-CN/_static/image/logo.svg +24 -0
  14. vlmeval/VLMEvalKit_old/docs/zh-CN/_templates/404.html +18 -0
  15. vlmeval/VLMEvalKit_old/vlmeval/__pycache__/__init__.cpython-311.pyc +0 -0
  16. vlmeval/VLMEvalKit_old/vlmeval/__pycache__/__init__.cpython-38.pyc +0 -0
  17. vlmeval/VLMEvalKit_old/vlmeval/__pycache__/config.cpython-310.pyc +0 -0
  18. vlmeval/VLMEvalKit_old/vlmeval/__pycache__/inference_mt.cpython-310.pyc +0 -0
  19. vlmeval/VLMEvalKit_old/vlmeval/__pycache__/inference_video.cpython-310.pyc +0 -0
  20. vlmeval/VLMEvalKit_old/vlmeval/api/bluelm_v_api.py +120 -0
  21. vlmeval/VLMEvalKit_old/vlmeval/api/gpt.py +263 -0
  22. vlmeval/VLMEvalKit_old/vlmeval/api/sensechat_vision.py +257 -0
  23. vlmeval/VLMEvalKit_old/vlmeval/dataset/__init__.py +228 -0
  24. vlmeval/VLMEvalKit_old/vlmeval/dataset/__pycache__/__init__.cpython-38.pyc +0 -0
  25. vlmeval/VLMEvalKit_old/vlmeval/dataset/__pycache__/dude.cpython-310.pyc +0 -0
  26. vlmeval/VLMEvalKit_old/vlmeval/dataset/__pycache__/image_caption.cpython-310.pyc +0 -0
  27. vlmeval/VLMEvalKit_old/vlmeval/dataset/__pycache__/image_caption.cpython-38.pyc +0 -0
  28. vlmeval/VLMEvalKit_old/vlmeval/dataset/__pycache__/longvideobench.cpython-38.pyc +0 -0
  29. vlmeval/VLMEvalKit_old/vlmeval/dataset/__pycache__/mmbench_video.cpython-310.pyc +0 -0
  30. vlmeval/VLMEvalKit_old/vlmeval/dataset/__pycache__/slidevqa.cpython-310.pyc +0 -0
  31. vlmeval/VLMEvalKit_old/vlmeval/dataset/__pycache__/slidevqa.cpython-38.pyc +0 -0
  32. vlmeval/VLMEvalKit_old/vlmeval/dataset/__pycache__/text_base.cpython-38.pyc +0 -0
  33. vlmeval/VLMEvalKit_old/vlmeval/dataset/image_caption.py +89 -0
  34. vlmeval/VLMEvalKit_old/vlmeval/dataset/image_vqa.py +1333 -0
  35. vlmeval/VLMEvalKit_old/vlmeval/dataset/image_yorn.py +95 -0
  36. vlmeval/VLMEvalKit_old/vlmeval/dataset/miabench.py +167 -0
  37. vlmeval/VLMEvalKit_old/vlmeval/dataset/mmmath.py +446 -0
  38. vlmeval/VLMEvalKit_old/vlmeval/dataset/mvbench.py +668 -0
  39. vlmeval/VLMEvalKit_old/vlmeval/dataset/text_base.py +88 -0
  40. vlmeval/VLMEvalKit_old/vlmeval/dataset/vcr.py +335 -0
  41. vlmeval/VLMEvalKit_old/vlmeval/dataset/video_concat_dataset.py +83 -0
  42. vlmeval/VLMEvalKit_old/vlmeval/smp/__pycache__/__init__.cpython-310.pyc +0 -0
  43. vlmeval/VLMEvalKit_old/vlmeval/smp/__pycache__/__init__.cpython-311.pyc +0 -0
  44. vlmeval/VLMEvalKit_old/vlmeval/smp/__pycache__/__init__.cpython-38.pyc +0 -0
  45. vlmeval/VLMEvalKit_old/vlmeval/smp/__pycache__/file.cpython-310.pyc +0 -0
  46. vlmeval/VLMEvalKit_old/vlmeval/smp/__pycache__/file.cpython-311.pyc +0 -0
  47. vlmeval/VLMEvalKit_old/vlmeval/smp/__pycache__/file.cpython-38.pyc +0 -0
  48. vlmeval/VLMEvalKit_old/vlmeval/smp/__pycache__/log.cpython-310.pyc +0 -0
  49. vlmeval/VLMEvalKit_old/vlmeval/smp/__pycache__/log.cpython-38.pyc +0 -0
  50. vlmeval/VLMEvalKit_old/vlmeval/smp/__pycache__/misc.cpython-310.pyc +0 -0
PaddleMIX/comfyui/ComfyUI_ppdiffusers/utils/callbacks.py ADDED
@@ -0,0 +1,20 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # Copyright (c) 2024 PaddlePaddle Authors. All Rights Reserved.
2
+ #
3
+ # Licensed under the Apache License, Version 2.0 (the "License");
4
+ # you may not use this file except in compliance with the License.
5
+ # You may obtain a copy of the License at
6
+ #
7
+ # http://www.apache.org/licenses/LICENSE-2.0
8
+ #
9
+ # Unless required by applicable law or agreed to in writing, software
10
+ # distributed under the License is distributed on an "AS IS" BASIS,
11
+ # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12
+ # See the License for the specific language governing permissions and
13
+ # limitations under the License.
14
+
15
+ from typing import Dict
16
+
17
+
18
+ def progress_callback(pbar, cls, step, timestep, kwargs) -> Dict:
19
+ pbar.update(1)
20
+ return {}
PaddleMIX/comfyui/ComfyUI_ppdiffusers/workflows/SD15/workflow_SD1.5_inpaint.json ADDED
@@ -0,0 +1,515 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "last_node_id": 23,
3
+ "last_link_id": 33,
4
+ "nodes": [
5
+ {
6
+ "id": 4,
7
+ "type": "PaddleSDVaeDecoder",
8
+ "pos": [
9
+ 1011,
10
+ 398
11
+ ],
12
+ "size": {
13
+ "0": 210,
14
+ "1": 46
15
+ },
16
+ "flags": {},
17
+ "order": 11,
18
+ "mode": 0,
19
+ "inputs": [
20
+ {
21
+ "name": "latent",
22
+ "type": "LATENT",
23
+ "link": 33,
24
+ "label": "latent"
25
+ },
26
+ {
27
+ "name": "sd_pipe",
28
+ "type": "PIPELINE",
29
+ "link": 4,
30
+ "label": "sd_pipe"
31
+ }
32
+ ],
33
+ "outputs": [
34
+ {
35
+ "name": "image",
36
+ "type": "IMAGE",
37
+ "links": [
38
+ 3
39
+ ],
40
+ "shape": 3,
41
+ "label": "image",
42
+ "slot_index": 0
43
+ }
44
+ ],
45
+ "properties": {
46
+ "Node name for S&R": "PaddleSDVaeDecoder"
47
+ }
48
+ },
49
+ {
50
+ "id": 5,
51
+ "type": "PaddleSaveImage",
52
+ "pos": [
53
+ 1478,
54
+ 470
55
+ ],
56
+ "size": {
57
+ "0": 315,
58
+ "1": 270
59
+ },
60
+ "flags": {},
61
+ "order": 12,
62
+ "mode": 0,
63
+ "inputs": [
64
+ {
65
+ "name": "images",
66
+ "type": "IMAGE",
67
+ "link": 3,
68
+ "label": "images"
69
+ }
70
+ ],
71
+ "properties": {
72
+ "Node name for S&R": "PaddleSaveImage"
73
+ },
74
+ "widgets_values": [
75
+ "ComfyUI"
76
+ ]
77
+ },
78
+ {
79
+ "id": 7,
80
+ "type": "LoadImage",
81
+ "pos": [
82
+ 50,
83
+ 588
84
+ ],
85
+ "size": {
86
+ "0": 315,
87
+ "1": 314
88
+ },
89
+ "flags": {},
90
+ "order": 0,
91
+ "mode": 0,
92
+ "outputs": [
93
+ {
94
+ "name": "IMAGE",
95
+ "type": "IMAGE",
96
+ "links": [
97
+ 30
98
+ ],
99
+ "shape": 3,
100
+ "label": "IMAGE",
101
+ "slot_index": 0
102
+ },
103
+ {
104
+ "name": "MASK",
105
+ "type": "MASK",
106
+ "links": [
107
+ 28
108
+ ],
109
+ "shape": 3,
110
+ "label": "MASK",
111
+ "slot_index": 1
112
+ }
113
+ ],
114
+ "properties": {
115
+ "Node name for S&R": "LoadImage"
116
+ },
117
+ "widgets_values": [
118
+ "clipspace/clipspace-mask-572957.png [input]",
119
+ "image"
120
+ ],
121
+ "color": "#322",
122
+ "bgcolor": "#533"
123
+ },
124
+ {
125
+ "id": 15,
126
+ "type": "PromptInput",
127
+ "pos": [
128
+ 479,
129
+ 1004
130
+ ],
131
+ "size": {
132
+ "0": 400,
133
+ "1": 200
134
+ },
135
+ "flags": {},
136
+ "order": 1,
137
+ "mode": 0,
138
+ "outputs": [
139
+ {
140
+ "name": "prompt",
141
+ "type": "PROMPT",
142
+ "links": [
143
+ 31
144
+ ],
145
+ "shape": 3,
146
+ "label": "prompt",
147
+ "slot_index": 0
148
+ }
149
+ ],
150
+ "properties": {
151
+ "Node name for S&R": "PromptInput"
152
+ },
153
+ "widgets_values": [
154
+ "1girl, blue hair"
155
+ ]
156
+ },
157
+ {
158
+ "id": 12,
159
+ "type": "PromptInput",
160
+ "pos": [
161
+ 965,
162
+ 954
163
+ ],
164
+ "size": {
165
+ "0": 400,
166
+ "1": 200
167
+ },
168
+ "flags": {},
169
+ "order": 2,
170
+ "mode": 0,
171
+ "outputs": [
172
+ {
173
+ "name": "prompt",
174
+ "type": "PROMPT",
175
+ "links": [
176
+ 32
177
+ ],
178
+ "shape": 3,
179
+ "label": "prompt",
180
+ "slot_index": 0
181
+ }
182
+ ],
183
+ "properties": {
184
+ "Node name for S&R": "PromptInput"
185
+ },
186
+ "widgets_values": [
187
+ "low, error, ugly"
188
+ ]
189
+ },
190
+ {
191
+ "id": 19,
192
+ "type": "Note",
193
+ "pos": [
194
+ 1406,
195
+ 968
196
+ ],
197
+ "size": {
198
+ "0": 210,
199
+ "1": 58
200
+ },
201
+ "flags": {},
202
+ "order": 3,
203
+ "mode": 0,
204
+ "properties": {
205
+ "text": ""
206
+ },
207
+ "widgets_values": [
208
+ "这里填负向画面提示 (不想要的内容)"
209
+ ],
210
+ "color": "#432",
211
+ "bgcolor": "#653"
212
+ },
213
+ {
214
+ "id": 18,
215
+ "type": "Note",
216
+ "pos": [
217
+ 254,
218
+ 1013
219
+ ],
220
+ "size": {
221
+ "0": 210,
222
+ "1": 58
223
+ },
224
+ "flags": {},
225
+ "order": 4,
226
+ "mode": 0,
227
+ "properties": {
228
+ "text": ""
229
+ },
230
+ "widgets_values": [
231
+ "这里填正向画面提示 (想要的内容)"
232
+ ],
233
+ "color": "#432",
234
+ "bgcolor": "#653"
235
+ },
236
+ {
237
+ "id": 21,
238
+ "type": "Note",
239
+ "pos": [
240
+ 990,
241
+ 543
242
+ ],
243
+ "size": {
244
+ "0": 217.51138305664062,
245
+ "1": 164.82931518554688
246
+ },
247
+ "flags": {},
248
+ "order": 5,
249
+ "mode": 0,
250
+ "properties": {
251
+ "text": ""
252
+ },
253
+ "widgets_values": [
254
+ "- denoise是重绘幅度,越高程度越大\n- steps是画笔绘制的步数\n- number是每次同时绘制的张数\n- cfg可以调整画面细节参数\n- scheduler是不同的去噪声方式"
255
+ ],
256
+ "color": "#432",
257
+ "bgcolor": "#653"
258
+ },
259
+ {
260
+ "id": 22,
261
+ "type": "Note",
262
+ "pos": [
263
+ 1835,
264
+ 498
265
+ ],
266
+ "size": {
267
+ "0": 210,
268
+ "1": 58
269
+ },
270
+ "flags": {},
271
+ "order": 6,
272
+ "mode": 0,
273
+ "properties": {
274
+ "text": ""
275
+ },
276
+ "widgets_values": [
277
+ "这里是最终结果"
278
+ ],
279
+ "color": "#432",
280
+ "bgcolor": "#653"
281
+ },
282
+ {
283
+ "id": 23,
284
+ "type": "Note",
285
+ "pos": [
286
+ 324,
287
+ 227
288
+ ],
289
+ "size": {
290
+ "0": 210,
291
+ "1": 58
292
+ },
293
+ "flags": {},
294
+ "order": 7,
295
+ "mode": 0,
296
+ "properties": {
297
+ "text": ""
298
+ },
299
+ "widgets_values": [
300
+ "这里选择喜欢的AIGC大模型"
301
+ ],
302
+ "color": "#432",
303
+ "bgcolor": "#653"
304
+ },
305
+ {
306
+ "id": 17,
307
+ "type": "PaddleSDInpaintPipe",
308
+ "pos": [
309
+ 628,
310
+ 526
311
+ ],
312
+ "size": {
313
+ "0": 315,
314
+ "1": 282
315
+ },
316
+ "flags": {},
317
+ "order": 10,
318
+ "mode": 0,
319
+ "inputs": [
320
+ {
321
+ "name": "sd_pipe",
322
+ "type": "PIPELINE",
323
+ "link": 29,
324
+ "label": "sd_pipe",
325
+ "slot_index": 0
326
+ },
327
+ {
328
+ "name": "image",
329
+ "type": "IMAGE",
330
+ "link": 30,
331
+ "label": "image"
332
+ },
333
+ {
334
+ "name": "mask",
335
+ "type": "MASK",
336
+ "link": 28,
337
+ "label": "mask",
338
+ "slot_index": 2
339
+ },
340
+ {
341
+ "name": "prompt",
342
+ "type": "PROMPT",
343
+ "link": 31,
344
+ "label": "prompt",
345
+ "slot_index": 3
346
+ },
347
+ {
348
+ "name": "negative_prompt",
349
+ "type": "PROMPT",
350
+ "link": 32,
351
+ "label": "negative_prompt"
352
+ }
353
+ ],
354
+ "outputs": [
355
+ {
356
+ "name": "latent",
357
+ "type": "LATENT",
358
+ "links": [
359
+ 33
360
+ ],
361
+ "shape": 3,
362
+ "label": "latent",
363
+ "slot_index": 0
364
+ }
365
+ ],
366
+ "properties": {
367
+ "Node name for S&R": "PaddleSDInpaintPipe"
368
+ },
369
+ "widgets_values": [
370
+ 0.7000000000000001,
371
+ 20,
372
+ 1,
373
+ 1064456556884681,
374
+ "randomize",
375
+ 7.5,
376
+ "euler"
377
+ ]
378
+ },
379
+ {
380
+ "id": 1,
381
+ "type": "PaddleSDCheckpointLoader",
382
+ "pos": [
383
+ -36,
384
+ 291
385
+ ],
386
+ "size": {
387
+ "0": 315,
388
+ "1": 58
389
+ },
390
+ "flags": {},
391
+ "order": 8,
392
+ "mode": 0,
393
+ "outputs": [
394
+ {
395
+ "name": "sd_pipe",
396
+ "type": "PIPELINE",
397
+ "links": [
398
+ 4,
399
+ 29
400
+ ],
401
+ "shape": 3,
402
+ "label": "sd_pipe",
403
+ "slot_index": 0
404
+ }
405
+ ],
406
+ "properties": {
407
+ "Node name for S&R": "PaddleSDCheckpointLoader"
408
+ },
409
+ "widgets_values": [
410
+ "sd15/人物写真_majicmixRealistic_v7.safetensors"
411
+ ]
412
+ },
413
+ {
414
+ "id": 20,
415
+ "type": "Note",
416
+ "pos": [
417
+ -204,
418
+ 673
419
+ ],
420
+ "size": {
421
+ "0": 210,
422
+ "1": 58
423
+ },
424
+ "flags": {},
425
+ "order": 9,
426
+ "mode": 0,
427
+ "properties": {
428
+ "text": ""
429
+ },
430
+ "widgets_values": [
431
+ "这里上传原图像,右键可以打开MaskEditor进行mask绘制。"
432
+ ],
433
+ "color": "#432",
434
+ "bgcolor": "#653"
435
+ }
436
+ ],
437
+ "links": [
438
+ [
439
+ 3,
440
+ 4,
441
+ 0,
442
+ 5,
443
+ 0,
444
+ "IMAGE"
445
+ ],
446
+ [
447
+ 4,
448
+ 1,
449
+ 0,
450
+ 4,
451
+ 1,
452
+ "PIPELINE"
453
+ ],
454
+ [
455
+ 28,
456
+ 7,
457
+ 1,
458
+ 17,
459
+ 2,
460
+ "MASK"
461
+ ],
462
+ [
463
+ 29,
464
+ 1,
465
+ 0,
466
+ 17,
467
+ 0,
468
+ "PIPELINE"
469
+ ],
470
+ [
471
+ 30,
472
+ 7,
473
+ 0,
474
+ 17,
475
+ 1,
476
+ "IMAGE"
477
+ ],
478
+ [
479
+ 31,
480
+ 15,
481
+ 0,
482
+ 17,
483
+ 3,
484
+ "PROMPT"
485
+ ],
486
+ [
487
+ 32,
488
+ 12,
489
+ 0,
490
+ 17,
491
+ 4,
492
+ "PROMPT"
493
+ ],
494
+ [
495
+ 33,
496
+ 17,
497
+ 0,
498
+ 4,
499
+ 0,
500
+ "LATENT"
501
+ ]
502
+ ],
503
+ "groups": [],
504
+ "config": {},
505
+ "extra": {
506
+ "ds": {
507
+ "scale": 0.6303940863128514,
508
+ "offset": [
509
+ 628.0768100805229,
510
+ 63.29978438298349
511
+ ]
512
+ }
513
+ },
514
+ "version": 0.4
515
+ }
PaddleMIX/comfyui/ComfyUI_ppdiffusers/workflows/SD15/workflow_SD1.5_text2img.json ADDED
@@ -0,0 +1,416 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "last_node_id": 25,
3
+ "last_link_id": 42,
4
+ "nodes": [
5
+ {
6
+ "id": 4,
7
+ "type": "PaddleSDVaeDecoder",
8
+ "pos": [
9
+ 1011,
10
+ 398
11
+ ],
12
+ "size": {
13
+ "0": 210,
14
+ "1": 46
15
+ },
16
+ "flags": {},
17
+ "order": 9,
18
+ "mode": 0,
19
+ "inputs": [
20
+ {
21
+ "name": "latent",
22
+ "type": "LATENT",
23
+ "link": 42,
24
+ "label": "latent"
25
+ },
26
+ {
27
+ "name": "sd_pipe",
28
+ "type": "PIPELINE",
29
+ "link": 4,
30
+ "label": "sd_pipe"
31
+ }
32
+ ],
33
+ "outputs": [
34
+ {
35
+ "name": "image",
36
+ "type": "IMAGE",
37
+ "links": [
38
+ 3
39
+ ],
40
+ "shape": 3,
41
+ "label": "image",
42
+ "slot_index": 0
43
+ }
44
+ ],
45
+ "properties": {
46
+ "Node name for S&R": "PaddleSDVaeDecoder"
47
+ }
48
+ },
49
+ {
50
+ "id": 19,
51
+ "type": "Note",
52
+ "pos": [
53
+ 1406,
54
+ 968
55
+ ],
56
+ "size": {
57
+ "0": 210,
58
+ "1": 58
59
+ },
60
+ "flags": {},
61
+ "order": 0,
62
+ "mode": 0,
63
+ "properties": {
64
+ "text": ""
65
+ },
66
+ "widgets_values": [
67
+ "这里填负向画面提示 (不想要的内容)"
68
+ ],
69
+ "color": "#432",
70
+ "bgcolor": "#653"
71
+ },
72
+ {
73
+ "id": 18,
74
+ "type": "Note",
75
+ "pos": [
76
+ 254,
77
+ 1013
78
+ ],
79
+ "size": {
80
+ "0": 210,
81
+ "1": 58
82
+ },
83
+ "flags": {},
84
+ "order": 1,
85
+ "mode": 0,
86
+ "properties": {
87
+ "text": ""
88
+ },
89
+ "widgets_values": [
90
+ "这里填正向画面提示 (想要的内容)"
91
+ ],
92
+ "color": "#432",
93
+ "bgcolor": "#653"
94
+ },
95
+ {
96
+ "id": 21,
97
+ "type": "Note",
98
+ "pos": [
99
+ 990,
100
+ 543
101
+ ],
102
+ "size": {
103
+ "0": 217.51138305664062,
104
+ "1": 164.82931518554688
105
+ },
106
+ "flags": {},
107
+ "order": 2,
108
+ "mode": 0,
109
+ "properties": {
110
+ "text": ""
111
+ },
112
+ "widgets_values": [
113
+ "- denoise是重绘幅度,越高程度越大\n- steps是画笔绘制的步数\n- number是每次同时绘制的张数\n- cfg可以调整画面细节参数\n- scheduler是不同的去噪声方式"
114
+ ],
115
+ "color": "#432",
116
+ "bgcolor": "#653"
117
+ },
118
+ {
119
+ "id": 22,
120
+ "type": "Note",
121
+ "pos": [
122
+ 1835,
123
+ 498
124
+ ],
125
+ "size": {
126
+ "0": 210,
127
+ "1": 58
128
+ },
129
+ "flags": {},
130
+ "order": 3,
131
+ "mode": 0,
132
+ "properties": {
133
+ "text": ""
134
+ },
135
+ "widgets_values": [
136
+ "这里是最终结果"
137
+ ],
138
+ "color": "#432",
139
+ "bgcolor": "#653"
140
+ },
141
+ {
142
+ "id": 23,
143
+ "type": "Note",
144
+ "pos": [
145
+ 324,
146
+ 227
147
+ ],
148
+ "size": {
149
+ "0": 210,
150
+ "1": 58
151
+ },
152
+ "flags": {},
153
+ "order": 4,
154
+ "mode": 0,
155
+ "properties": {
156
+ "text": ""
157
+ },
158
+ "widgets_values": [
159
+ "这里选择喜欢的AIGC大模型"
160
+ ],
161
+ "color": "#432",
162
+ "bgcolor": "#653"
163
+ },
164
+ {
165
+ "id": 1,
166
+ "type": "PaddleSDCheckpointLoader",
167
+ "pos": [
168
+ -36,
169
+ 291
170
+ ],
171
+ "size": {
172
+ "0": 315,
173
+ "1": 58
174
+ },
175
+ "flags": {},
176
+ "order": 5,
177
+ "mode": 0,
178
+ "outputs": [
179
+ {
180
+ "name": "sd_pipe",
181
+ "type": "PIPELINE",
182
+ "links": [
183
+ 4,
184
+ 39
185
+ ],
186
+ "shape": 3,
187
+ "label": "sd_pipe",
188
+ "slot_index": 0
189
+ }
190
+ ],
191
+ "properties": {
192
+ "Node name for S&R": "PaddleSDCheckpointLoader"
193
+ },
194
+ "widgets_values": [
195
+ "sd15/25D风_revAnimated_v122.safetensors"
196
+ ]
197
+ },
198
+ {
199
+ "id": 15,
200
+ "type": "PromptInput",
201
+ "pos": [
202
+ 479,
203
+ 1004
204
+ ],
205
+ "size": {
206
+ "0": 400,
207
+ "1": 200
208
+ },
209
+ "flags": {},
210
+ "order": 6,
211
+ "mode": 0,
212
+ "outputs": [
213
+ {
214
+ "name": "prompt",
215
+ "type": "PROMPT",
216
+ "links": [
217
+ 40
218
+ ],
219
+ "shape": 3,
220
+ "label": "prompt",
221
+ "slot_index": 0
222
+ }
223
+ ],
224
+ "properties": {
225
+ "Node name for S&R": "PromptInput"
226
+ },
227
+ "widgets_values": [
228
+ "1boy, blue hair, cute, anime style"
229
+ ]
230
+ },
231
+ {
232
+ "id": 12,
233
+ "type": "PromptInput",
234
+ "pos": [
235
+ 965,
236
+ 964
237
+ ],
238
+ "size": {
239
+ "0": 400,
240
+ "1": 200
241
+ },
242
+ "flags": {},
243
+ "order": 7,
244
+ "mode": 0,
245
+ "outputs": [
246
+ {
247
+ "name": "prompt",
248
+ "type": "PROMPT",
249
+ "links": [
250
+ 41
251
+ ],
252
+ "shape": 3,
253
+ "label": "prompt",
254
+ "slot_index": 0
255
+ }
256
+ ],
257
+ "properties": {
258
+ "Node name for S&R": "PromptInput"
259
+ },
260
+ "widgets_values": [
261
+ "low, error, ugly, (extra hand), wrong hand, nsfw, nude, extra head"
262
+ ]
263
+ },
264
+ {
265
+ "id": 5,
266
+ "type": "PaddleSaveImage",
267
+ "pos": [
268
+ 1478,
269
+ 470
270
+ ],
271
+ "size": {
272
+ "0": 315,
273
+ "1": 270
274
+ },
275
+ "flags": {},
276
+ "order": 10,
277
+ "mode": 0,
278
+ "inputs": [
279
+ {
280
+ "name": "images",
281
+ "type": "IMAGE",
282
+ "link": 3,
283
+ "label": "images"
284
+ }
285
+ ],
286
+ "properties": {
287
+ "Node name for S&R": "PaddleSaveImage"
288
+ },
289
+ "widgets_values": [
290
+ "ComfyUI"
291
+ ]
292
+ },
293
+ {
294
+ "id": 25,
295
+ "type": "PaddleSDText2ImagePipe",
296
+ "pos": [
297
+ 636,
298
+ 537
299
+ ],
300
+ "size": {
301
+ "0": 315,
302
+ "1": 266
303
+ },
304
+ "flags": {},
305
+ "order": 8,
306
+ "mode": 0,
307
+ "inputs": [
308
+ {
309
+ "name": "sd_pipe",
310
+ "type": "PIPELINE",
311
+ "link": 39,
312
+ "label": "sd_pipe"
313
+ },
314
+ {
315
+ "name": "prompt",
316
+ "type": "PROMPT",
317
+ "link": 40,
318
+ "label": "prompt"
319
+ },
320
+ {
321
+ "name": "negative_prompt",
322
+ "type": "PROMPT",
323
+ "link": 41,
324
+ "label": "negative_prompt"
325
+ }
326
+ ],
327
+ "outputs": [
328
+ {
329
+ "name": "latent",
330
+ "type": "LATENT",
331
+ "links": [
332
+ 42
333
+ ],
334
+ "shape": 3,
335
+ "label": "latent",
336
+ "slot_index": 0
337
+ }
338
+ ],
339
+ "properties": {
340
+ "Node name for S&R": "PaddleSDText2ImagePipe"
341
+ },
342
+ "widgets_values": [
343
+ 20,
344
+ 512,
345
+ 768,
346
+ 1,
347
+ 61130596064161,
348
+ "randomize",
349
+ 7.5,
350
+ "euler"
351
+ ]
352
+ }
353
+ ],
354
+ "links": [
355
+ [
356
+ 3,
357
+ 4,
358
+ 0,
359
+ 5,
360
+ 0,
361
+ "IMAGE"
362
+ ],
363
+ [
364
+ 4,
365
+ 1,
366
+ 0,
367
+ 4,
368
+ 1,
369
+ "PIPELINE"
370
+ ],
371
+ [
372
+ 39,
373
+ 1,
374
+ 0,
375
+ 25,
376
+ 0,
377
+ "PIPELINE"
378
+ ],
379
+ [
380
+ 40,
381
+ 15,
382
+ 0,
383
+ 25,
384
+ 1,
385
+ "PROMPT"
386
+ ],
387
+ [
388
+ 41,
389
+ 12,
390
+ 0,
391
+ 25,
392
+ 2,
393
+ "PROMPT"
394
+ ],
395
+ [
396
+ 42,
397
+ 25,
398
+ 0,
399
+ 4,
400
+ 0,
401
+ "LATENT"
402
+ ]
403
+ ],
404
+ "groups": [],
405
+ "config": {},
406
+ "extra": {
407
+ "ds": {
408
+ "scale": 0.7627768444385535,
409
+ "offset": [
410
+ 342.353878460601,
411
+ -167.10478701820625
412
+ ]
413
+ }
414
+ },
415
+ "version": 0.4
416
+ }
PaddleMIX/comfyui/ComfyUI_ppdiffusers/workflows/SDXL/workflow_SDXL_text2img.json ADDED
@@ -0,0 +1,416 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {
2
+ "last_node_id": 28,
3
+ "last_link_id": 51,
4
+ "nodes": [
5
+ {
6
+ "id": 19,
7
+ "type": "Note",
8
+ "pos": [
9
+ 1406,
10
+ 968
11
+ ],
12
+ "size": {
13
+ "0": 210,
14
+ "1": 58
15
+ },
16
+ "flags": {},
17
+ "order": 0,
18
+ "mode": 0,
19
+ "properties": {
20
+ "text": ""
21
+ },
22
+ "widgets_values": [
23
+ "这里填负向画面提示 (不想要的内容)"
24
+ ],
25
+ "color": "#432",
26
+ "bgcolor": "#653"
27
+ },
28
+ {
29
+ "id": 18,
30
+ "type": "Note",
31
+ "pos": [
32
+ 254,
33
+ 1013
34
+ ],
35
+ "size": {
36
+ "0": 210,
37
+ "1": 58
38
+ },
39
+ "flags": {},
40
+ "order": 1,
41
+ "mode": 0,
42
+ "properties": {
43
+ "text": ""
44
+ },
45
+ "widgets_values": [
46
+ "这里填正向画面提示 (想要的内容)"
47
+ ],
48
+ "color": "#432",
49
+ "bgcolor": "#653"
50
+ },
51
+ {
52
+ "id": 21,
53
+ "type": "Note",
54
+ "pos": [
55
+ 990,
56
+ 543
57
+ ],
58
+ "size": {
59
+ "0": 217.51138305664062,
60
+ "1": 164.82931518554688
61
+ },
62
+ "flags": {},
63
+ "order": 2,
64
+ "mode": 0,
65
+ "properties": {
66
+ "text": ""
67
+ },
68
+ "widgets_values": [
69
+ "- denoise是重绘幅度,越高程度越大\n- steps是画笔绘制的步数\n- number是每次同时绘制的张数\n- cfg可以调整画面细节参数\n- scheduler是不同的去噪声方式"
70
+ ],
71
+ "color": "#432",
72
+ "bgcolor": "#653"
73
+ },
74
+ {
75
+ "id": 22,
76
+ "type": "Note",
77
+ "pos": [
78
+ 1835,
79
+ 498
80
+ ],
81
+ "size": {
82
+ "0": 210,
83
+ "1": 58
84
+ },
85
+ "flags": {},
86
+ "order": 3,
87
+ "mode": 0,
88
+ "properties": {
89
+ "text": ""
90
+ },
91
+ "widgets_values": [
92
+ "这里是最终结果"
93
+ ],
94
+ "color": "#432",
95
+ "bgcolor": "#653"
96
+ },
97
+ {
98
+ "id": 23,
99
+ "type": "Note",
100
+ "pos": [
101
+ 324,
102
+ 227
103
+ ],
104
+ "size": {
105
+ "0": 210,
106
+ "1": 58
107
+ },
108
+ "flags": {},
109
+ "order": 4,
110
+ "mode": 0,
111
+ "properties": {
112
+ "text": ""
113
+ },
114
+ "widgets_values": [
115
+ "这里选择喜欢的AIGC大模型"
116
+ ],
117
+ "color": "#432",
118
+ "bgcolor": "#653"
119
+ },
120
+ {
121
+ "id": 5,
122
+ "type": "PaddleSaveImage",
123
+ "pos": [
124
+ 1478,
125
+ 470
126
+ ],
127
+ "size": {
128
+ "0": 315,
129
+ "1": 270
130
+ },
131
+ "flags": {},
132
+ "order": 10,
133
+ "mode": 0,
134
+ "inputs": [
135
+ {
136
+ "name": "images",
137
+ "type": "IMAGE",
138
+ "link": 51,
139
+ "label": "images"
140
+ }
141
+ ],
142
+ "properties": {
143
+ "Node name for S&R": "PaddleSaveImage"
144
+ },
145
+ "widgets_values": [
146
+ "ComfyUI"
147
+ ]
148
+ },
149
+ {
150
+ "id": 12,
151
+ "type": "PromptInput",
152
+ "pos": [
153
+ 965,
154
+ 964
155
+ ],
156
+ "size": {
157
+ "0": 400,
158
+ "1": 200
159
+ },
160
+ "flags": {},
161
+ "order": 5,
162
+ "mode": 0,
163
+ "outputs": [
164
+ {
165
+ "name": "prompt",
166
+ "type": "PROMPT",
167
+ "links": [
168
+ 48
169
+ ],
170
+ "shape": 3,
171
+ "label": "prompt",
172
+ "slot_index": 0
173
+ }
174
+ ],
175
+ "properties": {
176
+ "Node name for S&R": "PromptInput"
177
+ },
178
+ "widgets_values": [
179
+ "low, error, ugly, (extra hand), wrong hand, nsfw, nude, extra head"
180
+ ]
181
+ },
182
+ {
183
+ "id": 28,
184
+ "type": "PaddleSDXLVaeDecoder",
185
+ "pos": [
186
+ 1115.8165436384072,
187
+ 359.29368984194616
188
+ ],
189
+ "size": {
190
+ "0": 210,
191
+ "1": 46
192
+ },
193
+ "flags": {},
194
+ "order": 9,
195
+ "mode": 0,
196
+ "inputs": [
197
+ {
198
+ "name": "latent",
199
+ "type": "LATENT",
200
+ "link": 50,
201
+ "label": "latent"
202
+ },
203
+ {
204
+ "name": "sd_pipe",
205
+ "type": "PIPELINE",
206
+ "link": 49,
207
+ "label": "sd_pipe"
208
+ }
209
+ ],
210
+ "outputs": [
211
+ {
212
+ "name": "image",
213
+ "type": "IMAGE",
214
+ "links": [
215
+ 51
216
+ ],
217
+ "shape": 3,
218
+ "label": "image",
219
+ "slot_index": 0
220
+ }
221
+ ],
222
+ "properties": {
223
+ "Node name for S&R": "PaddleSDXLVaeDecoder"
224
+ }
225
+ },
226
+ {
227
+ "id": 27,
228
+ "type": "PaddleSDXLCheckpointLoader",
229
+ "pos": [
230
+ 53,
231
+ 413
232
+ ],
233
+ "size": {
234
+ "0": 315,
235
+ "1": 58
236
+ },
237
+ "flags": {},
238
+ "order": 6,
239
+ "mode": 0,
240
+ "outputs": [
241
+ {
242
+ "name": "sd_pipe",
243
+ "type": "PIPELINE",
244
+ "links": [
245
+ 45,
246
+ 49
247
+ ],
248
+ "shape": 3,
249
+ "label": "sd_pipe",
250
+ "slot_index": 0
251
+ }
252
+ ],
253
+ "properties": {
254
+ "Node name for S&R": "PaddleSDXLCheckpointLoader"
255
+ },
256
+ "widgets_values": [
257
+ "sdxl/MJ5风格_SDXL_Dream.safetensors"
258
+ ]
259
+ },
260
+ {
261
+ "id": 15,
262
+ "type": "PromptInput",
263
+ "pos": [
264
+ 479,
265
+ 1004
266
+ ],
267
+ "size": {
268
+ "0": 400,
269
+ "1": 200
270
+ },
271
+ "flags": {},
272
+ "order": 7,
273
+ "mode": 0,
274
+ "outputs": [
275
+ {
276
+ "name": "prompt",
277
+ "type": "PROMPT",
278
+ "links": [
279
+ 44
280
+ ],
281
+ "shape": 3,
282
+ "label": "prompt",
283
+ "slot_index": 0
284
+ }
285
+ ],
286
+ "properties": {
287
+ "Node name for S&R": "PromptInput"
288
+ },
289
+ "widgets_values": [
290
+ "1girl, cool, blue hair, cute, sunset, niji anime style"
291
+ ]
292
+ },
293
+ {
294
+ "id": 26,
295
+ "type": "PaddleSDXLText2ImagePipe",
296
+ "pos": [
297
+ 503,
298
+ 573
299
+ ],
300
+ "size": {
301
+ "0": 315,
302
+ "1": 266
303
+ },
304
+ "flags": {},
305
+ "order": 8,
306
+ "mode": 0,
307
+ "inputs": [
308
+ {
309
+ "name": "sd_pipe",
310
+ "type": "PIPELINE",
311
+ "link": 45,
312
+ "label": "sd_pipe"
313
+ },
314
+ {
315
+ "name": "prompt",
316
+ "type": "PROMPT",
317
+ "link": 44,
318
+ "label": "prompt"
319
+ },
320
+ {
321
+ "name": "negative_prompt",
322
+ "type": "PROMPT",
323
+ "link": 48,
324
+ "label": "negative_prompt"
325
+ }
326
+ ],
327
+ "outputs": [
328
+ {
329
+ "name": "latent",
330
+ "type": "LATENT",
331
+ "links": [
332
+ 50
333
+ ],
334
+ "shape": 3,
335
+ "label": "latent",
336
+ "slot_index": 0
337
+ }
338
+ ],
339
+ "properties": {
340
+ "Node name for S&R": "PaddleSDXLText2ImagePipe"
341
+ },
342
+ "widgets_values": [
343
+ 20,
344
+ 512,
345
+ 768,
346
+ 1,
347
+ 351732349249869,
348
+ "randomize",
349
+ 5,
350
+ "euler"
351
+ ]
352
+ }
353
+ ],
354
+ "links": [
355
+ [
356
+ 44,
357
+ 15,
358
+ 0,
359
+ 26,
360
+ 1,
361
+ "PROMPT"
362
+ ],
363
+ [
364
+ 45,
365
+ 27,
366
+ 0,
367
+ 26,
368
+ 0,
369
+ "PIPELINE"
370
+ ],
371
+ [
372
+ 48,
373
+ 12,
374
+ 0,
375
+ 26,
376
+ 2,
377
+ "PROMPT"
378
+ ],
379
+ [
380
+ 49,
381
+ 27,
382
+ 0,
383
+ 28,
384
+ 1,
385
+ "PIPELINE"
386
+ ],
387
+ [
388
+ 50,
389
+ 26,
390
+ 0,
391
+ 28,
392
+ 0,
393
+ "LATENT"
394
+ ],
395
+ [
396
+ 51,
397
+ 28,
398
+ 0,
399
+ 5,
400
+ 0,
401
+ "IMAGE"
402
+ ]
403
+ ],
404
+ "groups": [],
405
+ "config": {},
406
+ "extra": {
407
+ "ds": {
408
+ "scale": 0.5730855330116872,
409
+ "offset": [
410
+ 113.53226463291708,
411
+ -145.5843663012114
412
+ ]
413
+ }
414
+ },
415
+ "version": 0.4
416
+ }
vlmeval/VLMEvalKit_old/docs/en/.readthedocs.yaml ADDED
@@ -0,0 +1,17 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ version: 2
2
+
3
+ # Set the version of Python and other tools you might need
4
+ build:
5
+ os: ubuntu-22.04
6
+ tools:
7
+ python: "3.8"
8
+
9
+ formats:
10
+ - epub
11
+
12
+ sphinx:
13
+ configuration: docs/en/conf.py
14
+
15
+ python:
16
+ install:
17
+ - requirements: requirements/docs.txt
vlmeval/VLMEvalKit_old/docs/en/ConfigSystem.md ADDED
@@ -0,0 +1,57 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # Config System
2
+
3
+ By default, VLMEvalKit launches the evaluation by setting the model name(s) (defined in `/vlmeval/config.py`) and dataset name(s) (defined in `vlmeval/dataset/__init__.py`) in the `run.py` script with the `--model` and `--data` arguments. Such approach is simple and efficient in most scenarios, however, it may not be flexible enough when the user wants to evaluate multiple models / datasets with different settings.
4
+
5
+ To address this, VLMEvalKit provides a more flexible config system. The user can specify the model and dataset settings in a json file, and pass the path to the config file to the `run.py` script with the `--config` argument. Here is a sample config json:
6
+
7
+ ```json
8
+ {
9
+ "model": {
10
+ "GPT4o_20240806_T00_HIGH": {
11
+ "class": "GPT4V",
12
+ "model": "gpt-4o-2024-08-06",
13
+ "temperature": 0,
14
+ "img_detail": "high"
15
+ },
16
+ "GPT4o_20240806_T10_Low": {
17
+ "class": "GPT4V",
18
+ "model": "gpt-4o-2024-08-06",
19
+ "temperature": 1.0,
20
+ "img_detail": "low"
21
+ }
22
+ },
23
+ "data": {
24
+ "MME-RealWorld-Lite": {
25
+ "class": "MMERealWorld",
26
+ "dataset": "MME-RealWorld-Lite"
27
+ },
28
+ "MMBench_DEV_EN_V11": {
29
+ "class": "ImageMCQDataset",
30
+ "dataset": "MMBench_DEV_EN_V11"
31
+ }
32
+ }
33
+ }
34
+ ```
35
+
36
+ Explanation of the config json:
37
+
38
+ 1. Now we support two fields: `model` and `data`, each of which is a dictionary. The key of the dictionary is the name of the model / dataset (set by the user), and the value is the setting of the model / dataset.
39
+ 2. For items in `model`, the value is a dictionary containing the following keys:
40
+ - `class`: The class name of the model, which should be a class name defined in `vlmeval/vlm/__init__.py` (open-source models) or `vlmeval/api/__init__.py` (API models).
41
+ - Other kwargs: Other kwargs are model-specific parameters, please refer to the definition of the model class for detailed usage. For example, `model`, `temperature`, `img_detail` are arguments of the `GPT4V` class. It's noteworthy that the `model` argument is required by most model classes.
42
+ 3. For the dictionary `data`, we suggest users to use the official dataset name as the key (or part of the key), since we frequently determine the post-processing / judging settings based on the dataset name. For items in `data`, the value is a dictionary containing the following keys:
43
+ - `class`: The class name of the dataset, which should be a class name defined in `vlmeval/dataset/__init__.py`.
44
+ - Other kwargs: Other kwargs are dataset-specific parameters, please refer to the definition of the dataset class for detailed usage. Typically, the `dataset` argument is required by most dataset classes.
45
+
46
+ Saving the example config json to `config.json`, you can launch the evaluation by:
47
+
48
+ ```bash
49
+ python run.py --config config.json
50
+ ```
51
+
52
+ That will generate the following output files under the working directory `$WORK_DIR` (Following the format `{$WORK_DIR}/{$MODEL_NAME}/{$MODEL_NAME}_{$DATASET_NAME}_*`):
53
+
54
+ - `$WORK_DIR/GPT4o_20240806_T00_HIGH/GPT4o_20240806_T00_HIGH_MME-RealWorld-Lite*`
55
+ - `$WORK_DIR/GPT4o_20240806_T10_Low/GPT4o_20240806_T10_Low_MME-RealWorld-Lite*`
56
+ - `$WORK_DIR/GPT4o_20240806_T00_HIGH/GPT4o_20240806_T00_HIGH_MMBench_DEV_EN_V11*`
57
+ - `$WORK_DIR/GPT4o_20240806_T10_Low/GPT4o_20240806_T10_Low_MMBench_DEV_EN_V11*`
vlmeval/VLMEvalKit_old/docs/en/Development.md ADDED
@@ -0,0 +1,146 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # Develop new Benchmark / MLLM
2
+
3
+ > 🛠️ How to implement a new Benchmark / VLM in VLMEvalKit?
4
+
5
+ ## Implement a new benchmark
6
+
7
+ Example PR: **Math-Vision Benchmark** ([#292](https://github.com/open-compass/VLMEvalKit/pull/292/files))
8
+
9
+ In VLMEvalKit, benchmarks are organized as dataset classes. When you try to implement a new benchmark, you can either reuse existing dataset classes (*e.g.*, You can reuse `ImageMCQDataset` when implementing a new multi-choice benchmark), or support a new dataset class. Each dataset must have the following two member functions (either reuse the one of the parent class or implement your own):
10
+
11
+ - `build_prompt(self, line)`: The function input `line` is an integer (the sample index) or a `pd.Series` object (the raw record of the sample). The function outputs a `multi-modal message`, serving as the input of an MLLM. The `multi-modal message` is an interleaved list of multi-modal messages adopting the following format (the example includes an image and a text message): `[dict(type='image', value=IMAGE_PTH), dict(type='text', value=prompt)]`.
12
+ - `evaluate(self, eval_file, **judge_kwargs)`: The function input `eval_file` is the MLLM prediction (typically in `.xlsx` format). If the benchmark requires an external LLM (typically GPT) for evaluation, then `judge_kwargs` can pass the arguments for the LLM. The function outputs the benchmark evaluation results (metrics) in the form of `dict` or `pd.DataFrame`.
13
+
14
+ We then brief the typical steps to implement a new benchmark under VLMEvalKit:
15
+
16
+ ### 1. Prepare your benchmark tsv file
17
+
18
+ Currently, we organize a benchmark as one single TSV file. During inference, the data file will be automatically downloaded from the definited `DATASET_URL` link to `$LMUData` file (default path is `$HOME/LMUData`, if not set explicitly). You can upload the prepared TSV file to a downloadable address (e.g., Huggingface) or send it to us at <[email protected]>. We will assist in uploading the dataset to the server. You can also customize `LMUData` path in the environment variable `LMUData=/path/to/your/data`.
19
+
20
+ The contents of the TSV file consist of:
21
+
22
+ | Dataset Name \ Fields | index | image | image_path | question | hint | multi-choice<br>options | answer | category | l2-category | split |
23
+ | --------------------------------------- | ----- | ----- | ---------- | -------- | ---- | ----------------------- | ------ | -------- | ----------- | ----- |
24
+ | MMBench_DEV_[CN/EN] | ✅ | ✅ | | ✅ | ✅ | ✅ | ✅ | ✅ | ✅ | ✅ |
25
+ | MMBench_TEST_[CN/EN] | ✅ | ✅ | | ✅ | ✅ | ✅ | | ✅ | ✅ | ✅ |
26
+ | CCBench | ✅ | ✅ | | ✅ | | ✅ | ✅ | ✅ | | |
27
+ | SEEDBench_IMG | ✅ | ✅ | | ✅ | | ✅ | ✅ | ✅ | | |
28
+ | MME | ✅ | ✅ | | ✅ | | | ✅ | ✅ | | |
29
+ | CORE_MM | ✅ | ✅ | ✅ | ✅ | | | | ✅ | | |
30
+ | MMVet | ✅ | ✅ | | ✅ | | | ✅ | ✅ | | |
31
+ | MMMU_DEV_VAL | ✅ | ✅ | ✅ | ✅ | | ✅ | ✅ | ✅ | ✅ | ✅ |
32
+ | COCO_VAL | ✅ | ✅ | | | | | ✅ | | | |
33
+ | OCRVQA_[TEST/TESTCORE] | ✅ | ✅ | | ✅ | | | ✅ | | | |
34
+ | TextVQA_VAL | ✅ | ✅ | | ✅ | | | ✅ | | | |
35
+ | VCR_[EN/ZH]\_[EASY/HARD]\_[ALL/500/100] | ✅ | ✅ | | ✅ | | | ✅ | | | |
36
+ | MMMB_[en/cn/pt/ar/tr/ru] | ✅ | ✅ | | ✅ | ✅ | ✅ | ✅ | ✅ | |✅ |
37
+ | MMBench_dev_[en/cn/pt/ar/tr/ru] | ✅ | ✅ | | ✅ | ✅ | ✅ | ✅ | ✅ | ✅ |✅ |
38
+
39
+ <div align="center"><b>Table 1. TSV fields of supported datasets.</b></div>
40
+
41
+ **Intro to mandatory fields in the `TSV` file:**
42
+
43
+ - **index:** Integer, Unique for each line in `tsv`
44
+ - **image:** The base64 of the image, you can use APIs implemented in `vlmeval/smp/vlm.py` for encoding and decoding:
45
+ - Encoding: `encode_image_to_base64 `(for PIL Image) / `encode_image_file_to_base64` (for image file path)
46
+ - Decoding: `decode_base64_to_image`(for PIL Image) / `decode_base64_to_image_file` (for image file path)
47
+ - **question**: The question corresponding to the image, a string
48
+ - **answer**: The answer to the question, a string. The `test` split does not need this field
49
+
50
+ ### 2. Cutomize your benchmark prompt
51
+
52
+ `ImageBaseDataset` defines the default prompt format. If you need to add prompts specific to the dataset or input data in the `Interleave` format to the model, you can implement this through the `build_prompt(line)` function. This function takes a line from a TSV file as input, containing fields such as index, image, question, etc. The function returns a dictionary list of multimodal messages `msg` in the format `[dict(type='image', value=IMAGE_PTH), dict(type='text', value=prompt)]`, including the image path and the text prompt to be input into VLMs. For interleave type inputs, you can directly place the dictionary of the image path at the image token position.
53
+
54
+ ### 3. Cutomize your benchmark metrics
55
+
56
+ To add evaluation for a new benchmark, you need to customize a class object to implement the dataset’s metrics calculation. Multimodal datasets inherit from the `ImageBaseDataset` object in `vlmeval/dataset/image_base.py`. The TYPE defines the type of dataset, `DATASET_URL` is the download address of the dataset, and `DATASET_MD5` is the MD5 checksum for consistency checking of the dataset file.
57
+
58
+ In this class, **you need to implement** the `evaluate(eval_file, **judge_kwargs)` class function to calculate metrics and output results for the custom dataset. The function input `eval_file` is the path to the model prediction results file `{model_name}_{dataset}.xlsx`. This file can be read as a pandas.DataFrame using the `load(eval_file)` method, containing fields such as index, question, answer, category, prediction, etc. The judge_kwargs will pass a dictionary related to evaluation, such as the name of the `judge model`, the number of API request threads, etc. **The return value** of the function is the calculated accuracy and other metrics, formatted as a dictionary composed of lists, organized into a pandas.DataFrame.
59
+
60
+ ## Implement a new model
61
+
62
+ Example PR: **Support LLaVA-Next-Interleave** ([#294](https://github.com/open-compass/VLMEvalKit/pull/294))
63
+
64
+ **1. Support `generate_inner` API (mandatory).**
65
+
66
+ All existing models are implemented in `vlmeval/vlm`. For a minimal model, your model class **must implement the method** `generate_inner(msgs, dataset=None)`. In this function, you feed a multi-modal message to your VLM and return the VLM prediction (which is a string). The optional argument `dataset` can be used as the flag for the model to switch among various inference strategies.
67
+
68
+ The multi-modal messages `msgs` is a list of dictionaries, each dictionary has two keys: type and value:
69
+ - `type`: We currently support two types, choices are ["image", "text"].
70
+ - `value`: When type=='text' , the value is the text message (a single string); when type=='image', the value can be the local path of an image file, or the image URL.
71
+
72
+ Currently a multi-modal message may contain arbitrarily interleaved images and texts. If your model do not support that, a practice can be taking the 1st image and concatenated text messages as the input. You can set the `INTERLEAVE = False` in your model class and use `self.message_to_promptimg(message, dataset=dataset)` to build your prompt and the first image's path.
73
+
74
+ Here are some examples of multi-modal messages:
75
+
76
+ ```python
77
+ IMAGE_PTH = 'assets/apple.jpg'
78
+ IMAGE_URL = 'https://raw.githubusercontent.com/open-compass/VLMEvalKit/main/assets/apple.jpg'
79
+ msg1 = [
80
+ dict(type='image', value=IMAGE_PTH),
81
+ dict(type='text', value='What is in this image?')
82
+ ]
83
+ msg2 = [
84
+ dict(type='image', value=IMAGE_URL),
85
+ dict(type='image', value=IMAGE_URL),
86
+ dict(type='text', value='How many apples are there in these images?')
87
+ ]
88
+ response = model.generate(msg1)
89
+ ```
90
+
91
+ For convenience sake, we also support to take a list of string as inputs. In that case, we will check if a string is an image path or image URL and automatically convert it to the list[dict] format:
92
+
93
+ ```python
94
+ IMAGE_PTH = 'assets/apple.jpg'
95
+ IMAGE_URL = 'https://raw.githubusercontent.com/open-compass/VLMEvalKit/main/assets/apple.jpg'
96
+ msg1 = [IMAGE_PTH, 'What is in this image?']
97
+ msg2 = [IMAGE_URL, IMAGE_URL, 'How many apples are there in these images?']
98
+ response = model.generate(msg1)
99
+ ```
100
+
101
+ **Support Custom Prompt (optional).**
102
+
103
+ Besides, your model can support **custom prompt building** by implementing two optional methods: `use_custom_prompt(dataset)` and `build_prompt(line, dataset=None)`.
104
+
105
+ Both functions take the dataset name as the input:
106
+
107
+ - `use_custom_prompt(dataset)` returns a boolean flag, indicating whether the model should use the custom prompt building strategy.
108
+ - If `use_custom_prompt(dataset)` returns True, `build_prompt(line, dataset)` should return a customly bulit multimodal message for the corresponding `dataset`, given `line`, which is a dictionary that includes the necessary information of a data sample. If `use_custom_prompt(dataset)` returns False, the default prompt building strategy will be used.
109
+
110
+ **Support multi-turn chatting (optional).**
111
+
112
+ You can also support the multi-turn chatting and evaluation with your VLM by supporting the `chat_inner(message, dataset)` function. The function outputs a single string response, and the `message` is a list of chat history, following the below format.
113
+
114
+ ```python
115
+ # Assume msg1, msg2, msg3, ... are multi-modal messages following the previously described format
116
+ # `chat_inner` take the following chat history list as input:
117
+ message = [
118
+ dict(role='user', content=msg1),
119
+ dict(role='assistant', content=msg2),
120
+ dict(role='user', content=msg3),
121
+ dict(role='assistant', content=msg4),
122
+ ......
123
+ dict(role='user', content=msgn),
124
+ ]
125
+ # `message` should contain an odd number of chat utterances, the role of utterances should be interleaved "user" and "assistant", with the role of the last utterance to be "user".
126
+ # The chat function will call `chat_inner`
127
+ response = model.chat(message)
128
+ ```
129
+
130
+ ### Example PRs:
131
+
132
+ - VLM that doesn't support interleaved images and texts, and does not use custom prompts: [[Model] Support glm-4v-9b](https://github.com/open-compass/VLMEvalKit/pull/221)
133
+ - VLM that supports interleaved images and texts and custom prompts: [Add MiniCPM-Llama3-V-2.5](https://github.com/open-compass/VLMEvalKit/pull/205)
134
+ - VLM API: [Feature add glmv](https://github.com/open-compass/VLMEvalKit/pull/201)
135
+
136
+ ## Contribute to VLMEvalKit
137
+
138
+ If you want to contribute codes to **VLMEvalKit**, please do the pre-commit check before you submit a PR. That helps to keep the code tidy.
139
+
140
+ ```bash
141
+ # Under the directory of VLMEvalKit, install the pre-commit hook:
142
+ pip install pre-commit
143
+ pre-commit install
144
+ pre-commit run --all-files
145
+ # Then you can commit your code.
146
+ ```
vlmeval/VLMEvalKit_old/docs/en/Makefile ADDED
@@ -0,0 +1,20 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # Minimal makefile for Sphinx documentation
2
+ #
3
+
4
+ # You can set these variables from the command line, and also
5
+ # from the environment for the first two.
6
+ SPHINXOPTS ?=
7
+ SPHINXBUILD ?= sphinx-build
8
+ SOURCEDIR = .
9
+ BUILDDIR = _build
10
+
11
+ # Put it first so that "make" without argument is like "make help".
12
+ help:
13
+ @$(SPHINXBUILD) -M help "$(SOURCEDIR)" "$(BUILDDIR)" $(SPHINXOPTS) $(O)
14
+
15
+ .PHONY: help Makefile
16
+
17
+ # Catch-all target: route all unknown targets to Sphinx using the new
18
+ # "make mode" option. $(O) is meant as a shortcut for $(SPHINXOPTS).
19
+ %: Makefile
20
+ @$(SPHINXBUILD) -M $@ "$(SOURCEDIR)" "$(BUILDDIR)" $(SPHINXOPTS) $(O)
vlmeval/VLMEvalKit_old/docs/en/Quickstart.md ADDED
@@ -0,0 +1,148 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # Quickstart
2
+
3
+ Before running the evaluation script, you need to **configure** the VLMs and set the model_paths properly.
4
+
5
+ After that, you can use a single script `run.py` to inference and evaluate multiple VLMs and benchmarks at a same time.
6
+
7
+ ## Step 0. Installation & Setup essential keys
8
+
9
+ **Installation.**
10
+
11
+ ```bash
12
+ git clone https://github.com/open-compass/VLMEvalKit.git
13
+ cd VLMEvalKit
14
+ pip install -e .
15
+ ```
16
+
17
+ **Setup Keys.**
18
+
19
+ To infer with API models (GPT-4v, Gemini-Pro-V, etc.) or use LLM APIs as the **judge or choice extractor**, you need to first setup API keys. VLMEvalKit will use an judge **LLM** to extract answer from the output if you set the key, otherwise it uses the **exact matching** mode (find "Yes", "No", "A", "B", "C"... in the output strings). **The exact matching can only be applied to the Yes-or-No tasks and the Multi-choice tasks.**
20
+ - You can place the required keys in `$VLMEvalKit/.env` or directly set them as the environment variable. If you choose to create a `.env` file, its content will look like:
21
+
22
+ ```bash
23
+ # The .env file, place it under $VLMEvalKit
24
+ # API Keys of Proprietary VLMs
25
+ # QwenVL APIs
26
+ DASHSCOPE_API_KEY=
27
+ # Gemini w. Google Cloud Backends
28
+ GOOGLE_API_KEY=
29
+ # OpenAI API
30
+ OPENAI_API_KEY=
31
+ OPENAI_API_BASE=
32
+ # StepAI API
33
+ STEPAI_API_KEY=
34
+ # REKA API
35
+ REKA_API_KEY=
36
+ # GLMV API
37
+ GLMV_API_KEY=
38
+ # CongRong API
39
+ CW_API_BASE=
40
+ CW_API_KEY=
41
+ # SenseChat-V API
42
+ SENSECHAT_AK=
43
+ SENSECHAT_SK=
44
+ # Hunyuan-Vision API
45
+ HUNYUAN_SECRET_KEY=
46
+ HUNYUAN_SECRET_ID=
47
+ # You can also set a proxy for calling api models during the evaluation stage
48
+ EVAL_PROXY=
49
+ ```
50
+
51
+ - Fill the blanks with your API keys (if necessary). Those API keys will be automatically loaded when doing the inference and evaluation.
52
+ ## Step 1. Configuration
53
+
54
+ **VLM Configuration**: All VLMs are configured in `vlmeval/config.py`. Few legacy VLMs (like MiniGPT-4, LLaVA-v1-7B) requires additional configuration (configuring the code / model_weight root in the config file). During evaluation, you should use the model name specified in `supported_VLM` in `vlmeval/config.py` to select the VLM. Make sure you can successfully infer with the VLM before starting the evaluation with the following command `vlmutil check {MODEL_NAME}`.
55
+
56
+ ## Step 2. Evaluation
57
+
58
+ **New!!!** We integrated a new config system to enable more flexible evaluation settings. Check the [Document](/docs/en/ConfigSystem.md) or run `python run.py --help` for more details 🔥🔥🔥
59
+
60
+ We use `run.py` for evaluation. To use the script, you can use `$VLMEvalKit/run.py` or create a soft-link of the script (to use the script anywhere):
61
+
62
+ **Arguments**
63
+
64
+ - `--data (list[str])`: Set the dataset names that are supported in VLMEvalKit (names can be found in the codebase README).
65
+ - `--model (list[str])`: Set the VLM names that are supported in VLMEvalKit (defined in `supported_VLM` in `vlmeval/config.py`).
66
+ - `--mode (str, default to 'all', choices are ['all', 'infer'])`: When `mode` set to "all", will perform both inference and evaluation; when set to "infer", will only perform the inference.
67
+ - `--nproc (int, default to 4)`: The number of threads for OpenAI API calling.
68
+ - `--work-dir (str, default to '.')`: The directory to save evaluation results.
69
+ - `--nframe (int, default to 8)`: The number of frames to sample from a video, only applicable to the evaluation of video benchmarks.
70
+ - `--pack (bool, store_true)`: A video may associate with multiple questions, if `pack==True`, will ask all questions for a video in a single query.
71
+
72
+ **Command for Evaluating Image Benchmarks **
73
+
74
+ You can run the script with `python` or `torchrun`:
75
+
76
+ ```bash
77
+ # When running with `python`, only one VLM instance is instantiated, and it might use multiple GPUs (depending on its default behavior).
78
+ # That is recommended for evaluating very large VLMs (like IDEFICS-80B-Instruct).
79
+
80
+ # IDEFICS-80B-Instruct on MMBench_DEV_EN, MME, and SEEDBench_IMG, Inference and Evalution
81
+ python run.py --data MMBench_DEV_EN MME SEEDBench_IMG --model idefics_80b_instruct --verbose
82
+ # IDEFICS-80B-Instruct on MMBench_DEV_EN, MME, and SEEDBench_IMG, Inference only
83
+ python run.py --data MMBench_DEV_EN MME SEEDBench_IMG --model idefics_80b_instruct --verbose --mode infer
84
+
85
+ # When running with `torchrun`, one VLM instance is instantiated on each GPU. It can speed up the inference.
86
+ # However, that is only suitable for VLMs that consume small amounts of GPU memory.
87
+
88
+ # IDEFICS-9B-Instruct, Qwen-VL-Chat, mPLUG-Owl2 on MMBench_DEV_EN, MME, and SEEDBench_IMG. On a node with 8 GPU. Inference and Evaluation.
89
+ torchrun --nproc-per-node=8 run.py --data MMBench_DEV_EN MME SEEDBench_IMG --model idefics_80b_instruct qwen_chat mPLUG-Owl2 --verbose
90
+ # Qwen-VL-Chat on MME. On a node with 2 GPU. Inference and Evaluation.
91
+ torchrun --nproc-per-node=2 run.py --data MME --model qwen_chat --verbose
92
+ ```
93
+
94
+ **Command for Evaluating Video Benchmarks**
95
+
96
+ ```bash
97
+ # When running with `python`, only one VLM instance is instantiated, and it might use multiple GPUs (depending on its default behavior).
98
+ # That is recommended for evaluating very large VLMs (like IDEFICS-80B-Instruct).
99
+
100
+ # IDEFICS2-8B on MMBench-Video, with 8 frames as inputs and vanilla evaluation. On a node with 8 GPUs.
101
+ torchrun --nproc-per-node=8 run.py --data MMBench-Video --model idefics2_8b --nframe 8
102
+ # GPT-4o (API model) on MMBench-Video, with 16 frames as inputs and pack evaluation (all questions of a video in a single query).
103
+ python run.py --data MMBench-Video --model GPT4o --nframe 16 --pack
104
+ ```
105
+
106
+ The evaluation results will be printed as logs, besides. **Result Files** will also be generated in the directory `$YOUR_WORKING_DIRECTORY/{model_name}`. Files ending with `.csv` contain the evaluated metrics.
107
+
108
+ ## Deploy a local language model as the judge / choice extractor
109
+ The default setting mentioned above uses OpenAI's GPT as the judge LLM. However, you can also deploy a local judge LLM with [LMDeploy](https://github.com/InternLM/lmdeploy).
110
+
111
+ First install:
112
+ ```
113
+ pip install lmdeploy openai
114
+ ```
115
+
116
+ And then deploy a local judge LLM with the single line of code. LMDeploy will automatically download the model from Huggingface. Assuming we use internlm2-chat-1_8b as the judge, port 23333, and the key sk-123456 (the key must start with "sk-" and follow with any number you like):
117
+ ```
118
+ lmdeploy serve api_server internlm/internlm2-chat-1_8b --server-port 23333
119
+ ```
120
+
121
+ You need to get the model name registered by LMDeploy with the following python code:
122
+ ```
123
+ from openai import OpenAI
124
+ client = OpenAI(
125
+ api_key='sk-123456',
126
+ base_url="http://0.0.0.0:23333/v1"
127
+ )
128
+ model_name = client.models.list().data[0].id
129
+ ```
130
+
131
+ Now set some environment variables to tell VLMEvalKit how to use the local judge LLM. As mentioned above, you can also set them in `$VLMEvalKit/.env` file:
132
+ ```
133
+ OPENAI_API_KEY=sk-123456
134
+ OPENAI_API_BASE=http://0.0.0.0:23333/v1/chat/completions
135
+ LOCAL_LLM=<model_name you get>
136
+ ```
137
+
138
+ Finally, you can run the commands in step 2 to evaluate your VLM with the local judge LLM.
139
+
140
+ Note that
141
+
142
+ - If you hope to deploy the judge LLM in a single GPU and evaluate your VLM on other GPUs because of limited GPU memory, try `CUDA_VISIBLE_DEVICES=x` like
143
+ ```
144
+ CUDA_VISIBLE_DEVICES=0 lmdeploy serve api_server internlm/internlm2-chat-1_8b --server-port 23333
145
+ CUDA_VISIBLE_DEVICES=1,2,3 torchrun --nproc-per-node=3 run.py --data HallusionBench --model qwen_chat --verbose
146
+ ```
147
+ - If the local judge LLM is not good enough in following the instructions, the evaluation may fail. Please report such failures (e.g., by issues).
148
+ - It's possible to deploy the judge LLM in different ways, e.g., use a private LLM (not from HuggingFace) or use a quantized LLM. Please refer to the [LMDeploy doc](https://lmdeploy.readthedocs.io/en/latest/serving/api_server.html). You can use any other deployment framework if they support OpenAI API.
vlmeval/VLMEvalKit_old/docs/en/conf.py ADDED
@@ -0,0 +1,234 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # flake8: noqa
2
+ # Configuration file for the Sphinx documentation builder.
3
+ #
4
+ # This file only contains a selection of the most common options. For a full
5
+ # list see the documentation:
6
+ # https://www.sphinx-doc.org/en/master/usage/configuration.html
7
+
8
+ # -- Path setup --------------------------------------------------------------
9
+
10
+ # If extensions (or modules to document with autodoc) are in another directory,
11
+ # add these directories to sys.path here. If the directory is relative to the
12
+ # documentation root, use os.path.abspath to make it absolute, like shown here.
13
+ #
14
+ import os
15
+ import ast
16
+ import subprocess
17
+ import sys
18
+
19
+ import pytorch_sphinx_theme
20
+ from sphinx.builders.html import StandaloneHTMLBuilder
21
+
22
+ sys.path.insert(0, os.path.abspath('../../'))
23
+
24
+ # -- Project information -----------------------------------------------------
25
+
26
+ project = 'VLMEvalKit'
27
+ copyright = '2023, VLMEvalKit'
28
+ author = 'VLMEvalKit Authors'
29
+
30
+ # The full version, including alpha/beta/rc tags
31
+ version_file = '../../vlmeval/__init__.py'
32
+
33
+
34
+ def get_version():
35
+ with open(version_file, 'r') as f:
36
+ file_content = f.read()
37
+ # Parse the file content into an abstract syntax tree (AST)
38
+ tree = ast.parse(file_content, filename=version_file)
39
+
40
+ # Iterate through the body of the AST, looking for an assignment to __version__
41
+ for node in tree.body:
42
+ if isinstance(node, ast.Assign):
43
+ for target in node.targets:
44
+ if isinstance(target, ast.Name) and target.id == '__version__':
45
+ return node.value.s
46
+ raise ValueError('__version__ not found')
47
+
48
+
49
+ release = get_version()
50
+
51
+ # -- General configuration ---------------------------------------------------
52
+
53
+ # Add any Sphinx extension module names here, as strings. They can be
54
+ # extensions coming with Sphinx (named 'sphinx.ext.*') or your custom
55
+ # ones.
56
+ extensions = [
57
+ 'sphinx.ext.autodoc',
58
+ 'sphinx.ext.autosummary',
59
+ 'sphinx.ext.intersphinx',
60
+ 'sphinx.ext.napoleon',
61
+ 'sphinx.ext.viewcode',
62
+ 'myst_parser',
63
+ 'sphinx_copybutton',
64
+ 'sphinx_tabs.tabs',
65
+ 'notfound.extension',
66
+ 'sphinxcontrib.jquery',
67
+ 'sphinx_design',
68
+ ]
69
+
70
+ # Add any paths that contain templates here, relative to this directory.
71
+ templates_path = ['_templates']
72
+
73
+ # The suffix(es) of source filenames.
74
+ # You can specify multiple suffix as a list of string:
75
+ #
76
+ source_suffix = {
77
+ '.rst': 'restructuredtext',
78
+ '.md': 'markdown',
79
+ }
80
+
81
+ language = 'en'
82
+
83
+ # The master toctree document.
84
+ root_doc = 'index'
85
+ html_context = {
86
+ 'github_version': 'latest',
87
+ }
88
+ # List of patterns, relative to source directory, that match files and
89
+ # directories to ignore when looking for source files.
90
+ # This pattern also affects html_static_path and html_extra_path.
91
+ exclude_patterns = ['_build', 'Thumbs.db', '.DS_Store']
92
+
93
+ # -- Options for HTML output -------------------------------------------------
94
+
95
+ # The theme to use for HTML and HTML Help pages. See the documentation for
96
+ # a list of builtin themes.
97
+ #
98
+ html_theme = 'pytorch_sphinx_theme'
99
+ html_theme_path = [pytorch_sphinx_theme.get_html_theme_path()]
100
+
101
+ # Theme options are theme-specific and customize the look and feel of a theme
102
+ # further. For a list of options available for each theme, see the
103
+ # documentation.
104
+ # yapf: disable
105
+ html_theme_options = {
106
+ 'menu': [
107
+ {
108
+ 'name': 'GitHub',
109
+ 'url': 'https://github.com/open-compass/VLMEvalKit'
110
+ },
111
+ ],
112
+ # Specify the language of shared menu
113
+ 'menu_lang': 'en',
114
+ # Disable the default edit on GitHub
115
+ 'default_edit_on_github': False,
116
+ }
117
+ # yapf: enable
118
+
119
+ # Add any paths that contain custom static files (such as style sheets) here,
120
+ # relative to this directory. They are copied after the builtin static files,
121
+ # so a file named "default.css" will overwrite the builtin "default.css".
122
+ html_static_path = ['_static']
123
+ html_css_files = [
124
+ 'https://cdn.datatables.net/v/bs4/dt-1.12.1/datatables.min.css',
125
+ 'css/readthedocs.css'
126
+ ]
127
+ html_js_files = [
128
+ 'https://cdn.datatables.net/v/bs4/dt-1.12.1/datatables.min.js',
129
+ 'js/custom.js'
130
+ ]
131
+
132
+ # -- Options for HTMLHelp output ---------------------------------------------
133
+
134
+ # Output file base name for HTML help builder.
135
+ htmlhelp_basename = 'vlmevalkitdoc'
136
+
137
+ # -- Options for LaTeX output ------------------------------------------------
138
+
139
+ latex_elements = {
140
+ # The paper size ('letterpaper' or 'a4paper').
141
+ #
142
+ # 'papersize': 'letterpaper',
143
+
144
+ # The font size ('10pt', '11pt' or '12pt').
145
+ #
146
+ # 'pointsize': '10pt',
147
+
148
+ # Additional stuff for the LaTeX preamble.
149
+ #
150
+ # 'preamble': '',
151
+ }
152
+
153
+ # Grouping the document tree into LaTeX files. List of tuples
154
+ # (source start file, target name, title,
155
+ # author, documentclass [howto, manual, or own class]).
156
+ latex_documents = [
157
+ (root_doc, 'vlmevalkit.tex', 'VLMEvalKit Documentation', author,
158
+ 'manual'),
159
+ ]
160
+
161
+ # -- Options for manual page output ------------------------------------------
162
+
163
+ # One entry per manual page. List of tuples
164
+ # (source start file, name, description, authors, manual section).
165
+ man_pages = [(root_doc, 'vlmevalkit', 'VLMEvalKit Documentation', [author],
166
+ 1)]
167
+
168
+ # -- Options for Texinfo output ----------------------------------------------
169
+
170
+ # Grouping the document tree into Texinfo files. List of tuples
171
+ # (source start file, target name, title, author,
172
+ # dir menu entry, description, category)
173
+ texinfo_documents = [
174
+ (root_doc, 'vlmevalkit', 'VLMEvalKit Documentation', author,
175
+ 'VLMEvalKit Authors', 'AGI evaluation toolbox and benchmark.',
176
+ 'Miscellaneous'),
177
+ ]
178
+
179
+ # -- Options for Epub output -------------------------------------------------
180
+
181
+ # Bibliographic Dublin Core info.
182
+ epub_title = project
183
+
184
+ # The unique identifier of the text. This can be a ISBN number
185
+ # or the project homepage.
186
+ #
187
+ # epub_identifier = ''
188
+
189
+ # A unique identification for the text.
190
+ #
191
+ # epub_uid = ''
192
+
193
+ # A list of files that should not be packed into the epub file.
194
+ epub_exclude_files = ['search.html']
195
+
196
+ # set priority when building html
197
+ StandaloneHTMLBuilder.supported_image_types = [
198
+ 'image/svg+xml', 'image/gif', 'image/png', 'image/jpeg'
199
+ ]
200
+
201
+ # -- Extension configuration -------------------------------------------------
202
+ # Ignore >>> when copying code
203
+ copybutton_prompt_text = r'>>> |\.\.\. '
204
+ copybutton_prompt_is_regexp = True
205
+
206
+ # Auto-generated header anchors
207
+ myst_heading_anchors = 3
208
+ # Enable "colon_fence" extension of myst.
209
+ myst_enable_extensions = ['colon_fence', 'dollarmath']
210
+
211
+ # Configuration for intersphinx
212
+ intersphinx_mapping = {
213
+ 'python': ('https://docs.python.org/3', None),
214
+ 'numpy': ('https://numpy.org/doc/stable', None),
215
+ 'torch': ('https://pytorch.org/docs/stable/', None),
216
+ 'mmengine': ('https://mmengine.readthedocs.io/en/latest/', None),
217
+ 'transformers':
218
+ ('https://huggingface.co/docs/transformers/main/en/', None),
219
+ }
220
+ napoleon_custom_sections = [
221
+ # Custom sections for data elements.
222
+ ('Meta fields', 'params_style'),
223
+ ('Data fields', 'params_style'),
224
+ ]
225
+
226
+ # Disable docstring inheritance
227
+ autodoc_inherit_docstrings = False
228
+ # Mock some imports during generate API docs.
229
+ autodoc_mock_imports = ['rich', 'attr', 'einops']
230
+ # Disable displaying type annotations, these can be very verbose
231
+ autodoc_typehints = 'none'
232
+
233
+ # The not found page
234
+ notfound_template = '404.html'
vlmeval/VLMEvalKit_old/docs/en/index.rst ADDED
@@ -0,0 +1,41 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ Welcome to the VLMEvalKit Tutorial!
2
+ ==========================================
3
+
4
+ VLMEvalKit Getting Started Guide
5
+ -------------------------------
6
+
7
+ To help users get started quickly, we recommend the following process:
8
+
9
+ - For users who want to use VLMEvalKit, we recommend reading the "Start Your First Step" section to set up the environment and start a mini-experiment to familiarize yourself with the process.
10
+
11
+ - If you want to customize more modules, such as adding datasets and models, we provide an "Advanced Tutorial."
12
+
13
+ We always welcome users' PRs (Pull Requests) and Issues to improve VLMEvalKit!
14
+
15
+ .. _Start Your First Step:
16
+ .. toctree::
17
+ :maxdepth: 1
18
+ :caption: Start Your First Step
19
+
20
+ Quickstart.md
21
+
22
+ .. _Advanced Tutorial:
23
+ .. toctree::
24
+ :maxdepth: 1
25
+ :caption: Advanced Tutorial
26
+
27
+ Development.md
28
+ ConfigSystem.md
29
+
30
+ .. _Other Notes:
31
+ .. toctree::
32
+ :maxdepth: 1
33
+ :caption: Other Notes
34
+
35
+ Contributors.md
36
+
37
+ Index and Tables
38
+ ==================
39
+
40
+ * :ref:`genindex`
41
+ * :ref:`search`
vlmeval/VLMEvalKit_old/docs/zh-CN/_static/css/readthedocs.css ADDED
@@ -0,0 +1,63 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ .header-logo {
2
+ background-image: url("../image/logo.svg");
3
+ background-size: 275px 80px;
4
+ height: 80px;
5
+ width: 275px;
6
+ }
7
+
8
+
9
+ @media screen and (min-width: 1100px) {
10
+ .header-logo {
11
+ top: -25px;
12
+ }
13
+ }
14
+
15
+ pre {
16
+ white-space: pre;
17
+ }
18
+
19
+ @media screen and (min-width: 2000px) {
20
+ .pytorch-content-left {
21
+ width: 1200px;
22
+ margin-left: 30px;
23
+ }
24
+ article.pytorch-article {
25
+ max-width: 1200px;
26
+ }
27
+ .pytorch-breadcrumbs-wrapper {
28
+ width: 1200px;
29
+ }
30
+ .pytorch-right-menu.scrolling-fixed {
31
+ position: fixed;
32
+ top: 45px;
33
+ left: 1580px;
34
+ }
35
+ }
36
+
37
+
38
+ article.pytorch-article section code {
39
+ padding: .2em .4em;
40
+ background-color: #f3f4f7;
41
+ border-radius: 5px;
42
+ }
43
+
44
+ /* Disable the change in tables */
45
+ article.pytorch-article section table code {
46
+ padding: unset;
47
+ background-color: unset;
48
+ border-radius: unset;
49
+ }
50
+
51
+ table.autosummary td {
52
+ width: 50%
53
+ }
54
+
55
+ img.align-center {
56
+ display: block;
57
+ margin-left: auto;
58
+ margin-right: auto;
59
+ }
60
+
61
+ article.pytorch-article p.rubric {
62
+ font-weight: bold;
63
+ }
vlmeval/VLMEvalKit_old/docs/zh-CN/_static/image/logo.svg ADDED
vlmeval/VLMEvalKit_old/docs/zh-CN/_templates/404.html ADDED
@@ -0,0 +1,18 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ {% extends "layout.html" %}
2
+
3
+ {% block body %}
4
+
5
+ <h1>Page Not Found</h1>
6
+ <p>
7
+ The page you are looking for cannot be found.
8
+ </p>
9
+ <p>
10
+ If you just switched documentation versions, it is likely that the page you were on is moved. You can look for it in
11
+ the content table left, or go to <a href="{{ pathto(root_doc) }}">the homepage</a>.
12
+ </p>
13
+ <!-- <p>
14
+ If you cannot find documentation you want, please <a
15
+ href="">open an issue</a> to tell us!
16
+ </p> -->
17
+
18
+ {% endblock %}
vlmeval/VLMEvalKit_old/vlmeval/__pycache__/__init__.cpython-311.pyc ADDED
Binary file (566 Bytes). View file
 
vlmeval/VLMEvalKit_old/vlmeval/__pycache__/__init__.cpython-38.pyc ADDED
Binary file (396 Bytes). View file
 
vlmeval/VLMEvalKit_old/vlmeval/__pycache__/config.cpython-310.pyc ADDED
Binary file (17.2 kB). View file
 
vlmeval/VLMEvalKit_old/vlmeval/__pycache__/inference_mt.cpython-310.pyc ADDED
Binary file (5.52 kB). View file
 
vlmeval/VLMEvalKit_old/vlmeval/__pycache__/inference_video.cpython-310.pyc ADDED
Binary file (6.27 kB). View file
 
vlmeval/VLMEvalKit_old/vlmeval/api/bluelm_v_api.py ADDED
@@ -0,0 +1,120 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ from vlmeval.smp import *
2
+ from vlmeval.api.base import BaseAPI
3
+ import os
4
+ import json
5
+
6
+
7
+ def multimodal(images, text, url, key, temperature=0, max_tokens=1024, history=[]):
8
+ if images:
9
+ pics = []
10
+ for image in images:
11
+ with open(image, 'rb') as f:
12
+ pic = base64.b64encode(f.read()).decode('utf-8')
13
+ pics.append(pic)
14
+ data = {'images': pics, 'text': text, 'key': key, 'temperature': temperature, 'max_new_tokens': max_tokens}
15
+ else:
16
+ data = {'text': text, 'key': key, 'temperature': temperature, 'max_new_tokens': max_tokens}
17
+ response = requests.post(url, json=data, headers={'Content-Type': 'application/json'})
18
+ response = json.loads(response.text)
19
+ return response
20
+
21
+
22
+ class BlueLMWrapper(BaseAPI):
23
+ is_api: bool = True
24
+
25
+ def __init__(self,
26
+ model: str = 'BlueLM-V-v3.0',
27
+ retry: int = 5,
28
+ wait: int = 5,
29
+ verbose: bool = True,
30
+ temperature: float = 0.0,
31
+ system_prompt: str = None,
32
+ max_tokens: int = 1024,
33
+ key: str = None,
34
+ url: str = 'http://api-ai.vivo.com.cn/multimodal',
35
+ **kwargs):
36
+
37
+ self.model = model
38
+ self.fail_msg = 'Failed to obtain answer BlueLM-V API. '
39
+ self.max_tokens = max_tokens
40
+ self.temperature = temperature
41
+ self.url = url
42
+ self.key = key
43
+
44
+ if self.key is None:
45
+ self.key = os.environ.get('BLUELM_V_API_KEY', None)
46
+ assert self.key is not None, (
47
+ 'Please set the API Key (obtain it here: '
48
+ 'contact by email : [email protected]'
49
+ )
50
+
51
+ super().__init__(wait=wait, retry=retry, system_prompt=system_prompt, verbose=verbose, **kwargs)
52
+
53
+ def message_to_promptimg(self, message, dataset=None):
54
+
55
+ num_images = len([x for x in message if x['type'] == 'image'])
56
+ if num_images == 0:
57
+ prompt = '\n'.join([x['value'] for x in message if x['type'] == 'text'])
58
+ image = None
59
+ elif num_images == 1:
60
+ prompt = '\n'.join([x['value'] for x in message if x['type'] == 'text'])
61
+ image = [x['value'] for x in message if x['type'] == 'image']
62
+ else:
63
+ prompt = '\n'.join([x['value'] if x['type'] == 'text' else '<image>' for x in message])
64
+ if dataset == 'BLINK':
65
+ image = concat_images_vlmeval(
66
+ [x['value'] for x in message if x['type'] == 'image'],
67
+ target_size=512)
68
+ else:
69
+ image = [x['value'] for x in message if x['type'] == 'image']
70
+
71
+ if dataset in ['MMBench_DEV_EN_V11', 'MMBench_DEV_CN_V11', 'MMBench_TEST_EN_V11', 'MMBench_TEST_CN_V11',
72
+ 'AI2D_TEST', 'AI2D_TEST_TO_MASK', 'MMMU_DEV_VAL']:
73
+ prompt = prompt.replace('Please select the correct answer from the options above.',
74
+ 'Answer with the option’s letter from the given choices directly.')
75
+ elif dataset in ['ChartQA_TEST']:
76
+ prompt = prompt.replace('Answer the question using a single word or phrase.',
77
+ 'Answer the question using a single number or phrase.')
78
+ elif dataset in ['DocVQA_VAL', 'DocVQA_TEST', ]:
79
+ prompt = prompt.replace('Answer the question using a single word or phrase.',
80
+ 'Give the short answer directly.')
81
+ elif dataset in ['TextVQA_VAL']:
82
+ prompt = prompt.replace('Answer the question using a single word or phrase.',
83
+ 'When the provided information is insufficient, respond with ’Unanswerable’.'
84
+ 'Answer the question using a single word or phrase.')
85
+ elif dataset in ['MTVQA_TEST']:
86
+ prompt = prompt.replace('\nAnswer the question using a word or phrase in the language of the question.', '')
87
+ elif dataset in ['MathVista_MINI']:
88
+ if 'Choices:' in prompt:
89
+ prompt = prompt.replace('Choices:', 'Options:').replace('Hint:', 'Context:')
90
+ for i in range(1, 7): # replace A ~ F
91
+ prompt = prompt.replace(f'({chr(64 + i)})', f'{chr(64 + i)}.')
92
+ prompt += '\nAnswer with the option’s letter from the given choices directly.'
93
+ else:
94
+ prompt += '\nAnswer the question using a single word or phrase.'
95
+
96
+ return prompt, image
97
+
98
+ def generate_inner(self, inputs, **kwargs) -> str:
99
+
100
+ assert isinstance(inputs, str) or isinstance(inputs, list)
101
+ pure_text = np.all([x['type'] == 'text' for x in inputs])
102
+ assert not pure_text
103
+
104
+ prompt, image_path = self.message_to_promptimg(inputs, kwargs['dataset'])
105
+
106
+ try:
107
+ response = multimodal(image_path, prompt, self.url, self.key, self.temperature, self.max_tokens)
108
+ answer = response['result']
109
+ return 0, answer, 'Succeeded! '
110
+ except Exception as err:
111
+ if self.verbose:
112
+ self.logger.error(f'{type(err)}: {err}')
113
+ self.logger.error(f'The input messages are {inputs}.')
114
+ return -1, '', ''
115
+
116
+
117
+ class BlueLM_V_API(BlueLMWrapper):
118
+
119
+ def generate(self, message, dataset=None):
120
+ return super(BlueLM_V_API, self).generate(message, dataset=dataset)
vlmeval/VLMEvalKit_old/vlmeval/api/gpt.py ADDED
@@ -0,0 +1,263 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ from ..smp import *
2
+ import os
3
+ import sys
4
+ from .base import BaseAPI
5
+
6
+ APIBASES = {
7
+ 'OFFICIAL': 'https://api.openai.com/v1/chat/completions',
8
+ }
9
+
10
+
11
+ def GPT_context_window(model):
12
+ length_map = {
13
+ 'gpt-4': 8192,
14
+ 'gpt-4-0613': 8192,
15
+ 'gpt-4-turbo-preview': 128000,
16
+ 'gpt-4-1106-preview': 128000,
17
+ 'gpt-4-0125-preview': 128000,
18
+ 'gpt-4-vision-preview': 128000,
19
+ 'gpt-4-turbo': 128000,
20
+ 'gpt-4-turbo-2024-04-09': 128000,
21
+ 'gpt-3.5-turbo': 16385,
22
+ 'gpt-3.5-turbo-0125': 16385,
23
+ 'gpt-3.5-turbo-1106': 16385,
24
+ 'gpt-3.5-turbo-instruct': 4096,
25
+ }
26
+ if model in length_map:
27
+ return length_map[model]
28
+ else:
29
+ return 128000
30
+
31
+
32
+ class OpenAIWrapper(BaseAPI):
33
+
34
+ is_api: bool = True
35
+
36
+ def __init__(self,
37
+ model: str = 'gpt-3.5-turbo-0613',
38
+ retry: int = 5,
39
+ wait: int = 5,
40
+ key: str = None,
41
+ verbose: bool = False,
42
+ system_prompt: str = None,
43
+ temperature: float = 0,
44
+ timeout: int = 60,
45
+ api_base: str = None,
46
+ max_tokens: int = 1024,
47
+ img_size: int = 512,
48
+ img_detail: str = 'low',
49
+ use_azure: bool = False,
50
+ **kwargs):
51
+
52
+ self.model = model
53
+ self.cur_idx = 0
54
+ self.fail_msg = 'Failed to obtain answer via API. '
55
+ self.max_tokens = max_tokens
56
+ self.temperature = temperature
57
+ self.use_azure = use_azure
58
+
59
+ if 'step-1v' in model:
60
+ env_key = os.environ.get('STEPAI_API_KEY', '')
61
+ if key is None:
62
+ key = env_key
63
+ elif 'yi-vision' in model:
64
+ env_key = os.environ.get('YI_API_KEY', '')
65
+ if key is None:
66
+ key = env_key
67
+ elif 'internvl2-pro' in model:
68
+ env_key = os.environ.get('InternVL2_PRO_KEY', '')
69
+ if key is None:
70
+ key = env_key
71
+ else:
72
+ if use_azure:
73
+ env_key = os.environ.get('AZURE_OPENAI_API_KEY', None)
74
+ assert env_key is not None, 'Please set the environment variable AZURE_OPENAI_API_KEY. '
75
+
76
+ if key is None:
77
+ key = env_key
78
+ assert isinstance(key, str), (
79
+ 'Please set the environment variable AZURE_OPENAI_API_KEY to your openai key. '
80
+ )
81
+ else:
82
+ env_key = os.environ.get('OPENAI_API_KEY', '')
83
+ if key is None:
84
+ key = env_key
85
+ assert isinstance(key, str) and key.startswith('sk-'), (
86
+ f'Illegal openai_key {key}. '
87
+ 'Please set the environment variable OPENAI_API_KEY to your openai key. '
88
+ )
89
+
90
+ self.key = key
91
+ assert img_size > 0 or img_size == -1
92
+ self.img_size = img_size
93
+ assert img_detail in ['high', 'low']
94
+ self.img_detail = img_detail
95
+ self.timeout = timeout
96
+
97
+ super().__init__(wait=wait, retry=retry, system_prompt=system_prompt, verbose=verbose, **kwargs)
98
+
99
+ if use_azure:
100
+ api_base_template = (
101
+ '{endpoint}openai/deployments/{deployment_name}/chat/completions?api-version={api_version}'
102
+ )
103
+ endpoint = os.getenv('AZURE_OPENAI_ENDPOINT', None)
104
+ assert endpoint is not None, 'Please set the environment variable AZURE_OPENAI_ENDPOINT. '
105
+ deployment_name = os.getenv('AZURE_OPENAI_DEPLOYMENT_NAME', None)
106
+ assert deployment_name is not None, 'Please set the environment variable AZURE_OPENAI_DEPLOYMENT_NAME. '
107
+ api_version = os.getenv('OPENAI_API_VERSION', None)
108
+ assert api_version is not None, 'Please set the environment variable OPENAI_API_VERSION. '
109
+
110
+ self.api_base = api_base_template.format(
111
+ endpoint=os.getenv('AZURE_OPENAI_ENDPOINT'),
112
+ deployment_name=os.getenv('AZURE_OPENAI_DEPLOYMENT_NAME'),
113
+ api_version=os.getenv('OPENAI_API_VERSION')
114
+ )
115
+ else:
116
+ if api_base is None:
117
+ if 'OPENAI_API_BASE' in os.environ and os.environ['OPENAI_API_BASE'] != '':
118
+ self.logger.info('Environment variable OPENAI_API_BASE is set. Will use it as api_base. ')
119
+ api_base = os.environ['OPENAI_API_BASE']
120
+ else:
121
+ api_base = 'OFFICIAL'
122
+
123
+ assert api_base is not None
124
+
125
+ if api_base in APIBASES:
126
+ self.api_base = APIBASES[api_base]
127
+ elif api_base.startswith('http'):
128
+ self.api_base = api_base
129
+ else:
130
+ self.logger.error('Unknown API Base. ')
131
+ raise NotImplementedError
132
+
133
+ self.logger.info(f'Using API Base: {self.api_base}; API Key: {self.key}')
134
+
135
+ # inputs can be a lvl-2 nested list: [content1, content2, content3, ...]
136
+ # content can be a string or a list of image & text
137
+ def prepare_itlist(self, inputs):
138
+ assert np.all([isinstance(x, dict) for x in inputs])
139
+ has_images = np.sum([x['type'] == 'image' for x in inputs])
140
+ if has_images:
141
+ content_list = []
142
+ for msg in inputs:
143
+ if msg['type'] == 'text':
144
+ content_list.append(dict(type='text', text=msg['value']))
145
+ elif msg['type'] == 'image':
146
+ from PIL import Image
147
+ img = Image.open(msg['value'])
148
+ b64 = encode_image_to_base64(img, target_size=self.img_size)
149
+ img_struct = dict(url=f'data:image/jpeg;base64,{b64}', detail=self.img_detail)
150
+ content_list.append(dict(type='image_url', image_url=img_struct))
151
+ else:
152
+ assert all([x['type'] == 'text' for x in inputs])
153
+ text = '\n'.join([x['value'] for x in inputs])
154
+ content_list = [dict(type='text', text=text)]
155
+ return content_list
156
+
157
+ def prepare_inputs(self, inputs):
158
+ input_msgs = []
159
+ if self.system_prompt is not None:
160
+ input_msgs.append(dict(role='system', content=self.system_prompt))
161
+ assert isinstance(inputs, list) and isinstance(inputs[0], dict)
162
+ assert np.all(['type' in x for x in inputs]) or np.all(['role' in x for x in inputs]), inputs
163
+ if 'role' in inputs[0]:
164
+ assert inputs[-1]['role'] == 'user', inputs[-1]
165
+ for item in inputs:
166
+ input_msgs.append(dict(role=item['role'], content=self.prepare_itlist(item['content'])))
167
+ else:
168
+ input_msgs.append(dict(role='user', content=self.prepare_itlist(inputs)))
169
+ return input_msgs
170
+
171
+ def generate_inner(self, inputs, **kwargs) -> str:
172
+ input_msgs = self.prepare_inputs(inputs)
173
+ temperature = kwargs.pop('temperature', self.temperature)
174
+ max_tokens = kwargs.pop('max_tokens', self.max_tokens)
175
+
176
+ context_window = GPT_context_window(self.model)
177
+ new_max_tokens = min(max_tokens, context_window - self.get_token_len(inputs))
178
+ if 0 < new_max_tokens <= 100 and new_max_tokens < max_tokens:
179
+ self.logger.warning(
180
+ 'Less than 100 tokens left, '
181
+ 'may exceed the context window with some additional meta symbols. '
182
+ )
183
+ if new_max_tokens <= 0:
184
+ return 0, self.fail_msg + 'Input string longer than context window. ', 'Length Exceeded. '
185
+ max_tokens = new_max_tokens
186
+
187
+ # Will send request if use Azure, dk how to use openai client for it
188
+ if self.use_azure:
189
+ headers = {'Content-Type': 'application/json', 'api-key': self.key}
190
+ elif 'internvl2-pro' in self.model:
191
+ headers = {'Content-Type': 'application/json', 'Authorization': self.key}
192
+ else:
193
+ headers = {'Content-Type': 'application/json', 'Authorization': f'Bearer {self.key}'}
194
+ payload = dict(
195
+ model=self.model,
196
+ messages=input_msgs,
197
+ max_tokens=max_tokens,
198
+ n=1,
199
+ temperature=temperature,
200
+ **kwargs)
201
+ response = requests.post(
202
+ self.api_base,
203
+ headers=headers, data=json.dumps(payload), timeout=self.timeout * 1.1)
204
+ ret_code = response.status_code
205
+ ret_code = 0 if (200 <= int(ret_code) < 300) else ret_code
206
+ answer = self.fail_msg
207
+ try:
208
+ resp_struct = json.loads(response.text)
209
+ answer = resp_struct['choices'][0]['message']['content'].strip()
210
+ except Exception as err:
211
+ if self.verbose:
212
+ self.logger.error(f'{type(err)}: {err}')
213
+ self.logger.error(response.text if hasattr(response, 'text') else response)
214
+
215
+ return ret_code, answer, response
216
+
217
+ def get_image_token_len(self, img_path, detail='low'):
218
+ import math
219
+ if detail == 'low':
220
+ return 85
221
+
222
+ im = Image.open(img_path)
223
+ height, width = im.size
224
+ if width > 1024 or height > 1024:
225
+ if width > height:
226
+ height = int(height * 1024 / width)
227
+ width = 1024
228
+ else:
229
+ width = int(width * 1024 / height)
230
+ height = 1024
231
+
232
+ h = math.ceil(height / 512)
233
+ w = math.ceil(width / 512)
234
+ total = 85 + 170 * h * w
235
+ return total
236
+
237
+ def get_token_len(self, inputs) -> int:
238
+ import tiktoken
239
+ try:
240
+ enc = tiktoken.encoding_for_model(self.model)
241
+ except Exception as err:
242
+ if 'gpt' in self.model.lower():
243
+ if self.verbose:
244
+ self.logger.warning(f'{type(err)}: {err}')
245
+ enc = tiktoken.encoding_for_model('gpt-4')
246
+ else:
247
+ return 0
248
+ assert isinstance(inputs, list)
249
+ tot = 0
250
+ for item in inputs:
251
+ if 'role' in item:
252
+ tot += self.get_token_len(item['content'])
253
+ elif item['type'] == 'text':
254
+ tot += len(enc.encode(item['value']))
255
+ elif item['type'] == 'image':
256
+ tot += self.get_image_token_len(item['value'], detail=self.img_detail)
257
+ return tot
258
+
259
+
260
+ class GPT4V(OpenAIWrapper):
261
+
262
+ def generate(self, message, dataset=None):
263
+ return super(GPT4V, self).generate(message)
vlmeval/VLMEvalKit_old/vlmeval/api/sensechat_vision.py ADDED
@@ -0,0 +1,257 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ from vlmeval.smp import *
2
+ from vlmeval.api.base import BaseAPI
3
+ from vlmeval.dataset import img_root_map
4
+ from vlmeval.dataset import DATASET_TYPE
5
+
6
+
7
+ class SenseChatVisionWrapper(BaseAPI):
8
+
9
+ is_api: bool = True
10
+
11
+ def __init__(self,
12
+ model: str = 'SenseChat-5-Vision',
13
+ retry: int = 5,
14
+ wait: int = 5,
15
+ ak: str = None,
16
+ sk: str = None,
17
+ verbose: bool = True,
18
+ system_prompt: str = None,
19
+ max_tokens: int = 1024,
20
+ proxy: str = None,
21
+ **kwargs):
22
+
23
+ self.model = model
24
+ self.fail_msg = 'Failed to obtain answer via API. '
25
+ self.ak = os.environ.get('SENSECHAT_AK', None) if ak is None else ak
26
+ self.sk = os.environ.get('SENSECHAT_SK', None) if sk is None else sk
27
+ assert self.ak is not None and self.sk is not None
28
+ self.max_new_tokens = max_tokens
29
+ super().__init__(wait=wait, retry=retry, system_prompt=system_prompt, verbose=verbose, **kwargs)
30
+
31
+ def dump_image(self, line, dataset):
32
+ """Dump the image(s) of the input line to the corresponding dataset folder.
33
+
34
+ Args:
35
+ line (line of pd.DataFrame): The raw input line.
36
+ dataset (str): The name of the dataset.
37
+
38
+ Returns:
39
+ str | list[str]: The paths of the dumped images.
40
+ """
41
+ ROOT = LMUDataRoot()
42
+ assert isinstance(dataset, str)
43
+ img_root = osp.join(ROOT, 'images', img_root_map[dataset] if dataset in img_root_map else dataset)
44
+ os.makedirs(img_root, exist_ok=True)
45
+ if 'image' in line:
46
+ if isinstance(line['image'], list):
47
+ tgt_path = []
48
+ assert 'image_path' in line
49
+ for img, im_name in zip(line['image'], line['image_path']):
50
+ path = osp.join(img_root, im_name)
51
+ if not read_ok(path):
52
+ decode_base64_to_image_file(img, path)
53
+ tgt_path.append(path)
54
+ else:
55
+ tgt_path = osp.join(img_root, f"{line['index']}.jpg")
56
+ if not read_ok(tgt_path):
57
+ decode_base64_to_image_file(line['image'], tgt_path)
58
+ tgt_path = [tgt_path]
59
+ else:
60
+ assert 'image_path' in line
61
+ tgt_path = toliststr(line['image_path'])
62
+
63
+ return tgt_path
64
+
65
+ def image_to_base64(self, image_path):
66
+ import base64
67
+ with open(image_path, 'rb') as image_file:
68
+ encoded_string = base64.b64encode(image_file.read())
69
+ return encoded_string.decode('utf-8')
70
+
71
+ def encode_jwt_token(self, ak, sk):
72
+ import jwt
73
+ headers = {'alg': 'HS256', 'typ': 'JWT'}
74
+ payload = {
75
+ 'iss': ak,
76
+ 'exp': int(time.time())
77
+ + 1800, # 填写您期望的有效时间,此处示例代表当前时间+30分钟
78
+ 'nbf': int(time.time()) - 5, # 填写您期望的生效时间,此处示例代表当前时间-5秒
79
+ }
80
+ token = jwt.encode(payload, sk, headers=headers)
81
+ return token
82
+
83
+ def use_custom_prompt(self, dataset):
84
+ return True
85
+
86
+ def build_multi_choice_prompt(self, line, dataset=None):
87
+ question = line['question']
88
+ hint = line['hint'] if ('hint' in line and not pd.isna(line['hint'])) else None
89
+ if hint is not None:
90
+ question = hint + '\n' + question
91
+
92
+ options = {
93
+ cand: line[cand]
94
+ for cand in string.ascii_uppercase
95
+ if cand in line and not pd.isna(line[cand])
96
+ }
97
+ for key, item in options.items():
98
+ question += f'\n{key}. {item}'
99
+ prompt = question
100
+
101
+ if len(options):
102
+ prompt += '\n请直接回答选项字母。' if cn_string(
103
+ prompt) else "\nAnswer with the option's letter from the given choices directly."
104
+ else:
105
+ prompt += '\n请直接回答问题。' if cn_string(prompt) else '\nAnswer the question directly.'
106
+
107
+ return prompt
108
+
109
+ def build_prompt(self, line, dataset=None):
110
+ assert self.use_custom_prompt(dataset)
111
+ assert dataset is None or isinstance(dataset, str)
112
+
113
+ tgt_path = self.dump_image(line, dataset)
114
+
115
+ if dataset is not None and listinstr(['MME'], dataset):
116
+ question = line['question']
117
+ prompt = question + ' Answer the question using a single word or phrase.'
118
+ elif dataset is not None and listinstr(['HallusionBench'], dataset):
119
+ question = line['question']
120
+ prompt = question + ' Please answer yes or no. Answer the question using a single word or phrase.'
121
+ elif dataset is not None and DATASET_TYPE(dataset) == 'MCQ' and 'MMMU' not in dataset:
122
+ prompt = self.build_multi_choice_prompt(line, dataset)
123
+ elif dataset is not None and DATASET_TYPE(dataset) == 'VQA':
124
+ if 'MathVista' in dataset:
125
+ prompt = line['question']
126
+ elif listinstr(['LLaVABench'], dataset):
127
+ question = line['question']
128
+ prompt = question + '\nAnswer this question in detail.'
129
+ elif listinstr(['MMVet'], dataset):
130
+ prompt = line['question']
131
+ else:
132
+ question = line['question']
133
+ prompt = question + '\nAnswer the question using a single word or phrase.'
134
+ elif dataset is not None and 'MMMU' in dataset:
135
+ question = line['question']
136
+ options = {
137
+ cand: line[cand]
138
+ for cand in string.ascii_uppercase
139
+ if cand in line and not pd.isna(line[cand])
140
+ }
141
+ for key, item in options.items():
142
+ question += f'\n{key}. {item}'
143
+ prompt = {
144
+ 'multiple-choice': 'You are an expert in {}. Please solve the university-level {} examination question, which includes interleaved images and text. Your output should be divided into two parts: First, reason about the correct answer. Then write the answer in the following format where X is exactly one of the choices given by the problem: "ANSWER: X". If you are uncertain of the correct answer, guess the most likely one.', # noqa: E501
145
+ 'open': 'You are an expert in {}. Please solve the university-level {} examination question, which includes interleaved images and text. Your output should be divided into two parts: First, reason about the correct answer. Then write the answer in the following format where X is only the answer and nothing else: "ANSWER: X"' # noqa: E501
146
+ }
147
+ subject = '_'.join(line['id'].split('_')[1:-1])
148
+ prompt = prompt[line['question_type']].format(subject, subject) + '\n' + question
149
+ else:
150
+ prompt = line['question']
151
+
152
+ message = [dict(type='text', value=prompt)]
153
+ message.extend([dict(type='image', value=s) for s in tgt_path])
154
+
155
+ return message
156
+
157
+ def message_to_promptimg(self, message, dataset=None):
158
+ if dataset is None or listinstr(['MMMU', 'BLINK'], dataset):
159
+ prompt = '\n'.join([x['value'] for x in message if x['type'] == 'text'])
160
+ image = [[x['value'] for x in message if x['type'] == 'image'][0]]
161
+ else:
162
+ prompt = '\n'.join([x['value'] for x in message if x['type'] == 'text'])
163
+ image = [x['value'] for x in message if x['type'] == 'image']
164
+ return prompt, image
165
+
166
+ def generate_inner(self, inputs, **kwargs) -> str:
167
+ assert isinstance(inputs, str) or isinstance(inputs, list)
168
+ inputs = [inputs] if isinstance(inputs, str) else inputs
169
+ dataset = kwargs.get('dataset', None)
170
+
171
+ if dataset is not None and listinstr(['ChartQA_TEST'], dataset):
172
+ self.max_num = 12
173
+ elif dataset is not None and listinstr(['DocVQA_VAL', 'DocVQA_TEST'], dataset):
174
+ self.max_num = 18
175
+ elif dataset is not None and listinstr(['InfoVQA_VAL', 'InfoVQA_TEST', 'OCRBench'], dataset):
176
+ self.max_num = 24
177
+ else:
178
+ self.max_num = 6
179
+
180
+ if dataset is None:
181
+ pass
182
+ elif listinstr(['AI2D_TEST'], dataset):
183
+ self.max_new_tokens = 10
184
+ elif 'MMMU' in dataset:
185
+ self.max_new_tokens = 1024
186
+ elif 'MMBench' in dataset:
187
+ self.max_new_tokens = 100
188
+
189
+ prompt, image = self.message_to_promptimg(message=inputs, dataset=dataset)
190
+
191
+ url = 'https://api.sensenova.cn/v1/llm/chat-completions'
192
+ api_secret_key = self.encode_jwt_token(self.ak, self.sk)
193
+
194
+ content = [{
195
+ 'image_base64': self.image_to_base64(item),
196
+ 'image_file_id': '',
197
+ 'image_url': '',
198
+ 'text': '',
199
+ 'text': '',
200
+ 'type': 'image_base64'
201
+ } for item in image]
202
+
203
+ content.append({
204
+ 'image_base64': '',
205
+ 'image_file_id': '',
206
+ 'image_url': '',
207
+ 'text': prompt,
208
+ 'type': 'text'
209
+ })
210
+
211
+ message = [{'content': content, 'role': 'user'}]
212
+
213
+ data = {
214
+ 'messages': message,
215
+ 'max_new_tokens': self.max_new_tokens,
216
+ 'model': self.model,
217
+ 'stream': False,
218
+ }
219
+ headers = {
220
+ 'Content-type': 'application/json',
221
+ 'Authorization': 'Bearer ' + api_secret_key
222
+ }
223
+
224
+ response = requests.post(
225
+ url,
226
+ headers=headers,
227
+ json=data,
228
+ )
229
+ request_id = response.headers['x-request-id']
230
+
231
+ time.sleep(1)
232
+ try:
233
+ assert response.status_code == 200
234
+ response = response.json()['data']['choices'][0]['message'].strip()
235
+ if dataset is not None and 'MMMU' in dataset:
236
+ response = response.split('ANSWER: ')[-1].strip()
237
+ if self.verbose:
238
+ self.logger.info(f'inputs: {inputs}\nanswer: {response}')
239
+ return 0, response, 'Succeeded! '
240
+ except Exception as err:
241
+ if self.verbose:
242
+ self.logger.error('---------------------------ERROR---------------------------')
243
+ self.logger.error(response.json())
244
+ self.logger.error(f'{type(err)}: {err}')
245
+ self.logger.error('---------------------------request_id---------------------------' + request_id)
246
+ self.logger.error(
247
+ 'api error' + response.json()['error']['message']
248
+ + str([input['value'] if input['type'] == 'image' else None for input in inputs])
249
+ )
250
+ self.logger.error(f'The input messages are {inputs}.')
251
+ return -1, response.json()['error']['message'], ''
252
+
253
+
254
+ class SenseChatVisionAPI(SenseChatVisionWrapper):
255
+
256
+ def generate(self, message, dataset=None):
257
+ return super(SenseChatVisionAPI, self).generate(message, dataset=dataset)
vlmeval/VLMEvalKit_old/vlmeval/dataset/__init__.py ADDED
@@ -0,0 +1,228 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import warnings
2
+
3
+ from .image_base import img_root_map, ImageBaseDataset
4
+ from .image_caption import ImageCaptionDataset
5
+ from .image_yorn import ImageYORNDataset
6
+ from .image_mcq import (
7
+ ImageMCQDataset, MMMUDataset, CustomMCQDataset, MUIRDataset, GMAIMMBenchDataset, MMERealWorld, HRBenchDataset,
8
+ NaturalBenchDataset
9
+ )
10
+ from .image_mt import MMDUDataset
11
+ from .image_vqa import (
12
+ ImageVQADataset, MathVision, OCRBench, MathVista, LLaVABench, MMVet, MTVQADataset, TableVQABench,
13
+ CustomVQADataset, CRPE, MathVerse, OlympiadBench, QSpatial, VizWiz, MMNIAH
14
+ )
15
+
16
+ from .text_mcq import CustomTextMCQDataset, TextMCQDataset
17
+
18
+ from .vcr import VCRDataset
19
+ from .mmlongbench import MMLongBench
20
+ from .dude import DUDE
21
+ from .slidevqa import SlideVQA
22
+
23
+ from .mmbench_video import MMBenchVideo
24
+ from .videomme import VideoMME
25
+ from .mvbench import MVBench, MVBench_MP4
26
+ from .mlvu import MLVU, MLVU_MCQ, MLVU_OpenEnded
27
+ from .tempcompass import TempCompass, TempCompass_Captioning, TempCompass_MCQ, TempCompass_YorN
28
+ from .longvideobench import LongVideoBench
29
+ from .video_concat_dataset import ConcatVideoDataset
30
+ from .mmgenbench import MMGenBench
31
+
32
+ from .miabench import MIABench
33
+ from .wildvision import WildVision
34
+ from .mmmath import MMMath
35
+ from .dynamath import Dynamath
36
+ from .utils import *
37
+ from ..smp import *
38
+
39
+
40
+ class ConcatDataset(ImageBaseDataset):
41
+ # This dataset takes multiple dataset names as input and aggregate them into a single dataset.
42
+ # Each single dataset should not have a field named `SUB_DATASET`
43
+
44
+ DATASET_SETS = {
45
+ 'MMMB': ['MMMB_ar', 'MMMB_cn', 'MMMB_en', 'MMMB_pt', 'MMMB_ru', 'MMMB_tr'],
46
+ 'MTL_MMBench_DEV': [
47
+ 'MMBench_dev_ar', 'MMBench_dev_cn', 'MMBench_dev_en',
48
+ 'MMBench_dev_pt', 'MMBench_dev_ru', 'MMBench_dev_tr'
49
+ ]
50
+ }
51
+
52
+ def __init__(self, dataset):
53
+ datasets = self.DATASET_SETS[dataset]
54
+ self.dataset_map = {}
55
+ # The name of the compliation
56
+ self.dataset_name = dataset
57
+ self.datasets = datasets
58
+ for dname in datasets:
59
+ dataset = build_dataset(dname)
60
+ assert dataset is not None, dataset
61
+ self.dataset_map[dname] = dataset
62
+ TYPES = [x.TYPE for x in self.dataset_map.values()]
63
+ MODALITIES = [x.MODALITY for x in self.dataset_map.values()]
64
+ assert np.all([x == TYPES[0] for x in TYPES]), (datasets, TYPES)
65
+ assert np.all([x == MODALITIES[0] for x in MODALITIES]), (datasets, MODALITIES)
66
+ self.TYPE = TYPES[0]
67
+ self.MODALITY = MODALITIES[0]
68
+ data_all = []
69
+ for dname in datasets:
70
+ data = self.dataset_map[dname].data
71
+ data['SUB_DATASET'] = [dname] * len(data)
72
+ data_new = localize_df(data, dname, nproc=16)
73
+ data_all.append(data_new)
74
+
75
+ data = pd.concat(data_all)
76
+ data['original_index'] = data.pop('index')
77
+ data['index'] = np.arange(len(data))
78
+ self.data = data
79
+
80
+ def build_prompt(self, line):
81
+ if isinstance(line, int):
82
+ line = self.data.iloc[line]
83
+ idx = line['original_index']
84
+ dname = line['SUB_DATASET']
85
+ org_data = self.dataset_map[dname].data
86
+ org_line = cp.deepcopy(org_data[org_data['index'] == idx]).iloc[0]
87
+ return self.dataset_map[dname].build_prompt(org_line)
88
+
89
+ def dump_image(self, line):
90
+ # Assert all images are pre-dumped
91
+ assert 'image' not in line
92
+ assert 'image_path' in line
93
+ tgt_path = toliststr(line['image_path'])
94
+ return tgt_path
95
+
96
+ @classmethod
97
+ def supported_datasets(cls):
98
+ return list(cls.DATASET_SETS)
99
+
100
+ def evaluate(self, eval_file, **judge_kwargs):
101
+ suffix = eval_file.split('.')[-1]
102
+ # First, split the eval_file by dataset
103
+ data_all = load(eval_file)
104
+ for dname in self.datasets:
105
+ tgt = eval_file.replace(self.dataset_name, dname)
106
+ data_sub = data_all[data_all['SUB_DATASET'] == dname]
107
+ data_sub.pop('index')
108
+ data_sub['index'] = data_sub.pop('original_index')
109
+ data_sub.pop('SUB_DATASET')
110
+ dump(data_sub, tgt)
111
+ # Then, evaluate each dataset separately
112
+ results_all = []
113
+ for dname in self.datasets:
114
+ tgt = eval_file.replace(self.dataset_name, dname)
115
+ res = self.dataset_map[dname].evaluate(tgt, **judge_kwargs)
116
+ assert isinstance(res, pd.DataFrame)
117
+ res['DATASET'] = [dname] * len(res)
118
+ results_all.append(res)
119
+ result = pd.concat(results_all)
120
+ score_file = eval_file.replace(f'.{suffix}', '_acc.csv')
121
+ dump(result, score_file)
122
+ return result
123
+
124
+
125
+ # Add new supported dataset class here
126
+ IMAGE_DATASET = [
127
+ ImageCaptionDataset, ImageYORNDataset, ImageMCQDataset, ImageVQADataset, MathVision,
128
+ MMMUDataset, OCRBench, MathVista, LLaVABench, MMVet, MTVQADataset, TableVQABench,
129
+ MMLongBench, VCRDataset, MMDUDataset, DUDE, SlideVQA, MUIRDataset,
130
+ GMAIMMBenchDataset, MMERealWorld, HRBenchDataset, CRPE, MathVerse, NaturalBenchDataset,
131
+ MIABench, OlympiadBench, WildVision, MMMath, QSpatial, Dynamath, MMGenBench, VizWiz, MMNIAH
132
+ ]
133
+
134
+ VIDEO_DATASET = [
135
+ MMBenchVideo, VideoMME, MVBench, MVBench_MP4, LongVideoBench,
136
+ MLVU, MLVU_MCQ, MLVU_OpenEnded,
137
+ TempCompass, TempCompass_MCQ, TempCompass_Captioning, TempCompass_YorN
138
+ ]
139
+
140
+ TEXT_DATASET = [
141
+ TextMCQDataset
142
+ ]
143
+
144
+ CUSTOM_DATASET = [
145
+ CustomMCQDataset, CustomVQADataset, CustomTextMCQDataset
146
+ ]
147
+
148
+ DATASET_COLLECTION = [ConcatDataset, ConcatVideoDataset]
149
+
150
+ DATASET_CLASSES = IMAGE_DATASET + VIDEO_DATASET + TEXT_DATASET + CUSTOM_DATASET + DATASET_COLLECTION
151
+ SUPPORTED_DATASETS = []
152
+ for DATASET_CLS in DATASET_CLASSES:
153
+ SUPPORTED_DATASETS.extend(DATASET_CLS.supported_datasets())
154
+
155
+
156
+ def DATASET_TYPE(dataset, *, default: str = 'MCQ') -> str:
157
+ for cls in DATASET_CLASSES:
158
+ if dataset in cls.supported_datasets():
159
+ if hasattr(cls, 'TYPE'):
160
+ return cls.TYPE
161
+ # Have to add specific routine to handle ConcatDataset
162
+ if dataset in ConcatDataset.DATASET_SETS:
163
+ dataset_list = ConcatDataset.DATASET_SETS[dataset]
164
+ TYPES = [DATASET_TYPE(dname) for dname in dataset_list]
165
+ assert np.all([x == TYPES[0] for x in TYPES]), (dataset_list, TYPES)
166
+ return TYPES[0]
167
+
168
+ if 'openended' in dataset.lower():
169
+ return 'VQA'
170
+ warnings.warn(f'Dataset {dataset} is a custom one and not annotated as `openended`, will treat as {default}. ')
171
+ return default
172
+
173
+
174
+ def DATASET_MODALITY(dataset, *, default: str = 'IMAGE') -> str:
175
+ if dataset is None:
176
+ warnings.warn(f'Dataset is not specified, will treat modality as {default}. ')
177
+ return default
178
+ for cls in DATASET_CLASSES:
179
+ if dataset in cls.supported_datasets():
180
+ if hasattr(cls, 'MODALITY'):
181
+ return cls.MODALITY
182
+ # Have to add specific routine to handle ConcatDataset
183
+ if dataset in ConcatDataset.DATASET_SETS:
184
+ dataset_list = ConcatDataset.DATASET_SETS[dataset]
185
+ MODALITIES = [DATASET_MODALITY(dname) for dname in dataset_list]
186
+ assert np.all([x == MODALITIES[0] for x in MODALITIES]), (dataset_list, MODALITIES)
187
+ return MODALITIES[0]
188
+
189
+ if 'VIDEO' in dataset.lower():
190
+ return 'VIDEO'
191
+ elif 'IMAGE' in dataset.lower():
192
+ return 'IMAGE'
193
+ warnings.warn(f'Dataset {dataset} is a custom one, will treat modality as {default}. ')
194
+ return default
195
+
196
+
197
+ def build_dataset(dataset_name, **kwargs):
198
+ for cls in DATASET_CLASSES:
199
+ if dataset_name in cls.supported_datasets():
200
+ return cls(dataset=dataset_name, **kwargs)
201
+
202
+ warnings.warn(f'Dataset {dataset_name} is not officially supported. ')
203
+
204
+ data_file = osp.join(LMUDataRoot(), f'{dataset_name}.tsv')
205
+ if not osp.exists(data_file):
206
+ warnings.warn(f'Data file {data_file} does not exist. Dataset building failed. ')
207
+ return None
208
+
209
+ data = load(data_file)
210
+ if 'question' not in [x.lower() for x in data.columns]:
211
+ warnings.warn(f'Data file {data_file} does not have a `question` column. Dataset building failed. ')
212
+ return None
213
+
214
+ if 'A' in data and 'B' in data:
215
+ if 'image' in data or 'image_path' in data:
216
+ warnings.warn(f'Will assume unsupported dataset {dataset_name} as a Custom MCQ dataset. ')
217
+ return CustomMCQDataset(dataset=dataset_name, **kwargs)
218
+ else:
219
+ warnings.warn(f'Will assume unsupported dataset {dataset_name} as a Custom Text MCQ dataset. ')
220
+ return CustomTextMCQDataset(dataset=dataset_name, **kwargs)
221
+ else:
222
+ warnings.warn(f'Will assume unsupported dataset {dataset_name} as a Custom VQA dataset. ')
223
+ return CustomVQADataset(dataset=dataset_name, **kwargs)
224
+
225
+
226
+ __all__ = [
227
+ 'build_dataset', 'img_root_map', 'build_judge', 'extract_answer_from_item', 'prefetch_answer', 'DEBUG_MESSAGE'
228
+ ] + [cls.__name__ for cls in DATASET_CLASSES]
vlmeval/VLMEvalKit_old/vlmeval/dataset/__pycache__/__init__.cpython-38.pyc ADDED
Binary file (8.18 kB). View file
 
vlmeval/VLMEvalKit_old/vlmeval/dataset/__pycache__/dude.cpython-310.pyc ADDED
Binary file (7.03 kB). View file
 
vlmeval/VLMEvalKit_old/vlmeval/dataset/__pycache__/image_caption.cpython-310.pyc ADDED
Binary file (3.12 kB). View file
 
vlmeval/VLMEvalKit_old/vlmeval/dataset/__pycache__/image_caption.cpython-38.pyc ADDED
Binary file (3.02 kB). View file
 
vlmeval/VLMEvalKit_old/vlmeval/dataset/__pycache__/longvideobench.cpython-38.pyc ADDED
Binary file (10.6 kB). View file
 
vlmeval/VLMEvalKit_old/vlmeval/dataset/__pycache__/mmbench_video.cpython-310.pyc ADDED
Binary file (10.2 kB). View file
 
vlmeval/VLMEvalKit_old/vlmeval/dataset/__pycache__/slidevqa.cpython-310.pyc ADDED
Binary file (6.76 kB). View file
 
vlmeval/VLMEvalKit_old/vlmeval/dataset/__pycache__/slidevqa.cpython-38.pyc ADDED
Binary file (6.88 kB). View file
 
vlmeval/VLMEvalKit_old/vlmeval/dataset/__pycache__/text_base.cpython-38.pyc ADDED
Binary file (3.46 kB). View file
 
vlmeval/VLMEvalKit_old/vlmeval/dataset/image_caption.py ADDED
@@ -0,0 +1,89 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ from .image_base import ImageBaseDataset
2
+ from ..smp import *
3
+
4
+ MY_PROMPT = '''
5
+ Hãy mô tả chi tiết người bức ảnh. Hãy sử dụng tiếng Việt.
6
+ Hãy miêu tả về áo, quần, đầu/mặt, giày/dép, ba lô/túi xách, điện thoại, phương tiện di chuyển,...
7
+ '''
8
+
9
+
10
+ class COCO_Caption_Scorer:
11
+ def __init__(self, ref, gt):
12
+ from pycocoevalcap.bleu.bleu import Bleu
13
+ from pycocoevalcap.rouge.rouge import Rouge
14
+ from pycocoevalcap.cider.cider import Cider
15
+
16
+ self.ref = ref
17
+ self.gt = gt
18
+ print("setting up scorers...")
19
+ self.scorers = [
20
+ (Bleu(4), ["Bleu_1", "Bleu_2", "Bleu_3", "Bleu_4"]),
21
+ (Rouge(), "ROUGE_L"),
22
+ (Cider(), "CIDEr"),
23
+ ]
24
+
25
+ def compute_scores(self):
26
+ total_scores = {}
27
+ for scorer, method in self.scorers:
28
+ print("computing %s score..." % (scorer.method()))
29
+ score, scores = scorer.compute_score(self.gt, self.ref)
30
+ if isinstance(method, list):
31
+ for sc, scs, m in zip(score, scores, method):
32
+ print("%s: %0.3f" % (m, sc * 100))
33
+ total_scores["Bleu"] = [x * 100 for x in score]
34
+ else:
35
+ print("%s: %0.3f" % (method, score * 100))
36
+ total_scores[method] = score * 100
37
+
38
+ print("*****DONE*****")
39
+ for key, value in total_scores.items():
40
+ print("{}:{}".format(key, value))
41
+ return total_scores
42
+
43
+
44
+ class ImageCaptionDataset(ImageBaseDataset):
45
+
46
+ TYPE = "Caption"
47
+
48
+ DATASET_URL = {
49
+ "COCO_VAL": "https://opencompass.openxlab.space/utils/VLMEval/COCO_VAL.tsv",
50
+ }
51
+
52
+ DATASET_MD5 = {
53
+ "COCO_VAL": "72a5079dead060269ac222c5aa5128af",
54
+ }
55
+
56
+ def load_data(self, dataset):
57
+ global MY_PROMPT
58
+ data = super().load_data(dataset)
59
+ if "question" not in data:
60
+ data["question"] = [MY_PROMPT] * len(data)
61
+ return data
62
+
63
+ # def load_data(self, dataset):
64
+ # data = super().load_data(dataset)
65
+ # if "question" not in data:
66
+ # data["question"] = [
67
+ # (
68
+ # "Please describe this image in general. Directly provide the description, "
69
+ # 'do not include prefix like "This image depicts". '
70
+ # )
71
+ # ] * len(data)
72
+ # return data
73
+
74
+ # It returns a dictionary of scores
75
+ @classmethod
76
+ def evaluate(self, eval_file, **kwargs):
77
+ data = load(eval_file)
78
+ lt = len(data)
79
+ lines = [data.iloc[i] for i in range(lt)]
80
+ ref, gt = {}, {}
81
+ for i, line in enumerate(lines):
82
+ ref[str(i)] = [str(line["prediction"])]
83
+ gt[str(i)] = eval(line["answer"])
84
+
85
+ scorer = COCO_Caption_Scorer(ref, gt)
86
+ coco_caption_score_dict = scorer.compute_scores()
87
+ score_pth = eval_file.replace(".xlsx", "_score.json")
88
+ dump(coco_caption_score_dict, score_pth)
89
+ return coco_caption_score_dict
vlmeval/VLMEvalKit_old/vlmeval/dataset/image_vqa.py ADDED
@@ -0,0 +1,1333 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import os
2
+ import re
3
+ import tempfile
4
+ from functools import partial
5
+ from jinja2.sandbox import SandboxedEnvironment
6
+ from jinja2 import Template
7
+
8
+ import pandas as pd
9
+
10
+ from .image_base import ImageBaseDataset
11
+ from .utils import build_judge, DEBUG_MESSAGE
12
+ from ..smp import *
13
+ from ..utils import track_progress_rich
14
+ import ipdb
15
+
16
+
17
+ class ImageVQADataset(ImageBaseDataset):
18
+ TYPE = 'VQA'
19
+
20
+ DATASET_URL = {
21
+ 'OCRVQA_TEST': 'https://opencompass.openxlab.space/utils/VLMEval/OCRVQA_TEST.tsv',
22
+ 'OCRVQA_TESTCORE': 'https://opencompass.openxlab.space/utils/VLMEval/OCRVQA_TESTCORE.tsv',
23
+ 'TextVQA_VAL': 'https://opencompass.openxlab.space/utils/VLMEval/TextVQA_VAL.tsv',
24
+ 'DocVQA_VAL': 'https://opencompass.openxlab.space/utils/VLMEval/DocVQA_VAL.tsv',
25
+ 'DocVQA_TEST': 'https://opencompass.openxlab.space/utils/VLMEval/DocVQA_TEST.tsv',
26
+ 'InfoVQA_VAL': 'https://opencompass.openxlab.space/utils/VLMEval/InfoVQA_VAL.tsv',
27
+ 'InfoVQA_TEST': 'https://opencompass.openxlab.space/utils/VLMEval/InfoVQA_TEST.tsv',
28
+ 'ChartQA_TEST': 'https://opencompass.openxlab.space/utils/VLMEval/ChartQA_TEST.tsv',
29
+ 'GQA_TestDev_Balanced': 'https://opencompass.openxlab.space/utils/VLMEval/GQA_TestDev_Balanced.tsv',
30
+ }
31
+
32
+ DATASET_MD5 = {
33
+ 'OCRVQA_TEST': 'ca46a6d74b403e9d6c0b670f6fc00db9',
34
+ 'OCRVQA_TESTCORE': 'c5239fe77db8bdc1f2ad8e55e0d1fe97',
35
+ 'TextVQA_VAL': 'b233b31f551bbf4056f2f955da3a92cd',
36
+ 'DocVQA_VAL': 'd5ee77e1926ff10690d469c56b73eabf',
37
+ 'DocVQA_TEST': '6a2f28cac26ef2d3447374e8c6f6c8e9',
38
+ 'InfoVQA_VAL': '2342e9c225222f0ef4dec545ebb126fe',
39
+ 'InfoVQA_TEST': 'df535bf51b88dc9718252c34131a6227',
40
+ 'ChartQA_TEST': 'c902e0aa9be5582a7aad6dcf52734b42',
41
+ 'GQA_TestDev_Balanced': 'fead7df22befc1ed3ca2b62ea26fa17b',
42
+ }
43
+
44
+ def build_prompt(self, line):
45
+ msgs = super().build_prompt(line)
46
+ assert msgs[-1]['type'] == 'text'
47
+ msgs[-1]['value'] += '\nAnswer the question using a single word or phrase.'
48
+ return msgs
49
+
50
+ # It returns a DataFrame
51
+ def evaluate(self, eval_file, **judge_kwargs):
52
+ from .utils.vqa_eval import hit_calculate, process_line
53
+
54
+ data = load(eval_file)
55
+ dataset = self.dataset_name
56
+ assert 'answer' in data and 'prediction' in data
57
+ data['prediction'] = [str(x) for x in data['prediction']]
58
+ data['answer'] = [str(x) for x in data['answer']]
59
+ lt = len(data)
60
+ pool = mp.Pool(16)
61
+ lines = [data.iloc[i] for i in range(lt)]
62
+ if listinstr(['TextVQA'], dataset):
63
+ res = pool.map(partial(process_line, method='vqa_score'), lines)
64
+ elif listinstr(['ChartQA'], dataset):
65
+ res = pool.map(partial(process_line, method='relaxed_accuracy'), lines)
66
+ elif listinstr(['OCRVQA', 'GQA'], dataset):
67
+ res = pool.map(partial(process_line, method='accuracy'), lines)
68
+ elif listinstr(['DocVQA', 'InfoVQA'], dataset):
69
+ res = pool.map(partial(process_line, method='anls'), lines)
70
+ else: # default using vqa_score to calculate score
71
+ res = pool.map(process_line, lines)
72
+ hit = hit_calculate(res, dataset)
73
+ ret = dict()
74
+ if 'split' in data:
75
+ splits = set(data['split'])
76
+ for sp in splits:
77
+ sub = [r for l, r in zip(lines, res) if l['split'] == sp]
78
+ # [np.mean(x['match']) >= full_score_weight for x in sub]
79
+ hit = hit_calculate(sub, dataset)
80
+ ret[sp] = np.mean(hit) * 100
81
+ sub = [r for l, r in zip(lines, res)]
82
+ hit = hit_calculate(sub, dataset)
83
+ ret['Overall'] = np.mean(hit) * 100
84
+ else:
85
+ ret['Overall'] = np.mean(hit) * 100
86
+ if 'category' in data:
87
+ cates = list(set(data['category']))
88
+ cates.sort()
89
+ for c in cates:
90
+ sub = [r for l, r in zip(lines, res) if l['category'] == c]
91
+ # [np.mean(x['match']) >= full_score_weight for x in sub]
92
+ hit = hit_calculate(sub, dataset)
93
+ ret[c] = np.mean(hit) * 100
94
+ ret = d2df(ret)
95
+ ret.round(2)
96
+
97
+ suffix = eval_file.split('.')[-1]
98
+ result_file = eval_file.replace(f'.{suffix}', '_acc.csv')
99
+ dump(ret, result_file)
100
+ return ret
101
+
102
+
103
+ class VizWiz(ImageBaseDataset):
104
+ TYPE = 'VQA'
105
+ DATASET_URL = {
106
+ 'VizWiz': 'https://opencompass.openxlab.space/utils/VLMEval/VizWiz.tsv'
107
+ }
108
+ DATASET_MD5 = {
109
+ 'VizWiz': 'fa4ac4164467563ed2fac6eac6631bd0'
110
+ }
111
+
112
+ @classmethod
113
+ def evaluate(self, eval_file, **judge_kwargs):
114
+ from .utils.vqa_eval import hit_calculate, process_line
115
+
116
+ suffix = eval_file.split('.')[-1]
117
+ result_file = eval_file.replace(f'.{suffix}', '_acc.csv')
118
+
119
+ if not osp.exists(result_file):
120
+ data = load(eval_file)
121
+ assert 'answers' in data and 'prediction' in data
122
+ data['prediction'] = [str(x) for x in data['prediction']]
123
+ data['answer'] = [str(x) for x in data['answers']]
124
+
125
+ lt = len(data)
126
+ pool = mp.Pool(16)
127
+ lines = [data.iloc[i] for i in range(lt)]
128
+ res = pool.map(process_line, lines)
129
+
130
+ hit = hit_calculate(res, 'VizWiz')
131
+ ret = dict()
132
+
133
+ ret['Overall'] = np.mean(hit) * 100
134
+ ret = d2df(ret)
135
+ ret.round(2)
136
+
137
+ dump(ret, result_file)
138
+
139
+ retz = pd.read_csv(result_file)
140
+ return retz
141
+
142
+
143
+ class OCRBench(ImageBaseDataset):
144
+ TYPE = 'VQA'
145
+ DATASET_URL = {
146
+ 'OCRBench': 'https://opencompass.openxlab.space/utils/VLMEval/OCRBench.tsv'
147
+ }
148
+ DATASET_MD5 = {'OCRBench': 'e953d98a987cc6e26ef717b61260b778'}
149
+
150
+ # It returns a dictionary
151
+ @classmethod
152
+ def evaluate(self, eval_file, **judge_kwargs):
153
+ OCRBench_score = {
154
+ 'Regular Text Recognition': 0,
155
+ 'Irregular Text Recognition': 0,
156
+ 'Artistic Text Recognition': 0,
157
+ 'Handwriting Recognition': 0,
158
+ 'Digit String Recognition': 0,
159
+ 'Non-Semantic Text Recognition': 0,
160
+ 'Scene Text-centric VQA': 0,
161
+ 'Doc-oriented VQA': 0,
162
+ 'Key Information Extraction': 0,
163
+ 'Handwritten Mathematical Expression Recognition': 0,
164
+ }
165
+
166
+ data = load(eval_file)
167
+ lt = len(data)
168
+ lines = [data.iloc[i] for i in range(lt)]
169
+ for i in tqdm(range(len(lines))):
170
+ line = lines[i]
171
+ predict = str(line['prediction'])
172
+ answers = eval(line['answer'])
173
+ category = line['category']
174
+ if category == 'Handwritten Mathematical Expression Recognition':
175
+ for j in range(len(answers)):
176
+ answer = answers[j].strip().replace('\n', ' ').replace(' ', '')
177
+ predict = predict.strip().replace('\n', ' ').replace(' ', '')
178
+ if answer in predict:
179
+ OCRBench_score[category] += 1
180
+ break
181
+ else:
182
+ for j in range(len(answers)):
183
+ answer = answers[j].lower().strip().replace('\n', ' ')
184
+ predict = predict.lower().strip().replace('\n', ' ')
185
+ if answer in predict:
186
+ OCRBench_score[category] += 1
187
+ break
188
+
189
+ final_score_dict = {}
190
+ final_score_dict['Text Recognition'] = \
191
+ (OCRBench_score['Regular Text Recognition'] + OCRBench_score['Irregular Text Recognition']
192
+ + OCRBench_score['Artistic Text Recognition'] + OCRBench_score['Handwriting Recognition']
193
+ + OCRBench_score['Digit String Recognition'] + OCRBench_score['Non-Semantic Text Recognition'])
194
+ final_score_dict['Scene Text-centric VQA'] = OCRBench_score['Scene Text-centric VQA']
195
+ final_score_dict['Doc-oriented VQA'] = OCRBench_score['Doc-oriented VQA']
196
+ final_score_dict['Key Information Extraction'] = OCRBench_score['Key Information Extraction']
197
+ final_score_dict['Handwritten Mathematical Expression Recognition'] = \
198
+ (OCRBench_score['Handwritten Mathematical Expression Recognition'])
199
+ final_score_dict['Final Score'] = \
200
+ (final_score_dict['Text Recognition'] + final_score_dict['Scene Text-centric VQA']
201
+ + final_score_dict['Doc-oriented VQA'] + final_score_dict['Key Information Extraction']
202
+ + final_score_dict['Handwritten Mathematical Expression Recognition'])
203
+ final_score_dict['Final Score Norm'] = (float(final_score_dict['Final Score']) / 10)
204
+ score_pth = eval_file.replace('.xlsx', '_score.json')
205
+ dump(final_score_dict, score_pth)
206
+ return final_score_dict
207
+
208
+
209
+ class MathVista(ImageBaseDataset):
210
+ TYPE = 'VQA'
211
+ DATASET_URL = {
212
+ 'MathVista_MINI': 'https://opencompass.openxlab.space/utils/VLMEval/MathVista_MINI.tsv'
213
+ }
214
+ DATASET_MD5 = {'MathVista_MINI': 'f199b98e178e5a2a20e7048f5dcb0464'}
215
+
216
+ # It returns a DataFrame
217
+ @classmethod
218
+ def evaluate(self, eval_file, **judge_kwargs):
219
+ from .utils.mathvista import MathVista_auxeval, MathVista_acc
220
+
221
+ model = judge_kwargs['model']
222
+ suffix = eval_file.split('.')[-1]
223
+ storage = eval_file.replace(f'.{suffix}', f'_{model}.xlsx')
224
+ tmp_file = eval_file.replace(f'.{suffix}', f'_{model}.pkl')
225
+ nproc = judge_kwargs.pop('nproc', 4)
226
+
227
+ if not osp.exists(storage):
228
+ data = load(eval_file)
229
+ model = build_judge(max_tokens=128, **judge_kwargs)
230
+ assert model.working(), ('MathVista evaluation requires a working OPENAI API\n' + DEBUG_MESSAGE)
231
+ lt = len(data)
232
+ lines = [data.iloc[i] for i in range(lt)]
233
+ tups = [(model, line) for line in lines]
234
+ indices = [line['index'] for line in lines]
235
+
236
+ ans = {}
237
+ if osp.exists(tmp_file):
238
+ ans = load(tmp_file)
239
+ tups = [x for x, i in zip(tups, indices) if i not in ans]
240
+ indices = [i for i in indices if i not in ans]
241
+
242
+ if len(indices):
243
+ new_results = track_progress_rich(
244
+ MathVista_auxeval,
245
+ tups,
246
+ nproc=nproc,
247
+ chunksize=nproc,
248
+ keys=indices,
249
+ save=tmp_file,
250
+ )
251
+ ans = load(tmp_file)
252
+ for k, v in zip(indices, new_results):
253
+ assert k in ans
254
+ assert ans[k]['log'] == v['log'] and ans[k]['res'] == v['res']
255
+
256
+ data['res'] = [ans[idx]['res'] for idx in data['index']]
257
+ data['log'] = [ans[idx]['log'] for idx in data['index']]
258
+ dump(data, storage)
259
+
260
+ score = MathVista_acc(storage)
261
+ score_pth = storage.replace('.xlsx', '_score.csv')
262
+ dump(score, score_pth)
263
+ return score
264
+
265
+
266
+ class MathVerse(ImageBaseDataset):
267
+ TYPE = 'VQA'
268
+ DATASET_URL = {
269
+ 'MathVerse_MINI': 'http://opencompass.openxlab.space/utils/benchmarks/MathVerse/MathVerse_MINIV.tsv', # noqa
270
+ 'MathVerse_MINI_Vision_Only': 'http://opencompass.openxlab.space/utils/benchmarks/MathVerse/MathVerse_MINIVOnly.tsv', # noqa
271
+ 'MathVerse_MINI_Vision_Dominant': 'http://opencompass.openxlab.space/utils/benchmarks/MathVerse/MathVerse_MINIVDom.tsv', # noqa
272
+ 'MathVerse_MINI_Vision_Intensive': 'http://opencompass.openxlab.space/utils/benchmarks/MathVerse/MathVerse_MINIVInt.tsv', # noqa
273
+ 'MathVerse_MINI_Text_Lite': 'http://opencompass.openxlab.space/utils/benchmarks/MathVerse/MathVerse_MINITLite.tsv', # noqa
274
+ 'MathVerse_MINI_Text_Dominant': 'http://opencompass.openxlab.space/utils/benchmarks/MathVerse/MathVerse_MINITDom.tsv', # noqa
275
+ }
276
+ DATASET_MD5 = {
277
+ 'MathVerse_MINI': '5017caca32b7fa110c350a1bea861b65',
278
+ 'MathVerse_MINI_Vision_Only': '68a11d4680014ac881fa37adeadea3a4',
279
+ 'MathVerse_MINI_Vision_Dominant': 'b8fb63852d261ab2aaefba29cc2414d3',
280
+ 'MathVerse_MINI_Vision_Intensive': '01cbd35be202bb0c4873a4186a63bc19',
281
+ 'MathVerse_MINI_Text_Lite': '19e4b13bdd30b89a03b2e358bcfefa04',
282
+ 'MathVerse_MINI_Text_Dominant': '4f5cd2fa6630ea00bb11d6fde1f6fe6a',
283
+ }
284
+
285
+ # It returns a DataFrame
286
+ @classmethod
287
+ def evaluate(self, eval_file, **judge_kwargs):
288
+ from .utils.mathverse import MathVerse_auxeval_extract, MathVerse_auxeval_score, MathVerse_acc
289
+
290
+ model = judge_kwargs['model']
291
+ suffix = eval_file.split('.')[-1]
292
+ storage_extract = eval_file.replace(f'.{suffix}', f'_{model}_extract.xlsx')
293
+ tmp_file_extract = eval_file.replace(f'.{suffix}', f'_{model}_extract.pkl')
294
+ storage_score = eval_file.replace(f'.{suffix}', f'_{model}_score.xlsx')
295
+ tmp_file_score = eval_file.replace(f'.{suffix}', f'_{model}_score.pkl')
296
+ nproc = judge_kwargs.pop('nproc', 4)
297
+ # stage1: extract the answer
298
+ if not osp.exists(storage_extract):
299
+ data = load(eval_file)
300
+ model = build_judge(max_tokens=128, **judge_kwargs)
301
+ assert model.working(), ('MathVerse evaluation requires a working OPENAI API\n' + DEBUG_MESSAGE)
302
+ lt = len(data)
303
+ lines = [data.iloc[i] for i in range(lt)]
304
+ tups = [(model, line) for line in lines]
305
+ indices = [line['index'] for line in lines]
306
+
307
+ ans = {}
308
+ if osp.exists(tmp_file_extract):
309
+ ans = load(tmp_file_extract)
310
+ tups = [x for x, i in zip(tups, indices) if i not in ans]
311
+ indices = [i for i in indices if i not in ans]
312
+
313
+ if len(indices):
314
+ new_results = track_progress_rich(
315
+ MathVerse_auxeval_extract,
316
+ tups,
317
+ nproc=nproc,
318
+ chunksize=nproc,
319
+ keys=indices,
320
+ save=tmp_file_extract,
321
+ )
322
+ ans = load(tmp_file_extract)
323
+ for k, v in zip(indices, new_results):
324
+ assert k in ans
325
+ assert ans[k]['log_extract'] == v['log_extract'] and ans[k]['extract'] == v['extract']
326
+
327
+ data['extract'] = [ans[idx]['extract'] for idx in data['index']]
328
+ data['log_extract'] = [ans[idx]['log_extract'] for idx in data['index']]
329
+ dump(data, storage_extract)
330
+
331
+ # stage2: score the answer
332
+ if not osp.exists(storage_score):
333
+ data = load(storage_extract)
334
+ model = build_judge(max_tokens=128, **judge_kwargs)
335
+ assert model.working(), ('MathVerse evaluation requires a working OPENAI API\n' + DEBUG_MESSAGE)
336
+ lt = len(data)
337
+ lines = [data.iloc[i] for i in range(lt)]
338
+ tups = [(model, line) for line in lines]
339
+ indices = [line['index'] for line in lines]
340
+
341
+ ans = {}
342
+ if osp.exists(tmp_file_score):
343
+ ans = load(tmp_file_score)
344
+ tups = [x for x, i in zip(tups, indices) if i not in ans]
345
+ indices = [i for i in indices if i not in ans]
346
+
347
+ if len(indices):
348
+ new_results = track_progress_rich(
349
+ MathVerse_auxeval_score,
350
+ tups,
351
+ nproc=nproc,
352
+ chunksize=nproc,
353
+ keys=indices,
354
+ save=tmp_file_score,
355
+ )
356
+ ans = load(tmp_file_score)
357
+ for k, v in zip(indices, new_results):
358
+ assert k in ans
359
+ assert ans[k]['log_score'] == v['log_score'] and ans[k]['score'] == v['score']
360
+
361
+ data['score'] = [ans[idx]['score'] for idx in data['index']]
362
+ data['log_score'] = [ans[idx]['log_score'] for idx in data['index']]
363
+ dump(data, storage_score)
364
+
365
+ score = MathVerse_acc(storage_score)
366
+ score_pth = storage_score.replace('.xlsx', '.csv')
367
+ dump(score, score_pth)
368
+ return score
369
+
370
+
371
+ class MathVision(ImageBaseDataset):
372
+ TYPE = 'VQA'
373
+ DATASET_URL = {
374
+ 'MathVision': 'https://opencompass.openxlab.space/utils/VLMEval/MathVision.tsv',
375
+ 'MathVision_MINI': 'https://opencompass.openxlab.space/utils/VLMEval/MathVision_MINI.tsv'
376
+ }
377
+ DATASET_MD5 = {
378
+ 'MathVision': '93f6de14f7916e598aa1b7165589831e',
379
+ 'MathVision_MINI': '060fe4fa5d868987ce179307bd5f8a33'
380
+ }
381
+
382
+ # It returns a DataFrame
383
+ @classmethod
384
+ def evaluate(self, eval_file, **judge_kwargs):
385
+ from .utils.mathv import MATH_V_auxeval, MATH_V_acc
386
+
387
+ if 'model' in judge_kwargs:
388
+ model = judge_kwargs['model']
389
+ else:
390
+ model = os.path.basename(os.environ.get('LOCAL_LLM'))
391
+ suffix = eval_file.split('.')[-1]
392
+ storage = eval_file.replace(f'.{suffix}', f'_{model}.xlsx')
393
+ tmp_file = eval_file.replace(f'.{suffix}', f'_{model}.pkl')
394
+ nproc = judge_kwargs.pop('nproc', 4)
395
+
396
+ if not osp.exists(storage):
397
+ data = load(eval_file)
398
+ model = build_judge(max_tokens=128, **judge_kwargs)
399
+ assert model.working(), ('MATH-Vision evaluation requires a working OPENAI API\n' + DEBUG_MESSAGE)
400
+ lt = len(data)
401
+ lines = [data.iloc[i] for i in range(lt)]
402
+ tups = [(model, line) for line in lines]
403
+ indices = [line['index'] for line in lines]
404
+
405
+ ans = {}
406
+ if osp.exists(tmp_file):
407
+ ans = load(tmp_file)
408
+ tups = [x for x, i in zip(tups, indices) if i not in ans]
409
+ indices = [i for i in indices if i not in ans]
410
+
411
+ if len(indices):
412
+ new_results = track_progress_rich(
413
+ MATH_V_auxeval,
414
+ tups,
415
+ nproc=nproc,
416
+ chunksize=nproc,
417
+ keys=indices,
418
+ save=tmp_file,
419
+ )
420
+ ans = load(tmp_file)
421
+ for k, v in zip(indices, new_results):
422
+ assert k in ans
423
+ assert ans[k]['log'] == v['log'] and ans[k]['res'] == v['res']
424
+
425
+ data['res'] = [ans[idx]['res'] for idx in data['index']]
426
+ data['log'] = [ans[idx]['log'] for idx in data['index']]
427
+ dump(data, storage)
428
+
429
+ score = MATH_V_acc(storage)
430
+ score_pth = storage.replace('.xlsx', '_score.csv')
431
+ dump(score, score_pth)
432
+ return score
433
+
434
+
435
+ class OlympiadBench(ImageBaseDataset):
436
+ TYPE = 'VQA_ex_prompt'
437
+ DATASET_URL = {
438
+ 'OlympiadBench': 'https://opencompass.openxlab.space/utils/VLMEval/OlympiadBench.tsv',
439
+ 'OlympiadBench_EN': 'https://opencompass.openxlab.space/utils/VLMEval/OlympiadBench_EN.tsv',
440
+ 'OlympiadBench_CN': 'https://opencompass.openxlab.space/utils/VLMEval/OlympiadBench_CN.tsv'
441
+ }
442
+ DATASET_MD5 = {
443
+ 'OlympiadBench': '9735ae0f0299eae1e7d07f5a7feab914',
444
+ 'OlympiadBench_EN': '5c68e100d394351fc7049f29d4d4efed',
445
+ 'OlympiadBench_CN': 'ea01b16788955702c79650c701e5b623'
446
+ }
447
+
448
+ def dump_image(self, line):
449
+ os.makedirs(self.img_root, exist_ok=True)
450
+
451
+ tgt_path_z = []
452
+ if isinstance(line['image'], list):
453
+ for i in range(len(line['image'])):
454
+ tgt_path = osp.join(self.img_root, f"{line['index']}--{i+1}.jpg")
455
+ if not read_ok(tgt_path):
456
+ decode_base64_to_image_file(line['image'][i], tgt_path)
457
+ tgt_path_z.append(tgt_path)
458
+ else:
459
+ tgt_path = osp.join(self.img_root, f"{line['index']}.jpg")
460
+ if not read_ok(tgt_path):
461
+ decode_base64_to_image_file(line['image'], tgt_path)
462
+ tgt_path_z.append(tgt_path)
463
+ return tgt_path_z
464
+
465
+ def build_prompt(self, line):
466
+
467
+ from .utils.olympiadbench import get_answer_type_text, make_input
468
+
469
+ self.is_chinese = 'zh' in line['source']
470
+ self.is_math = 'maths' in line['source']
471
+ self.is_theorem_proving = 'TP' in line['source']
472
+
473
+ if self.is_chinese:
474
+ subject_content = '数学' if self.is_math else '物理'
475
+ if self.is_theorem_proving:
476
+ prompt = (
477
+ f"以下是中国{subject_content}竞赛中的证明题。请根据题目的要求,运用逻辑推理及常用定理证明题目中的命题。"
478
+ "证明过程中使用的变量和公式请使用LaTeX格式表示。"
479
+ )
480
+ else:
481
+ answer_type_text = get_answer_type_text(line['answer_type'], is_chinese=True,
482
+ multiple_answer=line['is_multiple_answer'])
483
+ if line['is_multiple_answer']:
484
+ multiple_answer_text = '\\boxed{用英文逗号连接的多个答案}'
485
+ else:
486
+ multiple_answer_text = '\\boxed{答案}'
487
+ unit_text = ''
488
+ if line['unit']:
489
+ multiple_answer_text += '(单位)'
490
+ unit_text = ',注意答案的单位不要放在\\boxed{}中'
491
+ prompt = (
492
+ f'以下是中国{subject_content}竞赛中的解答题{answer_type_text}。请根据题目的要求和所提供的信息计算得出答案。'
493
+ f'解答过程和结果中使用的变量和公式请使用LaTeX格式表示。请在最后以“所以最终答案是{multiple_answer_text}。”'
494
+ f'显式给出结果{unit_text}。'
495
+ )
496
+ else:
497
+ subject_content = 'Math' if self.is_math else 'Physics'
498
+ if self.is_theorem_proving:
499
+ prompt = (
500
+ f'The following is a theorem proving problem from an International {subject_content} competition. '
501
+ 'Please use logical reasoning and common theorems to prove the proposition in the problem '
502
+ 'according to the given requirements. '
503
+ 'Please use LaTeX format to represent the variables and formulas used in the proof.'
504
+ )
505
+ else:
506
+ if line['is_multiple_answer']:
507
+ multiple_answer_text = '\\boxed{multiple answers connected with commas}'
508
+ else:
509
+ multiple_answer_text = '\\boxed{answer}'
510
+ unit_text = ''
511
+ if line['unit']:
512
+ multiple_answer_text += '(unit)'
513
+ unit_text = ', note that the unit of the answer should not be included in \\boxed{}'
514
+ answer_type_text = get_answer_type_text(line['answer_type'], is_chinese=False,
515
+ multiple_answer=line['is_multiple_answer'])
516
+ prompt = (
517
+ f'The following is an open-ended problem from an International {subject_content} competition. '
518
+ f'{answer_type_text}Please calculate the answer according to the given requirements and '
519
+ 'the information provided. Please use LaTeX format to represent the variables and formulas '
520
+ 'used in the solution process and results. Please end your solution with "So the final answer '
521
+ f'is {multiple_answer_text}." and give the result explicitly{unit_text}.'
522
+ )
523
+
524
+ if self.is_math:
525
+ input = make_input(prompt, line['question'])
526
+ else:
527
+ if 'context' in line.keys() and str(line['context']) != 'nan': # cannot be null
528
+ input = make_input(prompt, line['context'] + '\n' + line['question'])
529
+ else:
530
+ input = make_input(prompt, line['question'])
531
+
532
+ ret = [dict(type='text', value=input)]
533
+ tgt_path = self.dump_image(line)
534
+
535
+ ret.extend([dict(type='image', value=s) for s in tgt_path])
536
+
537
+ return ret
538
+
539
+ @classmethod
540
+ def evaluate(self, eval_file, **judge_kwargs):
541
+ from .utils.olympiadbench import MathJudger, extract_answer
542
+ judger = MathJudger()
543
+
544
+ suffix = eval_file.split('.')[-1]
545
+ name_str1 = 'judge'
546
+ name_str2 = 'score'
547
+ result_file = eval_file.replace(f'.{suffix}', f'_{name_str1}_result.xlsx')
548
+ score_file = eval_file.replace(f'.{suffix}', f'_{name_str2}_result.csv')
549
+
550
+ if not osp.exists(result_file):
551
+ data = load(eval_file)
552
+ scorez = []
553
+
554
+ for i in tqdm(data.iterrows()):
555
+ line = i[1]
556
+ model_answer = line['prediction']
557
+ is_chinese = 'zh' in line['source']
558
+ model_answer = extract_answer(is_chinese, model_answer, is_deepseek=False)
559
+ answer_type = line['answer_type']
560
+
561
+ final_answer = line['final_answer'][2:-2]
562
+
563
+ if str(answer_type) != 'nan' and 'Tuple' in answer_type:
564
+ judge_result = judger.judge(model_answer, final_answer)
565
+ else:
566
+ if str(line['error']) != 'nan':
567
+ if ',' in line['error']:
568
+ precisions = line['error'].split(',')
569
+ precisions = [float(p) if p else 1e-8 for p in precisions]
570
+ judge_result = judger.judge(model_answer, final_answer, precisions)
571
+ else:
572
+ precision = float(line['error'])
573
+ judge_result = judger.judge(model_answer, final_answer, precision)
574
+ else:
575
+ judge_result = judger.judge(model_answer, final_answer)
576
+ scorez.append(judge_result)
577
+
578
+ data['score'] = scorez
579
+ dump(data, result_file)
580
+
581
+ judge_file = load(result_file)
582
+
583
+ if not osp.exists(score_file):
584
+ name_list = ['OE_MM_maths_en_COMP', 'OE_MM_maths_zh_CEE', 'OE_MM_maths_zh_COMP', 'OE_MM_physics_en_COMP',
585
+ 'OE_MM_physics_zh_CEE','OE_TO_maths_en_COMP', 'OE_TO_maths_zh_CEE', 'OE_TO_maths_zh_COMP',
586
+ 'OE_TO_physics_en_COMP', 'OE_TO_physics_zh_CEE']
587
+
588
+ sample_list = [[] for _ in range(len(name_list))]
589
+ for i in judge_file.iterrows():
590
+ line = i[1]
591
+ for j in range(len(name_list)):
592
+ if line['source'] == name_list[j]:
593
+ sample_list[j].append(line['score'])
594
+
595
+ acc_dict = {}
596
+ correct_list = []
597
+
598
+ # fine-grained
599
+ for i in range(len(name_list)):
600
+ correct_num = 0
601
+ for j in sample_list[i]:
602
+ if j:
603
+ correct_num += 1
604
+ correct_list.append(correct_num)
605
+ acc = 100 * correct_num / len(sample_list[i])
606
+ acc_dict[name_list[i]] = [acc]
607
+
608
+ # 4 grained
609
+ labela = ['zh', 'en']
610
+ labelb = ['maths', 'physics']
611
+
612
+ grain_list = [[x,y] for x in labela for y in labelb]
613
+ for j in grain_list:
614
+ dict_name = j[0] + "_" + j[1]
615
+ correct_num = 0
616
+ full_num = 0
617
+ for i in range(len(name_list)):
618
+ if all(k in name_list[i] for k in j):
619
+ correct_num += correct_list[i]
620
+ full_num += len(sample_list[i])
621
+ acc = 100 * correct_num / full_num
622
+ acc_dict[dict_name] = [acc]
623
+
624
+ # 2 grained
625
+ grain_list = ['maths', 'physics']
626
+ for j in grain_list:
627
+ dict_name = j
628
+ correct_num = 0
629
+ full_num = 0
630
+ for i in range(len(name_list)):
631
+ if j in name_list[i]:
632
+ correct_num += correct_list[i]
633
+ full_num += len(sample_list[i])
634
+ acc = 100 * correct_num / full_num
635
+ acc_dict[dict_name] = [acc]
636
+
637
+ # AVG
638
+ correct_num = sum(correct_list)
639
+ acc = 100 * correct_num / len(judge_file)
640
+ acc_dict['AVG'] = [acc]
641
+
642
+ acc_pd = pd.DataFrame(acc_dict)
643
+ acc_pd.to_csv(score_file, index=False, encoding='gbk')
644
+
645
+ accdz = pd.read_csv(score_file)
646
+ return accdz
647
+
648
+
649
+ class LLaVABench(ImageBaseDataset):
650
+ TYPE = 'VQA'
651
+ DATASET_URL = {'LLaVABench': 'https://opencompass.openxlab.space/utils/VLMEval/LLaVABench.tsv'}
652
+ DATASET_MD5 = {'LLaVABench': 'd382a093f749a697820d3dadd61c8428'}
653
+
654
+ # It returns a DataFrame
655
+ @classmethod
656
+ def evaluate(self, eval_file, **judge_kwargs):
657
+ from .utils.llavabench import (
658
+ build_prompt,
659
+ LLaVABench_atomeval,
660
+ LLaVABench_score,
661
+ )
662
+
663
+ suffix = '.' + eval_file.split('.')[-1]
664
+ record_file = eval_file.replace(suffix, '_openai_result' + suffix)
665
+ score_file = eval_file.replace(suffix, '_score.csv')
666
+ nproc = judge_kwargs.pop('nproc', 4)
667
+ system_prompt = 'You are a helpful and precise assistant for checking the quality of the answer.'
668
+
669
+ if not osp.exists(record_file):
670
+ data = load(eval_file)
671
+ lines = [data.iloc[i] for i in range(len(data))]
672
+ model = build_judge(temperature=0.2, system_prompt=system_prompt, **judge_kwargs)
673
+ assert model.working(), ('LLaVABench evaluation requires a working OPENAI API\n' + DEBUG_MESSAGE)
674
+
675
+ prompts = [build_prompt(line) for line in lines]
676
+ tups = [(model, prompt) for prompt in prompts]
677
+ scores = track_progress_rich(LLaVABench_atomeval, tups, nproc=nproc, chunksize=nproc)
678
+ data['gpt4_score'] = [x[0] for x in scores]
679
+ data['score'] = [x[1] for x in scores]
680
+ dump(data, record_file)
681
+
682
+ data = load(record_file)
683
+ ret = LLaVABench_score(data).round(1)
684
+ dump(ret, score_file)
685
+ return ret
686
+
687
+
688
+ class MMVet(ImageBaseDataset):
689
+ TYPE = 'VQA'
690
+ DATASET_URL = {
691
+ 'MMVet': 'https://opencompass.openxlab.space/utils/VLMEval/MMVet.tsv'
692
+ }
693
+ DATASET_MD5 = {'MMVet': '748aa6d4aa9d4de798306a63718455e3'}
694
+
695
+ # It returns a DataFrame
696
+ @classmethod
697
+ def evaluate(self, eval_file, **judge_kwargs):
698
+ from .utils.mmvet import MMVet_auxeval, MMVet_acc
699
+
700
+ suffix = eval_file.split('.')[-1]
701
+ model = judge_kwargs['model']
702
+ storage = eval_file.replace(f'.{suffix}', f'_{model}.xlsx')
703
+ tmp_file = eval_file.replace(f'.{suffix}', f'_{model}.pkl')
704
+ nproc = judge_kwargs.pop('nproc', 4)
705
+ if not osp.exists(storage):
706
+ data = load(eval_file)
707
+ model = build_judge(max_tokens=3, **judge_kwargs)
708
+ assert model.working(), ('MMVet evaluation requires a working OPENAI API\n' + DEBUG_MESSAGE)
709
+
710
+ lt = len(data)
711
+ lines = [data.iloc[i] for i in range(lt)]
712
+ tups = [(model, line) for line in lines]
713
+ indices = [line['index'] for line in lines]
714
+
715
+ ans = load(tmp_file) if osp.exists(tmp_file) else {}
716
+ tups = [x for x, i in zip(tups, indices) if i not in ans]
717
+ indices = [i for i in indices if i not in ans]
718
+
719
+ if len(indices):
720
+ new_results = track_progress_rich(
721
+ MMVet_auxeval,
722
+ tups,
723
+ nproc=nproc,
724
+ chunksize=nproc,
725
+ keys=indices,
726
+ save=tmp_file,
727
+ )
728
+ ans = load(tmp_file)
729
+ for k, v in zip(indices, new_results):
730
+ assert k in ans
731
+ assert ans[k]['log'] == v['log'] and ans[k]['score'] == v['score']
732
+ data['score'] = [ans[idx]['score'] for idx in data['index']]
733
+ data['log'] = [ans[idx]['log'] for idx in data['index']]
734
+ dump(data, storage)
735
+
736
+ score, score_fine = MMVet_acc(storage)
737
+ score_pth = storage.replace('.xlsx', '_score.csv')
738
+ score_fine_pth = storage.replace('.xlsx', '_score_fine.csv')
739
+ dump(score, score_pth)
740
+ dump(score_fine, score_fine_pth)
741
+ return score
742
+
743
+
744
+ class MTVQADataset(ImageBaseDataset):
745
+ TYPE = 'VQA'
746
+ DATASET_URL = {'MTVQA_TEST': 'https://opencompass.openxlab.space/utils/VLMEval/MTVQA_TEST.tsv'}
747
+ DATASET_MD5 = {'MTVQA_TEST': 'd87c17dbab934b7cd89c0a3c1c5657f4'}
748
+
749
+ @classmethod
750
+ def evaluate(self, eval_file, **judge_kwargs):
751
+ data = load(eval_file)
752
+ assert 'answer' in data and 'prediction' in data and 'category' in data
753
+ data['prediction'] = [str(x) for x in data['prediction']]
754
+ data['answer'] = [str(x) for x in data['answer']]
755
+ if 'split' in data:
756
+ assert np.all([x.lower() == 'test' for x in data['split']]), 'We only support MTVQA_TEST for now. '
757
+ lt = len(data)
758
+ category_scores = defaultdict(list)
759
+ for i in range(lt):
760
+ line = data.iloc[i]
761
+ ans = line['answer'].strip().lower().replace('.', '')
762
+ pred = line['prediction'].strip().lower().replace('.', '')
763
+ cate = line['category']
764
+ score = 1.0 if ans in pred else 0.0
765
+ category_scores[cate].append(score)
766
+ category_scores['Average'].append(score)
767
+ # Calculate the average score for each category, the score is normalized to [0, 100]
768
+ category_averages = {category: np.mean(scores) * 100 for category, scores in category_scores.items()}
769
+
770
+ suffix = eval_file.split('.')[-1]
771
+ result_file = eval_file.replace(f'.{suffix}', '_acc.json')
772
+ dump(category_averages, result_file)
773
+
774
+ return category_averages
775
+
776
+ # MT-VQA adopts a custom prompt
777
+ def build_prompt(self, line):
778
+ msgs = super().build_prompt(line)
779
+ assert sum([x['type'] == 'text' for x in msgs]) == 1
780
+ for item in msgs:
781
+ if item['type'] == 'text':
782
+ item['value'] += '\nAnswer the question using a word or phrase in the language of the question.'
783
+ return msgs
784
+
785
+
786
+ class TableVQABench(ImageBaseDataset):
787
+ TYPE = 'VQA'
788
+ DATASET_URL = {
789
+ 'TableVQABench': 'https://pai-aigc-photog.oss-cn-hangzhou.aliyuncs.com/mentor-vil/datasets/tablevqa-bench.tsv'
790
+ }
791
+ DATASET_MD5 = {'TableVQABench': '2550adc61bdc82d8e62f3b003de7c62d'}
792
+
793
+ from .utils.tablevqabench import FINTABNETQA_PROMPT, VTABFACT_PROMPT, VWTQ_PROMPT
794
+
795
+ # It returns a DataFrame
796
+ @classmethod
797
+ def evaluate(self, eval_file, **judge_kwargs):
798
+ import pandas as pd
799
+ from .utils.tablevqabench import evaluate_fintabnet, evaluate_tabfact, evaluate_wtq
800
+
801
+ data = load(eval_file)
802
+ assert 'answer' in data and 'prediction' in data
803
+
804
+ data['prediction'] = data['prediction'].str.replace('^Answer: ', '', regex=True)
805
+ data_group = dict(tuple(data.groupby('split')))
806
+ eval_result = {'split': [], 'average_scores': []}
807
+ for split in ['fintabnetqa', 'vtabfact', 'vwtq', 'vwtq_syn']:
808
+ data_split = data_group[split].to_dict(orient='records')
809
+ if split == 'fintabnetqa':
810
+ split_eval_meta = evaluate_fintabnet(data_split, ['accuracy'])
811
+ elif split == 'vtabfact':
812
+ split_eval_meta = evaluate_tabfact(data_split, ['accuracy'])
813
+ elif split == 'vwtq' or split == 'vwtq_syn':
814
+ split_eval_meta = evaluate_wtq(data_split, ['accuracy'])
815
+ eval_result['split'].append(split)
816
+ eval_result['average_scores'].append(split_eval_meta['average_scores'])
817
+
818
+ suffix = eval_file.split('.')[-1]
819
+ result_file = eval_file.replace(f'.{suffix}', '_acc.csv')
820
+ eval_result = pd.DataFrame(eval_result)
821
+ dump(eval_result, result_file)
822
+
823
+ return eval_result
824
+
825
+ # TableVQABench adopts a custom prompt
826
+ def build_prompt(self, line):
827
+ msgs = super().build_prompt(line)
828
+ assert sum([x['type'] == 'text' for x in msgs]) == 1
829
+ for item in msgs:
830
+ if item['type'] == 'text':
831
+ if line['split'] == 'fintabnetqa':
832
+ item['value'] = self.FINTABNETQA_PROMPT.format_map({'question': item['value']})
833
+ elif line['split'] == 'vtabfact':
834
+ item['value'] = self.VTABFACT_PROMPT.format_map({'question': item['value']})
835
+ elif line['split'] == 'vwtq_syn' or line['split'] == 'vwtq':
836
+ item['value'] = self.VWTQ_PROMPT.format_map({'question': item['value']})
837
+ return msgs
838
+
839
+
840
+ class CustomVQADataset(ImageBaseDataset):
841
+ TYPE = 'VQA'
842
+
843
+ def load_data(self, dataset):
844
+ data_path = osp.join(LMUDataRoot(), f'{dataset}.tsv')
845
+
846
+ if file_size(data_path, 'GB') > 1:
847
+ local_path = data_path.replace('.tsv', '_local.tsv')
848
+ if not osp.exists(local_path) or os.environ.get('FORCE_LOCAL', None):
849
+ from ..tools import LOCALIZE
850
+
851
+ LOCALIZE(data_path, local_path)
852
+ data_path = local_path
853
+ return load(data_path)
854
+
855
+ def evaluate(self, eval_file, **judge_kwargs):
856
+ raise NotImplementedError
857
+
858
+
859
+ class CRPE(ImageBaseDataset):
860
+ TYPE = 'VQA'
861
+ DATASET_URL = {
862
+ 'CRPE_EXIST': 'https://huggingface.co/datasets/petter12321/crpe_vlmevalkit/resolve/main/CRPE_EXIST.tsv',
863
+ 'CRPE_RELATION': 'https://huggingface.co/datasets/petter12321/crpe_vlmevalkit/resolve/main/CRPE_RELATION.tsv'
864
+ }
865
+ DATASET_MD5 = {
866
+ 'CRPE_EXIST': '315584e23ac1ff7f8719ed3b7ad90f08',
867
+ 'CRPE_RELATION': 'bad7094cde0b572288f4b119c2d0c656'}
868
+
869
+ @classmethod
870
+ def evaluate(self, eval_file, **judge_kwargs):
871
+ from .utils.crpe import is_correct
872
+ # find-image, count-text, find-text,
873
+ # infer-choose, count-image, visual-reasoning
874
+ score = {
875
+ 'exist': 0,
876
+ 'subject': 0,
877
+ 'predicate': 0,
878
+ 'object': 0,
879
+ 'total': 0,
880
+ }
881
+ num = {
882
+ 'exist': 0,
883
+ 'subject': 0,
884
+ 'predicate': 0,
885
+ 'object': 0,
886
+ 'total': 0,
887
+ }
888
+ final_score_dict = {
889
+ 'exist': 0,
890
+ 'subject': 0,
891
+ 'predicate': 0,
892
+ 'object': 0,
893
+ 'total': 0,
894
+ }
895
+ data = load(eval_file)
896
+ lt = len(data)
897
+ lines = [data.iloc[i] for i in range(lt)]
898
+ for i in tqdm(range(len(lines))):
899
+ line = lines[i]
900
+ predict = str(line['prediction'])
901
+ answers = str(line['answer'])
902
+ # print("predict =", predict)
903
+ # print("answers =", answers)
904
+ category = line['category']
905
+ if is_correct(answers, predict):
906
+ score[category] += 1
907
+ score['total'] += 1
908
+ num[category] += 1
909
+ num['total'] += 1
910
+
911
+ for category in ['exist', 'subject', 'predicate', 'object', 'total']:
912
+ if num[category] != 0:
913
+ final_score_dict[category] = score[category] / num[category]
914
+ else:
915
+ final_score_dict[category] = None
916
+
917
+ score_pth = eval_file.replace('.xlsx', '_score.json')
918
+ dump(final_score_dict, score_pth)
919
+ return final_score_dict
920
+
921
+ def build_prompt(self, line):
922
+ ROOT = LMUDataRoot()
923
+ msgs = super().build_prompt(line)
924
+ for msg in msgs:
925
+ if msg['type'] == 'image':
926
+ msg['value'] = osp.join(osp.join(ROOT, 'images', self.dataset_name), msg['value'])
927
+ return msgs
928
+
929
+
930
+ class QSpatial(ImageBaseDataset):
931
+ TYPE = 'VQA'
932
+ DATASET_URL = {
933
+ 'QSpatial_plus': '',
934
+ 'QSpatial_scannet': ''
935
+ }
936
+
937
+ # NOTE: To evaluate Q-Spatial-ScanNet, you need to get the permission from ScanNet website
938
+ # Once you get the permission, you can use the helper code here to download and extract necessary images:
939
+ # https://github.com/andrewliao11/Q-Spatial-Bench-code?tab=readme-ov-file#for-qspatial_scannet
940
+ qspatial_root = "TO_BE_REPLACED_WITH_THE_PATH_TO_QSPATIAL_DATASET"
941
+ url = "https://raw.githubusercontent.com/andrewliao11/Q-Spatial-Bench-code/refs/heads/main/prompt_templates/"
942
+
943
+ def post_build(self, dataset):
944
+ # Download the prompt templates from github
945
+
946
+ links = [
947
+ self.url + "system_prompt.txt",
948
+ self.url + "spatial_prompt_single.txt",
949
+ self.url + "spatial_prompt_steps.txt",
950
+ self.url + "standard_prompt.txt",
951
+ self.url + "zero_shot_prompt.txt"
952
+ ]
953
+ with tempfile.TemporaryDirectory() as temp_dir:
954
+ for link in links:
955
+ tgt_path = os.path.join(temp_dir, link.split("/")[-1])
956
+ os.system(f"wget {link} -O {tgt_path}")
957
+
958
+ self.system_prompt = open(os.path.join(temp_dir, "system_prompt.txt")).read()
959
+ self._prompt_templates = dict(
960
+ spatial_prompt_single=open(os.path.join(temp_dir, "spatial_prompt_single.txt")).read(),
961
+ spatial_prompt_steps=open(os.path.join(temp_dir, "spatial_prompt_steps.txt")).read(),
962
+ standard_prompt=open(os.path.join(temp_dir, "standard_prompt.txt")).read(),
963
+ zero_shot_prompt=open(os.path.join(temp_dir, "zero_shot_prompt.txt")).read(),
964
+ )
965
+
966
+ # Given one data record, return the built prompt (a multi-modal message), can override
967
+ def build_prompt(self, line):
968
+
969
+ text_prompt_template = self._prompt_templates["spatial_prompt_single"]
970
+ env = SandboxedEnvironment()
971
+ text_prompt = env.from_string(text_prompt_template).render(question=line["question"])
972
+ tgt_path = self.dump_image(line)
973
+
974
+ msgs = []
975
+ if isinstance(tgt_path, list):
976
+ msgs.extend([dict(type='image', value=p) for p in tgt_path])
977
+ else:
978
+ msgs = [dict(type='image', value=tgt_path)]
979
+
980
+ msgs.append(dict(type='text', value=f"{self.system_prompt}\n{text_prompt}"))
981
+ return msgs
982
+
983
+ # Given the dataset name, return the dataset as a pandas dataframe, can override
984
+ def load_data(self, dataset):
985
+ import io
986
+ import pandas as pd
987
+ from datasets import load_dataset
988
+
989
+ hf_dataset = load_dataset("andrewliao11/Q-Spatial-Bench", split=dataset)
990
+ df = hf_dataset.to_pandas()
991
+
992
+ df.reset_index(drop=True, inplace=True)
993
+ df['index'] = df.index
994
+ df['answer'] = list(zip(df['answer_value'], df['answer_unit']))
995
+ df = df[['index'] + [col for col in df.columns if col != 'index']]
996
+
997
+ if dataset == "QSpatial_scannet":
998
+ df = df.drop(columns=["image"])
999
+ df["image"] = [Image.open(os.path.join(self.qspatial_root, image_path)) for image_path in df["image_path"]]
1000
+ else:
1001
+ df["image"] = [Image.open(io.BytesIO(image_dict["bytes"])) for image_dict in df["image"]]
1002
+
1003
+ df["image"] = [encode_image_to_base64(image) for image in df["image"]]
1004
+ return df
1005
+
1006
+ @classmethod
1007
+ def get_multiplier(self, unit):
1008
+
1009
+ unit = unit.lower()
1010
+ if unit in ["meters", "meter", "m", "metre", "metres"]:
1011
+ multiplier = 100
1012
+ elif unit in ["centimeters", "centimeter", "cm"]:
1013
+ multiplier = 1
1014
+ elif unit in ["feet", "foot", "ft"]:
1015
+ multiplier = 30.48
1016
+ elif unit in ["inch", "inches", "in"]:
1017
+ multiplier = 2.54
1018
+ elif unit in ["mm"]:
1019
+ multiplier = 0.1
1020
+ else:
1021
+ print(f"Unknown unit: {unit}")
1022
+ multiplier = 0.
1023
+
1024
+ return multiplier
1025
+
1026
+ @classmethod
1027
+ def parse_string(self, input_str):
1028
+ # Regular expression to match the pattern (number or range, text)
1029
+ match = re.match(r'\(([\d.-]+), (.+)\)', input_str)
1030
+ if match:
1031
+ number_part = match.group(1)
1032
+ text = match.group(2)
1033
+
1034
+ if '-' in number_part:
1035
+ start, end = map(float, number_part.split('-'))
1036
+ number = (start + end) / 2
1037
+ else:
1038
+ number = float(number_part)
1039
+
1040
+ return number * self.get_multiplier(text)
1041
+ else:
1042
+ print(f"Unable to parse the input string {input_str}")
1043
+ return 0
1044
+
1045
+ @classmethod
1046
+ def parse_prediction(self, vlm_response):
1047
+ # Value
1048
+ pattern = r'scalar{([^}]*)}'
1049
+ str_inside_scalar_boxes = re.findall(pattern, vlm_response)[-1]
1050
+ scalar_list = re.findall(r'\d+\.?\d*', str_inside_scalar_boxes)
1051
+ parsed_scalar = np.array(scalar_list).astype(float).mean()
1052
+
1053
+ # Unit
1054
+ pattern = r'distance_unit{([^}]*)}'
1055
+ str_inside_unit_boxes = re.findall(pattern, vlm_response)
1056
+ parsed_unit = str_inside_unit_boxes[-1]
1057
+
1058
+ pred_value_in_cms = parsed_scalar * self.get_multiplier(parsed_unit)
1059
+ return pred_value_in_cms
1060
+
1061
+ # It returns a dictionary
1062
+ @classmethod
1063
+ def evaluate(self, eval_file, **judge_kwargs):
1064
+
1065
+ data = load(eval_file)
1066
+ if "model" in judge_kwargs:
1067
+ from .utils.qspatial import QSpatial_auxeval
1068
+
1069
+ # extract using model
1070
+ model = judge_kwargs['model']
1071
+ suffix = eval_file.split('.')[-1]
1072
+ storage = eval_file.replace(f'.{suffix}', f'_{model}.xlsx')
1073
+ tmp_file = eval_file.replace(f'.{suffix}', f'_{model}.pkl')
1074
+ nproc = judge_kwargs.pop('nproc', 4)
1075
+
1076
+ if not osp.exists(storage):
1077
+ model = build_judge(max_tokens=128, **judge_kwargs)
1078
+
1079
+ assert model.working(), ('Evaluation requires a working OPENAI API\n' + DEBUG_MESSAGE)
1080
+ lt = len(data)
1081
+ lines = [data.iloc[i] for i in range(lt)]
1082
+ tups = [(model, line) for line in lines]
1083
+ indices = [line['index'] for line in lines]
1084
+
1085
+ ans = {}
1086
+ if osp.exists(tmp_file):
1087
+ ans = load(tmp_file)
1088
+ tups = [x for x, i in zip(tups, indices) if i not in ans]
1089
+ indices = [i for i in indices if i not in ans]
1090
+
1091
+ if len(indices):
1092
+ new_results = track_progress_rich(
1093
+ QSpatial_auxeval,
1094
+ tups,
1095
+ nproc=nproc,
1096
+ chunksize=nproc,
1097
+ keys=indices,
1098
+ save=tmp_file,
1099
+ )
1100
+ ans = load(tmp_file)
1101
+ for k, v in zip(indices, new_results):
1102
+ assert k in ans
1103
+ assert ans[k]['log'] == v['log'] and ans[k]['res'] == v['res']
1104
+
1105
+ data['res'] = [ans[idx]['res'] for idx in data['index']]
1106
+ data['log'] = [ans[idx]['log'] for idx in data['index']]
1107
+ dump(data, storage)
1108
+
1109
+ data = load(storage)
1110
+
1111
+ pred_value_in_cms = []
1112
+ for res in data["res"]:
1113
+ try:
1114
+ pred_value_in_cms.append(self.parse_string(res))
1115
+ except ValueError:
1116
+ pred_value_in_cms.append(0.)
1117
+
1118
+ pred_value_in_cms = np.array(pred_value_in_cms) + 1e-8
1119
+ else:
1120
+ # regex parsing
1121
+ pred_value_in_cms = []
1122
+ n_errors_in_parsing = 0
1123
+ for pred in data["prediction"]:
1124
+ try:
1125
+ parsed_value = self.parse_prediction(pred)
1126
+ except IndexError:
1127
+ n_errors_in_parsing += 1
1128
+ parsed_value = 1e-8
1129
+
1130
+ pred_value_in_cms.append(parsed_value)
1131
+
1132
+ print(f"Encounter {n_errors_in_parsing} errors in parsing")
1133
+ pred_value_in_cms = np.array(pred_value_in_cms) + 1e-8
1134
+
1135
+ # Ground truth
1136
+ ground_truth_value_in_cms = []
1137
+ for answer in data["answer"]:
1138
+ value, unit = eval(answer)
1139
+ ground_truth_value_in_cms.append(value * self.get_multiplier(unit))
1140
+ ground_truth_value_in_cms = np.array(ground_truth_value_in_cms) + 1e-8
1141
+
1142
+ # Calculate the score
1143
+ pred_gt = pred_value_in_cms / ground_truth_value_in_cms
1144
+ gt_pred = ground_truth_value_in_cms / pred_value_in_cms
1145
+ delta_2 = np.stack([pred_gt, gt_pred]).max(0) < 2.
1146
+ delta_1_point_5 = np.stack([pred_gt, gt_pred]).max(0) < 1.5
1147
+
1148
+ data["eval_score_delta_2"] = delta_2
1149
+ data["eval_score_delta_1_point_5"] = delta_1_point_5
1150
+
1151
+ final_score_dict = {
1152
+ "delta_2": delta_2.mean(),
1153
+ "delta_1_point_5": delta_1_point_5.mean()
1154
+ }
1155
+ for question_type in set(data["question_type"]):
1156
+ filtered_data = data[data["question_type"] == question_type]
1157
+ delta_2_per_question_type = filtered_data["eval_score_delta_2"].mean()
1158
+ delta_1_point_5_per_question_type = filtered_data["eval_score_delta_1_point_5"].mean()
1159
+ final_score_dict.update({f"{question_type}_delta_2": delta_2_per_question_type})
1160
+ final_score_dict.update({f"{question_type}_delta_1_point_5": delta_1_point_5_per_question_type})
1161
+
1162
+ score_pth = eval_file.replace('.xlsx', '_score.json')
1163
+ dump(final_score_dict, score_pth)
1164
+ return final_score_dict
1165
+
1166
+
1167
+ class MMNIAH(ImageBaseDataset):
1168
+ TYPE = 'VQA'
1169
+ DATASET_URL = {
1170
+ 'MM_NIAH_VAL':
1171
+ 'https://huggingface.co/datasets/petter12321/MM-NIAH-VLMEvalKit/resolve/main/MM_NIAH_VAL.tsv',
1172
+ 'MM_NIAH_TEST':
1173
+ ['https://huggingface.co/datasets/petter12321/MM-NIAH-VLMEvalKit/resolve/main/part-aa',
1174
+ 'https://huggingface.co/datasets/petter12321/MM-NIAH-VLMEvalKit/resolve/main/part-ab',
1175
+ 'https://huggingface.co/datasets/petter12321/MM-NIAH-VLMEvalKit/resolve/main/part-ac',
1176
+ 'https://huggingface.co/datasets/petter12321/MM-NIAH-VLMEvalKit/resolve/main/part-ad',
1177
+ 'https://huggingface.co/datasets/petter12321/MM-NIAH-VLMEvalKit/resolve/main/part-ae']}
1178
+ DATASET_MD5 = {'MM_NIAH_VAL': '27e5a8c3cef7746cb38f89cd86c474c5',
1179
+ 'MM_NIAH_TEST': 'f490eb2a43096307465fe9e7ef13497c'}
1180
+
1181
+ def prepare_tsv(self, url, file_md5=None):
1182
+ import os
1183
+ data_root = LMUDataRoot()
1184
+ os.makedirs(data_root, exist_ok=True)
1185
+ update_flag = False
1186
+ file_name = 'MM_NIAH_VAL.tsv' if 'MM_NIAH_VAL' in url else 'MM_NIAH_TEST.tsv'
1187
+ data_path = osp.join(data_root, file_name)
1188
+ if osp.exists(data_path) and (file_md5 is None or md5(data_path) == file_md5):
1189
+ pass
1190
+ elif file_name == 'MM_NIAH_TEST.tsv':
1191
+ warnings.warn('The dataset tsv is not downloaded')
1192
+ for i in range(len(url)):
1193
+ if osp.exists(osp.join(data_root, 'part-a' + chr(ord('a') + i))):
1194
+ print('part_a' + chr(ord('a') + i) + ' is existed')
1195
+ continue
1196
+ download_file(url[i], data_path)
1197
+ file_prefix = 'part-'
1198
+ output_file = data_path
1199
+ split_files = sorted([f for f in os.listdir(data_root) if f.startswith(file_prefix)])
1200
+ with open(output_file, 'wb') as outfile:
1201
+ # 逐个读取每个拆分文件并写入到输出文件
1202
+ for filename in split_files:
1203
+ with open(osp.join(data_root, filename), 'rb') as infile:
1204
+ outfile.write(infile.read())
1205
+ update_flag = True
1206
+ else:
1207
+ warnings.warn('The dataset tsv is not downloaded')
1208
+ download_file(url, data_path)
1209
+ update_flag = True
1210
+
1211
+ if file_size(data_path, 'GB') > 1:
1212
+ local_path = data_path.replace('.tsv', '_local.tsv')
1213
+ if not osp.exists(local_path) or os.environ.get('FORCE_LOCAL', None) or update_flag:
1214
+ from ..tools import LOCALIZE
1215
+ LOCALIZE(data_path, local_path)
1216
+ data_path = local_path
1217
+ return load(data_path)
1218
+
1219
+ @classmethod
1220
+ def evaluate(self, eval_file, **judge_kwargs):
1221
+ from .utils.mmniah import is_correct
1222
+ # find-image, count-text, find-text,
1223
+ # infer-choose, count-image, visual-reasoning
1224
+ MMNIAH_score = {
1225
+ 'count-text': 0,
1226
+ 'find-image': 0,
1227
+ 'find-text': 0,
1228
+ 'infer-choose': 0,
1229
+ 'count-image': 0,
1230
+ 'visual-reasoning': 0,
1231
+ 'total': 0,
1232
+ }
1233
+ MMNIAH_num = {
1234
+ 'count-text': 0,
1235
+ 'find-image': 0,
1236
+ 'find-text': 0,
1237
+ 'infer-choose': 0,
1238
+ 'count-image': 0,
1239
+ 'visual-reasoning': 0,
1240
+ 'total': 0,
1241
+ }
1242
+ final_score_dict = {
1243
+ 'count-text': 0,
1244
+ 'find-image': 0,
1245
+ 'find-text': 0,
1246
+ 'infer-choose': 0,
1247
+ 'count-image': 0,
1248
+ 'visual-reasoning': 0,
1249
+ 'total': 0,
1250
+ }
1251
+ data = load(eval_file)
1252
+ lt = len(data)
1253
+ lines = [data.iloc[i] for i in range(lt)]
1254
+ for i in tqdm(range(len(lines))):
1255
+ line = lines[i]
1256
+ predict = line['prediction']
1257
+ answers = line['answer']
1258
+ category = line['category']
1259
+ if category in ['visual-reasoning', 'find-image']:
1260
+ answers = int(answers)
1261
+ if is_correct(answers, predict):
1262
+ MMNIAH_score[category] += 1
1263
+ MMNIAH_score['total'] += 1
1264
+ MMNIAH_num[category] += 1
1265
+ MMNIAH_num['total'] += 1
1266
+
1267
+ for category in ['find-image', 'count-text', 'find-text',
1268
+ 'infer-choose', 'count-image', 'visual-reasoning', 'total']:
1269
+ if MMNIAH_num[category] != 0:
1270
+ final_score_dict[category] = MMNIAH_score[category] / MMNIAH_num[category]
1271
+ else:
1272
+ final_score_dict[category] = None
1273
+
1274
+ score_pth = eval_file.replace('.xlsx', '_score.json')
1275
+ dump(final_score_dict, score_pth)
1276
+ return final_score_dict
1277
+
1278
+ def build_prompt(self, line):
1279
+ msgs = super().build_prompt(line)
1280
+ if isinstance(line, int):
1281
+ line = self.data.iloc[line]
1282
+ totalchoice = line['multi-choice options']
1283
+ totalchoice = eval(totalchoice)
1284
+ # find-image, count-text, find-text,
1285
+ # infer-choose, count-image, visual-reasoning
1286
+ context = msgs[-1]['value']
1287
+ context = eval(context)
1288
+ question = context[0] + '\n' + context[1]
1289
+ # tgt_path是所有图像地址列表
1290
+ tgt_path = []
1291
+ for i in range(len(msgs) - 1):
1292
+ tgt_path.append(msgs[i]['value'])
1293
+ choices = totalchoice[0]
1294
+ choices_image = totalchoice[1]
1295
+ if choices:
1296
+ for c_idx, c in enumerate(choices):
1297
+ question = f"{question}\n{chr(c_idx + ord('A'))}. {c}"
1298
+ question += "\nAnswer with the option's letter from the given choices directly."
1299
+ elif choices_image:
1300
+ for c_idx in range(len(choices_image)):
1301
+ question = f"{question}\n{chr(c_idx + ord('A'))}. <image>"
1302
+ question += "\nAnswer with the option's letter from the given choices directly."
1303
+ else:
1304
+ question += '\nAnswer the question using a single word or phrase.'
1305
+ question = '<start>' + question + '<end>'
1306
+ question = question.split('<image>')
1307
+ if choices_image:
1308
+ for i in range(len(question) - 5):
1309
+ question[i] = question[i] + '\n<image>'
1310
+ for i in range(len(question) - 5, len(question) - 1):
1311
+ question[i] = question[i] + '<image>'
1312
+ else:
1313
+ for i in range(len(question) - 1):
1314
+ question[i] = question[i] + '\n<image>'
1315
+ assert len(tgt_path) + 1 == len(question)
1316
+ context = []
1317
+ for i in range(len(tgt_path)):
1318
+ context.append(question[i])
1319
+ context.append(tgt_path[i])
1320
+ context.append(question[-1])
1321
+ context[0] = context[0][7:]
1322
+ context[-1] = context[-1][:-5]
1323
+ msgs = []
1324
+ for i in range(len(context)):
1325
+ if i % 2 == 0:
1326
+ msgs.append(dict(type='text', value=context[i]))
1327
+ else:
1328
+ ROOT = LMUDataRoot()
1329
+ msgs.append(dict(type='image', value=osp.join(osp.join(ROOT, 'images', self.dataset_name), context[i])))
1330
+ for element in msgs:
1331
+ if element['value'] == '':
1332
+ msgs.remove(element)
1333
+ return msgs
vlmeval/VLMEvalKit_old/vlmeval/dataset/image_yorn.py ADDED
@@ -0,0 +1,95 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ from ..smp import *
2
+ from ..utils import *
3
+ from .image_base import ImageBaseDataset
4
+ from .utils import build_judge, DEBUG_MESSAGE
5
+
6
+
7
+ class ImageYORNDataset(ImageBaseDataset):
8
+
9
+ TYPE = 'Y/N'
10
+
11
+ DATASET_URL = {
12
+ 'MME': 'https://opencompass.openxlab.space/utils/VLMEval/MME.tsv',
13
+ 'HallusionBench': 'https://opencompass.openxlab.space/utils/VLMEval/HallusionBench.tsv',
14
+ 'POPE': 'https://opencompass.openxlab.space/utils/VLMEval/POPE.tsv',
15
+ 'AMBER': 'https://huggingface.co/datasets/yifanzhang114/AMBER_base64/resolve/main/AMBER.tsv',
16
+ }
17
+
18
+ DATASET_MD5 = {
19
+ 'MME': 'b36b43c3f09801f5d368627fb92187c3',
20
+ 'HallusionBench': '0c23ac0dc9ef46832d7a24504f2a0c7c',
21
+ 'POPE': 'c12f5acb142f2ef1f85a26ba2fbe41d5',
22
+ 'AMBER': '970d94c0410916166e0a76ba75da7934',
23
+ }
24
+
25
+ # It returns a dataframe
26
+ def evaluate(self, eval_file, **judge_kwargs):
27
+ from .utils.yorn import YOrN_Extraction, YOrN_auxeval
28
+ from .utils.yorn import default_rating, MME_rating, Hallusion_rating, POPE_rating, AMBER_rating
29
+
30
+ dataset = self.dataset_name
31
+ data = load(eval_file)
32
+ data['prediction'] = [str(x) for x in data['prediction']]
33
+ storage = eval_file.replace('.xlsx', '_auxmatch.xlsx')
34
+ tmp_file = eval_file.replace('.xlsx', '_tmp.pkl')
35
+ nproc = judge_kwargs.pop('nproc', 4)
36
+
37
+ if not osp.exists(storage):
38
+ ans_map = {k: YOrN_Extraction(v) for k, v in zip(data['index'], data['prediction'])}
39
+ if osp.exists(tmp_file):
40
+ tmp = load(tmp_file)
41
+ for k in tmp:
42
+ if ans_map[k] == 'Unknown' and tmp[k] != 'Unknown':
43
+ ans_map[k] = tmp[k]
44
+
45
+ data['extracted'] = [ans_map[x] for x in data['index']]
46
+ unknown = data[data['extracted'] == 'Unknown']
47
+
48
+ model = judge_kwargs.get('model', 'exact_matching')
49
+ if model == 'exact_matching':
50
+ model = None
51
+ elif gpt_key_set():
52
+ model = build_judge(**judge_kwargs)
53
+ if not model.working():
54
+ warnings.warn('OPENAI API is not working properly, will use exact matching for evaluation')
55
+ warnings.warn(DEBUG_MESSAGE)
56
+ model = None
57
+ else:
58
+ model = None
59
+ warnings.warn('OPENAI_API_KEY is not working properly, will use exact matching for evaluation')
60
+
61
+ if model is not None:
62
+ lt = len(unknown)
63
+ lines = [unknown.iloc[i] for i in range(lt)]
64
+ tups = [(model, line) for line in lines]
65
+ indices = list(unknown['index'])
66
+ if len(tups):
67
+ res = track_progress_rich(
68
+ YOrN_auxeval, tups, nproc=nproc, chunksize=nproc, keys=indices, save=tmp_file)
69
+ for k, v in zip(indices, res):
70
+ ans_map[k] = v
71
+
72
+ data['extracted'] = [ans_map[x] for x in data['index']]
73
+ dump(data, storage)
74
+
75
+ data = load(storage)
76
+ if listinstr(['AMBER'], dataset):
77
+ data['score'] = (data['answer'].str.lower() == data['extracted'].str.lower())
78
+ else:
79
+ data['score'] = (data['answer'] == data['extracted'])
80
+ dump(data, storage)
81
+
82
+ if dataset is not None and listinstr(['MME'], dataset):
83
+ score = MME_rating(storage)
84
+ elif dataset is not None and listinstr(['Hallusion'], dataset):
85
+ score = Hallusion_rating(storage)
86
+ elif dataset is not None and listinstr(['POPE'], dataset):
87
+ score = POPE_rating(storage)
88
+ elif dataset is not None and listinstr(['AMBER'], dataset):
89
+ score = AMBER_rating(storage)
90
+ else:
91
+ score = default_rating(storage)
92
+
93
+ score_tgt = eval_file.replace('.xlsx', '_score.csv')
94
+ dump(score, score_tgt)
95
+ return score
vlmeval/VLMEvalKit_old/vlmeval/dataset/miabench.py ADDED
@@ -0,0 +1,167 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import json
2
+ import os
3
+
4
+ import pandas as pd
5
+
6
+ from .image_base import ImageBaseDataset
7
+ from ..smp import *
8
+ from .utils import build_judge, DEBUG_MESSAGE
9
+ from ..utils import track_progress_rich
10
+
11
+
12
+ def generate_prompt(d):
13
+ question = d['question']
14
+ weights = eval(d['component_weight'])
15
+ components = eval(d['components'])
16
+ num_of_component = int(d['num_of_component'])
17
+ response = d['prediction']
18
+
19
+ if num_of_component == 1:
20
+ components = f"The first component is: '{components[0]}'. "
21
+ score = f"The first component is worth: {weights[0]} scores. "
22
+ elif num_of_component == 2:
23
+ components = f"The first component is: '{components[0]}', and the second component is '{components[1]}'. "
24
+ score = f"The first and second component is each worth {weights[0]} and {weights[1]} scores. "
25
+ elif num_of_component == 3:
26
+ components = (
27
+ f"The first component is: '{components[0]}', and the second component is '{components[1]}', "
28
+ f"and the third component is '{components[2]}'. "
29
+ )
30
+ score = (
31
+ "The first, second, and third component is each worth "
32
+ f"{weights[0]}, {weights[1]}, and {weights[2]} scores."
33
+ )
34
+ elif num_of_component == 4:
35
+ components = (
36
+ f"The first component is: '{components[0]}', and the second component is '{components[1]}', "
37
+ f"and the third component is '{components[2]}', and the fourth component is '{components[3]}'. "
38
+ )
39
+ score = (
40
+ "The first, second, third, and fourth component is each worth "
41
+ f"{weights[0]}, {weights[1]}, {weights[2]}, and {weights[3]} scores."
42
+ )
43
+ elif num_of_component == 5:
44
+ components = (
45
+ f"The first component is: '{components[0]}', and the second component is '{components[1]}', "
46
+ f"and the third component is '{components[2]}', and the fourth component is '{components[3]}', "
47
+ f"and the fifth component is '{components[4]}'. "
48
+ )
49
+ score = (
50
+ "The first, second, third, fourth, and fifth component is each worth "
51
+ f"{weights[0]}, {weights[1]}, {weights[2]}, {weights[3]}, and {weights[4]} scores."
52
+ )
53
+
54
+ return (
55
+ "Here is an instruction for a multimodal LLM: '"
56
+ f"{question}"
57
+ "'. You need to grade if the response from the model follows each component of the instruction. "
58
+ f"{components}"
59
+ "The response is: '"
60
+ f"{response}"
61
+ "'. You need to score the response and be strict. The total score ranges from 0 to 10, "
62
+ "depending on if the response follows the instruction. "
63
+ f"{score}"
64
+ "List scores of each component, and the total score in one sentence in this format: "
65
+ "score of component 1: x/2, score of component 2: y/8, total score: z/10. Then explain your reasons."
66
+ )
67
+
68
+
69
+ def process_rawscore(component_type, raw_score):
70
+ first_sentence = raw_score.split('.')[0].split(',')
71
+ score_dict = {}
72
+ for i in range(len(first_sentence) - 1):
73
+ score_ = first_sentence[i].split(':')[1][1:].split('/')
74
+ score = int(score_[0]) / int(score_[1])
75
+ score_dict[component_type[i]] = score
76
+ total_score_ = first_sentence[i + 1].split(':')[1][1:].split('/')
77
+ total_score = int(total_score_[0]) / int(total_score_[1])
78
+ score_dict['total_score'] = total_score
79
+ return score_dict
80
+
81
+
82
+ def get_score_dict(data, score_raw):
83
+ cat_score_dict = {}
84
+ for i in range(len(data)):
85
+ try:
86
+ cmp = data['component_type'][i][2:-2]
87
+ cmp_list = cmp.split('\', \'')
88
+ score_dict = process_rawscore(cmp_list, score_raw[i])
89
+ for key, val in score_dict.items():
90
+ if key not in cat_score_dict.keys():
91
+ cat_score_dict[key] = [val]
92
+ else:
93
+ cat_score_dict[key].append(val)
94
+ except:
95
+ pass
96
+ cat_score_dict_average = {}
97
+ for key, val in cat_score_dict.items():
98
+ cat_score_dict_average[key] = sum(val) / len(val)
99
+ return cat_score_dict_average
100
+
101
+
102
+ class MIABench(ImageBaseDataset):
103
+ TYPE = 'VQA'
104
+
105
+ DATASET_URL = {
106
+ 'MIA-Bench': 'https://opencompass.openxlab.space/utils/VLMEval/Mia-Bench.tsv',
107
+ }
108
+ DATASET_MD5 = {
109
+ 'MIA-Bench': '0b9de595f4dd40af18a69b94d89aba82',
110
+ }
111
+
112
+ @classmethod
113
+ def evaluate(self, eval_file, **judge_kwargs):
114
+ judge_name = judge_kwargs.pop('model', 'gpt-4o')
115
+
116
+ model = build_judge(model=judge_name, **judge_kwargs)
117
+ suffix = eval_file.split('.')[-1]
118
+
119
+ storage = eval_file.replace(f'.{suffix}', f'_{judge_name}.xlsx') # noqa: F841
120
+ tmp_file = eval_file.replace(f'.{suffix}', f'_{judge_name}.pkl') # noqa: F841
121
+ nproc = judge_kwargs.pop('nproc', 4) # noqa: F841
122
+
123
+ if not osp.exists(storage):
124
+ data = load(eval_file)
125
+ num_samples = len(data)
126
+ lines = [data.loc[i] for i in range(num_samples)]
127
+ prompts = [generate_prompt(line) for line in lines]
128
+ org_data = MIABench('MIA-Bench').data
129
+ img_map = {x: y for x, y in zip(org_data['index'], org_data['image'])}
130
+ image_b64 = [img_map[idx] for idx in data['index']]
131
+ indices = list(data['index'])
132
+ mm_messages = [
133
+ dict(message=[
134
+ dict(type='text', value=prompt),
135
+ dict(type='image', value=f'data:image/jpeg;base64,{b64}')
136
+ ])
137
+ for prompt, b64 in zip(prompts, image_b64)
138
+ ]
139
+
140
+ res = {}
141
+ if osp.exists(tmp_file):
142
+ res = load(tmp_file)
143
+
144
+ jobs = {k: v for k, v in zip(indices, mm_messages) if k not in res}
145
+ job_keys = list(jobs.keys())
146
+ job_vals = [jobs[k] for k in job_keys]
147
+
148
+ resps = track_progress_rich(
149
+ model.generate,
150
+ job_vals,
151
+ nproc=nproc,
152
+ chunksize=nproc,
153
+ keys=job_keys,
154
+ save=tmp_file,
155
+ )
156
+ for k, resp in zip(job_keys, resps):
157
+ res[k] = resp
158
+ data['score_raw'] = [res[idx] for idx in indices]
159
+ dump(data, storage)
160
+
161
+ goresult = load(storage)
162
+ results = get_score_dict(goresult, goresult['score_raw'])
163
+ result_pth = storage.replace('.xlsx', '_score.csv')
164
+ results_pd = pd.DataFrame.from_dict(list(results.items()))
165
+ dump(results_pd, result_pth)
166
+
167
+ return results
vlmeval/VLMEvalKit_old/vlmeval/dataset/mmmath.py ADDED
@@ -0,0 +1,446 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import re
2
+ import json
3
+ import sympy as sp
4
+ import numpy as np
5
+ from sympy import simplify, Eq, sympify, Pow, pi
6
+ from sympy.parsing.latex import parse_latex
7
+ import sys
8
+ import math
9
+ import os
10
+ import argparse
11
+
12
+ from .image_base import ImageBaseDataset
13
+ from ..utils import track_progress_rich
14
+ from ..smp import load, dump
15
+
16
+
17
+ class AutoScoringJudge:
18
+ def __init__(self):
19
+ # Map of special symbols to their replacements
20
+ self.special_signal_map = {
21
+ "\\left": "",
22
+ "\\right": "",
23
+ "厘米":"",
24
+ # "∶": ":",
25
+ ",": ",",
26
+ "$": "",
27
+ "(":"(",
28
+ ")":")",
29
+ "\\infty":"oo",
30
+ "\\colon ":":",
31
+ # "\\approx": "=",
32
+ # "\\simeq": "=",
33
+ # "\\sim": "=",
34
+ # "^\\prime": "'",
35
+ # "^{\\prime}": "'",
36
+ "+":"+",
37
+ "\\, ": "",
38
+ "\\,":"",
39
+ "^\\circ": "",
40
+ "^{\\circ}": "",
41
+ # "%": "",
42
+ }
43
+ self.pi = parse_latex("\\pi")
44
+ # MM-Math default precision
45
+ self.precision = 1e-2
46
+
47
+ def trans_greater_sign_to_interval(self, expr:str):
48
+ expr_tmp = expr.split("<")
49
+ return "(" + expr_tmp[0] + ", " + expr_tmp[-1] + ")"
50
+
51
+ def split_by_comma(self, expr: str):
52
+ # Splits expressions by commas outside of brackets
53
+ in_bracket_num = 0
54
+ splitted_expr = []
55
+ start_idx = 0
56
+ for i, char in enumerate(expr):
57
+ if char in ["(", "["]:
58
+ in_bracket_num += 1
59
+ elif char in [")", "]"]:
60
+ in_bracket_num -= 1
61
+ elif char == "," and in_bracket_num == 0:
62
+ splitted_expr.append(expr[start_idx:i].strip())
63
+ start_idx = i + 1
64
+
65
+ if start_idx < len(expr):
66
+ splitted_expr.append(expr[start_idx:].strip())
67
+
68
+ return splitted_expr
69
+
70
+ def trans_plus_minus_sign(self, expr_list: list):
71
+ # Translates plus-minus signs into separate expressions
72
+ new_expr_list = []
73
+ for expr in expr_list:
74
+ if "\\pm" in expr:
75
+ new_expr_list.append(expr.replace("\\pm", "+"))
76
+ new_expr_list.append(expr.replace("\\pm", "-"))
77
+ else:
78
+ new_expr_list.append(expr)
79
+
80
+ return new_expr_list
81
+
82
+ def judge(self, expression1, expression2, precision=1e-2):
83
+ # Judge if two expressions are equal (expression1 is considered as the Ground Truth)
84
+ # Default precision is a list for supporting multiple expressions
85
+ precision = precision if isinstance(precision, list) else [precision]
86
+
87
+ try:
88
+ expression1, expression2 = self.preprocess(expression1, expression2)
89
+ except:
90
+ return False
91
+ if expression1 == expression2:
92
+ # print("Exactly equal")
93
+ return True
94
+
95
+ # Remove Chinese characters from the string, as answers like "yes" or "no" in Chinese have been considered
96
+ expression1 = expression1 if re.fullmatch(r"[\u4e00-\u9fff]+", expression1) else re.sub(r'[\u4e00-\u9fff]+', '', expression1) # noqa: E501
97
+ expression2 = expression2 if re.fullmatch(r'[\u4e00-\u9fff]+', expression2) else re.sub(r'[\u4e00-\u9fff]+', '', expression2) # noqa: E501
98
+ # Check if two < or > in expression
99
+ if self.is_two_greater_sign(expression1):
100
+ expression1 = self.trans_greater_sign_to_interval(expression1)
101
+
102
+ if self.is_two_greater_sign(expression2):
103
+ expression2 = self.trans_greater_sign_to_interval(expression2)
104
+
105
+ expression1 = self.split_by_comma(expression1)
106
+ expression2 = self.split_by_comma(expression2)
107
+
108
+ temp_list1 = self.trans_plus_minus_sign(expression1)
109
+ temp_list2 = self.trans_plus_minus_sign(expression2)
110
+
111
+ # Set up a list for allowed errors
112
+ if len(precision) <= 1:
113
+ precision = precision * len(temp_list1)
114
+
115
+ if len(temp_list1) != len(temp_list2):
116
+ return False
117
+
118
+ # Check if elements in both lists can be paired and are equal
119
+ idx = -1
120
+ while len(temp_list1) != 0:
121
+ idx = (idx + 1) % len(temp_list1)
122
+
123
+ item1 = temp_list1[idx]
124
+ self.precision = precision[idx]
125
+
126
+ for item2 in temp_list2:
127
+ if self.is_equal(item1, item2):
128
+ temp_list1.remove(item1)
129
+ temp_list2.remove(item2)
130
+ precision.remove(self.precision)
131
+ break
132
+ else:
133
+ # If no match was found, return False
134
+ return False
135
+
136
+ # If all elements are matched, return True
137
+ return True
138
+
139
+ def is_interval(self, expr):
140
+ # Checks if an expression is an interval
141
+ return expr.startswith(("(", "[")) and expr.endswith((")", "]"))
142
+
143
+ def is_two_greater_sign(self, expr):
144
+ match = re.findall(r'<', expr)
145
+ return len(match) == 2
146
+
147
+ def sympy_sub_pi(self, expression_sympy):
148
+ # Replaces the symbol for pi in sympy expressions with its numerical value
149
+ return expression_sympy.subs(self.pi, math.pi)
150
+
151
+ def is_equal(self, expression1, expression2):
152
+ # Default first expression is ground truth. Check if expressions are equal in different aspects
153
+ if expression1 == expression2 and expression1 != "" and expression2 != "":
154
+ # print("Equivalent natively")
155
+ return True
156
+
157
+ # First check if both are intervals
158
+ if self.is_interval(expression1) and self.is_interval(expression2):
159
+ try:
160
+ if self.interval_equal(expression1, expression2):
161
+ # print("Interval equivalent")
162
+ return True
163
+ except:
164
+ return False
165
+
166
+ # Then check for numerical equality
167
+ try:
168
+ if self.numerical_equal(expression1, expression2):
169
+ # print("Numerically equivalent")
170
+ return True
171
+ except:
172
+ pass
173
+ # Then check if expressions are mathematically equal
174
+ try:
175
+ if self.expression_equal(expression1, expression2) and not ("=" in expression1 and "=" in expression2):
176
+ # print("Expression equivalent")
177
+ return True
178
+ except:
179
+ pass
180
+
181
+ # Lastly, check for equation equality
182
+ try:
183
+ if self.equation_equal(expression1, expression2):
184
+ # print("Equation equivalent")
185
+ return True
186
+ except:
187
+ pass
188
+
189
+ return False
190
+
191
+ def numerical_equal(self, expression1: str, expression2: str, include_percentage: bool = True):
192
+ # Check if two numerical values are equal within an allowed error range
193
+ # Includes possible percentage cases
194
+ reference = float(expression1)
195
+ prediction = float(expression2)
196
+
197
+ if include_percentage:
198
+ gt_result = [reference / 100, reference, reference * 100]
199
+ else:
200
+ gt_result = [reference]
201
+
202
+ for item in gt_result:
203
+ if abs(item - prediction) <= self.precision * 1.01:
204
+ return True
205
+ return False
206
+
207
+ def expression_equal(self, exp1, exp2):
208
+ # Check if two expressions are mathematically equivalent
209
+ # Extract expression and use sympy for equivalence checking
210
+ def extract_expression(expression):
211
+ if "=" in expression:
212
+ expression = expression.split("=")[1]
213
+ return expression.strip()
214
+
215
+ exp1 = extract_expression(exp1)
216
+ exp2 = extract_expression(exp2)
217
+
218
+ exp_too_long = len(exp1) > 300 or len(exp2) > 300
219
+
220
+ expr1_sym = sympify(parse_latex(exp1))
221
+ expr2_sym = sympify(parse_latex(exp2))
222
+ if expr1_sym == expr2_sym:
223
+ return True
224
+ else:
225
+ expr1_sym = self.sympy_sub_pi(expr1_sym)
226
+ expr2_sym = self.sympy_sub_pi(expr2_sym)
227
+
228
+ if (expr1_sym.has(sp.Symbol) and not expr2_sym.has(sp.Symbol)) or \
229
+ (not expr1_sym.has(sp.Symbol) and expr2_sym.has(sp.Symbol)):
230
+ return False
231
+ elif not expr1_sym.has(sp.Symbol) and not expr2_sym.has(sp.Symbol):
232
+ try:
233
+ if not (self.can_compute_power(expr1_sym) and self.can_compute_power(expr2_sym)):
234
+ print("These two numbers cannot be calculated by the current computer for: "
235
+ f"\"{str(expr1_sym)}\" and \"{str(expr2_sym)}\"")
236
+ return False
237
+ if exp_too_long:
238
+ print(f'Expression {exp1} or {exp2} is too long to compute. ')
239
+ return False
240
+ if abs(expr1_sym.evalf() - expr2_sym.evalf()) <= self.precision * 1.01:
241
+ return True
242
+ else:
243
+ return False
244
+ except:
245
+ return False
246
+ elif exp_too_long:
247
+ print(f'Expression {exp1} or {exp2} is too long to compute. ')
248
+ return False
249
+ else:
250
+ try:
251
+ simplified_expr = simplify(expr1_sym - expr2_sym)
252
+ num_value = simplified_expr.evalf()
253
+ return abs(num_value) < 1e-3
254
+ except:
255
+ return False
256
+
257
+ def equation_equal(self, expression1, expression2):
258
+ # Check if two equations are mathematically equivalent
259
+ # Simplify equations and use sympy for equivalence checking
260
+ def simplify_equation(latex_eq):
261
+ lhs, rhs = latex_eq.split('=')
262
+
263
+ lhs_expr = parse_latex(lhs)
264
+ rhs_expr = parse_latex(rhs)
265
+
266
+ equation = Eq(lhs_expr, rhs_expr)
267
+
268
+ simplified_eq = simplify(equation.lhs - equation.rhs)
269
+
270
+ return simplified_eq
271
+
272
+ expr1_sym = simplify_equation(expression1)
273
+ expr2_sym = simplify_equation(expression2)
274
+
275
+ division_result_1 = simplify(expr1_sym / expr2_sym)
276
+ division_result_2 = simplify(expr2_sym / expr1_sym)
277
+
278
+ if ((division_result_1.is_Integer and division_result_1 != 0) or # noqa: W504
279
+ (division_result_2.is_Integer and division_result_2 != 0)):
280
+ return True
281
+ else:
282
+ return False
283
+
284
+ def interval_equal(self, expression1, expression2):
285
+ # Check if two intervals are mathematically equivalent
286
+ def compare_two_interval(inter1, inter2):
287
+ if inter1[0] != inter2[0] or inter1[-1] != inter2[-1]:
288
+ return False
289
+
290
+ inter1 = inter1.strip('[]()')
291
+ inter2 = inter2.strip('[]()')
292
+
293
+ items_1 = inter1.split(',')
294
+ items_2 = inter2.split(',')
295
+
296
+ for item_1, item_2 in zip(items_1, items_2):
297
+ if not self.expression_equal(item_1, item_2):
298
+ return False
299
+ return True
300
+
301
+ interval1 = expression1
302
+ interval2 = expression2
303
+
304
+ if interval1 == interval2:
305
+ return True
306
+ else:
307
+ inter_list1 = interval1.split("\\cup")
308
+ inter_list2 = interval2.split("\\cup")
309
+
310
+ if len(inter_list1) != len(inter_list2):
311
+ return False
312
+ else:
313
+ for inter1, inter2 in zip(inter_list1, inter_list2):
314
+ if not compare_two_interval(inter1, inter2):
315
+ return False
316
+ return True
317
+
318
+ def preprocess(self, expression1, expression2):
319
+ # Preprocess expressions to extract and replace special symbols
320
+ def extract_boxed_content(latex_str):
321
+ boxed_matches = re.finditer(r'\\boxed{', latex_str)
322
+ results = ""
323
+
324
+ for match in boxed_matches:
325
+ start_index = match.end()
326
+ end_index = start_index
327
+ stack = 1
328
+
329
+ while stack > 0 and end_index < len(latex_str):
330
+ if latex_str[end_index] == '{':
331
+ stack += 1
332
+ elif latex_str[end_index] == '}':
333
+ stack -= 1
334
+ end_index += 1
335
+
336
+ if stack == 0:
337
+ content = latex_str[start_index:end_index - 1]
338
+ results += content + ","
339
+ else:
340
+ raise ValueError("Mismatched braces in LaTeX string.")
341
+
342
+ if results == "":
343
+ last_line_ans = latex_str.strip().split("\n")[-1]
344
+ dollar_pattern = r"\$(.*?)\$"
345
+ answers = re.findall(dollar_pattern, last_line_ans)
346
+
347
+ if answers:
348
+ for ans in answers:
349
+ results += ans + ","
350
+ else:
351
+ results = latex_str
352
+
353
+ return results
354
+
355
+ def sepcial_symbol_replace(expression):
356
+
357
+ expression = expression.replace("\\text{cm}^2", '').replace("\\text{cm}", "").replace("\\,cm", '').replace("\\text{ cm}", '').replace("cm", '').replace("\\text{分米}^2", '').replace("cm^{2}", '').replace("60 \\text{ cm}^2",'').replace("\\ \\text{m}", "").replace("\\text{米}","").strip() # noqa: E501
358
+
359
+ expression = re.sub(r"(.+)m$", r"\1", expression)
360
+
361
+ if "\\in " in expression:
362
+ expression = expression.split("\\in ")[1]
363
+
364
+ for signal in self.special_signal_map:
365
+ expression = expression.replace(signal, self.special_signal_map[signal])
366
+
367
+ expression = re.sub(r'(\\sin|\\cos|\\tan)(\d+)', r'\1((\2/180)\\pi)', expression)
368
+
369
+ expression = expression.strip("\n,.:;^_=+`!@#%^&*~,。")
370
+
371
+ pattern = r'\\(?:mathrm|mathbf)\{~?([^}]*)\}'
372
+ expression = re.sub(pattern, r'\1', expression)
373
+
374
+ return expression
375
+
376
+ exp1, exp2 = extract_boxed_content(expression1), extract_boxed_content(expression2)
377
+
378
+ exp1, exp2 = sepcial_symbol_replace(exp1), sepcial_symbol_replace(exp2)
379
+
380
+ return exp1, exp2
381
+
382
+ def can_compute_power(self, expr):
383
+ # Checks if a power expression can be computed
384
+ if isinstance(expr, Pow):
385
+ base, exp = expr.as_base_exp()
386
+ if base.is_number and exp.is_number:
387
+ MAX_EXP = 1000 # Adjust based on computing environment
388
+ if abs(exp.evalf()) > MAX_EXP:
389
+ return False
390
+ else:
391
+ return True
392
+ else:
393
+ return False
394
+ else:
395
+ return True # Not a power expression, can compute
396
+
397
+
398
+ class MMMath(ImageBaseDataset):
399
+
400
+ TYPE = 'VQA'
401
+
402
+ DATASET_URL = {
403
+ 'MM-Math': 'https://opencompass.openxlab.space/utils/VLMEval/MM-Math.tsv',
404
+ }
405
+ DATASET_MD5 = {
406
+ 'MM-Math': '1f064ed7c4e0e8926a3fa65849419ca5',
407
+ }
408
+
409
+ @classmethod
410
+ def evaluate(self, eval_file, **kwargs):
411
+
412
+ data = load(eval_file)
413
+ judger = AutoScoringJudge()
414
+ func = judger.judge
415
+
416
+ tups = [dict(expression1=x, expression2=y) for x, y in zip(data['answer'], data['prediction'])]
417
+
418
+ res = track_progress_rich(func, tups, nproc=16)
419
+ data['hit'] = res
420
+ dump(data, eval_file)
421
+
422
+ score_file = eval_file.replace('.xlsx', '_score.json')
423
+ score = {}
424
+ score['overall'] = np.mean(data['hit'])
425
+ # Results by Difficulty
426
+ difficulties = set(data['difficulty'])
427
+ for d in difficulties:
428
+ score[f'Difficulty-{d}'] = np.mean(data[data['difficulty'] == d]['hit'])
429
+
430
+ # Results by Year
431
+ years = set(data['year'])
432
+ for y in years:
433
+ score[f'Year-{y}'] = np.mean(data[data['year'] == y]['hit'])
434
+
435
+ # Results by Knowledge-L1
436
+ points = set(data['knowledge_l1'])
437
+ for p in points:
438
+ score[f'Knowledge-L1-{p}'] = np.mean(data[data['knowledge_l1'] == p]['hit'])
439
+
440
+ # Results by Knowledge-L2
441
+ points = set(data['knowledge_l2'])
442
+ for p in points:
443
+ score[f'Knowledge-L2-{p}'] = np.mean(data[data['knowledge_l2'] == p]['hit'])
444
+
445
+ dump(score, score_file)
446
+ return score
vlmeval/VLMEvalKit_old/vlmeval/dataset/mvbench.py ADDED
@@ -0,0 +1,668 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import huggingface_hub
2
+ from huggingface_hub import snapshot_download
3
+ from ..smp import *
4
+ from .video_base import VideoBaseDataset
5
+ from .utils import build_judge, DEBUG_MESSAGE
6
+ from ..utils import track_progress_rich
7
+ import torchvision.transforms as T
8
+ from torchvision import transforms
9
+ from torchvision.transforms.functional import InterpolationMode
10
+ from decord import VideoReader, cpu
11
+ import imageio
12
+ import cv2
13
+ import zipfile
14
+ import os
15
+ import glob
16
+ from .utils.mvbench import *
17
+
18
+ FAIL_MSG = 'Failed to obtain answer via API.'
19
+
20
+
21
+ class MVBench(VideoBaseDataset):
22
+
23
+ MD5 = 'fd21d36522cdedd46d84dc46715ad832'
24
+ SYS = """Carefully watch the video and pay attention to the cause and sequence of events, \
25
+ the detail and movement of objects, and the action and pose of persons. \
26
+ Based on your observations, select the best option that accurately addresses the question.
27
+ """
28
+
29
+ TYPE = 'Video-MCQ'
30
+
31
+ def __init__(self, dataset='MVBench', pack=False):
32
+ self.type_data_list = {
33
+ 'Action Sequence': ('action_sequence.json',
34
+ 'your_data_path/star/Charades_v1_480/', 'video', True), # has start & end
35
+ 'Action Prediction': ('action_prediction.json',
36
+ 'your_data_path/star/Charades_v1_480/', 'video', True), # has start & end
37
+ 'Action Antonym': ('action_antonym.json',
38
+ 'your_data_path/ssv2_video/', 'video', False),
39
+ 'Fine-grained Action': ('fine_grained_action.json',
40
+ 'your_data_path/Moments_in_Time_Raw/videos/', 'video', False),
41
+ 'Unexpected Action': ('unexpected_action.json',
42
+ 'your_data_path/FunQA_test/test/', 'video', False),
43
+ 'Object Existence': ('object_existence.json',
44
+ 'your_data_path/clevrer/video_validation/', 'video', False),
45
+ 'Object Interaction': ('object_interaction.json',
46
+ 'your_data_path/star/Charades_v1_480/', 'video', True), # has start & end
47
+ 'Object Shuffle': ('object_shuffle.json',
48
+ 'your_data_path/perception/videos/', 'video', False),
49
+ 'Moving Direction': ('moving_direction.json',
50
+ 'your_data_path/clevrer/video_validation/', 'video', False),
51
+ 'Action Localization': ('action_localization.json',
52
+ 'your_data_path/sta/sta_video/', 'video', True), # has start & end
53
+ 'Scene Transition': ('scene_transition.json',
54
+ 'your_data_path/scene_qa/video/', 'video', False),
55
+ 'Action Count': ('action_count.json',
56
+ 'your_data_path/perception/videos/', 'video', False),
57
+ 'Moving Count': ('moving_count.json',
58
+ 'your_data_path/clevrer/video_validation/', 'video', False),
59
+ 'Moving Attribute': ('moving_attribute.json',
60
+ 'your_data_path/clevrer/video_validation/', 'video', False),
61
+ 'State Change': ('state_change.json',
62
+ 'your_data_path/perception/videos/', 'video', False),
63
+ 'Fine-grained Pose': ('fine_grained_pose.json',
64
+ 'your_data_path/nturgbd/', 'video', False),
65
+ 'Character Order': ('character_order.json',
66
+ 'your_data_path/perception/videos/', 'video', False),
67
+ 'Egocentric Navigation': ('egocentric_navigation.json',
68
+ 'your_data_path/vlnqa/', 'video', False),
69
+ 'Episodic Reasoning': ('episodic_reasoning.json',
70
+ 'your_data_path/tvqa/frames_fps3_hq/', 'frame', True), # has start & end, read frame
71
+ 'Counterfactual Inference': ('counterfactual_inference.json',
72
+ 'your_data_path/clevrer/video_validation/', 'video', False),
73
+ }
74
+ super().__init__(dataset=dataset, pack=pack)
75
+
76
+ @classmethod
77
+ def supported_datasets(cls):
78
+ return ['MVBench']
79
+
80
+ def prepare_dataset(self, dataset_name='MVBench', repo_id='OpenGVLab/MVBench'):
81
+ def check_integrity(pth):
82
+ data_file = osp.join(pth, f'{dataset_name}.tsv')
83
+
84
+ if not os.path.exists(data_file):
85
+ return False
86
+
87
+ if md5(data_file) != self.MD5:
88
+ return False
89
+
90
+ data = load(data_file)
91
+ for idx, item in data.iterrows():
92
+ if not osp.exists(osp.join(pth, item['prefix'], item['video'])):
93
+ return False
94
+ return True
95
+
96
+ if modelscope_flag_set():
97
+ repo_id = 'modelscope/MVBench'
98
+
99
+ cache_path = get_cache_path(repo_id, branch='main')
100
+ if cache_path is not None and check_integrity(cache_path):
101
+ dataset_path = cache_path
102
+ else:
103
+ def unzip_hf_zip(pth):
104
+ pth = os.path.join(pth, 'video/')
105
+ for filename in os.listdir(pth):
106
+ if filename.endswith('.zip'):
107
+ # 构建完整的文件路径
108
+ zip_path = os.path.join(pth, filename)
109
+
110
+ # 解压 ZIP 文件
111
+ with zipfile.ZipFile(zip_path, 'r') as zip_ref:
112
+ zip_ref.extractall(pth)
113
+
114
+ def generate_tsv(pth):
115
+ data_file = osp.join(pth, f'{dataset_name}.tsv')
116
+ if os.path.exists(data_file) and md5(data_file) == self.MD5:
117
+ return
118
+ json_data_dir = os.path.join(pth, 'json')
119
+ self.data_list = []
120
+ for k, v in self.type_data_list.items():
121
+ with open(os.path.join(json_data_dir, v[0]), 'r') as f:
122
+ json_data = json.load(f)
123
+ for data in json_data:
124
+ if os.path.exists(os.path.join(pth, v[1].replace('your_data_path', 'video'), data['video'])):
125
+ self.data_list.append({
126
+ 'task_type': k,
127
+ 'prefix': v[1].replace('your_data_path', 'video'),
128
+ 'data_type': v[2],
129
+ 'bound': v[3],
130
+ 'start': data['start'] if 'start' in data.keys() else None,
131
+ 'end': data['end'] if 'end' in data.keys() else None,
132
+ 'video': data['video'],
133
+ 'question': data['question'],
134
+ 'answer': data['answer'],
135
+ 'candidates': data['candidates']
136
+ })
137
+ else:
138
+ print(
139
+ 'NTURGB-D zip file is removed according to MVBench, you can view it at '
140
+ 'https://huggingface.co/datasets/OpenGVLab/MVBench for detailed reason.'
141
+ )
142
+ raise Exception(
143
+ f"{os.path.join(v[1].replace('your_data_path', 'video'), data['video'])} does not exist"
144
+ )
145
+
146
+ data_df = pd.DataFrame(self.data_list)
147
+ data_df = data_df.assign(index=range(len(data_df)))
148
+ data_df.to_csv(data_file, sep='\t', index=False)
149
+
150
+ def move_files(pth):
151
+ src_folder = os.path.join(pth, 'video/data0613')
152
+ if not os.path.exists(src_folder):
153
+ return
154
+ for subdir in os.listdir(src_folder):
155
+ subdir_path = os.path.join(src_folder, subdir)
156
+ if os.path.isdir(subdir_path):
157
+ for subsubdir in os.listdir(subdir_path):
158
+ subsubdir_path = os.path.join(subdir_path, subsubdir)
159
+ if os.path.isdir(subsubdir_path):
160
+ for item in os.listdir(subsubdir_path):
161
+ item_path = os.path.join(subsubdir_path, item)
162
+ target_folder = os.path.join(pth, 'video', subdir, subsubdir)
163
+ if not os.path.exists(target_folder):
164
+ os.makedirs(target_folder)
165
+ target_path = os.path.join(target_folder, item)
166
+ try:
167
+ shutil.move(item_path, target_path)
168
+ except Exception as e:
169
+ print(f"Error moving {item_path} to {target_path}: {e}")
170
+
171
+ if modelscope_flag_set():
172
+ from modelscope import dataset_snapshot_download
173
+ dataset_path = dataset_snapshot_download(dataset_id=repo_id, revision='master')
174
+ else:
175
+ hf_token = os.environ.get('HUGGINGFACE_TOKEN')
176
+ huggingface_hub.login(hf_token)
177
+ dataset_path = snapshot_download(repo_id=repo_id, repo_type='dataset')
178
+ unzip_hf_zip(dataset_path)
179
+ move_files(dataset_path)
180
+ generate_tsv(dataset_path)
181
+
182
+ data_file = osp.join(dataset_path, f'{dataset_name}.tsv')
183
+
184
+ self.decord_method = {
185
+ 'video': self.read_video,
186
+ 'gif': self.read_gif,
187
+ 'frame': self.read_frame,
188
+ }
189
+
190
+ self.nframe = 8
191
+ self.frame_fps = 3
192
+
193
+ # transform
194
+ self.transform = T.Compose([
195
+ Stack(),
196
+ ToTorchFormatTensor()
197
+ ])
198
+
199
+ return dict(root=dataset_path, data_file=data_file)
200
+
201
+ def get_index(self, bound, fps, max_frame, first_idx=0):
202
+ if bound:
203
+ start, end = bound[0], bound[1]
204
+ else:
205
+ start, end = -100000, 100000
206
+ start_idx = max(first_idx, round(start * fps))
207
+ end_idx = min(round(end * fps), max_frame)
208
+ seg_size = float(end_idx - start_idx) / self.num_segments
209
+ frame_indices = np.array([
210
+ int(start_idx + (seg_size / 2) + np.round(seg_size * idx))
211
+ for idx in range(self.num_segments)
212
+ ])
213
+ return frame_indices
214
+
215
+ def read_video(self, video_path, bound=None):
216
+ vr = VideoReader(video_path, ctx=cpu(0), num_threads=1)
217
+ max_frame = len(vr) - 1
218
+ fps = float(vr.get_avg_fps())
219
+
220
+ images_group = list()
221
+ frame_indices = self.get_index(bound, fps, max_frame, first_idx=0)
222
+ for frame_index in frame_indices:
223
+ img = Image.fromarray(vr[frame_index].asnumpy())
224
+ images_group.append(img)
225
+ torch_imgs = self.transform(images_group)
226
+ return torch_imgs
227
+
228
+ def read_gif(self, video_path, bound=None, fps=25):
229
+ gif = imageio.get_reader(video_path)
230
+ max_frame = len(gif) - 1
231
+
232
+ images_group = list()
233
+ frame_indices = self.get_index(bound, fps, max_frame, first_idx=0)
234
+ for index, frame in enumerate(gif):
235
+ if index in frame_indices:
236
+ img = cv2.cvtColor(frame, cv2.COLOR_RGBA2RGB)
237
+ img = Image.fromarray(img)
238
+ images_group.append(img)
239
+ torch_imgs = self.transform(images_group)
240
+ return torch_imgs
241
+
242
+ def read_frame(self, video_path, bound=None, fps=3):
243
+ max_frame = len(os.listdir(video_path))
244
+ images_group = list()
245
+ frame_indices = self.get_index(bound, fps, max_frame, first_idx=1) # frame_idx starts from 1
246
+ for frame_index in frame_indices:
247
+ img = Image.open(os.path.join(video_path, f'{frame_index:05d}.jpg'))
248
+ images_group.append(img)
249
+ torch_imgs = self.transform(images_group)
250
+ return torch_imgs
251
+
252
+ def save_video_frames(self, imgs, video_name, frames):
253
+
254
+ frame_paths = self.frame_paths(video_name, frames)
255
+ flag = np.all([osp.exists(p) for p in frame_paths])
256
+
257
+ if not flag:
258
+ block_size = imgs.size(0) // frames
259
+ split_tensors = torch.split(imgs, block_size)
260
+ to_pil = transforms.ToPILImage()
261
+ images = [to_pil(arr) for arr in split_tensors]
262
+ for im, pth in zip(images, frame_paths):
263
+ if not osp.exists(pth):
264
+ im.save(pth)
265
+
266
+ return frame_paths
267
+
268
+ def qa_template(self, data):
269
+ question = f"Question: {data['question']}\n"
270
+ question += 'Options:\n'
271
+ answer = data['answer']
272
+ answer_idx = -1
273
+ for idx, c in enumerate(eval(data['candidates'])):
274
+ question += f"({chr(ord('A') + idx)}) {c}\n"
275
+ if c == answer:
276
+ answer_idx = idx
277
+ question = question.rstrip()
278
+ answer = f"({chr(ord('A') + answer_idx)}) {answer}"
279
+ return question, answer
280
+
281
+ def load_into_video_and_process(self, line):
282
+ try:
283
+ from moviepy.editor import VideoFileClip, ImageSequenceClip
284
+ except:
285
+ raise ImportError(
286
+ 'MoviePy is not installed, please install it by running "pip install moviepy==1.0.3"'
287
+ )
288
+ video_path = os.path.join(self.data_root, line['prefix'], line['video'])
289
+
290
+ if line['data_type'] in ['gif'] or os.path.splitext(video_path)[1] in ['.webm']:
291
+ processed_video_path = video_path.replace(os.path.splitext(video_path)[1], '.mp4')
292
+ if not os.path.exists(processed_video_path):
293
+ # using MoviePy to transform GIF, webm into mp4 format
294
+ gif_clip = VideoFileClip(video_path)
295
+ gif_clip.write_videofile(processed_video_path, codec='libx264')
296
+ gif_clip.close()
297
+ elif line['data_type'] in ['frame']:
298
+ input_images = os.path.join(video_path, '*.jpg')
299
+ processed_video_path = f'{video_path}.mp4'
300
+ if not os.path.exists(processed_video_path):
301
+ # using MoviePy to transform images into mp4
302
+ image_files = sorted(glob.glob(input_images))
303
+ image_clip = ImageSequenceClip(image_files, fps=self.frame_fps)
304
+ image_clip.write_videofile(processed_video_path, codec='libx264')
305
+ image_clip.close()
306
+ else:
307
+ processed_video_path = video_path
308
+
309
+ if line['bound']:
310
+ base_name, suffix = os.path.splitext(processed_video_path)
311
+ output_video_path = f'{base_name}_processed{suffix}'
312
+ if not os.path.exists(output_video_path):
313
+ video_clip = VideoFileClip(processed_video_path)
314
+ clip = video_clip.subclip(line['start'], min(line['end'], video_clip.duration))
315
+ clip.write_videofile(output_video_path)
316
+ clip.close()
317
+ else:
318
+ output_video_path = processed_video_path
319
+
320
+ return output_video_path
321
+
322
+ def save_video_into_images(self, line, num_frames):
323
+ bound = None
324
+ if line['bound']:
325
+ bound = (
326
+ line['start'],
327
+ line['end'],
328
+ )
329
+ video_path = os.path.join(self.data_root, line['prefix'], line['video'])
330
+ decord_method = self.decord_method[line['data_type']]
331
+ self.num_segments = num_frames if num_frames > 0 else self.nframe
332
+ torch_imgs = decord_method(video_path, bound)
333
+ img_frame_paths = self.save_video_frames(torch_imgs, line['video'], self.num_segments)
334
+ return img_frame_paths
335
+
336
+ def build_prompt(self, line, num_frames, video_llm, fps):
337
+ if fps > 0:
338
+ raise ValueError('MVBench does not support fps setting, please transfer to MVBench_MP4!')
339
+ if isinstance(line, int):
340
+ assert line < len(self)
341
+ line = self.data.iloc[line]
342
+
343
+ question, answer = self.qa_template(line)
344
+ message = [dict(type='text', value=self.SYS, role='system')]
345
+ message.append(dict(type='text', value=question))
346
+ if video_llm:
347
+ new_video_path = self.load_into_video_and_process(line)
348
+ message.append(dict(type='video', value=new_video_path))
349
+ else:
350
+ img_frame_paths = self.save_video_into_images(line, num_frames)
351
+ for im in img_frame_paths:
352
+ message.append(dict(type='image', value=im))
353
+ message.append(dict(type='text', value='\nOnly give the best option.'))
354
+ message.append(dict(type='text', value='Best option:(', role='assistant'))
355
+ return message
356
+
357
+ @classmethod
358
+ def evaluate(self, eval_file, **judge_kwargs):
359
+
360
+ assert eval_file.endswith('.xlsx'), 'data file should be an xlsx file'
361
+
362
+ tmp_file = eval_file.replace('.xlsx', '_tmp.pkl')
363
+ tgt_file = eval_file.replace('.xlsx', '_rating.json')
364
+ score_file = eval_file.replace('.xlsx', '_score.xlsx')
365
+
366
+ if not osp.exists(score_file):
367
+ model = judge_kwargs.setdefault('model', 'chatgpt-0125')
368
+ assert model in ['chatgpt-0125', 'exact_matching', 'gpt-4-0125']
369
+
370
+ if model == 'exact_matching':
371
+ model = None
372
+ elif gpt_key_set():
373
+ model = build_judge(**judge_kwargs)
374
+ if not model.working():
375
+ warnings.warn('OPENAI API is not working properly, will use exact matching for evaluation')
376
+ warnings.warn(DEBUG_MESSAGE)
377
+ model = None
378
+ else:
379
+ warnings.warn('OPENAI_API_KEY is not set properly, will use exact matching for evaluation')
380
+ model = None
381
+ res = {} if not osp.exists(tmp_file) else load(tmp_file)
382
+ res = {k: v for k, v in res.items() if FAIL_MSG not in v}
383
+
384
+ data = load(eval_file)
385
+ data_un = data[~pd.isna(data['prediction'])]
386
+
387
+ for idx in data_un['index']:
388
+ ans = data.loc[data['index'] == idx, 'answer'].values[0]
389
+ pred = data.loc[data['index'] == idx, 'prediction'].values[0]
390
+ options = eval(data.loc[data['index'] == idx, 'candidates'].values[0])
391
+ answer_idx = -1
392
+ for id, c in enumerate(options):
393
+ if c == ans:
394
+ answer_idx = id
395
+ ans = f"({chr(ord('A') + answer_idx)}) {ans}"
396
+ input_item = data.loc[data['index'] == idx].to_dict(orient='records')[0]
397
+ for id, option_content in enumerate(eval(input_item['candidates'])):
398
+ input_item[chr(ord('A') + id)] = option_content
399
+ if option_content == input_item['answer']:
400
+ input_item['answer'] = chr(ord('A') + id)
401
+
402
+ if FAIL_MSG in pred:
403
+ data.loc[idx, 'score'] = -1
404
+ else:
405
+ data.loc[idx, 'score'] = int(check_ans_with_model(
406
+ pred, ans, model,
407
+ input_item,
408
+ 'MVBench'
409
+ ))
410
+
411
+ rejected = [x for x in data['score'] if x == -1]
412
+
413
+ print(
414
+ f'Among {len(data)} questions, failed to obtain prediction for {len(data) - len(data_un)} questions, '
415
+ f'failed to obtain the score for another {len(rejected)} questions. '
416
+ f'Those questions will be counted as -1 score in ALL rating, and will not be counted in VALID rating.'
417
+ )
418
+
419
+ dump(data, score_file)
420
+
421
+ rating = get_dimension_rating(score_file)
422
+ dump(rating, tgt_file)
423
+ return rating
424
+
425
+
426
+ class MVBench_MP4(VideoBaseDataset):
427
+
428
+ MP4_MD5 = '5c8c6f8b7972c2de65a629590f7c42f5'
429
+ SYS = """Carefully watch the video and pay attention to the cause and sequence of events, \
430
+ the detail and movement of objects, and the action and pose of persons. \
431
+ Based on your observations, select the best option that accurately addresses the question.
432
+ """
433
+ TYPE = 'Video-MCQ'
434
+
435
+ def __init__(self, dataset='MVBench_MP4', pack=False):
436
+ super().__init__(dataset=dataset, pack=pack)
437
+
438
+ @classmethod
439
+ def supported_datasets(cls):
440
+ return ['MVBench_MP4']
441
+
442
+ def prepare_dataset(self, dataset_name='MVBench_MP4', repo_id='OpenGVLab/MVBench'):
443
+ def check_integrity(pth):
444
+ data_file = osp.join(pth, f'{dataset_name}.tsv')
445
+
446
+ if not os.path.exists(data_file):
447
+ return False
448
+
449
+ if md5(data_file) != self.MP4_MD5:
450
+ return False
451
+
452
+ data = load(data_file)
453
+ for idx, item in data.iterrows():
454
+ if not osp.exists(osp.join(pth, item['prefix'], item['video'])):
455
+ return False
456
+ return True
457
+
458
+ if modelscope_flag_set():
459
+ repo_id = 'modelscope/MVBench'
460
+
461
+ cache_path = get_cache_path(repo_id, branch='video')
462
+ if cache_path is not None and check_integrity(cache_path):
463
+ dataset_path = cache_path
464
+ else:
465
+ def generate_tsv(pth):
466
+ data_file = osp.join(pth, f'{dataset_name}.tsv')
467
+ if os.path.exists(data_file) and md5(data_file) == self.MP4_MD5:
468
+ return
469
+ json_data_path = os.path.join(dataset_path, 'test.json')
470
+ json_data = load(json_data_path)
471
+ root_data_dict = json_data['root']
472
+ self.data_list = []
473
+ for k, v in json_data['meta'].items():
474
+ for item in v:
475
+ self.data_list.append({
476
+ 'task_type': k,
477
+ 'prefix': root_data_dict[k],
478
+ 'video': item['video'],
479
+ 'question': item['question'],
480
+ 'answer': item['answer'],
481
+ 'candidates': item['candidates']
482
+ })
483
+ data_df = pd.DataFrame(self.data_list)
484
+ data_df = data_df.assign(index=range(len(data_df)))
485
+ data_df.to_csv(data_file, sep='\t', index=False)
486
+
487
+ if modelscope_flag_set():
488
+ from modelscope import dataset_snapshot_download
489
+ dataset_path = dataset_snapshot_download(dataset_id=repo_id, revision='video')
490
+ else:
491
+ hf_token = os.environ.get('HUGGINGFACE_TOKEN')
492
+ huggingface_hub.login(hf_token)
493
+ dataset_path = snapshot_download(repo_id=repo_id, repo_type='dataset', revision='video')
494
+ generate_tsv(dataset_path)
495
+
496
+ data_file = osp.join(dataset_path, f'{dataset_name}.tsv')
497
+
498
+ self.nframe = 8
499
+
500
+ # transform
501
+ self.transform = T.Compose([
502
+ Stack(),
503
+ ToTorchFormatTensor()
504
+ ])
505
+
506
+ return dict(root=dataset_path, data_file=data_file)
507
+
508
+ def qa_template(self, data):
509
+ question = f"Question: {data['question']}\n"
510
+ question += 'Options:\n'
511
+ answer = data['answer']
512
+ answer_idx = -1
513
+ for idx, c in enumerate(eval(data['candidates'])):
514
+ question += f"({chr(ord('A') + idx)}) {c}\n"
515
+ if c == answer:
516
+ answer_idx = idx
517
+ question = question.rstrip()
518
+ answer = f"({chr(ord('A') + answer_idx)}) {answer}"
519
+ return question, answer
520
+
521
+ def get_index_by_frame(self, max_frame):
522
+ seg_size = float(max_frame) / self.num_segments
523
+ frame_indices = np.array([
524
+ int((seg_size / 2) + np.round(seg_size * idx))
525
+ for idx in range(self.num_segments)
526
+ ])
527
+ return frame_indices
528
+
529
+ def get_index_by_fps(self, vid, fps):
530
+ total_frames = len(vid)
531
+ video_fps = vid.get_avg_fps()
532
+ total_duration = total_frames / video_fps
533
+ required_frames = int(total_duration * fps)
534
+ step_size = video_fps / fps
535
+ frame_indices = np.array([int(i * step_size) for i in range(required_frames)])
536
+ self.num_segments = len(frame_indices)
537
+ return frame_indices
538
+
539
+ def read_video(self, video_path, fps=-1):
540
+ vr = VideoReader(video_path, ctx=cpu(0), num_threads=1)
541
+ max_frame = len(vr) - 1
542
+
543
+ images_group = list()
544
+ if fps < 0:
545
+ frame_indices = self.get_index_by_frame(max_frame)
546
+ else:
547
+ frame_indices = self.get_index_by_fps(vr, fps)
548
+
549
+ for frame_index in frame_indices:
550
+ img = Image.fromarray(vr[frame_index].asnumpy())
551
+ images_group.append(img)
552
+ torch_imgs = self.transform(images_group)
553
+ return torch_imgs
554
+
555
+ def save_video_frames(self, imgs, video_name, frames, fps):
556
+ if fps > 0:
557
+ frame_paths = self.frame_paths_fps(video_name, frames, fps)
558
+ else:
559
+ frame_paths = self.frame_paths(video_name, frames)
560
+ flag = np.all([osp.exists(p) for p in frame_paths])
561
+
562
+ if not flag:
563
+ block_size = imgs.size(0) // frames
564
+ split_tensors = torch.split(imgs, block_size)
565
+ to_pil = transforms.ToPILImage()
566
+ images = [to_pil(arr) for arr in split_tensors]
567
+ for im, pth in zip(images, frame_paths):
568
+ if not osp.exists(pth):
569
+ im.save(pth)
570
+
571
+ return frame_paths
572
+
573
+ def save_video_into_images(self, line, num_frames, fps=-1):
574
+ video_path = os.path.join(self.data_root, line['prefix'], line['video'])
575
+ if fps <= 0:
576
+ self.num_segments = num_frames if num_frames > 0 else self.nframe
577
+ else:
578
+ self.num_segments = 0
579
+ torch_imgs = self.read_video(video_path, fps)
580
+ img_frame_paths = self.save_video_frames(torch_imgs, line['video'], self.num_segments, fps)
581
+ return img_frame_paths
582
+
583
+ def build_prompt(self, line, num_frames, video_llm, fps):
584
+ if isinstance(line, int):
585
+ assert line < len(self)
586
+ line = self.data.iloc[line]
587
+
588
+ question, answer = self.qa_template(line)
589
+ message = [dict(type='text', value=self.SYS, role='system')]
590
+ message.append(dict(type='text', value=question))
591
+ video_path = os.path.join(self.data_root, line['prefix'], line['video'])
592
+ if video_llm:
593
+ message.append(dict(type='video', value=video_path))
594
+ else:
595
+ img_frame_paths = self.save_video_into_images(line, num_frames, fps)
596
+ for im in img_frame_paths:
597
+ message.append(dict(type='image', value=im))
598
+ message.append(dict(type='text', value='\nOnly give the best option.'))
599
+ message.append(dict(type='text', value='Best option:(', role='assistant'))
600
+ return message
601
+
602
+ @classmethod
603
+ def evaluate(self, eval_file, **judge_kwargs):
604
+
605
+ assert eval_file.endswith('.xlsx'), 'data file should be an xlsx file'
606
+
607
+ tmp_file = eval_file.replace('.xlsx', '_tmp.pkl')
608
+ tgt_file = eval_file.replace('.xlsx', '_rating.json')
609
+ score_file = eval_file.replace('.xlsx', '_score.xlsx')
610
+
611
+ if not osp.exists(score_file):
612
+ model = judge_kwargs.setdefault('model', 'chatgpt-0125')
613
+ assert model in ['chatgpt-0125', 'exact_matching', 'gpt-4-0125']
614
+
615
+ if model == 'exact_matching':
616
+ model = None
617
+ elif gpt_key_set():
618
+ model = build_judge(**judge_kwargs)
619
+ if not model.working():
620
+ warnings.warn('OPENAI API is not working properly, will use exact matching for evaluation')
621
+ warnings.warn(DEBUG_MESSAGE)
622
+ model = None
623
+ else:
624
+ warnings.warn('OPENAI_API_KEY is not set properly, will use exact matching for evaluation')
625
+ model = None
626
+ res = {} if not osp.exists(tmp_file) else load(tmp_file)
627
+ res = {k: v for k, v in res.items() if FAIL_MSG not in v}
628
+
629
+ data = load(eval_file)
630
+ data_un = data[~pd.isna(data['prediction'])]
631
+
632
+ for idx in data_un['index']:
633
+ ans = data.loc[data['index'] == idx, 'answer'].values[0]
634
+ pred = data.loc[data['index'] == idx, 'prediction'].values[0]
635
+ options = eval(data.loc[data['index'] == idx, 'candidates'].values[0])
636
+ answer_idx = -1
637
+ for id, c in enumerate(options):
638
+ if c == ans:
639
+ answer_idx = id
640
+ ans = f"({chr(ord('A') + answer_idx)}) {ans}"
641
+ input_item = data.loc[data['index'] == idx].to_dict(orient='records')[0]
642
+ for id, option_content in enumerate(eval(input_item['candidates'])):
643
+ input_item[chr(ord('A') + id)] = option_content
644
+ if option_content == input_item['answer']:
645
+ input_item['answer'] = chr(ord('A') + id)
646
+
647
+ if FAIL_MSG in pred:
648
+ data.loc[idx, 'score'] = -1
649
+ else:
650
+ data.loc[idx, 'score'] = int(check_ans_with_model(
651
+ pred, ans, model,
652
+ input_item,
653
+ 'MVBench_MP4'
654
+ ))
655
+
656
+ rejected = [x for x in data['score'] if x == -1]
657
+
658
+ print(
659
+ f'Among {len(data)} questions, failed to obtain prediction for {len(data) - len(data_un)} questions, '
660
+ f'failed to obtain the score for another {len(rejected)} questions. '
661
+ f'Those questions will be counted as -1 score in ALL rating, and will not be counted in VALID rating.'
662
+ )
663
+
664
+ dump(data, score_file)
665
+
666
+ rating = get_dimension_rating(score_file)
667
+ dump(rating, tgt_file)
668
+ return rating
vlmeval/VLMEvalKit_old/vlmeval/dataset/text_base.py ADDED
@@ -0,0 +1,88 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ from abc import abstractmethod
2
+ from ..smp import *
3
+
4
+
5
+ class TextBaseDataset:
6
+ MODALITY = 'TEXT'
7
+ DATASET_URL = {}
8
+ DATASET_MD5 = {}
9
+
10
+ def __init__(self, dataset='MMBench', **kwargs):
11
+ self.dataset_name = dataset
12
+
13
+ data = self.load_data(dataset)
14
+
15
+ data['index'] = [str(x) for x in data['index']]
16
+
17
+ if np.all([istype(x, int) for x in data['index']]):
18
+ data['index'] = [int(x) for x in data['index']]
19
+
20
+ self.data = data
21
+ self.post_build(dataset)
22
+
23
+ def __len__(self):
24
+ return len(self.data)
25
+
26
+ def __getitem__(self, idx):
27
+ return dict(self.data.iloc[idx])
28
+
29
+ def prepare_tsv(self, url, file_md5=None):
30
+ data_root = LMUDataRoot()
31
+ os.makedirs(data_root, exist_ok=True)
32
+ update_flag = False
33
+ file_name = url.split('/')[-1]
34
+ data_path = osp.join(data_root, file_name)
35
+ if osp.exists(data_path) and (file_md5 is None or md5(data_path) == file_md5):
36
+ pass
37
+ else:
38
+ warnings.warn('The dataset tsv is not downloaded')
39
+ download_file(url, data_path)
40
+ update_flag = True
41
+
42
+ if file_size(data_path, 'GB') > 1:
43
+ local_path = data_path.replace('.tsv', '_local.tsv')
44
+ if not osp.exists(local_path) or os.environ.get('FORCE_LOCAL', None) or update_flag:
45
+ from ..tools import LOCALIZE
46
+ LOCALIZE(data_path, local_path)
47
+ data_path = local_path
48
+ return load(data_path)
49
+
50
+ def dump_image(self, line):
51
+ return []
52
+
53
+ def display(self, line):
54
+ if isinstance(line, int):
55
+ line = self.data.iloc[line]
56
+ assert isinstance(line, pd.Series) or isinstance(line, dict)
57
+ mmqa_display(line)
58
+
59
+ # Return a list of dataset names that are supported by this class, can override
60
+ @classmethod
61
+ def supported_datasets(cls):
62
+ return list(cls.DATASET_URL)
63
+
64
+ # Given the dataset name, return the dataset as a pandas dataframe, can override
65
+ def load_data(self, dataset):
66
+ url = self.DATASET_URL[dataset]
67
+ file_md5 = self.DATASET_MD5[dataset]
68
+ return self.prepare_tsv(url, file_md5)
69
+
70
+ # Post built hook, will be called after the dataset is built, can override
71
+ def post_build(self, dataset):
72
+ pass
73
+
74
+ # Given one data record, return the built prompt (a multi-modal message), can override
75
+ def build_prompt(self, line):
76
+ if isinstance(line, int):
77
+ line = self.data.iloc[line]
78
+
79
+ question = line['question']
80
+
81
+ msgs = []
82
+ msgs.append(dict(type='text', value=question))
83
+ return msgs
84
+
85
+ # Given the prediction file, return the evaluation results in the format of a dictionary or pandas dataframe
86
+ @abstractmethod
87
+ def evaluate(self, eval_file, **judge_kwargs):
88
+ pass
vlmeval/VLMEvalKit_old/vlmeval/dataset/vcr.py ADDED
@@ -0,0 +1,335 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import uuid
2
+ from functools import partial
3
+ from .image_base import ImageBaseDataset
4
+ from ..smp import *
5
+
6
+ rouge = None
7
+ nlp_en = None
8
+ nlp_zh = None
9
+ nlp = None
10
+
11
+
12
+ def initialize():
13
+ import evaluate
14
+ import spacy
15
+
16
+ global rouge, nlp_en, nlp_zh, nlp
17
+
18
+ try:
19
+ rouge = evaluate.load('rouge', experiment_id=str(uuid.uuid4()))
20
+ except Exception as e:
21
+ logging.critical(f'{type(e)}: {e}')
22
+ logging.critical('Please first `pip install rouge_score`.')
23
+
24
+ try:
25
+ nlp_en = spacy.load('en_core_web_sm')
26
+ except Exception as e:
27
+ logging.warning(f'{type(e)}: {e}')
28
+ logging.warning('Will automatically download en_core_web_sm via spacy.')
29
+ spacy.cli.download('en_core_web_sm')
30
+ nlp_en = spacy.load('en_core_web_sm')
31
+
32
+ try:
33
+ nlp_zh = spacy.load('zh_core_web_sm')
34
+ except Exception as e:
35
+ logging.warning(f'{type(e)}: {e}')
36
+ logging.warning('Will automatically download zh_core_web_sm via spacy.')
37
+ spacy.cli.download('zh_core_web_sm')
38
+ nlp_zh = spacy.load('zh_core_web_sm')
39
+
40
+ nlp = {'en': nlp_en, 'zh': nlp_zh}
41
+
42
+
43
+ def rough_filter(answer_text):
44
+ if "I can't" in answer_text:
45
+ return False
46
+ elif 'I cannot' in answer_text:
47
+ return False
48
+ elif 'sorry' in answer_text.lower():
49
+ return False
50
+ if '无法' in answer_text:
51
+ return False
52
+ elif '抱歉' in answer_text:
53
+ return False
54
+ else:
55
+ return True
56
+
57
+
58
+ def zero_template(crossed_text):
59
+ return {
60
+ 'crossed_text': crossed_text,
61
+ 'max_sim_val': 0,
62
+ 'max_sim_string': '',
63
+ 'precision': 0,
64
+ 'recall': 0,
65
+ 'f1': 0,
66
+ 'jaccard': 0,
67
+ 'rouge1': 0,
68
+ 'exact_match': 0,
69
+ }
70
+
71
+
72
+ def tokenize(text, language):
73
+ """
74
+ Tokenize the text and return the tokens.
75
+
76
+ Parameters:
77
+ text (str): The text to tokenize.
78
+ language (str): The language of the text.
79
+
80
+ Returns:
81
+ list: The list of tokens.
82
+ """
83
+ assert language in ['en', 'zh']
84
+ nlp_language = nlp[language]
85
+ processed_text = nlp_language(text)
86
+ return [token.text for token in processed_text]
87
+
88
+
89
+ def find_best_match(needle, hay, language, rouge):
90
+ """
91
+ Finds the best matching n-gram in the haystack for the given needle.
92
+
93
+ Parameters:
94
+ needle (str): The string to find.
95
+ hay (str): The text to search within.
96
+
97
+ Returns:
98
+ tuple: The highest similarity value and the best matching string.
99
+ """
100
+ assert language in ['en', 'zh']
101
+ from nltk.util import ngrams
102
+ from difflib import SequenceMatcher as SM
103
+
104
+ tokens_hay = tokenize(hay, language)
105
+ tokens_needle = tokenize(needle, language)
106
+
107
+ splitter = '' if language == 'zh' else ' '
108
+ ngrams_ = ngrams(tokens_hay, len(tokens_needle))
109
+ max_sim_val = 0
110
+ max_sim_string = ''
111
+ max_sim_ngram = []
112
+ tokens_needle_set = set(tokens_needle)
113
+ ngrams_hasjoint = [
114
+ ngram
115
+ for ngram in ngrams_
116
+ if not set(ngram).isdisjoint(tokens_needle_set)
117
+ ]
118
+
119
+ for ngram in ngrams_hasjoint:
120
+ hay_ngram = splitter.join(ngram)
121
+ similarity = SM(None, hay_ngram, needle).ratio()
122
+ if similarity > max_sim_val:
123
+ max_sim_val = similarity
124
+ max_sim_string = hay_ngram
125
+ max_sim_ngram = ngram
126
+
127
+ # Evaluate
128
+ if len(max_sim_ngram) == 0:
129
+ return {
130
+ 'crossed_text': needle,
131
+ 'max_sim_val': 0,
132
+ 'max_sim_string': '',
133
+ 'precision': 0,
134
+ 'recall': 0,
135
+ 'f1': 0,
136
+ 'jaccard': 0,
137
+ 'rouge1': 0,
138
+ 'exact_match': 0,
139
+ }
140
+ pred_set = set(max_sim_ngram)
141
+ ref_set = set(tokens_needle)
142
+ correct_tokens = pred_set.intersection(ref_set)
143
+ len_correct_tokens = len(correct_tokens)
144
+
145
+ precision = len_correct_tokens / len(pred_set)
146
+ recall = len_correct_tokens / len(ref_set)
147
+ if (precision + recall) == 0:
148
+ f1 = 0
149
+ else:
150
+ f1 = 2 * precision * recall / (precision + recall)
151
+ union = pred_set.union(ref_set)
152
+ jaccard = len_correct_tokens / len(union) if len(union) > 0 else 0
153
+ rouge_1 = rouge.compute(
154
+ predictions=[max_sim_string],
155
+ references=[needle],
156
+ tokenizer=partial(tokenize, language=language),
157
+ rouge_types=['rouge1'],
158
+ )['rouge1']
159
+ exact_match = float(list(max_sim_ngram) == list(tokens_needle))
160
+ out = {
161
+ 'crossed_text': needle,
162
+ 'max_sim_string': max_sim_string,
163
+ 'max_sim_val': max_sim_val,
164
+ 'precision': precision,
165
+ 'recall': recall,
166
+ 'f1': f1,
167
+ 'jaccard': jaccard,
168
+ 'rouge1': rouge_1,
169
+ 'exact_match': exact_match,
170
+ }
171
+ return out
172
+
173
+
174
+ def process_match_single_new(
175
+ image_id, prediction, answer, language, progress
176
+ ):
177
+ """
178
+ process the inference results for a single image and calculate the metrics
179
+
180
+ Parameters:
181
+ image_id (int): The image id (question id).
182
+ prediction (str): The prediction text.
183
+ answer (Union[str, List[str]]): The answer text, or a list of answer texts. The masked n-grams in the image.
184
+ language (str): The language of the text. Can be "en" or "zh".
185
+ rouge (rouge): The rouge metric object.
186
+ progress (multiprocessing.Queue): The progress queue.
187
+
188
+ Returns:
189
+ tuple: The image id (question_id, int) and the result per id (dict of dict of dict).
190
+ """
191
+ result_per_id = {image_id: {}}
192
+ if isinstance(answer, str):
193
+ answer = eval(answer)
194
+ assert isinstance(answer, list)
195
+ result = prediction.split('Assistant: ')[-1]
196
+ for i, crossed_text in enumerate(answer):
197
+ if rough_filter(result):
198
+ find_best_match_result = find_best_match(
199
+ crossed_text, result, language, rouge
200
+ )
201
+ if i == 0:
202
+ result_per_id[image_id] = {str(i): find_best_match_result}
203
+ else:
204
+ result_per_id[image_id][str(i)] = find_best_match_result
205
+ else:
206
+ if i == 0:
207
+ result_per_id[image_id] = {str(i): zero_template(crossed_text)}
208
+ else:
209
+ result_per_id[image_id][str(i)] = zero_template(crossed_text)
210
+ progress.put(1)
211
+ return image_id, result_per_id
212
+
213
+
214
+ class VCRDataset(ImageBaseDataset):
215
+ TYPE = 'VQA'
216
+
217
+ URL_PREFIX = 'https://huggingface.co/datasets/vcr-org'
218
+
219
+ DATASET_URL = {
220
+ 'VCR_EN_EASY_500': f'{URL_PREFIX}/VCR-wiki-en-easy-test-500/resolve/main/VCR-wiki-en-easy-test-500.tsv',
221
+ 'VCR_EN_EASY_100': f'{URL_PREFIX}/VCR-wiki-en-easy-test-100/resolve/main/VCR-wiki-en-easy-test-100.tsv',
222
+ 'VCR_EN_EASY_ALL': f'{URL_PREFIX}/VCR-wiki-en-easy-test/resolve/main/VCR-wiki-en-easy-test.tsv',
223
+ 'VCR_EN_HARD_500': f'{URL_PREFIX}/VCR-wiki-en-hard-test-500/resolve/main/VCR-wiki-en-hard-test-500.tsv',
224
+ 'VCR_EN_HARD_100': f'{URL_PREFIX}/VCR-wiki-en-hard-test-100/resolve/main/VCR-wiki-en-hard-test-100.tsv',
225
+ 'VCR_EN_HARD_ALL': f'{URL_PREFIX}/VCR-wiki-en-hard-test/resolve/main/VCR-wiki-en-hard-test.tsv',
226
+ 'VCR_ZH_EASY_500': f'{URL_PREFIX}/VCR-wiki-zh-easy-test-500/resolve/main/VCR-wiki-zh-easy-test-500.tsv',
227
+ 'VCR_ZH_EASY_100': f'{URL_PREFIX}/VCR-wiki-zh-easy-test-100/resolve/main/VCR-wiki-zh-easy-test-100.tsv',
228
+ 'VCR_ZH_EASY_ALL': f'{URL_PREFIX}/VCR-wiki-zh-easy-test/resolve/main/VCR-wiki-zh-easy-test.tsv',
229
+ 'VCR_ZH_HARD_500': f'{URL_PREFIX}/VCR-wiki-zh-hard-test-500/resolve/main/VCR-wiki-zh-hard-test-500.tsv',
230
+ 'VCR_ZH_HARD_100': f'{URL_PREFIX}/VCR-wiki-zh-hard-test-100/resolve/main/VCR-wiki-zh-hard-test-100.tsv',
231
+ 'VCR_ZH_HARD_ALL': f'{URL_PREFIX}/VCR-wiki-zh-hard-test/resolve/main/VCR-wiki-zh-hard-test.tsv',
232
+ }
233
+
234
+ DATASET_MD5 = {
235
+ 'VCR_EN_EASY_500': 'fd9258db52f8685dc710619a0ea0a261',
236
+ 'VCR_EN_EASY_100': '9df5d7266683458621ecbe122beb72f0',
237
+ 'VCR_EN_EASY_ALL': '8a9b96885f251d1c85f42f84073327f1',
238
+ 'VCR_EN_HARD_500': '0a22a85080b6a1f52b1f95e302d43df4',
239
+ 'VCR_EN_HARD_100': '1b20f5cbcbeae0b0bec77f7a36143958',
240
+ 'VCR_EN_HARD_ALL': '2d8b8b1ee0eba0e0b618fd3aa7d9710e',
241
+ 'VCR_ZH_EASY_500': 'beca5fd54176adf44cf94bd9b50cf048',
242
+ 'VCR_ZH_EASY_100': '4a86a5678a79844d6d22ab0629c51cd5',
243
+ 'VCR_ZH_EASY_ALL': '5050fe7f0027ad2068fd4c7f220edaea',
244
+ 'VCR_ZH_HARD_500': '617e3360f75c54455625cb0a8da5c1e7',
245
+ 'VCR_ZH_HARD_100': 'b0e38c85f5d5e63894a3b881c372a62b',
246
+ 'VCR_ZH_HARD_ALL': '54bbfef448206518b03127ef8b61404c',
247
+ }
248
+
249
+ def __init__(self, dataset='VCR_EN_EASY_500', skip_noimg=True):
250
+ super().__init__(dataset, skip_noimg)
251
+
252
+ initialize()
253
+ self.language = 'en' if 'EN' in dataset else 'zh'
254
+ self.difficulty = 'easy' if 'EASY' in dataset else 'hard'
255
+
256
+ # def build_prompt(self, line):
257
+ # msgs = super().build_prompt(line)
258
+ # assert msgs[-1]['type'] == 'text'
259
+ # if self.language == 'zh':
260
+ # msgs[-1]['value'] += '图像中被覆盖的文本是什么?请在不输出解释的情况下还原被覆盖的文本。'
261
+ # else:
262
+ # msgs[-1]['value'] += ('What is the covered texts in the image? '
263
+ # 'Please restore the covered texts without outputting the explanations.')
264
+ # return msgs
265
+
266
+ def evaluate(self, eval_file, **judge_kwargs):
267
+ import multiprocessing
268
+
269
+ vcr_score_list = {'Exact_Match': [], 'Jaccard': []}
270
+ vcr_score = {'Exact_Match': 0, 'Jaccard': 0}
271
+ logger = get_logger('Evaluation')
272
+ data = load(eval_file)
273
+
274
+ lt = len(data)
275
+ lines = [data.iloc[i] for i in range(lt)]
276
+
277
+ pool = multiprocessing.Pool()
278
+ manager = multiprocessing.Manager()
279
+ progress_queue = manager.Queue()
280
+ results = []
281
+
282
+ overall_results = {str(image_id): {} for image_id in range(len(lines))}
283
+
284
+ for instance_id, instance in enumerate(lines):
285
+ results.append(
286
+ pool.apply_async(
287
+ process_match_single_new,
288
+ args=(
289
+ str(instance_id),
290
+ instance['prediction'],
291
+ instance['answer'],
292
+ self.language,
293
+ progress_queue,
294
+ ),
295
+ )
296
+ )
297
+ pool.close()
298
+
299
+ # Display progress bar
300
+ for _ in tqdm(range(len(results))):
301
+ progress_queue.get()
302
+
303
+ pool.join()
304
+
305
+ # Merging results into overall_result
306
+ for result in results:
307
+ image_id, result_per_id = result.get()
308
+ overall_results[str(image_id)].update(result_per_id[image_id])
309
+ for blank_id_str in result_per_id[image_id].keys():
310
+ vcr_score_list['Exact_Match'].append(
311
+ result_per_id[image_id][blank_id_str]['exact_match']
312
+ )
313
+ vcr_score_list['Jaccard'].append(
314
+ result_per_id[image_id][blank_id_str]['jaccard']
315
+ )
316
+ vcr_score['Exact_Match'] = np.mean(vcr_score_list['Exact_Match'])
317
+ vcr_score['Jaccard'] = np.mean(vcr_score_list['Jaccard'])
318
+ results_out = {
319
+ k: v for i in range(len(results)) for k, v in results[i].get()[1].items()
320
+ }
321
+ results_with_metrics = {
322
+ 'Exact_Match': vcr_score['Exact_Match'],
323
+ 'Jaccard': vcr_score['Jaccard'],
324
+ 'Predictions': results_out,
325
+ }
326
+ score_pth = eval_file.replace(
327
+ '.xlsx', f'{self.language}_{self.difficulty}_score.json'
328
+ )
329
+ dump(results_with_metrics, score_pth)
330
+ logger.info(
331
+ f'VCR successfully finished evaluating {eval_file}, results saved in {score_pth}'
332
+ )
333
+ logger.info('Score: ')
334
+ for key, value in vcr_score.items():
335
+ logger.info('{}:{}'.format(key, value))
vlmeval/VLMEvalKit_old/vlmeval/dataset/video_concat_dataset.py ADDED
@@ -0,0 +1,83 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ from ..smp import *
2
+ from .video_base import VideoBaseDataset
3
+
4
+
5
+ class ConcatVideoDataset(VideoBaseDataset):
6
+ # This dataset takes multiple dataset names as input and aggregate them into a single dataset.
7
+ # Each single dataset should not have a field named `SUB_DATASET`
8
+
9
+ DATASET_SETS = {}
10
+
11
+ def __init__(self, dataset):
12
+ from . import build_dataset
13
+ datasets = self.DATASET_SETS[dataset]
14
+ self.dataset_map = {}
15
+ # The name of the compliation
16
+ self.dataset_name = dataset
17
+ self.datasets = datasets
18
+ for dname in datasets:
19
+ dataset = build_dataset(dname)
20
+ assert dataset is not None, dataset
21
+ self.dataset_map[dname] = dataset
22
+ TYPES = [x.TYPE for x in self.dataset_map.values()]
23
+ MODALITIES = [x.MODALITY for x in self.dataset_map.values()]
24
+ # assert np.all([x == TYPES[0] for x in TYPES]), (datasets, TYPES)
25
+ assert np.all([x == MODALITIES[0] for x in MODALITIES]), (datasets, MODALITIES)
26
+ self.TYPE = TYPES
27
+ self.MODALITY = MODALITIES[0]
28
+ data_all = []
29
+ for dname in datasets:
30
+ data = self.dataset_map[dname].data
31
+ data['SUB_DATASET'] = [dname] * len(data)
32
+ data_all.append(data)
33
+
34
+ data = pd.concat(data_all)
35
+ data['original_index'] = data.pop('index')
36
+ data['index'] = np.arange(len(data))
37
+ self.data = data
38
+
39
+ def build_prompt(self, line, num_frames, video_llm, fps):
40
+ if isinstance(line, int):
41
+ line = self.data.iloc[line]
42
+ idx = line['original_index']
43
+ dname = line['SUB_DATASET']
44
+ org_data = self.dataset_map[dname].data
45
+ org_line = cp.deepcopy(org_data[org_data['index'] == idx]).iloc[0]
46
+ return self.dataset_map[dname].build_prompt(org_line, num_frames, video_llm, fps)
47
+
48
+ def dump_image(self, line):
49
+ # Assert all images are pre-dumped
50
+ assert 'image' not in line
51
+ assert 'image_path' in line
52
+ tgt_path = toliststr(line['image_path'])
53
+ return tgt_path
54
+
55
+ @classmethod
56
+ def supported_datasets(cls):
57
+ return [] # list(cls.DATASET_SETS)
58
+
59
+ def evaluate(self, eval_file, **judge_kwargs):
60
+ suffix = eval_file.split('.')[-1]
61
+ # First, split the eval_file by dataset
62
+ data_all = load(eval_file)
63
+ for dname in self.datasets:
64
+ tgt = eval_file.replace(self.dataset_name, dname)
65
+ data_sub = data_all[data_all['SUB_DATASET'] == dname]
66
+ data_sub.pop('index')
67
+ data_sub['index'] = data_sub.pop('original_index')
68
+ data_sub.pop('SUB_DATASET')
69
+ dump(data_sub, tgt)
70
+ # Then, evaluate each dataset separately
71
+ results_all = {}
72
+ for dname in self.datasets:
73
+ tgt = eval_file.replace(self.dataset_name, dname)
74
+ res = self.dataset_map[dname].evaluate(tgt, **judge_kwargs)
75
+ results_all.update(res)
76
+
77
+ result = pd.DataFrame(results_all, index=['success', 'overall'])
78
+ result = result.T
79
+ for idx, item in result.iterrows():
80
+ result.loc[idx, 'acc'] = round(item['success'] / item['overall'] * 100, 1)
81
+ score_file = eval_file.replace(f'.{suffix}', '_acc.csv')
82
+ dump(result, score_file)
83
+ return result
vlmeval/VLMEvalKit_old/vlmeval/smp/__pycache__/__init__.cpython-310.pyc ADDED
Binary file (220 Bytes). View file
 
vlmeval/VLMEvalKit_old/vlmeval/smp/__pycache__/__init__.cpython-311.pyc ADDED
Binary file (274 Bytes). View file
 
vlmeval/VLMEvalKit_old/vlmeval/smp/__pycache__/__init__.cpython-38.pyc ADDED
Binary file (218 Bytes). View file
 
vlmeval/VLMEvalKit_old/vlmeval/smp/__pycache__/file.cpython-310.pyc ADDED
Binary file (12.7 kB). View file
 
vlmeval/VLMEvalKit_old/vlmeval/smp/__pycache__/file.cpython-311.pyc ADDED
Binary file (25.5 kB). View file
 
vlmeval/VLMEvalKit_old/vlmeval/smp/__pycache__/file.cpython-38.pyc ADDED
Binary file (12.8 kB). View file
 
vlmeval/VLMEvalKit_old/vlmeval/smp/__pycache__/log.cpython-310.pyc ADDED
Binary file (1.23 kB). View file
 
vlmeval/VLMEvalKit_old/vlmeval/smp/__pycache__/log.cpython-38.pyc ADDED
Binary file (1.23 kB). View file
 
vlmeval/VLMEvalKit_old/vlmeval/smp/__pycache__/misc.cpython-310.pyc ADDED
Binary file (10.3 kB). View file