diff --git a/PaddleMIX/comfyui/ComfyUI_ppdiffusers/utils/callbacks.py b/PaddleMIX/comfyui/ComfyUI_ppdiffusers/utils/callbacks.py new file mode 100644 index 0000000000000000000000000000000000000000..76a2c1174bae33984665236dfb831db057c123a1 --- /dev/null +++ b/PaddleMIX/comfyui/ComfyUI_ppdiffusers/utils/callbacks.py @@ -0,0 +1,20 @@ +# Copyright (c) 2024 PaddlePaddle Authors. All Rights Reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +from typing import Dict + + +def progress_callback(pbar, cls, step, timestep, kwargs) -> Dict: + pbar.update(1) + return {} diff --git a/PaddleMIX/comfyui/ComfyUI_ppdiffusers/workflows/SD15/workflow_SD1.5_inpaint.json b/PaddleMIX/comfyui/ComfyUI_ppdiffusers/workflows/SD15/workflow_SD1.5_inpaint.json new file mode 100644 index 0000000000000000000000000000000000000000..0b5e754d345ec9b92657e233583ee60097bfa66e --- /dev/null +++ b/PaddleMIX/comfyui/ComfyUI_ppdiffusers/workflows/SD15/workflow_SD1.5_inpaint.json @@ -0,0 +1,515 @@ +{ + "last_node_id": 23, + "last_link_id": 33, + "nodes": [ + { + "id": 4, + "type": "PaddleSDVaeDecoder", + "pos": [ + 1011, + 398 + ], + "size": { + "0": 210, + "1": 46 + }, + "flags": {}, + "order": 11, + "mode": 0, + "inputs": [ + { + "name": "latent", + "type": "LATENT", + "link": 33, + "label": "latent" + }, + { + "name": "sd_pipe", + "type": "PIPELINE", + "link": 4, + "label": "sd_pipe" + } + ], + "outputs": [ + { + "name": "image", + "type": "IMAGE", + "links": [ + 3 + ], + "shape": 3, + "label": "image", + "slot_index": 0 + } + ], + "properties": { + "Node name for S&R": "PaddleSDVaeDecoder" + } + }, + { + "id": 5, + "type": "PaddleSaveImage", + "pos": [ + 1478, + 470 + ], + "size": { + "0": 315, + "1": 270 + }, + "flags": {}, + "order": 12, + "mode": 0, + "inputs": [ + { + "name": "images", + "type": "IMAGE", + "link": 3, + "label": "images" + } + ], + "properties": { + "Node name for S&R": "PaddleSaveImage" + }, + "widgets_values": [ + "ComfyUI" + ] + }, + { + "id": 7, + "type": "LoadImage", + "pos": [ + 50, + 588 + ], + "size": { + "0": 315, + "1": 314 + }, + "flags": {}, + "order": 0, + "mode": 0, + "outputs": [ + { + "name": "IMAGE", + "type": "IMAGE", + "links": [ + 30 + ], + "shape": 3, + "label": "IMAGE", + "slot_index": 0 + }, + { + "name": "MASK", + "type": "MASK", + "links": [ + 28 + ], + "shape": 3, + "label": "MASK", + "slot_index": 1 + } + ], + "properties": { + "Node name for S&R": "LoadImage" + }, + "widgets_values": [ + "clipspace/clipspace-mask-572957.png [input]", + "image" + ], + "color": "#322", + "bgcolor": "#533" + }, + { + "id": 15, + "type": "PromptInput", + "pos": [ + 479, + 1004 + ], + "size": { + "0": 400, + "1": 200 + }, + "flags": {}, + "order": 1, + "mode": 0, + "outputs": [ + { + "name": "prompt", + "type": "PROMPT", + "links": [ + 31 + ], + "shape": 3, + "label": "prompt", + "slot_index": 0 + } + ], + "properties": { + "Node name for S&R": "PromptInput" + }, + "widgets_values": [ + "1girl, blue hair" + ] + }, + { + "id": 12, + "type": "PromptInput", + "pos": [ + 965, + 954 + ], + "size": { + "0": 400, + "1": 200 + }, + "flags": {}, + "order": 2, + "mode": 0, + "outputs": [ + { + "name": "prompt", + "type": "PROMPT", + "links": [ + 32 + ], + "shape": 3, + "label": "prompt", + "slot_index": 0 + } + ], + "properties": { + "Node name for S&R": "PromptInput" + }, + "widgets_values": [ + "low, error, ugly" + ] + }, + { + "id": 19, + "type": "Note", + "pos": [ + 1406, + 968 + ], + "size": { + "0": 210, + "1": 58 + }, + "flags": {}, + "order": 3, + "mode": 0, + "properties": { + "text": "" + }, + "widgets_values": [ + "这里填负向画面提示 (不想要的内容)" + ], + "color": "#432", + "bgcolor": "#653" + }, + { + "id": 18, + "type": "Note", + "pos": [ + 254, + 1013 + ], + "size": { + "0": 210, + "1": 58 + }, + "flags": {}, + "order": 4, + "mode": 0, + "properties": { + "text": "" + }, + "widgets_values": [ + "这里填正向画面提示 (想要的内容)" + ], + "color": "#432", + "bgcolor": "#653" + }, + { + "id": 21, + "type": "Note", + "pos": [ + 990, + 543 + ], + "size": { + "0": 217.51138305664062, + "1": 164.82931518554688 + }, + "flags": {}, + "order": 5, + "mode": 0, + "properties": { + "text": "" + }, + "widgets_values": [ + "- denoise是重绘幅度,越高程度越大\n- steps是画笔绘制的步数\n- number是每次同时绘制的张数\n- cfg可以调整画面细节参数\n- scheduler是不同的去噪声方式" + ], + "color": "#432", + "bgcolor": "#653" + }, + { + "id": 22, + "type": "Note", + "pos": [ + 1835, + 498 + ], + "size": { + "0": 210, + "1": 58 + }, + "flags": {}, + "order": 6, + "mode": 0, + "properties": { + "text": "" + }, + "widgets_values": [ + "这里是最终结果" + ], + "color": "#432", + "bgcolor": "#653" + }, + { + "id": 23, + "type": "Note", + "pos": [ + 324, + 227 + ], + "size": { + "0": 210, + "1": 58 + }, + "flags": {}, + "order": 7, + "mode": 0, + "properties": { + "text": "" + }, + "widgets_values": [ + "这里选择喜欢的AIGC大模型" + ], + "color": "#432", + "bgcolor": "#653" + }, + { + "id": 17, + "type": "PaddleSDInpaintPipe", + "pos": [ + 628, + 526 + ], + "size": { + "0": 315, + "1": 282 + }, + "flags": {}, + "order": 10, + "mode": 0, + "inputs": [ + { + "name": "sd_pipe", + "type": "PIPELINE", + "link": 29, + "label": "sd_pipe", + "slot_index": 0 + }, + { + "name": "image", + "type": "IMAGE", + "link": 30, + "label": "image" + }, + { + "name": "mask", + "type": "MASK", + "link": 28, + "label": "mask", + "slot_index": 2 + }, + { + "name": "prompt", + "type": "PROMPT", + "link": 31, + "label": "prompt", + "slot_index": 3 + }, + { + "name": "negative_prompt", + "type": "PROMPT", + "link": 32, + "label": "negative_prompt" + } + ], + "outputs": [ + { + "name": "latent", + "type": "LATENT", + "links": [ + 33 + ], + "shape": 3, + "label": "latent", + "slot_index": 0 + } + ], + "properties": { + "Node name for S&R": "PaddleSDInpaintPipe" + }, + "widgets_values": [ + 0.7000000000000001, + 20, + 1, + 1064456556884681, + "randomize", + 7.5, + "euler" + ] + }, + { + "id": 1, + "type": "PaddleSDCheckpointLoader", + "pos": [ + -36, + 291 + ], + "size": { + "0": 315, + "1": 58 + }, + "flags": {}, + "order": 8, + "mode": 0, + "outputs": [ + { + "name": "sd_pipe", + "type": "PIPELINE", + "links": [ + 4, + 29 + ], + "shape": 3, + "label": "sd_pipe", + "slot_index": 0 + } + ], + "properties": { + "Node name for S&R": "PaddleSDCheckpointLoader" + }, + "widgets_values": [ + "sd15/人物写真_majicmixRealistic_v7.safetensors" + ] + }, + { + "id": 20, + "type": "Note", + "pos": [ + -204, + 673 + ], + "size": { + "0": 210, + "1": 58 + }, + "flags": {}, + "order": 9, + "mode": 0, + "properties": { + "text": "" + }, + "widgets_values": [ + "这里上传原图像,右键可以打开MaskEditor进行mask绘制。" + ], + "color": "#432", + "bgcolor": "#653" + } + ], + "links": [ + [ + 3, + 4, + 0, + 5, + 0, + "IMAGE" + ], + [ + 4, + 1, + 0, + 4, + 1, + "PIPELINE" + ], + [ + 28, + 7, + 1, + 17, + 2, + "MASK" + ], + [ + 29, + 1, + 0, + 17, + 0, + "PIPELINE" + ], + [ + 30, + 7, + 0, + 17, + 1, + "IMAGE" + ], + [ + 31, + 15, + 0, + 17, + 3, + "PROMPT" + ], + [ + 32, + 12, + 0, + 17, + 4, + "PROMPT" + ], + [ + 33, + 17, + 0, + 4, + 0, + "LATENT" + ] + ], + "groups": [], + "config": {}, + "extra": { + "ds": { + "scale": 0.6303940863128514, + "offset": [ + 628.0768100805229, + 63.29978438298349 + ] + } + }, + "version": 0.4 +} \ No newline at end of file diff --git a/PaddleMIX/comfyui/ComfyUI_ppdiffusers/workflows/SD15/workflow_SD1.5_text2img.json b/PaddleMIX/comfyui/ComfyUI_ppdiffusers/workflows/SD15/workflow_SD1.5_text2img.json new file mode 100644 index 0000000000000000000000000000000000000000..2c7ae117bf7b83242009df5ea56c9ec43af5e9e3 --- /dev/null +++ b/PaddleMIX/comfyui/ComfyUI_ppdiffusers/workflows/SD15/workflow_SD1.5_text2img.json @@ -0,0 +1,416 @@ +{ + "last_node_id": 25, + "last_link_id": 42, + "nodes": [ + { + "id": 4, + "type": "PaddleSDVaeDecoder", + "pos": [ + 1011, + 398 + ], + "size": { + "0": 210, + "1": 46 + }, + "flags": {}, + "order": 9, + "mode": 0, + "inputs": [ + { + "name": "latent", + "type": "LATENT", + "link": 42, + "label": "latent" + }, + { + "name": "sd_pipe", + "type": "PIPELINE", + "link": 4, + "label": "sd_pipe" + } + ], + "outputs": [ + { + "name": "image", + "type": "IMAGE", + "links": [ + 3 + ], + "shape": 3, + "label": "image", + "slot_index": 0 + } + ], + "properties": { + "Node name for S&R": "PaddleSDVaeDecoder" + } + }, + { + "id": 19, + "type": "Note", + "pos": [ + 1406, + 968 + ], + "size": { + "0": 210, + "1": 58 + }, + "flags": {}, + "order": 0, + "mode": 0, + "properties": { + "text": "" + }, + "widgets_values": [ + "这里填负向画面提示 (不想要的内容)" + ], + "color": "#432", + "bgcolor": "#653" + }, + { + "id": 18, + "type": "Note", + "pos": [ + 254, + 1013 + ], + "size": { + "0": 210, + "1": 58 + }, + "flags": {}, + "order": 1, + "mode": 0, + "properties": { + "text": "" + }, + "widgets_values": [ + "这里填正向画面提示 (想要的内容)" + ], + "color": "#432", + "bgcolor": "#653" + }, + { + "id": 21, + "type": "Note", + "pos": [ + 990, + 543 + ], + "size": { + "0": 217.51138305664062, + "1": 164.82931518554688 + }, + "flags": {}, + "order": 2, + "mode": 0, + "properties": { + "text": "" + }, + "widgets_values": [ + "- denoise是重绘幅度,越高程度越大\n- steps是画笔绘制的步数\n- number是每次同时绘制的张数\n- cfg可以调整画面细节参数\n- scheduler是不同的去噪声方式" + ], + "color": "#432", + "bgcolor": "#653" + }, + { + "id": 22, + "type": "Note", + "pos": [ + 1835, + 498 + ], + "size": { + "0": 210, + "1": 58 + }, + "flags": {}, + "order": 3, + "mode": 0, + "properties": { + "text": "" + }, + "widgets_values": [ + "这里是最终结果" + ], + "color": "#432", + "bgcolor": "#653" + }, + { + "id": 23, + "type": "Note", + "pos": [ + 324, + 227 + ], + "size": { + "0": 210, + "1": 58 + }, + "flags": {}, + "order": 4, + "mode": 0, + "properties": { + "text": "" + }, + "widgets_values": [ + "这里选择喜欢的AIGC大模型" + ], + "color": "#432", + "bgcolor": "#653" + }, + { + "id": 1, + "type": "PaddleSDCheckpointLoader", + "pos": [ + -36, + 291 + ], + "size": { + "0": 315, + "1": 58 + }, + "flags": {}, + "order": 5, + "mode": 0, + "outputs": [ + { + "name": "sd_pipe", + "type": "PIPELINE", + "links": [ + 4, + 39 + ], + "shape": 3, + "label": "sd_pipe", + "slot_index": 0 + } + ], + "properties": { + "Node name for S&R": "PaddleSDCheckpointLoader" + }, + "widgets_values": [ + "sd15/25D风_revAnimated_v122.safetensors" + ] + }, + { + "id": 15, + "type": "PromptInput", + "pos": [ + 479, + 1004 + ], + "size": { + "0": 400, + "1": 200 + }, + "flags": {}, + "order": 6, + "mode": 0, + "outputs": [ + { + "name": "prompt", + "type": "PROMPT", + "links": [ + 40 + ], + "shape": 3, + "label": "prompt", + "slot_index": 0 + } + ], + "properties": { + "Node name for S&R": "PromptInput" + }, + "widgets_values": [ + "1boy, blue hair, cute, anime style" + ] + }, + { + "id": 12, + "type": "PromptInput", + "pos": [ + 965, + 964 + ], + "size": { + "0": 400, + "1": 200 + }, + "flags": {}, + "order": 7, + "mode": 0, + "outputs": [ + { + "name": "prompt", + "type": "PROMPT", + "links": [ + 41 + ], + "shape": 3, + "label": "prompt", + "slot_index": 0 + } + ], + "properties": { + "Node name for S&R": "PromptInput" + }, + "widgets_values": [ + "low, error, ugly, (extra hand), wrong hand, nsfw, nude, extra head" + ] + }, + { + "id": 5, + "type": "PaddleSaveImage", + "pos": [ + 1478, + 470 + ], + "size": { + "0": 315, + "1": 270 + }, + "flags": {}, + "order": 10, + "mode": 0, + "inputs": [ + { + "name": "images", + "type": "IMAGE", + "link": 3, + "label": "images" + } + ], + "properties": { + "Node name for S&R": "PaddleSaveImage" + }, + "widgets_values": [ + "ComfyUI" + ] + }, + { + "id": 25, + "type": "PaddleSDText2ImagePipe", + "pos": [ + 636, + 537 + ], + "size": { + "0": 315, + "1": 266 + }, + "flags": {}, + "order": 8, + "mode": 0, + "inputs": [ + { + "name": "sd_pipe", + "type": "PIPELINE", + "link": 39, + "label": "sd_pipe" + }, + { + "name": "prompt", + "type": "PROMPT", + "link": 40, + "label": "prompt" + }, + { + "name": "negative_prompt", + "type": "PROMPT", + "link": 41, + "label": "negative_prompt" + } + ], + "outputs": [ + { + "name": "latent", + "type": "LATENT", + "links": [ + 42 + ], + "shape": 3, + "label": "latent", + "slot_index": 0 + } + ], + "properties": { + "Node name for S&R": "PaddleSDText2ImagePipe" + }, + "widgets_values": [ + 20, + 512, + 768, + 1, + 61130596064161, + "randomize", + 7.5, + "euler" + ] + } + ], + "links": [ + [ + 3, + 4, + 0, + 5, + 0, + "IMAGE" + ], + [ + 4, + 1, + 0, + 4, + 1, + "PIPELINE" + ], + [ + 39, + 1, + 0, + 25, + 0, + "PIPELINE" + ], + [ + 40, + 15, + 0, + 25, + 1, + "PROMPT" + ], + [ + 41, + 12, + 0, + 25, + 2, + "PROMPT" + ], + [ + 42, + 25, + 0, + 4, + 0, + "LATENT" + ] + ], + "groups": [], + "config": {}, + "extra": { + "ds": { + "scale": 0.7627768444385535, + "offset": [ + 342.353878460601, + -167.10478701820625 + ] + } + }, + "version": 0.4 +} \ No newline at end of file diff --git a/PaddleMIX/comfyui/ComfyUI_ppdiffusers/workflows/SDXL/workflow_SDXL_text2img.json b/PaddleMIX/comfyui/ComfyUI_ppdiffusers/workflows/SDXL/workflow_SDXL_text2img.json new file mode 100644 index 0000000000000000000000000000000000000000..9ed8e318171fe23a44935c6856c89ebe5e942cc6 --- /dev/null +++ b/PaddleMIX/comfyui/ComfyUI_ppdiffusers/workflows/SDXL/workflow_SDXL_text2img.json @@ -0,0 +1,416 @@ +{ + "last_node_id": 28, + "last_link_id": 51, + "nodes": [ + { + "id": 19, + "type": "Note", + "pos": [ + 1406, + 968 + ], + "size": { + "0": 210, + "1": 58 + }, + "flags": {}, + "order": 0, + "mode": 0, + "properties": { + "text": "" + }, + "widgets_values": [ + "这里填负向画面提示 (不想要的内容)" + ], + "color": "#432", + "bgcolor": "#653" + }, + { + "id": 18, + "type": "Note", + "pos": [ + 254, + 1013 + ], + "size": { + "0": 210, + "1": 58 + }, + "flags": {}, + "order": 1, + "mode": 0, + "properties": { + "text": "" + }, + "widgets_values": [ + "这里填正向画面提示 (想要的内容)" + ], + "color": "#432", + "bgcolor": "#653" + }, + { + "id": 21, + "type": "Note", + "pos": [ + 990, + 543 + ], + "size": { + "0": 217.51138305664062, + "1": 164.82931518554688 + }, + "flags": {}, + "order": 2, + "mode": 0, + "properties": { + "text": "" + }, + "widgets_values": [ + "- denoise是重绘幅度,越高程度越大\n- steps是画笔绘制的步数\n- number是每次同时绘制的张数\n- cfg可以调整画面细节参数\n- scheduler是不同的去噪声方式" + ], + "color": "#432", + "bgcolor": "#653" + }, + { + "id": 22, + "type": "Note", + "pos": [ + 1835, + 498 + ], + "size": { + "0": 210, + "1": 58 + }, + "flags": {}, + "order": 3, + "mode": 0, + "properties": { + "text": "" + }, + "widgets_values": [ + "这里是最终结果" + ], + "color": "#432", + "bgcolor": "#653" + }, + { + "id": 23, + "type": "Note", + "pos": [ + 324, + 227 + ], + "size": { + "0": 210, + "1": 58 + }, + "flags": {}, + "order": 4, + "mode": 0, + "properties": { + "text": "" + }, + "widgets_values": [ + "这里选择喜欢的AIGC大模型" + ], + "color": "#432", + "bgcolor": "#653" + }, + { + "id": 5, + "type": "PaddleSaveImage", + "pos": [ + 1478, + 470 + ], + "size": { + "0": 315, + "1": 270 + }, + "flags": {}, + "order": 10, + "mode": 0, + "inputs": [ + { + "name": "images", + "type": "IMAGE", + "link": 51, + "label": "images" + } + ], + "properties": { + "Node name for S&R": "PaddleSaveImage" + }, + "widgets_values": [ + "ComfyUI" + ] + }, + { + "id": 12, + "type": "PromptInput", + "pos": [ + 965, + 964 + ], + "size": { + "0": 400, + "1": 200 + }, + "flags": {}, + "order": 5, + "mode": 0, + "outputs": [ + { + "name": "prompt", + "type": "PROMPT", + "links": [ + 48 + ], + "shape": 3, + "label": "prompt", + "slot_index": 0 + } + ], + "properties": { + "Node name for S&R": "PromptInput" + }, + "widgets_values": [ + "low, error, ugly, (extra hand), wrong hand, nsfw, nude, extra head" + ] + }, + { + "id": 28, + "type": "PaddleSDXLVaeDecoder", + "pos": [ + 1115.8165436384072, + 359.29368984194616 + ], + "size": { + "0": 210, + "1": 46 + }, + "flags": {}, + "order": 9, + "mode": 0, + "inputs": [ + { + "name": "latent", + "type": "LATENT", + "link": 50, + "label": "latent" + }, + { + "name": "sd_pipe", + "type": "PIPELINE", + "link": 49, + "label": "sd_pipe" + } + ], + "outputs": [ + { + "name": "image", + "type": "IMAGE", + "links": [ + 51 + ], + "shape": 3, + "label": "image", + "slot_index": 0 + } + ], + "properties": { + "Node name for S&R": "PaddleSDXLVaeDecoder" + } + }, + { + "id": 27, + "type": "PaddleSDXLCheckpointLoader", + "pos": [ + 53, + 413 + ], + "size": { + "0": 315, + "1": 58 + }, + "flags": {}, + "order": 6, + "mode": 0, + "outputs": [ + { + "name": "sd_pipe", + "type": "PIPELINE", + "links": [ + 45, + 49 + ], + "shape": 3, + "label": "sd_pipe", + "slot_index": 0 + } + ], + "properties": { + "Node name for S&R": "PaddleSDXLCheckpointLoader" + }, + "widgets_values": [ + "sdxl/MJ5风格_SDXL_Dream.safetensors" + ] + }, + { + "id": 15, + "type": "PromptInput", + "pos": [ + 479, + 1004 + ], + "size": { + "0": 400, + "1": 200 + }, + "flags": {}, + "order": 7, + "mode": 0, + "outputs": [ + { + "name": "prompt", + "type": "PROMPT", + "links": [ + 44 + ], + "shape": 3, + "label": "prompt", + "slot_index": 0 + } + ], + "properties": { + "Node name for S&R": "PromptInput" + }, + "widgets_values": [ + "1girl, cool, blue hair, cute, sunset, niji anime style" + ] + }, + { + "id": 26, + "type": "PaddleSDXLText2ImagePipe", + "pos": [ + 503, + 573 + ], + "size": { + "0": 315, + "1": 266 + }, + "flags": {}, + "order": 8, + "mode": 0, + "inputs": [ + { + "name": "sd_pipe", + "type": "PIPELINE", + "link": 45, + "label": "sd_pipe" + }, + { + "name": "prompt", + "type": "PROMPT", + "link": 44, + "label": "prompt" + }, + { + "name": "negative_prompt", + "type": "PROMPT", + "link": 48, + "label": "negative_prompt" + } + ], + "outputs": [ + { + "name": "latent", + "type": "LATENT", + "links": [ + 50 + ], + "shape": 3, + "label": "latent", + "slot_index": 0 + } + ], + "properties": { + "Node name for S&R": "PaddleSDXLText2ImagePipe" + }, + "widgets_values": [ + 20, + 512, + 768, + 1, + 351732349249869, + "randomize", + 5, + "euler" + ] + } + ], + "links": [ + [ + 44, + 15, + 0, + 26, + 1, + "PROMPT" + ], + [ + 45, + 27, + 0, + 26, + 0, + "PIPELINE" + ], + [ + 48, + 12, + 0, + 26, + 2, + "PROMPT" + ], + [ + 49, + 27, + 0, + 28, + 1, + "PIPELINE" + ], + [ + 50, + 26, + 0, + 28, + 0, + "LATENT" + ], + [ + 51, + 28, + 0, + 5, + 0, + "IMAGE" + ] + ], + "groups": [], + "config": {}, + "extra": { + "ds": { + "scale": 0.5730855330116872, + "offset": [ + 113.53226463291708, + -145.5843663012114 + ] + } + }, + "version": 0.4 +} \ No newline at end of file diff --git a/vlmeval/VLMEvalKit_old/docs/en/.readthedocs.yaml b/vlmeval/VLMEvalKit_old/docs/en/.readthedocs.yaml new file mode 100644 index 0000000000000000000000000000000000000000..c6cf8e2a075ea15f39dc7aba8faa98f464f52fe6 --- /dev/null +++ b/vlmeval/VLMEvalKit_old/docs/en/.readthedocs.yaml @@ -0,0 +1,17 @@ +version: 2 + +# Set the version of Python and other tools you might need +build: + os: ubuntu-22.04 + tools: + python: "3.8" + +formats: + - epub + +sphinx: + configuration: docs/en/conf.py + +python: + install: + - requirements: requirements/docs.txt diff --git a/vlmeval/VLMEvalKit_old/docs/en/ConfigSystem.md b/vlmeval/VLMEvalKit_old/docs/en/ConfigSystem.md new file mode 100644 index 0000000000000000000000000000000000000000..6b245f35ea6ef83896f47ad5433f4aa71715e32f --- /dev/null +++ b/vlmeval/VLMEvalKit_old/docs/en/ConfigSystem.md @@ -0,0 +1,57 @@ +# Config System + +By default, VLMEvalKit launches the evaluation by setting the model name(s) (defined in `/vlmeval/config.py`) and dataset name(s) (defined in `vlmeval/dataset/__init__.py`) in the `run.py` script with the `--model` and `--data` arguments. Such approach is simple and efficient in most scenarios, however, it may not be flexible enough when the user wants to evaluate multiple models / datasets with different settings. + +To address this, VLMEvalKit provides a more flexible config system. The user can specify the model and dataset settings in a json file, and pass the path to the config file to the `run.py` script with the `--config` argument. Here is a sample config json: + +```json +{ + "model": { + "GPT4o_20240806_T00_HIGH": { + "class": "GPT4V", + "model": "gpt-4o-2024-08-06", + "temperature": 0, + "img_detail": "high" + }, + "GPT4o_20240806_T10_Low": { + "class": "GPT4V", + "model": "gpt-4o-2024-08-06", + "temperature": 1.0, + "img_detail": "low" + } + }, + "data": { + "MME-RealWorld-Lite": { + "class": "MMERealWorld", + "dataset": "MME-RealWorld-Lite" + }, + "MMBench_DEV_EN_V11": { + "class": "ImageMCQDataset", + "dataset": "MMBench_DEV_EN_V11" + } + } +} +``` + +Explanation of the config json: + +1. Now we support two fields: `model` and `data`, each of which is a dictionary. The key of the dictionary is the name of the model / dataset (set by the user), and the value is the setting of the model / dataset. +2. For items in `model`, the value is a dictionary containing the following keys: + - `class`: The class name of the model, which should be a class name defined in `vlmeval/vlm/__init__.py` (open-source models) or `vlmeval/api/__init__.py` (API models). + - Other kwargs: Other kwargs are model-specific parameters, please refer to the definition of the model class for detailed usage. For example, `model`, `temperature`, `img_detail` are arguments of the `GPT4V` class. It's noteworthy that the `model` argument is required by most model classes. +3. For the dictionary `data`, we suggest users to use the official dataset name as the key (or part of the key), since we frequently determine the post-processing / judging settings based on the dataset name. For items in `data`, the value is a dictionary containing the following keys: + - `class`: The class name of the dataset, which should be a class name defined in `vlmeval/dataset/__init__.py`. + - Other kwargs: Other kwargs are dataset-specific parameters, please refer to the definition of the dataset class for detailed usage. Typically, the `dataset` argument is required by most dataset classes. + +Saving the example config json to `config.json`, you can launch the evaluation by: + +```bash +python run.py --config config.json +``` + +That will generate the following output files under the working directory `$WORK_DIR` (Following the format `{$WORK_DIR}/{$MODEL_NAME}/{$MODEL_NAME}_{$DATASET_NAME}_*`): + +- `$WORK_DIR/GPT4o_20240806_T00_HIGH/GPT4o_20240806_T00_HIGH_MME-RealWorld-Lite*` +- `$WORK_DIR/GPT4o_20240806_T10_Low/GPT4o_20240806_T10_Low_MME-RealWorld-Lite*` +- `$WORK_DIR/GPT4o_20240806_T00_HIGH/GPT4o_20240806_T00_HIGH_MMBench_DEV_EN_V11*` +- `$WORK_DIR/GPT4o_20240806_T10_Low/GPT4o_20240806_T10_Low_MMBench_DEV_EN_V11*` diff --git a/vlmeval/VLMEvalKit_old/docs/en/Development.md b/vlmeval/VLMEvalKit_old/docs/en/Development.md new file mode 100644 index 0000000000000000000000000000000000000000..c0a4637e98feef1257698d81b23a15b2b61843d4 --- /dev/null +++ b/vlmeval/VLMEvalKit_old/docs/en/Development.md @@ -0,0 +1,146 @@ +# Develop new Benchmark / MLLM + +> 🛠️ How to implement a new Benchmark / VLM in VLMEvalKit? + +## Implement a new benchmark + +Example PR: **Math-Vision Benchmark** ([#292](https://github.com/open-compass/VLMEvalKit/pull/292/files)) + +In VLMEvalKit, benchmarks are organized as dataset classes. When you try to implement a new benchmark, you can either reuse existing dataset classes (*e.g.*, You can reuse `ImageMCQDataset` when implementing a new multi-choice benchmark), or support a new dataset class. Each dataset must have the following two member functions (either reuse the one of the parent class or implement your own): + +- `build_prompt(self, line)`: The function input `line` is an integer (the sample index) or a `pd.Series` object (the raw record of the sample). The function outputs a `multi-modal message`, serving as the input of an MLLM. The `multi-modal message` is an interleaved list of multi-modal messages adopting the following format (the example includes an image and a text message): `[dict(type='image', value=IMAGE_PTH), dict(type='text', value=prompt)]`. +- `evaluate(self, eval_file, **judge_kwargs)`: The function input `eval_file` is the MLLM prediction (typically in `.xlsx` format). If the benchmark requires an external LLM (typically GPT) for evaluation, then `judge_kwargs` can pass the arguments for the LLM. The function outputs the benchmark evaluation results (metrics) in the form of `dict` or `pd.DataFrame`. + +We then brief the typical steps to implement a new benchmark under VLMEvalKit: + +### 1. Prepare your benchmark tsv file + +Currently, we organize a benchmark as one single TSV file. During inference, the data file will be automatically downloaded from the definited `DATASET_URL` link to `$LMUData` file (default path is `$HOME/LMUData`, if not set explicitly). You can upload the prepared TSV file to a downloadable address (e.g., Huggingface) or send it to us at . We will assist in uploading the dataset to the server. You can also customize `LMUData` path in the environment variable `LMUData=/path/to/your/data`. + +The contents of the TSV file consist of: + +| Dataset Name \ Fields | index | image | image_path | question | hint | multi-choice
options | answer | category | l2-category | split | +| --------------------------------------- | ----- | ----- | ---------- | -------- | ---- | ----------------------- | ------ | -------- | ----------- | ----- | +| MMBench_DEV_[CN/EN] | ✅ | ✅ | | ✅ | ✅ | ✅ | ✅ | ✅ | ✅ | ✅ | +| MMBench_TEST_[CN/EN] | ✅ | ✅ | | ✅ | ✅ | ✅ | | ✅ | ✅ | ✅ | +| CCBench | ✅ | ✅ | | ✅ | | ✅ | ✅ | ✅ | | | +| SEEDBench_IMG | ✅ | ✅ | | ✅ | | ✅ | ✅ | ✅ | | | +| MME | ✅ | ✅ | | ✅ | | | ✅ | ✅ | | | +| CORE_MM | ✅ | ✅ | ✅ | ✅ | | | | ✅ | | | +| MMVet | ✅ | ✅ | | ✅ | | | ✅ | ✅ | | | +| MMMU_DEV_VAL | ✅ | ✅ | ✅ | ✅ | | ✅ | ✅ | ✅ | ✅ | ✅ | +| COCO_VAL | ✅ | ✅ | | | | | ✅ | | | | +| OCRVQA_[TEST/TESTCORE] | ✅ | ✅ | | ✅ | | | ✅ | | | | +| TextVQA_VAL | ✅ | ✅ | | ✅ | | | ✅ | | | | +| VCR_[EN/ZH]\_[EASY/HARD]\_[ALL/500/100] | ✅ | ✅ | | ✅ | | | ✅ | | | | +| MMMB_[en/cn/pt/ar/tr/ru] | ✅ | ✅ | | ✅ | ✅ | ✅ | ✅ | ✅ | |✅ | +| MMBench_dev_[en/cn/pt/ar/tr/ru] | ✅ | ✅ | | ✅ | ✅ | ✅ | ✅ | ✅ | ✅ |✅ | + +
Table 1. TSV fields of supported datasets.
+ +**Intro to mandatory fields in the `TSV` file:** + +- **index:** Integer, Unique for each line in `tsv` +- **image:** The base64 of the image, you can use APIs implemented in `vlmeval/smp/vlm.py` for encoding and decoding: + - Encoding: `encode_image_to_base64 `(for PIL Image) / `encode_image_file_to_base64` (for image file path) + - Decoding: `decode_base64_to_image`(for PIL Image) / `decode_base64_to_image_file` (for image file path) +- **question**: The question corresponding to the image, a string +- **answer**: The answer to the question, a string. The `test` split does not need this field + +### 2. Cutomize your benchmark prompt + +`ImageBaseDataset` defines the default prompt format. If you need to add prompts specific to the dataset or input data in the `Interleave` format to the model, you can implement this through the `build_prompt(line)` function. This function takes a line from a TSV file as input, containing fields such as index, image, question, etc. The function returns a dictionary list of multimodal messages `msg` in the format `[dict(type='image', value=IMAGE_PTH), dict(type='text', value=prompt)]`, including the image path and the text prompt to be input into VLMs. For interleave type inputs, you can directly place the dictionary of the image path at the image token position. + +### 3. Cutomize your benchmark metrics + +To add evaluation for a new benchmark, you need to customize a class object to implement the dataset’s metrics calculation. Multimodal datasets inherit from the `ImageBaseDataset` object in `vlmeval/dataset/image_base.py`. The TYPE defines the type of dataset, `DATASET_URL` is the download address of the dataset, and `DATASET_MD5` is the MD5 checksum for consistency checking of the dataset file. + +In this class, **you need to implement** the `evaluate(eval_file, **judge_kwargs)` class function to calculate metrics and output results for the custom dataset. The function input `eval_file` is the path to the model prediction results file `{model_name}_{dataset}.xlsx`. This file can be read as a pandas.DataFrame using the `load(eval_file)` method, containing fields such as index, question, answer, category, prediction, etc. The judge_kwargs will pass a dictionary related to evaluation, such as the name of the `judge model`, the number of API request threads, etc. **The return value** of the function is the calculated accuracy and other metrics, formatted as a dictionary composed of lists, organized into a pandas.DataFrame. + +## Implement a new model + +Example PR: **Support LLaVA-Next-Interleave** ([#294](https://github.com/open-compass/VLMEvalKit/pull/294)) + +**1. Support `generate_inner` API (mandatory).** + +All existing models are implemented in `vlmeval/vlm`. For a minimal model, your model class **must implement the method** `generate_inner(msgs, dataset=None)`. In this function, you feed a multi-modal message to your VLM and return the VLM prediction (which is a string). The optional argument `dataset` can be used as the flag for the model to switch among various inference strategies. + +The multi-modal messages `msgs` is a list of dictionaries, each dictionary has two keys: type and value: +- `type`: We currently support two types, choices are ["image", "text"]. +- `value`: When type=='text' , the value is the text message (a single string); when type=='image', the value can be the local path of an image file, or the image URL. + +Currently a multi-modal message may contain arbitrarily interleaved images and texts. If your model do not support that, a practice can be taking the 1st image and concatenated text messages as the input. You can set the `INTERLEAVE = False` in your model class and use `self.message_to_promptimg(message, dataset=dataset)` to build your prompt and the first image's path. + +Here are some examples of multi-modal messages: + +```python +IMAGE_PTH = 'assets/apple.jpg' +IMAGE_URL = 'https://raw.githubusercontent.com/open-compass/VLMEvalKit/main/assets/apple.jpg' +msg1 = [ + dict(type='image', value=IMAGE_PTH), + dict(type='text', value='What is in this image?') +] +msg2 = [ + dict(type='image', value=IMAGE_URL), + dict(type='image', value=IMAGE_URL), + dict(type='text', value='How many apples are there in these images?') +] +response = model.generate(msg1) +``` + +For convenience sake, we also support to take a list of string as inputs. In that case, we will check if a string is an image path or image URL and automatically convert it to the list[dict] format: + +```python +IMAGE_PTH = 'assets/apple.jpg' +IMAGE_URL = 'https://raw.githubusercontent.com/open-compass/VLMEvalKit/main/assets/apple.jpg' +msg1 = [IMAGE_PTH, 'What is in this image?'] +msg2 = [IMAGE_URL, IMAGE_URL, 'How many apples are there in these images?'] +response = model.generate(msg1) +``` + +**Support Custom Prompt (optional).** + +Besides, your model can support **custom prompt building** by implementing two optional methods: `use_custom_prompt(dataset)` and `build_prompt(line, dataset=None)`. + +Both functions take the dataset name as the input: + +- `use_custom_prompt(dataset)` returns a boolean flag, indicating whether the model should use the custom prompt building strategy. +- If `use_custom_prompt(dataset)` returns True, `build_prompt(line, dataset)` should return a customly bulit multimodal message for the corresponding `dataset`, given `line`, which is a dictionary that includes the necessary information of a data sample. If `use_custom_prompt(dataset)` returns False, the default prompt building strategy will be used. + +**Support multi-turn chatting (optional).** + +You can also support the multi-turn chatting and evaluation with your VLM by supporting the `chat_inner(message, dataset)` function. The function outputs a single string response, and the `message` is a list of chat history, following the below format. + +```python +# Assume msg1, msg2, msg3, ... are multi-modal messages following the previously described format +# `chat_inner` take the following chat history list as input: +message = [ + dict(role='user', content=msg1), + dict(role='assistant', content=msg2), + dict(role='user', content=msg3), + dict(role='assistant', content=msg4), + ...... + dict(role='user', content=msgn), +] +# `message` should contain an odd number of chat utterances, the role of utterances should be interleaved "user" and "assistant", with the role of the last utterance to be "user". +# The chat function will call `chat_inner` +response = model.chat(message) +``` + +### Example PRs: + +- VLM that doesn't support interleaved images and texts, and does not use custom prompts: [[Model] Support glm-4v-9b](https://github.com/open-compass/VLMEvalKit/pull/221) +- VLM that supports interleaved images and texts and custom prompts: [Add MiniCPM-Llama3-V-2.5](https://github.com/open-compass/VLMEvalKit/pull/205) +- VLM API: [Feature add glmv](https://github.com/open-compass/VLMEvalKit/pull/201) + +## Contribute to VLMEvalKit + +If you want to contribute codes to **VLMEvalKit**, please do the pre-commit check before you submit a PR. That helps to keep the code tidy. + +```bash +# Under the directory of VLMEvalKit, install the pre-commit hook: +pip install pre-commit +pre-commit install +pre-commit run --all-files +# Then you can commit your code. +``` diff --git a/vlmeval/VLMEvalKit_old/docs/en/Makefile b/vlmeval/VLMEvalKit_old/docs/en/Makefile new file mode 100644 index 0000000000000000000000000000000000000000..d4bb2cbb9eddb1bb1b4f366623044af8e4830919 --- /dev/null +++ b/vlmeval/VLMEvalKit_old/docs/en/Makefile @@ -0,0 +1,20 @@ +# Minimal makefile for Sphinx documentation +# + +# You can set these variables from the command line, and also +# from the environment for the first two. +SPHINXOPTS ?= +SPHINXBUILD ?= sphinx-build +SOURCEDIR = . +BUILDDIR = _build + +# Put it first so that "make" without argument is like "make help". +help: + @$(SPHINXBUILD) -M help "$(SOURCEDIR)" "$(BUILDDIR)" $(SPHINXOPTS) $(O) + +.PHONY: help Makefile + +# Catch-all target: route all unknown targets to Sphinx using the new +# "make mode" option. $(O) is meant as a shortcut for $(SPHINXOPTS). +%: Makefile + @$(SPHINXBUILD) -M $@ "$(SOURCEDIR)" "$(BUILDDIR)" $(SPHINXOPTS) $(O) diff --git a/vlmeval/VLMEvalKit_old/docs/en/Quickstart.md b/vlmeval/VLMEvalKit_old/docs/en/Quickstart.md new file mode 100644 index 0000000000000000000000000000000000000000..bbd78307e3890f0c63d51a2b54e82ee4fe1f099b --- /dev/null +++ b/vlmeval/VLMEvalKit_old/docs/en/Quickstart.md @@ -0,0 +1,148 @@ +# Quickstart + +Before running the evaluation script, you need to **configure** the VLMs and set the model_paths properly. + +After that, you can use a single script `run.py` to inference and evaluate multiple VLMs and benchmarks at a same time. + +## Step 0. Installation & Setup essential keys + +**Installation.** + +```bash +git clone https://github.com/open-compass/VLMEvalKit.git +cd VLMEvalKit +pip install -e . +``` + +**Setup Keys.** + +To infer with API models (GPT-4v, Gemini-Pro-V, etc.) or use LLM APIs as the **judge or choice extractor**, you need to first setup API keys. VLMEvalKit will use an judge **LLM** to extract answer from the output if you set the key, otherwise it uses the **exact matching** mode (find "Yes", "No", "A", "B", "C"... in the output strings). **The exact matching can only be applied to the Yes-or-No tasks and the Multi-choice tasks.** +- You can place the required keys in `$VLMEvalKit/.env` or directly set them as the environment variable. If you choose to create a `.env` file, its content will look like: + + ```bash + # The .env file, place it under $VLMEvalKit + # API Keys of Proprietary VLMs + # QwenVL APIs + DASHSCOPE_API_KEY= + # Gemini w. Google Cloud Backends + GOOGLE_API_KEY= + # OpenAI API + OPENAI_API_KEY= + OPENAI_API_BASE= + # StepAI API + STEPAI_API_KEY= + # REKA API + REKA_API_KEY= + # GLMV API + GLMV_API_KEY= + # CongRong API + CW_API_BASE= + CW_API_KEY= + # SenseChat-V API + SENSECHAT_AK= + SENSECHAT_SK= + # Hunyuan-Vision API + HUNYUAN_SECRET_KEY= + HUNYUAN_SECRET_ID= + # You can also set a proxy for calling api models during the evaluation stage + EVAL_PROXY= + ``` + +- Fill the blanks with your API keys (if necessary). Those API keys will be automatically loaded when doing the inference and evaluation. +## Step 1. Configuration + +**VLM Configuration**: All VLMs are configured in `vlmeval/config.py`. Few legacy VLMs (like MiniGPT-4, LLaVA-v1-7B) requires additional configuration (configuring the code / model_weight root in the config file). During evaluation, you should use the model name specified in `supported_VLM` in `vlmeval/config.py` to select the VLM. Make sure you can successfully infer with the VLM before starting the evaluation with the following command `vlmutil check {MODEL_NAME}`. + +## Step 2. Evaluation + +**New!!!** We integrated a new config system to enable more flexible evaluation settings. Check the [Document](/docs/en/ConfigSystem.md) or run `python run.py --help` for more details 🔥🔥🔥 + +We use `run.py` for evaluation. To use the script, you can use `$VLMEvalKit/run.py` or create a soft-link of the script (to use the script anywhere): + +**Arguments** + +- `--data (list[str])`: Set the dataset names that are supported in VLMEvalKit (names can be found in the codebase README). +- `--model (list[str])`: Set the VLM names that are supported in VLMEvalKit (defined in `supported_VLM` in `vlmeval/config.py`). +- `--mode (str, default to 'all', choices are ['all', 'infer'])`: When `mode` set to "all", will perform both inference and evaluation; when set to "infer", will only perform the inference. +- `--nproc (int, default to 4)`: The number of threads for OpenAI API calling. +- `--work-dir (str, default to '.')`: The directory to save evaluation results. +- `--nframe (int, default to 8)`: The number of frames to sample from a video, only applicable to the evaluation of video benchmarks. +- `--pack (bool, store_true)`: A video may associate with multiple questions, if `pack==True`, will ask all questions for a video in a single query. + +**Command for Evaluating Image Benchmarks ** + +You can run the script with `python` or `torchrun`: + +```bash +# When running with `python`, only one VLM instance is instantiated, and it might use multiple GPUs (depending on its default behavior). +# That is recommended for evaluating very large VLMs (like IDEFICS-80B-Instruct). + +# IDEFICS-80B-Instruct on MMBench_DEV_EN, MME, and SEEDBench_IMG, Inference and Evalution +python run.py --data MMBench_DEV_EN MME SEEDBench_IMG --model idefics_80b_instruct --verbose +# IDEFICS-80B-Instruct on MMBench_DEV_EN, MME, and SEEDBench_IMG, Inference only +python run.py --data MMBench_DEV_EN MME SEEDBench_IMG --model idefics_80b_instruct --verbose --mode infer + +# When running with `torchrun`, one VLM instance is instantiated on each GPU. It can speed up the inference. +# However, that is only suitable for VLMs that consume small amounts of GPU memory. + +# IDEFICS-9B-Instruct, Qwen-VL-Chat, mPLUG-Owl2 on MMBench_DEV_EN, MME, and SEEDBench_IMG. On a node with 8 GPU. Inference and Evaluation. +torchrun --nproc-per-node=8 run.py --data MMBench_DEV_EN MME SEEDBench_IMG --model idefics_80b_instruct qwen_chat mPLUG-Owl2 --verbose +# Qwen-VL-Chat on MME. On a node with 2 GPU. Inference and Evaluation. +torchrun --nproc-per-node=2 run.py --data MME --model qwen_chat --verbose +``` + +**Command for Evaluating Video Benchmarks** + +```bash +# When running with `python`, only one VLM instance is instantiated, and it might use multiple GPUs (depending on its default behavior). +# That is recommended for evaluating very large VLMs (like IDEFICS-80B-Instruct). + +# IDEFICS2-8B on MMBench-Video, with 8 frames as inputs and vanilla evaluation. On a node with 8 GPUs. +torchrun --nproc-per-node=8 run.py --data MMBench-Video --model idefics2_8b --nframe 8 +# GPT-4o (API model) on MMBench-Video, with 16 frames as inputs and pack evaluation (all questions of a video in a single query). +python run.py --data MMBench-Video --model GPT4o --nframe 16 --pack +``` + +The evaluation results will be printed as logs, besides. **Result Files** will also be generated in the directory `$YOUR_WORKING_DIRECTORY/{model_name}`. Files ending with `.csv` contain the evaluated metrics. + +## Deploy a local language model as the judge / choice extractor +The default setting mentioned above uses OpenAI's GPT as the judge LLM. However, you can also deploy a local judge LLM with [LMDeploy](https://github.com/InternLM/lmdeploy). + +First install: +``` +pip install lmdeploy openai +``` + +And then deploy a local judge LLM with the single line of code. LMDeploy will automatically download the model from Huggingface. Assuming we use internlm2-chat-1_8b as the judge, port 23333, and the key sk-123456 (the key must start with "sk-" and follow with any number you like): +``` +lmdeploy serve api_server internlm/internlm2-chat-1_8b --server-port 23333 +``` + +You need to get the model name registered by LMDeploy with the following python code: +``` +from openai import OpenAI +client = OpenAI( + api_key='sk-123456', + base_url="http://0.0.0.0:23333/v1" +) +model_name = client.models.list().data[0].id +``` + +Now set some environment variables to tell VLMEvalKit how to use the local judge LLM. As mentioned above, you can also set them in `$VLMEvalKit/.env` file: +``` +OPENAI_API_KEY=sk-123456 +OPENAI_API_BASE=http://0.0.0.0:23333/v1/chat/completions +LOCAL_LLM= +``` + +Finally, you can run the commands in step 2 to evaluate your VLM with the local judge LLM. + +Note that + +- If you hope to deploy the judge LLM in a single GPU and evaluate your VLM on other GPUs because of limited GPU memory, try `CUDA_VISIBLE_DEVICES=x` like +``` +CUDA_VISIBLE_DEVICES=0 lmdeploy serve api_server internlm/internlm2-chat-1_8b --server-port 23333 +CUDA_VISIBLE_DEVICES=1,2,3 torchrun --nproc-per-node=3 run.py --data HallusionBench --model qwen_chat --verbose +``` +- If the local judge LLM is not good enough in following the instructions, the evaluation may fail. Please report such failures (e.g., by issues). +- It's possible to deploy the judge LLM in different ways, e.g., use a private LLM (not from HuggingFace) or use a quantized LLM. Please refer to the [LMDeploy doc](https://lmdeploy.readthedocs.io/en/latest/serving/api_server.html). You can use any other deployment framework if they support OpenAI API. diff --git a/vlmeval/VLMEvalKit_old/docs/en/conf.py b/vlmeval/VLMEvalKit_old/docs/en/conf.py new file mode 100644 index 0000000000000000000000000000000000000000..360c1622dd18fcca8c033af9122383cd66c5f686 --- /dev/null +++ b/vlmeval/VLMEvalKit_old/docs/en/conf.py @@ -0,0 +1,234 @@ +# flake8: noqa +# Configuration file for the Sphinx documentation builder. +# +# This file only contains a selection of the most common options. For a full +# list see the documentation: +# https://www.sphinx-doc.org/en/master/usage/configuration.html + +# -- Path setup -------------------------------------------------------------- + +# If extensions (or modules to document with autodoc) are in another directory, +# add these directories to sys.path here. If the directory is relative to the +# documentation root, use os.path.abspath to make it absolute, like shown here. +# +import os +import ast +import subprocess +import sys + +import pytorch_sphinx_theme +from sphinx.builders.html import StandaloneHTMLBuilder + +sys.path.insert(0, os.path.abspath('../../')) + +# -- Project information ----------------------------------------------------- + +project = 'VLMEvalKit' +copyright = '2023, VLMEvalKit' +author = 'VLMEvalKit Authors' + +# The full version, including alpha/beta/rc tags +version_file = '../../vlmeval/__init__.py' + + +def get_version(): + with open(version_file, 'r') as f: + file_content = f.read() + # Parse the file content into an abstract syntax tree (AST) + tree = ast.parse(file_content, filename=version_file) + + # Iterate through the body of the AST, looking for an assignment to __version__ + for node in tree.body: + if isinstance(node, ast.Assign): + for target in node.targets: + if isinstance(target, ast.Name) and target.id == '__version__': + return node.value.s + raise ValueError('__version__ not found') + + +release = get_version() + +# -- General configuration --------------------------------------------------- + +# Add any Sphinx extension module names here, as strings. They can be +# extensions coming with Sphinx (named 'sphinx.ext.*') or your custom +# ones. +extensions = [ + 'sphinx.ext.autodoc', + 'sphinx.ext.autosummary', + 'sphinx.ext.intersphinx', + 'sphinx.ext.napoleon', + 'sphinx.ext.viewcode', + 'myst_parser', + 'sphinx_copybutton', + 'sphinx_tabs.tabs', + 'notfound.extension', + 'sphinxcontrib.jquery', + 'sphinx_design', +] + +# Add any paths that contain templates here, relative to this directory. +templates_path = ['_templates'] + +# The suffix(es) of source filenames. +# You can specify multiple suffix as a list of string: +# +source_suffix = { + '.rst': 'restructuredtext', + '.md': 'markdown', +} + +language = 'en' + +# The master toctree document. +root_doc = 'index' +html_context = { + 'github_version': 'latest', +} +# List of patterns, relative to source directory, that match files and +# directories to ignore when looking for source files. +# This pattern also affects html_static_path and html_extra_path. +exclude_patterns = ['_build', 'Thumbs.db', '.DS_Store'] + +# -- Options for HTML output ------------------------------------------------- + +# The theme to use for HTML and HTML Help pages. See the documentation for +# a list of builtin themes. +# +html_theme = 'pytorch_sphinx_theme' +html_theme_path = [pytorch_sphinx_theme.get_html_theme_path()] + +# Theme options are theme-specific and customize the look and feel of a theme +# further. For a list of options available for each theme, see the +# documentation. +# yapf: disable +html_theme_options = { + 'menu': [ + { + 'name': 'GitHub', + 'url': 'https://github.com/open-compass/VLMEvalKit' + }, + ], + # Specify the language of shared menu + 'menu_lang': 'en', + # Disable the default edit on GitHub + 'default_edit_on_github': False, +} +# yapf: enable + +# Add any paths that contain custom static files (such as style sheets) here, +# relative to this directory. They are copied after the builtin static files, +# so a file named "default.css" will overwrite the builtin "default.css". +html_static_path = ['_static'] +html_css_files = [ + 'https://cdn.datatables.net/v/bs4/dt-1.12.1/datatables.min.css', + 'css/readthedocs.css' +] +html_js_files = [ + 'https://cdn.datatables.net/v/bs4/dt-1.12.1/datatables.min.js', + 'js/custom.js' +] + +# -- Options for HTMLHelp output --------------------------------------------- + +# Output file base name for HTML help builder. +htmlhelp_basename = 'vlmevalkitdoc' + +# -- Options for LaTeX output ------------------------------------------------ + +latex_elements = { + # The paper size ('letterpaper' or 'a4paper'). + # + # 'papersize': 'letterpaper', + + # The font size ('10pt', '11pt' or '12pt'). + # + # 'pointsize': '10pt', + + # Additional stuff for the LaTeX preamble. + # + # 'preamble': '', +} + +# Grouping the document tree into LaTeX files. List of tuples +# (source start file, target name, title, +# author, documentclass [howto, manual, or own class]). +latex_documents = [ + (root_doc, 'vlmevalkit.tex', 'VLMEvalKit Documentation', author, + 'manual'), +] + +# -- Options for manual page output ------------------------------------------ + +# One entry per manual page. List of tuples +# (source start file, name, description, authors, manual section). +man_pages = [(root_doc, 'vlmevalkit', 'VLMEvalKit Documentation', [author], + 1)] + +# -- Options for Texinfo output ---------------------------------------------- + +# Grouping the document tree into Texinfo files. List of tuples +# (source start file, target name, title, author, +# dir menu entry, description, category) +texinfo_documents = [ + (root_doc, 'vlmevalkit', 'VLMEvalKit Documentation', author, + 'VLMEvalKit Authors', 'AGI evaluation toolbox and benchmark.', + 'Miscellaneous'), +] + +# -- Options for Epub output ------------------------------------------------- + +# Bibliographic Dublin Core info. +epub_title = project + +# The unique identifier of the text. This can be a ISBN number +# or the project homepage. +# +# epub_identifier = '' + +# A unique identification for the text. +# +# epub_uid = '' + +# A list of files that should not be packed into the epub file. +epub_exclude_files = ['search.html'] + +# set priority when building html +StandaloneHTMLBuilder.supported_image_types = [ + 'image/svg+xml', 'image/gif', 'image/png', 'image/jpeg' +] + +# -- Extension configuration ------------------------------------------------- +# Ignore >>> when copying code +copybutton_prompt_text = r'>>> |\.\.\. ' +copybutton_prompt_is_regexp = True + +# Auto-generated header anchors +myst_heading_anchors = 3 +# Enable "colon_fence" extension of myst. +myst_enable_extensions = ['colon_fence', 'dollarmath'] + +# Configuration for intersphinx +intersphinx_mapping = { + 'python': ('https://docs.python.org/3', None), + 'numpy': ('https://numpy.org/doc/stable', None), + 'torch': ('https://pytorch.org/docs/stable/', None), + 'mmengine': ('https://mmengine.readthedocs.io/en/latest/', None), + 'transformers': + ('https://huggingface.co/docs/transformers/main/en/', None), +} +napoleon_custom_sections = [ + # Custom sections for data elements. + ('Meta fields', 'params_style'), + ('Data fields', 'params_style'), +] + +# Disable docstring inheritance +autodoc_inherit_docstrings = False +# Mock some imports during generate API docs. +autodoc_mock_imports = ['rich', 'attr', 'einops'] +# Disable displaying type annotations, these can be very verbose +autodoc_typehints = 'none' + +# The not found page +notfound_template = '404.html' diff --git a/vlmeval/VLMEvalKit_old/docs/en/index.rst b/vlmeval/VLMEvalKit_old/docs/en/index.rst new file mode 100644 index 0000000000000000000000000000000000000000..425c7de4de85670f8fd7a64d65fb786a9006f7e1 --- /dev/null +++ b/vlmeval/VLMEvalKit_old/docs/en/index.rst @@ -0,0 +1,41 @@ +Welcome to the VLMEvalKit Tutorial! +========================================== + +VLMEvalKit Getting Started Guide +------------------------------- + +To help users get started quickly, we recommend the following process: + +- For users who want to use VLMEvalKit, we recommend reading the "Start Your First Step" section to set up the environment and start a mini-experiment to familiarize yourself with the process. + +- If you want to customize more modules, such as adding datasets and models, we provide an "Advanced Tutorial." + +We always welcome users' PRs (Pull Requests) and Issues to improve VLMEvalKit! + +.. _Start Your First Step: +.. toctree:: + :maxdepth: 1 + :caption: Start Your First Step + + Quickstart.md + +.. _Advanced Tutorial: +.. toctree:: + :maxdepth: 1 + :caption: Advanced Tutorial + + Development.md + ConfigSystem.md + +.. _Other Notes: +.. toctree:: + :maxdepth: 1 + :caption: Other Notes + + Contributors.md + +Index and Tables +================== + +* :ref:`genindex` +* :ref:`search` diff --git a/vlmeval/VLMEvalKit_old/docs/zh-CN/_static/css/readthedocs.css b/vlmeval/VLMEvalKit_old/docs/zh-CN/_static/css/readthedocs.css new file mode 100644 index 0000000000000000000000000000000000000000..c83beffd261d9d7cb79dc499aec7187474639d89 --- /dev/null +++ b/vlmeval/VLMEvalKit_old/docs/zh-CN/_static/css/readthedocs.css @@ -0,0 +1,63 @@ +.header-logo { + background-image: url("../image/logo.svg"); + background-size: 275px 80px; + height: 80px; + width: 275px; +} + + +@media screen and (min-width: 1100px) { + .header-logo { + top: -25px; + } +} + +pre { + white-space: pre; +} + +@media screen and (min-width: 2000px) { + .pytorch-content-left { + width: 1200px; + margin-left: 30px; + } + article.pytorch-article { + max-width: 1200px; + } + .pytorch-breadcrumbs-wrapper { + width: 1200px; + } + .pytorch-right-menu.scrolling-fixed { + position: fixed; + top: 45px; + left: 1580px; + } +} + + +article.pytorch-article section code { + padding: .2em .4em; + background-color: #f3f4f7; + border-radius: 5px; +} + +/* Disable the change in tables */ +article.pytorch-article section table code { + padding: unset; + background-color: unset; + border-radius: unset; +} + +table.autosummary td { + width: 50% +} + +img.align-center { + display: block; + margin-left: auto; + margin-right: auto; +} + +article.pytorch-article p.rubric { + font-weight: bold; +} diff --git a/vlmeval/VLMEvalKit_old/docs/zh-CN/_static/image/logo.svg b/vlmeval/VLMEvalKit_old/docs/zh-CN/_static/image/logo.svg new file mode 100644 index 0000000000000000000000000000000000000000..043530572afb48d0eac26b4b53d448aae6e9a9af --- /dev/null +++ b/vlmeval/VLMEvalKit_old/docs/zh-CN/_static/image/logo.svg @@ -0,0 +1,24 @@ + + + +Created with Fabric.js 5.3.0 + + + + + + + + + + + + + VLMEvalKit + diff --git a/vlmeval/VLMEvalKit_old/docs/zh-CN/_templates/404.html b/vlmeval/VLMEvalKit_old/docs/zh-CN/_templates/404.html new file mode 100644 index 0000000000000000000000000000000000000000..64910175d5d69946845b04d5e6a378de205e8388 --- /dev/null +++ b/vlmeval/VLMEvalKit_old/docs/zh-CN/_templates/404.html @@ -0,0 +1,18 @@ +{% extends "layout.html" %} + +{% block body %} + +

Page Not Found

+

+ The page you are looking for cannot be found. +

+

+ If you just switched documentation versions, it is likely that the page you were on is moved. You can look for it in + the content table left, or go to the homepage. +

+ + +{% endblock %} diff --git a/vlmeval/VLMEvalKit_old/vlmeval/__pycache__/__init__.cpython-311.pyc b/vlmeval/VLMEvalKit_old/vlmeval/__pycache__/__init__.cpython-311.pyc new file mode 100644 index 0000000000000000000000000000000000000000..e0c7576d4acf626496556139a40cabb02739bc47 Binary files /dev/null and b/vlmeval/VLMEvalKit_old/vlmeval/__pycache__/__init__.cpython-311.pyc differ diff --git a/vlmeval/VLMEvalKit_old/vlmeval/__pycache__/__init__.cpython-38.pyc b/vlmeval/VLMEvalKit_old/vlmeval/__pycache__/__init__.cpython-38.pyc new file mode 100644 index 0000000000000000000000000000000000000000..494bf2f9b348ac3c963575d413efc6963054f76c Binary files /dev/null and b/vlmeval/VLMEvalKit_old/vlmeval/__pycache__/__init__.cpython-38.pyc differ diff --git a/vlmeval/VLMEvalKit_old/vlmeval/__pycache__/config.cpython-310.pyc b/vlmeval/VLMEvalKit_old/vlmeval/__pycache__/config.cpython-310.pyc new file mode 100644 index 0000000000000000000000000000000000000000..57b1fdb50ba13918c43eb0ae7e98fdc7d3dce323 Binary files /dev/null and b/vlmeval/VLMEvalKit_old/vlmeval/__pycache__/config.cpython-310.pyc differ diff --git a/vlmeval/VLMEvalKit_old/vlmeval/__pycache__/inference_mt.cpython-310.pyc b/vlmeval/VLMEvalKit_old/vlmeval/__pycache__/inference_mt.cpython-310.pyc new file mode 100644 index 0000000000000000000000000000000000000000..2d7775e16c858b5f754097189b162e57f26a4856 Binary files /dev/null and b/vlmeval/VLMEvalKit_old/vlmeval/__pycache__/inference_mt.cpython-310.pyc differ diff --git a/vlmeval/VLMEvalKit_old/vlmeval/__pycache__/inference_video.cpython-310.pyc b/vlmeval/VLMEvalKit_old/vlmeval/__pycache__/inference_video.cpython-310.pyc new file mode 100644 index 0000000000000000000000000000000000000000..ff236c9972cf7e26b92d0e401c541fc64ea52d04 Binary files /dev/null and b/vlmeval/VLMEvalKit_old/vlmeval/__pycache__/inference_video.cpython-310.pyc differ diff --git a/vlmeval/VLMEvalKit_old/vlmeval/api/bluelm_v_api.py b/vlmeval/VLMEvalKit_old/vlmeval/api/bluelm_v_api.py new file mode 100644 index 0000000000000000000000000000000000000000..a994f468b22c124665584f72b526775b8c05651b --- /dev/null +++ b/vlmeval/VLMEvalKit_old/vlmeval/api/bluelm_v_api.py @@ -0,0 +1,120 @@ +from vlmeval.smp import * +from vlmeval.api.base import BaseAPI +import os +import json + + +def multimodal(images, text, url, key, temperature=0, max_tokens=1024, history=[]): + if images: + pics = [] + for image in images: + with open(image, 'rb') as f: + pic = base64.b64encode(f.read()).decode('utf-8') + pics.append(pic) + data = {'images': pics, 'text': text, 'key': key, 'temperature': temperature, 'max_new_tokens': max_tokens} + else: + data = {'text': text, 'key': key, 'temperature': temperature, 'max_new_tokens': max_tokens} + response = requests.post(url, json=data, headers={'Content-Type': 'application/json'}) + response = json.loads(response.text) + return response + + +class BlueLMWrapper(BaseAPI): + is_api: bool = True + + def __init__(self, + model: str = 'BlueLM-V-v3.0', + retry: int = 5, + wait: int = 5, + verbose: bool = True, + temperature: float = 0.0, + system_prompt: str = None, + max_tokens: int = 1024, + key: str = None, + url: str = 'http://api-ai.vivo.com.cn/multimodal', + **kwargs): + + self.model = model + self.fail_msg = 'Failed to obtain answer BlueLM-V API. ' + self.max_tokens = max_tokens + self.temperature = temperature + self.url = url + self.key = key + + if self.key is None: + self.key = os.environ.get('BLUELM_V_API_KEY', None) + assert self.key is not None, ( + 'Please set the API Key (obtain it here: ' + 'contact by email : shuai.ren@vivo.com' + ) + + super().__init__(wait=wait, retry=retry, system_prompt=system_prompt, verbose=verbose, **kwargs) + + def message_to_promptimg(self, message, dataset=None): + + num_images = len([x for x in message if x['type'] == 'image']) + if num_images == 0: + prompt = '\n'.join([x['value'] for x in message if x['type'] == 'text']) + image = None + elif num_images == 1: + prompt = '\n'.join([x['value'] for x in message if x['type'] == 'text']) + image = [x['value'] for x in message if x['type'] == 'image'] + else: + prompt = '\n'.join([x['value'] if x['type'] == 'text' else '' for x in message]) + if dataset == 'BLINK': + image = concat_images_vlmeval( + [x['value'] for x in message if x['type'] == 'image'], + target_size=512) + else: + image = [x['value'] for x in message if x['type'] == 'image'] + + if dataset in ['MMBench_DEV_EN_V11', 'MMBench_DEV_CN_V11', 'MMBench_TEST_EN_V11', 'MMBench_TEST_CN_V11', + 'AI2D_TEST', 'AI2D_TEST_TO_MASK', 'MMMU_DEV_VAL']: + prompt = prompt.replace('Please select the correct answer from the options above.', + 'Answer with the option’s letter from the given choices directly.') + elif dataset in ['ChartQA_TEST']: + prompt = prompt.replace('Answer the question using a single word or phrase.', + 'Answer the question using a single number or phrase.') + elif dataset in ['DocVQA_VAL', 'DocVQA_TEST', ]: + prompt = prompt.replace('Answer the question using a single word or phrase.', + 'Give the short answer directly.') + elif dataset in ['TextVQA_VAL']: + prompt = prompt.replace('Answer the question using a single word or phrase.', + 'When the provided information is insufficient, respond with ’Unanswerable’.' + 'Answer the question using a single word or phrase.') + elif dataset in ['MTVQA_TEST']: + prompt = prompt.replace('\nAnswer the question using a word or phrase in the language of the question.', '') + elif dataset in ['MathVista_MINI']: + if 'Choices:' in prompt: + prompt = prompt.replace('Choices:', 'Options:').replace('Hint:', 'Context:') + for i in range(1, 7): # replace A ~ F + prompt = prompt.replace(f'({chr(64 + i)})', f'{chr(64 + i)}.') + prompt += '\nAnswer with the option’s letter from the given choices directly.' + else: + prompt += '\nAnswer the question using a single word or phrase.' + + return prompt, image + + def generate_inner(self, inputs, **kwargs) -> str: + + assert isinstance(inputs, str) or isinstance(inputs, list) + pure_text = np.all([x['type'] == 'text' for x in inputs]) + assert not pure_text + + prompt, image_path = self.message_to_promptimg(inputs, kwargs['dataset']) + + try: + response = multimodal(image_path, prompt, self.url, self.key, self.temperature, self.max_tokens) + answer = response['result'] + return 0, answer, 'Succeeded! ' + except Exception as err: + if self.verbose: + self.logger.error(f'{type(err)}: {err}') + self.logger.error(f'The input messages are {inputs}.') + return -1, '', '' + + +class BlueLM_V_API(BlueLMWrapper): + + def generate(self, message, dataset=None): + return super(BlueLM_V_API, self).generate(message, dataset=dataset) diff --git a/vlmeval/VLMEvalKit_old/vlmeval/api/gpt.py b/vlmeval/VLMEvalKit_old/vlmeval/api/gpt.py new file mode 100644 index 0000000000000000000000000000000000000000..1e86cc733b463d2c7dbf429074fbf9dca4894006 --- /dev/null +++ b/vlmeval/VLMEvalKit_old/vlmeval/api/gpt.py @@ -0,0 +1,263 @@ +from ..smp import * +import os +import sys +from .base import BaseAPI + +APIBASES = { + 'OFFICIAL': 'https://api.openai.com/v1/chat/completions', +} + + +def GPT_context_window(model): + length_map = { + 'gpt-4': 8192, + 'gpt-4-0613': 8192, + 'gpt-4-turbo-preview': 128000, + 'gpt-4-1106-preview': 128000, + 'gpt-4-0125-preview': 128000, + 'gpt-4-vision-preview': 128000, + 'gpt-4-turbo': 128000, + 'gpt-4-turbo-2024-04-09': 128000, + 'gpt-3.5-turbo': 16385, + 'gpt-3.5-turbo-0125': 16385, + 'gpt-3.5-turbo-1106': 16385, + 'gpt-3.5-turbo-instruct': 4096, + } + if model in length_map: + return length_map[model] + else: + return 128000 + + +class OpenAIWrapper(BaseAPI): + + is_api: bool = True + + def __init__(self, + model: str = 'gpt-3.5-turbo-0613', + retry: int = 5, + wait: int = 5, + key: str = None, + verbose: bool = False, + system_prompt: str = None, + temperature: float = 0, + timeout: int = 60, + api_base: str = None, + max_tokens: int = 1024, + img_size: int = 512, + img_detail: str = 'low', + use_azure: bool = False, + **kwargs): + + self.model = model + self.cur_idx = 0 + self.fail_msg = 'Failed to obtain answer via API. ' + self.max_tokens = max_tokens + self.temperature = temperature + self.use_azure = use_azure + + if 'step-1v' in model: + env_key = os.environ.get('STEPAI_API_KEY', '') + if key is None: + key = env_key + elif 'yi-vision' in model: + env_key = os.environ.get('YI_API_KEY', '') + if key is None: + key = env_key + elif 'internvl2-pro' in model: + env_key = os.environ.get('InternVL2_PRO_KEY', '') + if key is None: + key = env_key + else: + if use_azure: + env_key = os.environ.get('AZURE_OPENAI_API_KEY', None) + assert env_key is not None, 'Please set the environment variable AZURE_OPENAI_API_KEY. ' + + if key is None: + key = env_key + assert isinstance(key, str), ( + 'Please set the environment variable AZURE_OPENAI_API_KEY to your openai key. ' + ) + else: + env_key = os.environ.get('OPENAI_API_KEY', '') + if key is None: + key = env_key + assert isinstance(key, str) and key.startswith('sk-'), ( + f'Illegal openai_key {key}. ' + 'Please set the environment variable OPENAI_API_KEY to your openai key. ' + ) + + self.key = key + assert img_size > 0 or img_size == -1 + self.img_size = img_size + assert img_detail in ['high', 'low'] + self.img_detail = img_detail + self.timeout = timeout + + super().__init__(wait=wait, retry=retry, system_prompt=system_prompt, verbose=verbose, **kwargs) + + if use_azure: + api_base_template = ( + '{endpoint}openai/deployments/{deployment_name}/chat/completions?api-version={api_version}' + ) + endpoint = os.getenv('AZURE_OPENAI_ENDPOINT', None) + assert endpoint is not None, 'Please set the environment variable AZURE_OPENAI_ENDPOINT. ' + deployment_name = os.getenv('AZURE_OPENAI_DEPLOYMENT_NAME', None) + assert deployment_name is not None, 'Please set the environment variable AZURE_OPENAI_DEPLOYMENT_NAME. ' + api_version = os.getenv('OPENAI_API_VERSION', None) + assert api_version is not None, 'Please set the environment variable OPENAI_API_VERSION. ' + + self.api_base = api_base_template.format( + endpoint=os.getenv('AZURE_OPENAI_ENDPOINT'), + deployment_name=os.getenv('AZURE_OPENAI_DEPLOYMENT_NAME'), + api_version=os.getenv('OPENAI_API_VERSION') + ) + else: + if api_base is None: + if 'OPENAI_API_BASE' in os.environ and os.environ['OPENAI_API_BASE'] != '': + self.logger.info('Environment variable OPENAI_API_BASE is set. Will use it as api_base. ') + api_base = os.environ['OPENAI_API_BASE'] + else: + api_base = 'OFFICIAL' + + assert api_base is not None + + if api_base in APIBASES: + self.api_base = APIBASES[api_base] + elif api_base.startswith('http'): + self.api_base = api_base + else: + self.logger.error('Unknown API Base. ') + raise NotImplementedError + + self.logger.info(f'Using API Base: {self.api_base}; API Key: {self.key}') + + # inputs can be a lvl-2 nested list: [content1, content2, content3, ...] + # content can be a string or a list of image & text + def prepare_itlist(self, inputs): + assert np.all([isinstance(x, dict) for x in inputs]) + has_images = np.sum([x['type'] == 'image' for x in inputs]) + if has_images: + content_list = [] + for msg in inputs: + if msg['type'] == 'text': + content_list.append(dict(type='text', text=msg['value'])) + elif msg['type'] == 'image': + from PIL import Image + img = Image.open(msg['value']) + b64 = encode_image_to_base64(img, target_size=self.img_size) + img_struct = dict(url=f'data:image/jpeg;base64,{b64}', detail=self.img_detail) + content_list.append(dict(type='image_url', image_url=img_struct)) + else: + assert all([x['type'] == 'text' for x in inputs]) + text = '\n'.join([x['value'] for x in inputs]) + content_list = [dict(type='text', text=text)] + return content_list + + def prepare_inputs(self, inputs): + input_msgs = [] + if self.system_prompt is not None: + input_msgs.append(dict(role='system', content=self.system_prompt)) + assert isinstance(inputs, list) and isinstance(inputs[0], dict) + assert np.all(['type' in x for x in inputs]) or np.all(['role' in x for x in inputs]), inputs + if 'role' in inputs[0]: + assert inputs[-1]['role'] == 'user', inputs[-1] + for item in inputs: + input_msgs.append(dict(role=item['role'], content=self.prepare_itlist(item['content']))) + else: + input_msgs.append(dict(role='user', content=self.prepare_itlist(inputs))) + return input_msgs + + def generate_inner(self, inputs, **kwargs) -> str: + input_msgs = self.prepare_inputs(inputs) + temperature = kwargs.pop('temperature', self.temperature) + max_tokens = kwargs.pop('max_tokens', self.max_tokens) + + context_window = GPT_context_window(self.model) + new_max_tokens = min(max_tokens, context_window - self.get_token_len(inputs)) + if 0 < new_max_tokens <= 100 and new_max_tokens < max_tokens: + self.logger.warning( + 'Less than 100 tokens left, ' + 'may exceed the context window with some additional meta symbols. ' + ) + if new_max_tokens <= 0: + return 0, self.fail_msg + 'Input string longer than context window. ', 'Length Exceeded. ' + max_tokens = new_max_tokens + + # Will send request if use Azure, dk how to use openai client for it + if self.use_azure: + headers = {'Content-Type': 'application/json', 'api-key': self.key} + elif 'internvl2-pro' in self.model: + headers = {'Content-Type': 'application/json', 'Authorization': self.key} + else: + headers = {'Content-Type': 'application/json', 'Authorization': f'Bearer {self.key}'} + payload = dict( + model=self.model, + messages=input_msgs, + max_tokens=max_tokens, + n=1, + temperature=temperature, + **kwargs) + response = requests.post( + self.api_base, + headers=headers, data=json.dumps(payload), timeout=self.timeout * 1.1) + ret_code = response.status_code + ret_code = 0 if (200 <= int(ret_code) < 300) else ret_code + answer = self.fail_msg + try: + resp_struct = json.loads(response.text) + answer = resp_struct['choices'][0]['message']['content'].strip() + except Exception as err: + if self.verbose: + self.logger.error(f'{type(err)}: {err}') + self.logger.error(response.text if hasattr(response, 'text') else response) + + return ret_code, answer, response + + def get_image_token_len(self, img_path, detail='low'): + import math + if detail == 'low': + return 85 + + im = Image.open(img_path) + height, width = im.size + if width > 1024 or height > 1024: + if width > height: + height = int(height * 1024 / width) + width = 1024 + else: + width = int(width * 1024 / height) + height = 1024 + + h = math.ceil(height / 512) + w = math.ceil(width / 512) + total = 85 + 170 * h * w + return total + + def get_token_len(self, inputs) -> int: + import tiktoken + try: + enc = tiktoken.encoding_for_model(self.model) + except Exception as err: + if 'gpt' in self.model.lower(): + if self.verbose: + self.logger.warning(f'{type(err)}: {err}') + enc = tiktoken.encoding_for_model('gpt-4') + else: + return 0 + assert isinstance(inputs, list) + tot = 0 + for item in inputs: + if 'role' in item: + tot += self.get_token_len(item['content']) + elif item['type'] == 'text': + tot += len(enc.encode(item['value'])) + elif item['type'] == 'image': + tot += self.get_image_token_len(item['value'], detail=self.img_detail) + return tot + + +class GPT4V(OpenAIWrapper): + + def generate(self, message, dataset=None): + return super(GPT4V, self).generate(message) diff --git a/vlmeval/VLMEvalKit_old/vlmeval/api/sensechat_vision.py b/vlmeval/VLMEvalKit_old/vlmeval/api/sensechat_vision.py new file mode 100644 index 0000000000000000000000000000000000000000..e4ace4ca5ef2c9fb7bcdc07e4a297268037591f0 --- /dev/null +++ b/vlmeval/VLMEvalKit_old/vlmeval/api/sensechat_vision.py @@ -0,0 +1,257 @@ +from vlmeval.smp import * +from vlmeval.api.base import BaseAPI +from vlmeval.dataset import img_root_map +from vlmeval.dataset import DATASET_TYPE + + +class SenseChatVisionWrapper(BaseAPI): + + is_api: bool = True + + def __init__(self, + model: str = 'SenseChat-5-Vision', + retry: int = 5, + wait: int = 5, + ak: str = None, + sk: str = None, + verbose: bool = True, + system_prompt: str = None, + max_tokens: int = 1024, + proxy: str = None, + **kwargs): + + self.model = model + self.fail_msg = 'Failed to obtain answer via API. ' + self.ak = os.environ.get('SENSECHAT_AK', None) if ak is None else ak + self.sk = os.environ.get('SENSECHAT_SK', None) if sk is None else sk + assert self.ak is not None and self.sk is not None + self.max_new_tokens = max_tokens + super().__init__(wait=wait, retry=retry, system_prompt=system_prompt, verbose=verbose, **kwargs) + + def dump_image(self, line, dataset): + """Dump the image(s) of the input line to the corresponding dataset folder. + + Args: + line (line of pd.DataFrame): The raw input line. + dataset (str): The name of the dataset. + + Returns: + str | list[str]: The paths of the dumped images. + """ + ROOT = LMUDataRoot() + assert isinstance(dataset, str) + img_root = osp.join(ROOT, 'images', img_root_map[dataset] if dataset in img_root_map else dataset) + os.makedirs(img_root, exist_ok=True) + if 'image' in line: + if isinstance(line['image'], list): + tgt_path = [] + assert 'image_path' in line + for img, im_name in zip(line['image'], line['image_path']): + path = osp.join(img_root, im_name) + if not read_ok(path): + decode_base64_to_image_file(img, path) + tgt_path.append(path) + else: + tgt_path = osp.join(img_root, f"{line['index']}.jpg") + if not read_ok(tgt_path): + decode_base64_to_image_file(line['image'], tgt_path) + tgt_path = [tgt_path] + else: + assert 'image_path' in line + tgt_path = toliststr(line['image_path']) + + return tgt_path + + def image_to_base64(self, image_path): + import base64 + with open(image_path, 'rb') as image_file: + encoded_string = base64.b64encode(image_file.read()) + return encoded_string.decode('utf-8') + + def encode_jwt_token(self, ak, sk): + import jwt + headers = {'alg': 'HS256', 'typ': 'JWT'} + payload = { + 'iss': ak, + 'exp': int(time.time()) + + 1800, # 填写您期望的有效时间,此处示例代表当前时间+30分钟 + 'nbf': int(time.time()) - 5, # 填写您期望的生效时间,此处示例代表当前时间-5秒 + } + token = jwt.encode(payload, sk, headers=headers) + return token + + def use_custom_prompt(self, dataset): + return True + + def build_multi_choice_prompt(self, line, dataset=None): + question = line['question'] + hint = line['hint'] if ('hint' in line and not pd.isna(line['hint'])) else None + if hint is not None: + question = hint + '\n' + question + + options = { + cand: line[cand] + for cand in string.ascii_uppercase + if cand in line and not pd.isna(line[cand]) + } + for key, item in options.items(): + question += f'\n{key}. {item}' + prompt = question + + if len(options): + prompt += '\n请直接回答选项字母。' if cn_string( + prompt) else "\nAnswer with the option's letter from the given choices directly." + else: + prompt += '\n请直接回答问题。' if cn_string(prompt) else '\nAnswer the question directly.' + + return prompt + + def build_prompt(self, line, dataset=None): + assert self.use_custom_prompt(dataset) + assert dataset is None or isinstance(dataset, str) + + tgt_path = self.dump_image(line, dataset) + + if dataset is not None and listinstr(['MME'], dataset): + question = line['question'] + prompt = question + ' Answer the question using a single word or phrase.' + elif dataset is not None and listinstr(['HallusionBench'], dataset): + question = line['question'] + prompt = question + ' Please answer yes or no. Answer the question using a single word or phrase.' + elif dataset is not None and DATASET_TYPE(dataset) == 'MCQ' and 'MMMU' not in dataset: + prompt = self.build_multi_choice_prompt(line, dataset) + elif dataset is not None and DATASET_TYPE(dataset) == 'VQA': + if 'MathVista' in dataset: + prompt = line['question'] + elif listinstr(['LLaVABench'], dataset): + question = line['question'] + prompt = question + '\nAnswer this question in detail.' + elif listinstr(['MMVet'], dataset): + prompt = line['question'] + else: + question = line['question'] + prompt = question + '\nAnswer the question using a single word or phrase.' + elif dataset is not None and 'MMMU' in dataset: + question = line['question'] + options = { + cand: line[cand] + for cand in string.ascii_uppercase + if cand in line and not pd.isna(line[cand]) + } + for key, item in options.items(): + question += f'\n{key}. {item}' + prompt = { + 'multiple-choice': 'You are an expert in {}. Please solve the university-level {} examination question, which includes interleaved images and text. Your output should be divided into two parts: First, reason about the correct answer. Then write the answer in the following format where X is exactly one of the choices given by the problem: "ANSWER: X". If you are uncertain of the correct answer, guess the most likely one.', # noqa: E501 + 'open': 'You are an expert in {}. Please solve the university-level {} examination question, which includes interleaved images and text. Your output should be divided into two parts: First, reason about the correct answer. Then write the answer in the following format where X is only the answer and nothing else: "ANSWER: X"' # noqa: E501 + } + subject = '_'.join(line['id'].split('_')[1:-1]) + prompt = prompt[line['question_type']].format(subject, subject) + '\n' + question + else: + prompt = line['question'] + + message = [dict(type='text', value=prompt)] + message.extend([dict(type='image', value=s) for s in tgt_path]) + + return message + + def message_to_promptimg(self, message, dataset=None): + if dataset is None or listinstr(['MMMU', 'BLINK'], dataset): + prompt = '\n'.join([x['value'] for x in message if x['type'] == 'text']) + image = [[x['value'] for x in message if x['type'] == 'image'][0]] + else: + prompt = '\n'.join([x['value'] for x in message if x['type'] == 'text']) + image = [x['value'] for x in message if x['type'] == 'image'] + return prompt, image + + def generate_inner(self, inputs, **kwargs) -> str: + assert isinstance(inputs, str) or isinstance(inputs, list) + inputs = [inputs] if isinstance(inputs, str) else inputs + dataset = kwargs.get('dataset', None) + + if dataset is not None and listinstr(['ChartQA_TEST'], dataset): + self.max_num = 12 + elif dataset is not None and listinstr(['DocVQA_VAL', 'DocVQA_TEST'], dataset): + self.max_num = 18 + elif dataset is not None and listinstr(['InfoVQA_VAL', 'InfoVQA_TEST', 'OCRBench'], dataset): + self.max_num = 24 + else: + self.max_num = 6 + + if dataset is None: + pass + elif listinstr(['AI2D_TEST'], dataset): + self.max_new_tokens = 10 + elif 'MMMU' in dataset: + self.max_new_tokens = 1024 + elif 'MMBench' in dataset: + self.max_new_tokens = 100 + + prompt, image = self.message_to_promptimg(message=inputs, dataset=dataset) + + url = 'https://api.sensenova.cn/v1/llm/chat-completions' + api_secret_key = self.encode_jwt_token(self.ak, self.sk) + + content = [{ + 'image_base64': self.image_to_base64(item), + 'image_file_id': '', + 'image_url': '', + 'text': '', + 'text': '', + 'type': 'image_base64' + } for item in image] + + content.append({ + 'image_base64': '', + 'image_file_id': '', + 'image_url': '', + 'text': prompt, + 'type': 'text' + }) + + message = [{'content': content, 'role': 'user'}] + + data = { + 'messages': message, + 'max_new_tokens': self.max_new_tokens, + 'model': self.model, + 'stream': False, + } + headers = { + 'Content-type': 'application/json', + 'Authorization': 'Bearer ' + api_secret_key + } + + response = requests.post( + url, + headers=headers, + json=data, + ) + request_id = response.headers['x-request-id'] + + time.sleep(1) + try: + assert response.status_code == 200 + response = response.json()['data']['choices'][0]['message'].strip() + if dataset is not None and 'MMMU' in dataset: + response = response.split('ANSWER: ')[-1].strip() + if self.verbose: + self.logger.info(f'inputs: {inputs}\nanswer: {response}') + return 0, response, 'Succeeded! ' + except Exception as err: + if self.verbose: + self.logger.error('---------------------------ERROR---------------------------') + self.logger.error(response.json()) + self.logger.error(f'{type(err)}: {err}') + self.logger.error('---------------------------request_id---------------------------' + request_id) + self.logger.error( + 'api error' + response.json()['error']['message'] + + str([input['value'] if input['type'] == 'image' else None for input in inputs]) + ) + self.logger.error(f'The input messages are {inputs}.') + return -1, response.json()['error']['message'], '' + + +class SenseChatVisionAPI(SenseChatVisionWrapper): + + def generate(self, message, dataset=None): + return super(SenseChatVisionAPI, self).generate(message, dataset=dataset) diff --git a/vlmeval/VLMEvalKit_old/vlmeval/dataset/__init__.py b/vlmeval/VLMEvalKit_old/vlmeval/dataset/__init__.py new file mode 100644 index 0000000000000000000000000000000000000000..66a3908f67bda34978ee45a729c3b3b9a1a84576 --- /dev/null +++ b/vlmeval/VLMEvalKit_old/vlmeval/dataset/__init__.py @@ -0,0 +1,228 @@ +import warnings + +from .image_base import img_root_map, ImageBaseDataset +from .image_caption import ImageCaptionDataset +from .image_yorn import ImageYORNDataset +from .image_mcq import ( + ImageMCQDataset, MMMUDataset, CustomMCQDataset, MUIRDataset, GMAIMMBenchDataset, MMERealWorld, HRBenchDataset, + NaturalBenchDataset +) +from .image_mt import MMDUDataset +from .image_vqa import ( + ImageVQADataset, MathVision, OCRBench, MathVista, LLaVABench, MMVet, MTVQADataset, TableVQABench, + CustomVQADataset, CRPE, MathVerse, OlympiadBench, QSpatial, VizWiz, MMNIAH +) + +from .text_mcq import CustomTextMCQDataset, TextMCQDataset + +from .vcr import VCRDataset +from .mmlongbench import MMLongBench +from .dude import DUDE +from .slidevqa import SlideVQA + +from .mmbench_video import MMBenchVideo +from .videomme import VideoMME +from .mvbench import MVBench, MVBench_MP4 +from .mlvu import MLVU, MLVU_MCQ, MLVU_OpenEnded +from .tempcompass import TempCompass, TempCompass_Captioning, TempCompass_MCQ, TempCompass_YorN +from .longvideobench import LongVideoBench +from .video_concat_dataset import ConcatVideoDataset +from .mmgenbench import MMGenBench + +from .miabench import MIABench +from .wildvision import WildVision +from .mmmath import MMMath +from .dynamath import Dynamath +from .utils import * +from ..smp import * + + +class ConcatDataset(ImageBaseDataset): + # This dataset takes multiple dataset names as input and aggregate them into a single dataset. + # Each single dataset should not have a field named `SUB_DATASET` + + DATASET_SETS = { + 'MMMB': ['MMMB_ar', 'MMMB_cn', 'MMMB_en', 'MMMB_pt', 'MMMB_ru', 'MMMB_tr'], + 'MTL_MMBench_DEV': [ + 'MMBench_dev_ar', 'MMBench_dev_cn', 'MMBench_dev_en', + 'MMBench_dev_pt', 'MMBench_dev_ru', 'MMBench_dev_tr' + ] + } + + def __init__(self, dataset): + datasets = self.DATASET_SETS[dataset] + self.dataset_map = {} + # The name of the compliation + self.dataset_name = dataset + self.datasets = datasets + for dname in datasets: + dataset = build_dataset(dname) + assert dataset is not None, dataset + self.dataset_map[dname] = dataset + TYPES = [x.TYPE for x in self.dataset_map.values()] + MODALITIES = [x.MODALITY for x in self.dataset_map.values()] + assert np.all([x == TYPES[0] for x in TYPES]), (datasets, TYPES) + assert np.all([x == MODALITIES[0] for x in MODALITIES]), (datasets, MODALITIES) + self.TYPE = TYPES[0] + self.MODALITY = MODALITIES[0] + data_all = [] + for dname in datasets: + data = self.dataset_map[dname].data + data['SUB_DATASET'] = [dname] * len(data) + data_new = localize_df(data, dname, nproc=16) + data_all.append(data_new) + + data = pd.concat(data_all) + data['original_index'] = data.pop('index') + data['index'] = np.arange(len(data)) + self.data = data + + def build_prompt(self, line): + if isinstance(line, int): + line = self.data.iloc[line] + idx = line['original_index'] + dname = line['SUB_DATASET'] + org_data = self.dataset_map[dname].data + org_line = cp.deepcopy(org_data[org_data['index'] == idx]).iloc[0] + return self.dataset_map[dname].build_prompt(org_line) + + def dump_image(self, line): + # Assert all images are pre-dumped + assert 'image' not in line + assert 'image_path' in line + tgt_path = toliststr(line['image_path']) + return tgt_path + + @classmethod + def supported_datasets(cls): + return list(cls.DATASET_SETS) + + def evaluate(self, eval_file, **judge_kwargs): + suffix = eval_file.split('.')[-1] + # First, split the eval_file by dataset + data_all = load(eval_file) + for dname in self.datasets: + tgt = eval_file.replace(self.dataset_name, dname) + data_sub = data_all[data_all['SUB_DATASET'] == dname] + data_sub.pop('index') + data_sub['index'] = data_sub.pop('original_index') + data_sub.pop('SUB_DATASET') + dump(data_sub, tgt) + # Then, evaluate each dataset separately + results_all = [] + for dname in self.datasets: + tgt = eval_file.replace(self.dataset_name, dname) + res = self.dataset_map[dname].evaluate(tgt, **judge_kwargs) + assert isinstance(res, pd.DataFrame) + res['DATASET'] = [dname] * len(res) + results_all.append(res) + result = pd.concat(results_all) + score_file = eval_file.replace(f'.{suffix}', '_acc.csv') + dump(result, score_file) + return result + + +# Add new supported dataset class here +IMAGE_DATASET = [ + ImageCaptionDataset, ImageYORNDataset, ImageMCQDataset, ImageVQADataset, MathVision, + MMMUDataset, OCRBench, MathVista, LLaVABench, MMVet, MTVQADataset, TableVQABench, + MMLongBench, VCRDataset, MMDUDataset, DUDE, SlideVQA, MUIRDataset, + GMAIMMBenchDataset, MMERealWorld, HRBenchDataset, CRPE, MathVerse, NaturalBenchDataset, + MIABench, OlympiadBench, WildVision, MMMath, QSpatial, Dynamath, MMGenBench, VizWiz, MMNIAH +] + +VIDEO_DATASET = [ + MMBenchVideo, VideoMME, MVBench, MVBench_MP4, LongVideoBench, + MLVU, MLVU_MCQ, MLVU_OpenEnded, + TempCompass, TempCompass_MCQ, TempCompass_Captioning, TempCompass_YorN +] + +TEXT_DATASET = [ + TextMCQDataset +] + +CUSTOM_DATASET = [ + CustomMCQDataset, CustomVQADataset, CustomTextMCQDataset +] + +DATASET_COLLECTION = [ConcatDataset, ConcatVideoDataset] + +DATASET_CLASSES = IMAGE_DATASET + VIDEO_DATASET + TEXT_DATASET + CUSTOM_DATASET + DATASET_COLLECTION +SUPPORTED_DATASETS = [] +for DATASET_CLS in DATASET_CLASSES: + SUPPORTED_DATASETS.extend(DATASET_CLS.supported_datasets()) + + +def DATASET_TYPE(dataset, *, default: str = 'MCQ') -> str: + for cls in DATASET_CLASSES: + if dataset in cls.supported_datasets(): + if hasattr(cls, 'TYPE'): + return cls.TYPE + # Have to add specific routine to handle ConcatDataset + if dataset in ConcatDataset.DATASET_SETS: + dataset_list = ConcatDataset.DATASET_SETS[dataset] + TYPES = [DATASET_TYPE(dname) for dname in dataset_list] + assert np.all([x == TYPES[0] for x in TYPES]), (dataset_list, TYPES) + return TYPES[0] + + if 'openended' in dataset.lower(): + return 'VQA' + warnings.warn(f'Dataset {dataset} is a custom one and not annotated as `openended`, will treat as {default}. ') + return default + + +def DATASET_MODALITY(dataset, *, default: str = 'IMAGE') -> str: + if dataset is None: + warnings.warn(f'Dataset is not specified, will treat modality as {default}. ') + return default + for cls in DATASET_CLASSES: + if dataset in cls.supported_datasets(): + if hasattr(cls, 'MODALITY'): + return cls.MODALITY + # Have to add specific routine to handle ConcatDataset + if dataset in ConcatDataset.DATASET_SETS: + dataset_list = ConcatDataset.DATASET_SETS[dataset] + MODALITIES = [DATASET_MODALITY(dname) for dname in dataset_list] + assert np.all([x == MODALITIES[0] for x in MODALITIES]), (dataset_list, MODALITIES) + return MODALITIES[0] + + if 'VIDEO' in dataset.lower(): + return 'VIDEO' + elif 'IMAGE' in dataset.lower(): + return 'IMAGE' + warnings.warn(f'Dataset {dataset} is a custom one, will treat modality as {default}. ') + return default + + +def build_dataset(dataset_name, **kwargs): + for cls in DATASET_CLASSES: + if dataset_name in cls.supported_datasets(): + return cls(dataset=dataset_name, **kwargs) + + warnings.warn(f'Dataset {dataset_name} is not officially supported. ') + + data_file = osp.join(LMUDataRoot(), f'{dataset_name}.tsv') + if not osp.exists(data_file): + warnings.warn(f'Data file {data_file} does not exist. Dataset building failed. ') + return None + + data = load(data_file) + if 'question' not in [x.lower() for x in data.columns]: + warnings.warn(f'Data file {data_file} does not have a `question` column. Dataset building failed. ') + return None + + if 'A' in data and 'B' in data: + if 'image' in data or 'image_path' in data: + warnings.warn(f'Will assume unsupported dataset {dataset_name} as a Custom MCQ dataset. ') + return CustomMCQDataset(dataset=dataset_name, **kwargs) + else: + warnings.warn(f'Will assume unsupported dataset {dataset_name} as a Custom Text MCQ dataset. ') + return CustomTextMCQDataset(dataset=dataset_name, **kwargs) + else: + warnings.warn(f'Will assume unsupported dataset {dataset_name} as a Custom VQA dataset. ') + return CustomVQADataset(dataset=dataset_name, **kwargs) + + +__all__ = [ + 'build_dataset', 'img_root_map', 'build_judge', 'extract_answer_from_item', 'prefetch_answer', 'DEBUG_MESSAGE' +] + [cls.__name__ for cls in DATASET_CLASSES] diff --git a/vlmeval/VLMEvalKit_old/vlmeval/dataset/__pycache__/__init__.cpython-38.pyc b/vlmeval/VLMEvalKit_old/vlmeval/dataset/__pycache__/__init__.cpython-38.pyc new file mode 100644 index 0000000000000000000000000000000000000000..9541f2ba06f9ca2b1f72d01f6a5c09c7760caade Binary files /dev/null and b/vlmeval/VLMEvalKit_old/vlmeval/dataset/__pycache__/__init__.cpython-38.pyc differ diff --git a/vlmeval/VLMEvalKit_old/vlmeval/dataset/__pycache__/dude.cpython-310.pyc b/vlmeval/VLMEvalKit_old/vlmeval/dataset/__pycache__/dude.cpython-310.pyc new file mode 100644 index 0000000000000000000000000000000000000000..53db3223af0be3be00c4f0264594a4a778e3cd77 Binary files /dev/null and b/vlmeval/VLMEvalKit_old/vlmeval/dataset/__pycache__/dude.cpython-310.pyc differ diff --git a/vlmeval/VLMEvalKit_old/vlmeval/dataset/__pycache__/image_caption.cpython-310.pyc b/vlmeval/VLMEvalKit_old/vlmeval/dataset/__pycache__/image_caption.cpython-310.pyc new file mode 100644 index 0000000000000000000000000000000000000000..54b42284a044497bc3a0aeb5a8f45e00c0fd3d91 Binary files /dev/null and b/vlmeval/VLMEvalKit_old/vlmeval/dataset/__pycache__/image_caption.cpython-310.pyc differ diff --git a/vlmeval/VLMEvalKit_old/vlmeval/dataset/__pycache__/image_caption.cpython-38.pyc b/vlmeval/VLMEvalKit_old/vlmeval/dataset/__pycache__/image_caption.cpython-38.pyc new file mode 100644 index 0000000000000000000000000000000000000000..a42952d5c6b772b439a704f85cd096599de3a58e Binary files /dev/null and b/vlmeval/VLMEvalKit_old/vlmeval/dataset/__pycache__/image_caption.cpython-38.pyc differ diff --git a/vlmeval/VLMEvalKit_old/vlmeval/dataset/__pycache__/longvideobench.cpython-38.pyc b/vlmeval/VLMEvalKit_old/vlmeval/dataset/__pycache__/longvideobench.cpython-38.pyc new file mode 100644 index 0000000000000000000000000000000000000000..078884ba2505e3a57abc901a9d9daebda00a127e Binary files /dev/null and b/vlmeval/VLMEvalKit_old/vlmeval/dataset/__pycache__/longvideobench.cpython-38.pyc differ diff --git a/vlmeval/VLMEvalKit_old/vlmeval/dataset/__pycache__/mmbench_video.cpython-310.pyc b/vlmeval/VLMEvalKit_old/vlmeval/dataset/__pycache__/mmbench_video.cpython-310.pyc new file mode 100644 index 0000000000000000000000000000000000000000..0be3f9c1a150109c88c83590936de337c169c7af Binary files /dev/null and b/vlmeval/VLMEvalKit_old/vlmeval/dataset/__pycache__/mmbench_video.cpython-310.pyc differ diff --git a/vlmeval/VLMEvalKit_old/vlmeval/dataset/__pycache__/slidevqa.cpython-310.pyc b/vlmeval/VLMEvalKit_old/vlmeval/dataset/__pycache__/slidevqa.cpython-310.pyc new file mode 100644 index 0000000000000000000000000000000000000000..a64de616ce5402533709e5ad9b7a3fac289a5eac Binary files /dev/null and b/vlmeval/VLMEvalKit_old/vlmeval/dataset/__pycache__/slidevqa.cpython-310.pyc differ diff --git a/vlmeval/VLMEvalKit_old/vlmeval/dataset/__pycache__/slidevqa.cpython-38.pyc b/vlmeval/VLMEvalKit_old/vlmeval/dataset/__pycache__/slidevqa.cpython-38.pyc new file mode 100644 index 0000000000000000000000000000000000000000..493514851ff24db4dd515043c5557dd8dab79bc6 Binary files /dev/null and b/vlmeval/VLMEvalKit_old/vlmeval/dataset/__pycache__/slidevqa.cpython-38.pyc differ diff --git a/vlmeval/VLMEvalKit_old/vlmeval/dataset/__pycache__/text_base.cpython-38.pyc b/vlmeval/VLMEvalKit_old/vlmeval/dataset/__pycache__/text_base.cpython-38.pyc new file mode 100644 index 0000000000000000000000000000000000000000..d80862a3f7401135d4b3416983239876fee104ec Binary files /dev/null and b/vlmeval/VLMEvalKit_old/vlmeval/dataset/__pycache__/text_base.cpython-38.pyc differ diff --git a/vlmeval/VLMEvalKit_old/vlmeval/dataset/image_caption.py b/vlmeval/VLMEvalKit_old/vlmeval/dataset/image_caption.py new file mode 100644 index 0000000000000000000000000000000000000000..cfc2e38106dbc65ee01c381d83e598b0d00dcd4d --- /dev/null +++ b/vlmeval/VLMEvalKit_old/vlmeval/dataset/image_caption.py @@ -0,0 +1,89 @@ +from .image_base import ImageBaseDataset +from ..smp import * + +MY_PROMPT = ''' +Hãy mô tả chi tiết người bức ảnh. Hãy sử dụng tiếng Việt. +Hãy miêu tả về áo, quần, đầu/mặt, giày/dép, ba lô/túi xách, điện thoại, phương tiện di chuyển,... +''' + + +class COCO_Caption_Scorer: + def __init__(self, ref, gt): + from pycocoevalcap.bleu.bleu import Bleu + from pycocoevalcap.rouge.rouge import Rouge + from pycocoevalcap.cider.cider import Cider + + self.ref = ref + self.gt = gt + print("setting up scorers...") + self.scorers = [ + (Bleu(4), ["Bleu_1", "Bleu_2", "Bleu_3", "Bleu_4"]), + (Rouge(), "ROUGE_L"), + (Cider(), "CIDEr"), + ] + + def compute_scores(self): + total_scores = {} + for scorer, method in self.scorers: + print("computing %s score..." % (scorer.method())) + score, scores = scorer.compute_score(self.gt, self.ref) + if isinstance(method, list): + for sc, scs, m in zip(score, scores, method): + print("%s: %0.3f" % (m, sc * 100)) + total_scores["Bleu"] = [x * 100 for x in score] + else: + print("%s: %0.3f" % (method, score * 100)) + total_scores[method] = score * 100 + + print("*****DONE*****") + for key, value in total_scores.items(): + print("{}:{}".format(key, value)) + return total_scores + + +class ImageCaptionDataset(ImageBaseDataset): + + TYPE = "Caption" + + DATASET_URL = { + "COCO_VAL": "https://opencompass.openxlab.space/utils/VLMEval/COCO_VAL.tsv", + } + + DATASET_MD5 = { + "COCO_VAL": "72a5079dead060269ac222c5aa5128af", + } + + def load_data(self, dataset): + global MY_PROMPT + data = super().load_data(dataset) + if "question" not in data: + data["question"] = [MY_PROMPT] * len(data) + return data + + # def load_data(self, dataset): + # data = super().load_data(dataset) + # if "question" not in data: + # data["question"] = [ + # ( + # "Please describe this image in general. Directly provide the description, " + # 'do not include prefix like "This image depicts". ' + # ) + # ] * len(data) + # return data + + # It returns a dictionary of scores + @classmethod + def evaluate(self, eval_file, **kwargs): + data = load(eval_file) + lt = len(data) + lines = [data.iloc[i] for i in range(lt)] + ref, gt = {}, {} + for i, line in enumerate(lines): + ref[str(i)] = [str(line["prediction"])] + gt[str(i)] = eval(line["answer"]) + + scorer = COCO_Caption_Scorer(ref, gt) + coco_caption_score_dict = scorer.compute_scores() + score_pth = eval_file.replace(".xlsx", "_score.json") + dump(coco_caption_score_dict, score_pth) + return coco_caption_score_dict diff --git a/vlmeval/VLMEvalKit_old/vlmeval/dataset/image_vqa.py b/vlmeval/VLMEvalKit_old/vlmeval/dataset/image_vqa.py new file mode 100644 index 0000000000000000000000000000000000000000..0323e4c9f1d5836e35aa03bf3b3be97adb68bac6 --- /dev/null +++ b/vlmeval/VLMEvalKit_old/vlmeval/dataset/image_vqa.py @@ -0,0 +1,1333 @@ +import os +import re +import tempfile +from functools import partial +from jinja2.sandbox import SandboxedEnvironment +from jinja2 import Template + +import pandas as pd + +from .image_base import ImageBaseDataset +from .utils import build_judge, DEBUG_MESSAGE +from ..smp import * +from ..utils import track_progress_rich +import ipdb + + +class ImageVQADataset(ImageBaseDataset): + TYPE = 'VQA' + + DATASET_URL = { + 'OCRVQA_TEST': 'https://opencompass.openxlab.space/utils/VLMEval/OCRVQA_TEST.tsv', + 'OCRVQA_TESTCORE': 'https://opencompass.openxlab.space/utils/VLMEval/OCRVQA_TESTCORE.tsv', + 'TextVQA_VAL': 'https://opencompass.openxlab.space/utils/VLMEval/TextVQA_VAL.tsv', + 'DocVQA_VAL': 'https://opencompass.openxlab.space/utils/VLMEval/DocVQA_VAL.tsv', + 'DocVQA_TEST': 'https://opencompass.openxlab.space/utils/VLMEval/DocVQA_TEST.tsv', + 'InfoVQA_VAL': 'https://opencompass.openxlab.space/utils/VLMEval/InfoVQA_VAL.tsv', + 'InfoVQA_TEST': 'https://opencompass.openxlab.space/utils/VLMEval/InfoVQA_TEST.tsv', + 'ChartQA_TEST': 'https://opencompass.openxlab.space/utils/VLMEval/ChartQA_TEST.tsv', + 'GQA_TestDev_Balanced': 'https://opencompass.openxlab.space/utils/VLMEval/GQA_TestDev_Balanced.tsv', + } + + DATASET_MD5 = { + 'OCRVQA_TEST': 'ca46a6d74b403e9d6c0b670f6fc00db9', + 'OCRVQA_TESTCORE': 'c5239fe77db8bdc1f2ad8e55e0d1fe97', + 'TextVQA_VAL': 'b233b31f551bbf4056f2f955da3a92cd', + 'DocVQA_VAL': 'd5ee77e1926ff10690d469c56b73eabf', + 'DocVQA_TEST': '6a2f28cac26ef2d3447374e8c6f6c8e9', + 'InfoVQA_VAL': '2342e9c225222f0ef4dec545ebb126fe', + 'InfoVQA_TEST': 'df535bf51b88dc9718252c34131a6227', + 'ChartQA_TEST': 'c902e0aa9be5582a7aad6dcf52734b42', + 'GQA_TestDev_Balanced': 'fead7df22befc1ed3ca2b62ea26fa17b', + } + + def build_prompt(self, line): + msgs = super().build_prompt(line) + assert msgs[-1]['type'] == 'text' + msgs[-1]['value'] += '\nAnswer the question using a single word or phrase.' + return msgs + + # It returns a DataFrame + def evaluate(self, eval_file, **judge_kwargs): + from .utils.vqa_eval import hit_calculate, process_line + + data = load(eval_file) + dataset = self.dataset_name + assert 'answer' in data and 'prediction' in data + data['prediction'] = [str(x) for x in data['prediction']] + data['answer'] = [str(x) for x in data['answer']] + lt = len(data) + pool = mp.Pool(16) + lines = [data.iloc[i] for i in range(lt)] + if listinstr(['TextVQA'], dataset): + res = pool.map(partial(process_line, method='vqa_score'), lines) + elif listinstr(['ChartQA'], dataset): + res = pool.map(partial(process_line, method='relaxed_accuracy'), lines) + elif listinstr(['OCRVQA', 'GQA'], dataset): + res = pool.map(partial(process_line, method='accuracy'), lines) + elif listinstr(['DocVQA', 'InfoVQA'], dataset): + res = pool.map(partial(process_line, method='anls'), lines) + else: # default using vqa_score to calculate score + res = pool.map(process_line, lines) + hit = hit_calculate(res, dataset) + ret = dict() + if 'split' in data: + splits = set(data['split']) + for sp in splits: + sub = [r for l, r in zip(lines, res) if l['split'] == sp] + # [np.mean(x['match']) >= full_score_weight for x in sub] + hit = hit_calculate(sub, dataset) + ret[sp] = np.mean(hit) * 100 + sub = [r for l, r in zip(lines, res)] + hit = hit_calculate(sub, dataset) + ret['Overall'] = np.mean(hit) * 100 + else: + ret['Overall'] = np.mean(hit) * 100 + if 'category' in data: + cates = list(set(data['category'])) + cates.sort() + for c in cates: + sub = [r for l, r in zip(lines, res) if l['category'] == c] + # [np.mean(x['match']) >= full_score_weight for x in sub] + hit = hit_calculate(sub, dataset) + ret[c] = np.mean(hit) * 100 + ret = d2df(ret) + ret.round(2) + + suffix = eval_file.split('.')[-1] + result_file = eval_file.replace(f'.{suffix}', '_acc.csv') + dump(ret, result_file) + return ret + + +class VizWiz(ImageBaseDataset): + TYPE = 'VQA' + DATASET_URL = { + 'VizWiz': 'https://opencompass.openxlab.space/utils/VLMEval/VizWiz.tsv' + } + DATASET_MD5 = { + 'VizWiz': 'fa4ac4164467563ed2fac6eac6631bd0' + } + + @classmethod + def evaluate(self, eval_file, **judge_kwargs): + from .utils.vqa_eval import hit_calculate, process_line + + suffix = eval_file.split('.')[-1] + result_file = eval_file.replace(f'.{suffix}', '_acc.csv') + + if not osp.exists(result_file): + data = load(eval_file) + assert 'answers' in data and 'prediction' in data + data['prediction'] = [str(x) for x in data['prediction']] + data['answer'] = [str(x) for x in data['answers']] + + lt = len(data) + pool = mp.Pool(16) + lines = [data.iloc[i] for i in range(lt)] + res = pool.map(process_line, lines) + + hit = hit_calculate(res, 'VizWiz') + ret = dict() + + ret['Overall'] = np.mean(hit) * 100 + ret = d2df(ret) + ret.round(2) + + dump(ret, result_file) + + retz = pd.read_csv(result_file) + return retz + + +class OCRBench(ImageBaseDataset): + TYPE = 'VQA' + DATASET_URL = { + 'OCRBench': 'https://opencompass.openxlab.space/utils/VLMEval/OCRBench.tsv' + } + DATASET_MD5 = {'OCRBench': 'e953d98a987cc6e26ef717b61260b778'} + + # It returns a dictionary + @classmethod + def evaluate(self, eval_file, **judge_kwargs): + OCRBench_score = { + 'Regular Text Recognition': 0, + 'Irregular Text Recognition': 0, + 'Artistic Text Recognition': 0, + 'Handwriting Recognition': 0, + 'Digit String Recognition': 0, + 'Non-Semantic Text Recognition': 0, + 'Scene Text-centric VQA': 0, + 'Doc-oriented VQA': 0, + 'Key Information Extraction': 0, + 'Handwritten Mathematical Expression Recognition': 0, + } + + data = load(eval_file) + lt = len(data) + lines = [data.iloc[i] for i in range(lt)] + for i in tqdm(range(len(lines))): + line = lines[i] + predict = str(line['prediction']) + answers = eval(line['answer']) + category = line['category'] + if category == 'Handwritten Mathematical Expression Recognition': + for j in range(len(answers)): + answer = answers[j].strip().replace('\n', ' ').replace(' ', '') + predict = predict.strip().replace('\n', ' ').replace(' ', '') + if answer in predict: + OCRBench_score[category] += 1 + break + else: + for j in range(len(answers)): + answer = answers[j].lower().strip().replace('\n', ' ') + predict = predict.lower().strip().replace('\n', ' ') + if answer in predict: + OCRBench_score[category] += 1 + break + + final_score_dict = {} + final_score_dict['Text Recognition'] = \ + (OCRBench_score['Regular Text Recognition'] + OCRBench_score['Irregular Text Recognition'] + + OCRBench_score['Artistic Text Recognition'] + OCRBench_score['Handwriting Recognition'] + + OCRBench_score['Digit String Recognition'] + OCRBench_score['Non-Semantic Text Recognition']) + final_score_dict['Scene Text-centric VQA'] = OCRBench_score['Scene Text-centric VQA'] + final_score_dict['Doc-oriented VQA'] = OCRBench_score['Doc-oriented VQA'] + final_score_dict['Key Information Extraction'] = OCRBench_score['Key Information Extraction'] + final_score_dict['Handwritten Mathematical Expression Recognition'] = \ + (OCRBench_score['Handwritten Mathematical Expression Recognition']) + final_score_dict['Final Score'] = \ + (final_score_dict['Text Recognition'] + final_score_dict['Scene Text-centric VQA'] + + final_score_dict['Doc-oriented VQA'] + final_score_dict['Key Information Extraction'] + + final_score_dict['Handwritten Mathematical Expression Recognition']) + final_score_dict['Final Score Norm'] = (float(final_score_dict['Final Score']) / 10) + score_pth = eval_file.replace('.xlsx', '_score.json') + dump(final_score_dict, score_pth) + return final_score_dict + + +class MathVista(ImageBaseDataset): + TYPE = 'VQA' + DATASET_URL = { + 'MathVista_MINI': 'https://opencompass.openxlab.space/utils/VLMEval/MathVista_MINI.tsv' + } + DATASET_MD5 = {'MathVista_MINI': 'f199b98e178e5a2a20e7048f5dcb0464'} + + # It returns a DataFrame + @classmethod + def evaluate(self, eval_file, **judge_kwargs): + from .utils.mathvista import MathVista_auxeval, MathVista_acc + + model = judge_kwargs['model'] + suffix = eval_file.split('.')[-1] + storage = eval_file.replace(f'.{suffix}', f'_{model}.xlsx') + tmp_file = eval_file.replace(f'.{suffix}', f'_{model}.pkl') + nproc = judge_kwargs.pop('nproc', 4) + + if not osp.exists(storage): + data = load(eval_file) + model = build_judge(max_tokens=128, **judge_kwargs) + assert model.working(), ('MathVista evaluation requires a working OPENAI API\n' + DEBUG_MESSAGE) + lt = len(data) + lines = [data.iloc[i] for i in range(lt)] + tups = [(model, line) for line in lines] + indices = [line['index'] for line in lines] + + ans = {} + if osp.exists(tmp_file): + ans = load(tmp_file) + tups = [x for x, i in zip(tups, indices) if i not in ans] + indices = [i for i in indices if i not in ans] + + if len(indices): + new_results = track_progress_rich( + MathVista_auxeval, + tups, + nproc=nproc, + chunksize=nproc, + keys=indices, + save=tmp_file, + ) + ans = load(tmp_file) + for k, v in zip(indices, new_results): + assert k in ans + assert ans[k]['log'] == v['log'] and ans[k]['res'] == v['res'] + + data['res'] = [ans[idx]['res'] for idx in data['index']] + data['log'] = [ans[idx]['log'] for idx in data['index']] + dump(data, storage) + + score = MathVista_acc(storage) + score_pth = storage.replace('.xlsx', '_score.csv') + dump(score, score_pth) + return score + + +class MathVerse(ImageBaseDataset): + TYPE = 'VQA' + DATASET_URL = { + 'MathVerse_MINI': 'http://opencompass.openxlab.space/utils/benchmarks/MathVerse/MathVerse_MINIV.tsv', # noqa + 'MathVerse_MINI_Vision_Only': 'http://opencompass.openxlab.space/utils/benchmarks/MathVerse/MathVerse_MINIVOnly.tsv', # noqa + 'MathVerse_MINI_Vision_Dominant': 'http://opencompass.openxlab.space/utils/benchmarks/MathVerse/MathVerse_MINIVDom.tsv', # noqa + 'MathVerse_MINI_Vision_Intensive': 'http://opencompass.openxlab.space/utils/benchmarks/MathVerse/MathVerse_MINIVInt.tsv', # noqa + 'MathVerse_MINI_Text_Lite': 'http://opencompass.openxlab.space/utils/benchmarks/MathVerse/MathVerse_MINITLite.tsv', # noqa + 'MathVerse_MINI_Text_Dominant': 'http://opencompass.openxlab.space/utils/benchmarks/MathVerse/MathVerse_MINITDom.tsv', # noqa + } + DATASET_MD5 = { + 'MathVerse_MINI': '5017caca32b7fa110c350a1bea861b65', + 'MathVerse_MINI_Vision_Only': '68a11d4680014ac881fa37adeadea3a4', + 'MathVerse_MINI_Vision_Dominant': 'b8fb63852d261ab2aaefba29cc2414d3', + 'MathVerse_MINI_Vision_Intensive': '01cbd35be202bb0c4873a4186a63bc19', + 'MathVerse_MINI_Text_Lite': '19e4b13bdd30b89a03b2e358bcfefa04', + 'MathVerse_MINI_Text_Dominant': '4f5cd2fa6630ea00bb11d6fde1f6fe6a', + } + + # It returns a DataFrame + @classmethod + def evaluate(self, eval_file, **judge_kwargs): + from .utils.mathverse import MathVerse_auxeval_extract, MathVerse_auxeval_score, MathVerse_acc + + model = judge_kwargs['model'] + suffix = eval_file.split('.')[-1] + storage_extract = eval_file.replace(f'.{suffix}', f'_{model}_extract.xlsx') + tmp_file_extract = eval_file.replace(f'.{suffix}', f'_{model}_extract.pkl') + storage_score = eval_file.replace(f'.{suffix}', f'_{model}_score.xlsx') + tmp_file_score = eval_file.replace(f'.{suffix}', f'_{model}_score.pkl') + nproc = judge_kwargs.pop('nproc', 4) + # stage1: extract the answer + if not osp.exists(storage_extract): + data = load(eval_file) + model = build_judge(max_tokens=128, **judge_kwargs) + assert model.working(), ('MathVerse evaluation requires a working OPENAI API\n' + DEBUG_MESSAGE) + lt = len(data) + lines = [data.iloc[i] for i in range(lt)] + tups = [(model, line) for line in lines] + indices = [line['index'] for line in lines] + + ans = {} + if osp.exists(tmp_file_extract): + ans = load(tmp_file_extract) + tups = [x for x, i in zip(tups, indices) if i not in ans] + indices = [i for i in indices if i not in ans] + + if len(indices): + new_results = track_progress_rich( + MathVerse_auxeval_extract, + tups, + nproc=nproc, + chunksize=nproc, + keys=indices, + save=tmp_file_extract, + ) + ans = load(tmp_file_extract) + for k, v in zip(indices, new_results): + assert k in ans + assert ans[k]['log_extract'] == v['log_extract'] and ans[k]['extract'] == v['extract'] + + data['extract'] = [ans[idx]['extract'] for idx in data['index']] + data['log_extract'] = [ans[idx]['log_extract'] for idx in data['index']] + dump(data, storage_extract) + + # stage2: score the answer + if not osp.exists(storage_score): + data = load(storage_extract) + model = build_judge(max_tokens=128, **judge_kwargs) + assert model.working(), ('MathVerse evaluation requires a working OPENAI API\n' + DEBUG_MESSAGE) + lt = len(data) + lines = [data.iloc[i] for i in range(lt)] + tups = [(model, line) for line in lines] + indices = [line['index'] for line in lines] + + ans = {} + if osp.exists(tmp_file_score): + ans = load(tmp_file_score) + tups = [x for x, i in zip(tups, indices) if i not in ans] + indices = [i for i in indices if i not in ans] + + if len(indices): + new_results = track_progress_rich( + MathVerse_auxeval_score, + tups, + nproc=nproc, + chunksize=nproc, + keys=indices, + save=tmp_file_score, + ) + ans = load(tmp_file_score) + for k, v in zip(indices, new_results): + assert k in ans + assert ans[k]['log_score'] == v['log_score'] and ans[k]['score'] == v['score'] + + data['score'] = [ans[idx]['score'] for idx in data['index']] + data['log_score'] = [ans[idx]['log_score'] for idx in data['index']] + dump(data, storage_score) + + score = MathVerse_acc(storage_score) + score_pth = storage_score.replace('.xlsx', '.csv') + dump(score, score_pth) + return score + + +class MathVision(ImageBaseDataset): + TYPE = 'VQA' + DATASET_URL = { + 'MathVision': 'https://opencompass.openxlab.space/utils/VLMEval/MathVision.tsv', + 'MathVision_MINI': 'https://opencompass.openxlab.space/utils/VLMEval/MathVision_MINI.tsv' + } + DATASET_MD5 = { + 'MathVision': '93f6de14f7916e598aa1b7165589831e', + 'MathVision_MINI': '060fe4fa5d868987ce179307bd5f8a33' + } + + # It returns a DataFrame + @classmethod + def evaluate(self, eval_file, **judge_kwargs): + from .utils.mathv import MATH_V_auxeval, MATH_V_acc + + if 'model' in judge_kwargs: + model = judge_kwargs['model'] + else: + model = os.path.basename(os.environ.get('LOCAL_LLM')) + suffix = eval_file.split('.')[-1] + storage = eval_file.replace(f'.{suffix}', f'_{model}.xlsx') + tmp_file = eval_file.replace(f'.{suffix}', f'_{model}.pkl') + nproc = judge_kwargs.pop('nproc', 4) + + if not osp.exists(storage): + data = load(eval_file) + model = build_judge(max_tokens=128, **judge_kwargs) + assert model.working(), ('MATH-Vision evaluation requires a working OPENAI API\n' + DEBUG_MESSAGE) + lt = len(data) + lines = [data.iloc[i] for i in range(lt)] + tups = [(model, line) for line in lines] + indices = [line['index'] for line in lines] + + ans = {} + if osp.exists(tmp_file): + ans = load(tmp_file) + tups = [x for x, i in zip(tups, indices) if i not in ans] + indices = [i for i in indices if i not in ans] + + if len(indices): + new_results = track_progress_rich( + MATH_V_auxeval, + tups, + nproc=nproc, + chunksize=nproc, + keys=indices, + save=tmp_file, + ) + ans = load(tmp_file) + for k, v in zip(indices, new_results): + assert k in ans + assert ans[k]['log'] == v['log'] and ans[k]['res'] == v['res'] + + data['res'] = [ans[idx]['res'] for idx in data['index']] + data['log'] = [ans[idx]['log'] for idx in data['index']] + dump(data, storage) + + score = MATH_V_acc(storage) + score_pth = storage.replace('.xlsx', '_score.csv') + dump(score, score_pth) + return score + + +class OlympiadBench(ImageBaseDataset): + TYPE = 'VQA_ex_prompt' + DATASET_URL = { + 'OlympiadBench': 'https://opencompass.openxlab.space/utils/VLMEval/OlympiadBench.tsv', + 'OlympiadBench_EN': 'https://opencompass.openxlab.space/utils/VLMEval/OlympiadBench_EN.tsv', + 'OlympiadBench_CN': 'https://opencompass.openxlab.space/utils/VLMEval/OlympiadBench_CN.tsv' + } + DATASET_MD5 = { + 'OlympiadBench': '9735ae0f0299eae1e7d07f5a7feab914', + 'OlympiadBench_EN': '5c68e100d394351fc7049f29d4d4efed', + 'OlympiadBench_CN': 'ea01b16788955702c79650c701e5b623' + } + + def dump_image(self, line): + os.makedirs(self.img_root, exist_ok=True) + + tgt_path_z = [] + if isinstance(line['image'], list): + for i in range(len(line['image'])): + tgt_path = osp.join(self.img_root, f"{line['index']}--{i+1}.jpg") + if not read_ok(tgt_path): + decode_base64_to_image_file(line['image'][i], tgt_path) + tgt_path_z.append(tgt_path) + else: + tgt_path = osp.join(self.img_root, f"{line['index']}.jpg") + if not read_ok(tgt_path): + decode_base64_to_image_file(line['image'], tgt_path) + tgt_path_z.append(tgt_path) + return tgt_path_z + + def build_prompt(self, line): + + from .utils.olympiadbench import get_answer_type_text, make_input + + self.is_chinese = 'zh' in line['source'] + self.is_math = 'maths' in line['source'] + self.is_theorem_proving = 'TP' in line['source'] + + if self.is_chinese: + subject_content = '数学' if self.is_math else '物理' + if self.is_theorem_proving: + prompt = ( + f"以下是中国{subject_content}竞赛中的证明题。请根据题目的要求,运用逻辑推理及常用定理证明题目中的命题。" + "证明过程中使用的变量和公式请使用LaTeX格式表示。" + ) + else: + answer_type_text = get_answer_type_text(line['answer_type'], is_chinese=True, + multiple_answer=line['is_multiple_answer']) + if line['is_multiple_answer']: + multiple_answer_text = '\\boxed{用英文逗号连接的多个答案}' + else: + multiple_answer_text = '\\boxed{答案}' + unit_text = '' + if line['unit']: + multiple_answer_text += '(单位)' + unit_text = ',注意答案的单位不要放在\\boxed{}中' + prompt = ( + f'以下是中国{subject_content}竞赛中的解答题{answer_type_text}。请根据题目的要求和所提供的信息计算得出答案。' + f'解答过程和结果中使用的变量和公式请使用LaTeX格式表示。请在最后以“所以最终答案是{multiple_answer_text}。”' + f'显式给出结果{unit_text}。' + ) + else: + subject_content = 'Math' if self.is_math else 'Physics' + if self.is_theorem_proving: + prompt = ( + f'The following is a theorem proving problem from an International {subject_content} competition. ' + 'Please use logical reasoning and common theorems to prove the proposition in the problem ' + 'according to the given requirements. ' + 'Please use LaTeX format to represent the variables and formulas used in the proof.' + ) + else: + if line['is_multiple_answer']: + multiple_answer_text = '\\boxed{multiple answers connected with commas}' + else: + multiple_answer_text = '\\boxed{answer}' + unit_text = '' + if line['unit']: + multiple_answer_text += '(unit)' + unit_text = ', note that the unit of the answer should not be included in \\boxed{}' + answer_type_text = get_answer_type_text(line['answer_type'], is_chinese=False, + multiple_answer=line['is_multiple_answer']) + prompt = ( + f'The following is an open-ended problem from an International {subject_content} competition. ' + f'{answer_type_text}Please calculate the answer according to the given requirements and ' + 'the information provided. Please use LaTeX format to represent the variables and formulas ' + 'used in the solution process and results. Please end your solution with "So the final answer ' + f'is {multiple_answer_text}." and give the result explicitly{unit_text}.' + ) + + if self.is_math: + input = make_input(prompt, line['question']) + else: + if 'context' in line.keys() and str(line['context']) != 'nan': # cannot be null + input = make_input(prompt, line['context'] + '\n' + line['question']) + else: + input = make_input(prompt, line['question']) + + ret = [dict(type='text', value=input)] + tgt_path = self.dump_image(line) + + ret.extend([dict(type='image', value=s) for s in tgt_path]) + + return ret + + @classmethod + def evaluate(self, eval_file, **judge_kwargs): + from .utils.olympiadbench import MathJudger, extract_answer + judger = MathJudger() + + suffix = eval_file.split('.')[-1] + name_str1 = 'judge' + name_str2 = 'score' + result_file = eval_file.replace(f'.{suffix}', f'_{name_str1}_result.xlsx') + score_file = eval_file.replace(f'.{suffix}', f'_{name_str2}_result.csv') + + if not osp.exists(result_file): + data = load(eval_file) + scorez = [] + + for i in tqdm(data.iterrows()): + line = i[1] + model_answer = line['prediction'] + is_chinese = 'zh' in line['source'] + model_answer = extract_answer(is_chinese, model_answer, is_deepseek=False) + answer_type = line['answer_type'] + + final_answer = line['final_answer'][2:-2] + + if str(answer_type) != 'nan' and 'Tuple' in answer_type: + judge_result = judger.judge(model_answer, final_answer) + else: + if str(line['error']) != 'nan': + if ',' in line['error']: + precisions = line['error'].split(',') + precisions = [float(p) if p else 1e-8 for p in precisions] + judge_result = judger.judge(model_answer, final_answer, precisions) + else: + precision = float(line['error']) + judge_result = judger.judge(model_answer, final_answer, precision) + else: + judge_result = judger.judge(model_answer, final_answer) + scorez.append(judge_result) + + data['score'] = scorez + dump(data, result_file) + + judge_file = load(result_file) + + if not osp.exists(score_file): + name_list = ['OE_MM_maths_en_COMP', 'OE_MM_maths_zh_CEE', 'OE_MM_maths_zh_COMP', 'OE_MM_physics_en_COMP', + 'OE_MM_physics_zh_CEE','OE_TO_maths_en_COMP', 'OE_TO_maths_zh_CEE', 'OE_TO_maths_zh_COMP', + 'OE_TO_physics_en_COMP', 'OE_TO_physics_zh_CEE'] + + sample_list = [[] for _ in range(len(name_list))] + for i in judge_file.iterrows(): + line = i[1] + for j in range(len(name_list)): + if line['source'] == name_list[j]: + sample_list[j].append(line['score']) + + acc_dict = {} + correct_list = [] + + # fine-grained + for i in range(len(name_list)): + correct_num = 0 + for j in sample_list[i]: + if j: + correct_num += 1 + correct_list.append(correct_num) + acc = 100 * correct_num / len(sample_list[i]) + acc_dict[name_list[i]] = [acc] + + # 4 grained + labela = ['zh', 'en'] + labelb = ['maths', 'physics'] + + grain_list = [[x,y] for x in labela for y in labelb] + for j in grain_list: + dict_name = j[0] + "_" + j[1] + correct_num = 0 + full_num = 0 + for i in range(len(name_list)): + if all(k in name_list[i] for k in j): + correct_num += correct_list[i] + full_num += len(sample_list[i]) + acc = 100 * correct_num / full_num + acc_dict[dict_name] = [acc] + + # 2 grained + grain_list = ['maths', 'physics'] + for j in grain_list: + dict_name = j + correct_num = 0 + full_num = 0 + for i in range(len(name_list)): + if j in name_list[i]: + correct_num += correct_list[i] + full_num += len(sample_list[i]) + acc = 100 * correct_num / full_num + acc_dict[dict_name] = [acc] + + # AVG + correct_num = sum(correct_list) + acc = 100 * correct_num / len(judge_file) + acc_dict['AVG'] = [acc] + + acc_pd = pd.DataFrame(acc_dict) + acc_pd.to_csv(score_file, index=False, encoding='gbk') + + accdz = pd.read_csv(score_file) + return accdz + + +class LLaVABench(ImageBaseDataset): + TYPE = 'VQA' + DATASET_URL = {'LLaVABench': 'https://opencompass.openxlab.space/utils/VLMEval/LLaVABench.tsv'} + DATASET_MD5 = {'LLaVABench': 'd382a093f749a697820d3dadd61c8428'} + + # It returns a DataFrame + @classmethod + def evaluate(self, eval_file, **judge_kwargs): + from .utils.llavabench import ( + build_prompt, + LLaVABench_atomeval, + LLaVABench_score, + ) + + suffix = '.' + eval_file.split('.')[-1] + record_file = eval_file.replace(suffix, '_openai_result' + suffix) + score_file = eval_file.replace(suffix, '_score.csv') + nproc = judge_kwargs.pop('nproc', 4) + system_prompt = 'You are a helpful and precise assistant for checking the quality of the answer.' + + if not osp.exists(record_file): + data = load(eval_file) + lines = [data.iloc[i] for i in range(len(data))] + model = build_judge(temperature=0.2, system_prompt=system_prompt, **judge_kwargs) + assert model.working(), ('LLaVABench evaluation requires a working OPENAI API\n' + DEBUG_MESSAGE) + + prompts = [build_prompt(line) for line in lines] + tups = [(model, prompt) for prompt in prompts] + scores = track_progress_rich(LLaVABench_atomeval, tups, nproc=nproc, chunksize=nproc) + data['gpt4_score'] = [x[0] for x in scores] + data['score'] = [x[1] for x in scores] + dump(data, record_file) + + data = load(record_file) + ret = LLaVABench_score(data).round(1) + dump(ret, score_file) + return ret + + +class MMVet(ImageBaseDataset): + TYPE = 'VQA' + DATASET_URL = { + 'MMVet': 'https://opencompass.openxlab.space/utils/VLMEval/MMVet.tsv' + } + DATASET_MD5 = {'MMVet': '748aa6d4aa9d4de798306a63718455e3'} + + # It returns a DataFrame + @classmethod + def evaluate(self, eval_file, **judge_kwargs): + from .utils.mmvet import MMVet_auxeval, MMVet_acc + + suffix = eval_file.split('.')[-1] + model = judge_kwargs['model'] + storage = eval_file.replace(f'.{suffix}', f'_{model}.xlsx') + tmp_file = eval_file.replace(f'.{suffix}', f'_{model}.pkl') + nproc = judge_kwargs.pop('nproc', 4) + if not osp.exists(storage): + data = load(eval_file) + model = build_judge(max_tokens=3, **judge_kwargs) + assert model.working(), ('MMVet evaluation requires a working OPENAI API\n' + DEBUG_MESSAGE) + + lt = len(data) + lines = [data.iloc[i] for i in range(lt)] + tups = [(model, line) for line in lines] + indices = [line['index'] for line in lines] + + ans = load(tmp_file) if osp.exists(tmp_file) else {} + tups = [x for x, i in zip(tups, indices) if i not in ans] + indices = [i for i in indices if i not in ans] + + if len(indices): + new_results = track_progress_rich( + MMVet_auxeval, + tups, + nproc=nproc, + chunksize=nproc, + keys=indices, + save=tmp_file, + ) + ans = load(tmp_file) + for k, v in zip(indices, new_results): + assert k in ans + assert ans[k]['log'] == v['log'] and ans[k]['score'] == v['score'] + data['score'] = [ans[idx]['score'] for idx in data['index']] + data['log'] = [ans[idx]['log'] for idx in data['index']] + dump(data, storage) + + score, score_fine = MMVet_acc(storage) + score_pth = storage.replace('.xlsx', '_score.csv') + score_fine_pth = storage.replace('.xlsx', '_score_fine.csv') + dump(score, score_pth) + dump(score_fine, score_fine_pth) + return score + + +class MTVQADataset(ImageBaseDataset): + TYPE = 'VQA' + DATASET_URL = {'MTVQA_TEST': 'https://opencompass.openxlab.space/utils/VLMEval/MTVQA_TEST.tsv'} + DATASET_MD5 = {'MTVQA_TEST': 'd87c17dbab934b7cd89c0a3c1c5657f4'} + + @classmethod + def evaluate(self, eval_file, **judge_kwargs): + data = load(eval_file) + assert 'answer' in data and 'prediction' in data and 'category' in data + data['prediction'] = [str(x) for x in data['prediction']] + data['answer'] = [str(x) for x in data['answer']] + if 'split' in data: + assert np.all([x.lower() == 'test' for x in data['split']]), 'We only support MTVQA_TEST for now. ' + lt = len(data) + category_scores = defaultdict(list) + for i in range(lt): + line = data.iloc[i] + ans = line['answer'].strip().lower().replace('.', '') + pred = line['prediction'].strip().lower().replace('.', '') + cate = line['category'] + score = 1.0 if ans in pred else 0.0 + category_scores[cate].append(score) + category_scores['Average'].append(score) + # Calculate the average score for each category, the score is normalized to [0, 100] + category_averages = {category: np.mean(scores) * 100 for category, scores in category_scores.items()} + + suffix = eval_file.split('.')[-1] + result_file = eval_file.replace(f'.{suffix}', '_acc.json') + dump(category_averages, result_file) + + return category_averages + + # MT-VQA adopts a custom prompt + def build_prompt(self, line): + msgs = super().build_prompt(line) + assert sum([x['type'] == 'text' for x in msgs]) == 1 + for item in msgs: + if item['type'] == 'text': + item['value'] += '\nAnswer the question using a word or phrase in the language of the question.' + return msgs + + +class TableVQABench(ImageBaseDataset): + TYPE = 'VQA' + DATASET_URL = { + 'TableVQABench': 'https://pai-aigc-photog.oss-cn-hangzhou.aliyuncs.com/mentor-vil/datasets/tablevqa-bench.tsv' + } + DATASET_MD5 = {'TableVQABench': '2550adc61bdc82d8e62f3b003de7c62d'} + + from .utils.tablevqabench import FINTABNETQA_PROMPT, VTABFACT_PROMPT, VWTQ_PROMPT + + # It returns a DataFrame + @classmethod + def evaluate(self, eval_file, **judge_kwargs): + import pandas as pd + from .utils.tablevqabench import evaluate_fintabnet, evaluate_tabfact, evaluate_wtq + + data = load(eval_file) + assert 'answer' in data and 'prediction' in data + + data['prediction'] = data['prediction'].str.replace('^Answer: ', '', regex=True) + data_group = dict(tuple(data.groupby('split'))) + eval_result = {'split': [], 'average_scores': []} + for split in ['fintabnetqa', 'vtabfact', 'vwtq', 'vwtq_syn']: + data_split = data_group[split].to_dict(orient='records') + if split == 'fintabnetqa': + split_eval_meta = evaluate_fintabnet(data_split, ['accuracy']) + elif split == 'vtabfact': + split_eval_meta = evaluate_tabfact(data_split, ['accuracy']) + elif split == 'vwtq' or split == 'vwtq_syn': + split_eval_meta = evaluate_wtq(data_split, ['accuracy']) + eval_result['split'].append(split) + eval_result['average_scores'].append(split_eval_meta['average_scores']) + + suffix = eval_file.split('.')[-1] + result_file = eval_file.replace(f'.{suffix}', '_acc.csv') + eval_result = pd.DataFrame(eval_result) + dump(eval_result, result_file) + + return eval_result + + # TableVQABench adopts a custom prompt + def build_prompt(self, line): + msgs = super().build_prompt(line) + assert sum([x['type'] == 'text' for x in msgs]) == 1 + for item in msgs: + if item['type'] == 'text': + if line['split'] == 'fintabnetqa': + item['value'] = self.FINTABNETQA_PROMPT.format_map({'question': item['value']}) + elif line['split'] == 'vtabfact': + item['value'] = self.VTABFACT_PROMPT.format_map({'question': item['value']}) + elif line['split'] == 'vwtq_syn' or line['split'] == 'vwtq': + item['value'] = self.VWTQ_PROMPT.format_map({'question': item['value']}) + return msgs + + +class CustomVQADataset(ImageBaseDataset): + TYPE = 'VQA' + + def load_data(self, dataset): + data_path = osp.join(LMUDataRoot(), f'{dataset}.tsv') + + if file_size(data_path, 'GB') > 1: + local_path = data_path.replace('.tsv', '_local.tsv') + if not osp.exists(local_path) or os.environ.get('FORCE_LOCAL', None): + from ..tools import LOCALIZE + + LOCALIZE(data_path, local_path) + data_path = local_path + return load(data_path) + + def evaluate(self, eval_file, **judge_kwargs): + raise NotImplementedError + + +class CRPE(ImageBaseDataset): + TYPE = 'VQA' + DATASET_URL = { + 'CRPE_EXIST': 'https://huggingface.co/datasets/petter12321/crpe_vlmevalkit/resolve/main/CRPE_EXIST.tsv', + 'CRPE_RELATION': 'https://huggingface.co/datasets/petter12321/crpe_vlmevalkit/resolve/main/CRPE_RELATION.tsv' + } + DATASET_MD5 = { + 'CRPE_EXIST': '315584e23ac1ff7f8719ed3b7ad90f08', + 'CRPE_RELATION': 'bad7094cde0b572288f4b119c2d0c656'} + + @classmethod + def evaluate(self, eval_file, **judge_kwargs): + from .utils.crpe import is_correct + # find-image, count-text, find-text, + # infer-choose, count-image, visual-reasoning + score = { + 'exist': 0, + 'subject': 0, + 'predicate': 0, + 'object': 0, + 'total': 0, + } + num = { + 'exist': 0, + 'subject': 0, + 'predicate': 0, + 'object': 0, + 'total': 0, + } + final_score_dict = { + 'exist': 0, + 'subject': 0, + 'predicate': 0, + 'object': 0, + 'total': 0, + } + data = load(eval_file) + lt = len(data) + lines = [data.iloc[i] for i in range(lt)] + for i in tqdm(range(len(lines))): + line = lines[i] + predict = str(line['prediction']) + answers = str(line['answer']) + # print("predict =", predict) + # print("answers =", answers) + category = line['category'] + if is_correct(answers, predict): + score[category] += 1 + score['total'] += 1 + num[category] += 1 + num['total'] += 1 + + for category in ['exist', 'subject', 'predicate', 'object', 'total']: + if num[category] != 0: + final_score_dict[category] = score[category] / num[category] + else: + final_score_dict[category] = None + + score_pth = eval_file.replace('.xlsx', '_score.json') + dump(final_score_dict, score_pth) + return final_score_dict + + def build_prompt(self, line): + ROOT = LMUDataRoot() + msgs = super().build_prompt(line) + for msg in msgs: + if msg['type'] == 'image': + msg['value'] = osp.join(osp.join(ROOT, 'images', self.dataset_name), msg['value']) + return msgs + + +class QSpatial(ImageBaseDataset): + TYPE = 'VQA' + DATASET_URL = { + 'QSpatial_plus': '', + 'QSpatial_scannet': '' + } + + # NOTE: To evaluate Q-Spatial-ScanNet, you need to get the permission from ScanNet website + # Once you get the permission, you can use the helper code here to download and extract necessary images: + # https://github.com/andrewliao11/Q-Spatial-Bench-code?tab=readme-ov-file#for-qspatial_scannet + qspatial_root = "TO_BE_REPLACED_WITH_THE_PATH_TO_QSPATIAL_DATASET" + url = "https://raw.githubusercontent.com/andrewliao11/Q-Spatial-Bench-code/refs/heads/main/prompt_templates/" + + def post_build(self, dataset): + # Download the prompt templates from github + + links = [ + self.url + "system_prompt.txt", + self.url + "spatial_prompt_single.txt", + self.url + "spatial_prompt_steps.txt", + self.url + "standard_prompt.txt", + self.url + "zero_shot_prompt.txt" + ] + with tempfile.TemporaryDirectory() as temp_dir: + for link in links: + tgt_path = os.path.join(temp_dir, link.split("/")[-1]) + os.system(f"wget {link} -O {tgt_path}") + + self.system_prompt = open(os.path.join(temp_dir, "system_prompt.txt")).read() + self._prompt_templates = dict( + spatial_prompt_single=open(os.path.join(temp_dir, "spatial_prompt_single.txt")).read(), + spatial_prompt_steps=open(os.path.join(temp_dir, "spatial_prompt_steps.txt")).read(), + standard_prompt=open(os.path.join(temp_dir, "standard_prompt.txt")).read(), + zero_shot_prompt=open(os.path.join(temp_dir, "zero_shot_prompt.txt")).read(), + ) + + # Given one data record, return the built prompt (a multi-modal message), can override + def build_prompt(self, line): + + text_prompt_template = self._prompt_templates["spatial_prompt_single"] + env = SandboxedEnvironment() + text_prompt = env.from_string(text_prompt_template).render(question=line["question"]) + tgt_path = self.dump_image(line) + + msgs = [] + if isinstance(tgt_path, list): + msgs.extend([dict(type='image', value=p) for p in tgt_path]) + else: + msgs = [dict(type='image', value=tgt_path)] + + msgs.append(dict(type='text', value=f"{self.system_prompt}\n{text_prompt}")) + return msgs + + # Given the dataset name, return the dataset as a pandas dataframe, can override + def load_data(self, dataset): + import io + import pandas as pd + from datasets import load_dataset + + hf_dataset = load_dataset("andrewliao11/Q-Spatial-Bench", split=dataset) + df = hf_dataset.to_pandas() + + df.reset_index(drop=True, inplace=True) + df['index'] = df.index + df['answer'] = list(zip(df['answer_value'], df['answer_unit'])) + df = df[['index'] + [col for col in df.columns if col != 'index']] + + if dataset == "QSpatial_scannet": + df = df.drop(columns=["image"]) + df["image"] = [Image.open(os.path.join(self.qspatial_root, image_path)) for image_path in df["image_path"]] + else: + df["image"] = [Image.open(io.BytesIO(image_dict["bytes"])) for image_dict in df["image"]] + + df["image"] = [encode_image_to_base64(image) for image in df["image"]] + return df + + @classmethod + def get_multiplier(self, unit): + + unit = unit.lower() + if unit in ["meters", "meter", "m", "metre", "metres"]: + multiplier = 100 + elif unit in ["centimeters", "centimeter", "cm"]: + multiplier = 1 + elif unit in ["feet", "foot", "ft"]: + multiplier = 30.48 + elif unit in ["inch", "inches", "in"]: + multiplier = 2.54 + elif unit in ["mm"]: + multiplier = 0.1 + else: + print(f"Unknown unit: {unit}") + multiplier = 0. + + return multiplier + + @classmethod + def parse_string(self, input_str): + # Regular expression to match the pattern (number or range, text) + match = re.match(r'\(([\d.-]+), (.+)\)', input_str) + if match: + number_part = match.group(1) + text = match.group(2) + + if '-' in number_part: + start, end = map(float, number_part.split('-')) + number = (start + end) / 2 + else: + number = float(number_part) + + return number * self.get_multiplier(text) + else: + print(f"Unable to parse the input string {input_str}") + return 0 + + @classmethod + def parse_prediction(self, vlm_response): + # Value + pattern = r'scalar{([^}]*)}' + str_inside_scalar_boxes = re.findall(pattern, vlm_response)[-1] + scalar_list = re.findall(r'\d+\.?\d*', str_inside_scalar_boxes) + parsed_scalar = np.array(scalar_list).astype(float).mean() + + # Unit + pattern = r'distance_unit{([^}]*)}' + str_inside_unit_boxes = re.findall(pattern, vlm_response) + parsed_unit = str_inside_unit_boxes[-1] + + pred_value_in_cms = parsed_scalar * self.get_multiplier(parsed_unit) + return pred_value_in_cms + + # It returns a dictionary + @classmethod + def evaluate(self, eval_file, **judge_kwargs): + + data = load(eval_file) + if "model" in judge_kwargs: + from .utils.qspatial import QSpatial_auxeval + + # extract using model + model = judge_kwargs['model'] + suffix = eval_file.split('.')[-1] + storage = eval_file.replace(f'.{suffix}', f'_{model}.xlsx') + tmp_file = eval_file.replace(f'.{suffix}', f'_{model}.pkl') + nproc = judge_kwargs.pop('nproc', 4) + + if not osp.exists(storage): + model = build_judge(max_tokens=128, **judge_kwargs) + + assert model.working(), ('Evaluation requires a working OPENAI API\n' + DEBUG_MESSAGE) + lt = len(data) + lines = [data.iloc[i] for i in range(lt)] + tups = [(model, line) for line in lines] + indices = [line['index'] for line in lines] + + ans = {} + if osp.exists(tmp_file): + ans = load(tmp_file) + tups = [x for x, i in zip(tups, indices) if i not in ans] + indices = [i for i in indices if i not in ans] + + if len(indices): + new_results = track_progress_rich( + QSpatial_auxeval, + tups, + nproc=nproc, + chunksize=nproc, + keys=indices, + save=tmp_file, + ) + ans = load(tmp_file) + for k, v in zip(indices, new_results): + assert k in ans + assert ans[k]['log'] == v['log'] and ans[k]['res'] == v['res'] + + data['res'] = [ans[idx]['res'] for idx in data['index']] + data['log'] = [ans[idx]['log'] for idx in data['index']] + dump(data, storage) + + data = load(storage) + + pred_value_in_cms = [] + for res in data["res"]: + try: + pred_value_in_cms.append(self.parse_string(res)) + except ValueError: + pred_value_in_cms.append(0.) + + pred_value_in_cms = np.array(pred_value_in_cms) + 1e-8 + else: + # regex parsing + pred_value_in_cms = [] + n_errors_in_parsing = 0 + for pred in data["prediction"]: + try: + parsed_value = self.parse_prediction(pred) + except IndexError: + n_errors_in_parsing += 1 + parsed_value = 1e-8 + + pred_value_in_cms.append(parsed_value) + + print(f"Encounter {n_errors_in_parsing} errors in parsing") + pred_value_in_cms = np.array(pred_value_in_cms) + 1e-8 + + # Ground truth + ground_truth_value_in_cms = [] + for answer in data["answer"]: + value, unit = eval(answer) + ground_truth_value_in_cms.append(value * self.get_multiplier(unit)) + ground_truth_value_in_cms = np.array(ground_truth_value_in_cms) + 1e-8 + + # Calculate the score + pred_gt = pred_value_in_cms / ground_truth_value_in_cms + gt_pred = ground_truth_value_in_cms / pred_value_in_cms + delta_2 = np.stack([pred_gt, gt_pred]).max(0) < 2. + delta_1_point_5 = np.stack([pred_gt, gt_pred]).max(0) < 1.5 + + data["eval_score_delta_2"] = delta_2 + data["eval_score_delta_1_point_5"] = delta_1_point_5 + + final_score_dict = { + "delta_2": delta_2.mean(), + "delta_1_point_5": delta_1_point_5.mean() + } + for question_type in set(data["question_type"]): + filtered_data = data[data["question_type"] == question_type] + delta_2_per_question_type = filtered_data["eval_score_delta_2"].mean() + delta_1_point_5_per_question_type = filtered_data["eval_score_delta_1_point_5"].mean() + final_score_dict.update({f"{question_type}_delta_2": delta_2_per_question_type}) + final_score_dict.update({f"{question_type}_delta_1_point_5": delta_1_point_5_per_question_type}) + + score_pth = eval_file.replace('.xlsx', '_score.json') + dump(final_score_dict, score_pth) + return final_score_dict + + +class MMNIAH(ImageBaseDataset): + TYPE = 'VQA' + DATASET_URL = { + 'MM_NIAH_VAL': + 'https://huggingface.co/datasets/petter12321/MM-NIAH-VLMEvalKit/resolve/main/MM_NIAH_VAL.tsv', + 'MM_NIAH_TEST': + ['https://huggingface.co/datasets/petter12321/MM-NIAH-VLMEvalKit/resolve/main/part-aa', + 'https://huggingface.co/datasets/petter12321/MM-NIAH-VLMEvalKit/resolve/main/part-ab', + 'https://huggingface.co/datasets/petter12321/MM-NIAH-VLMEvalKit/resolve/main/part-ac', + 'https://huggingface.co/datasets/petter12321/MM-NIAH-VLMEvalKit/resolve/main/part-ad', + 'https://huggingface.co/datasets/petter12321/MM-NIAH-VLMEvalKit/resolve/main/part-ae']} + DATASET_MD5 = {'MM_NIAH_VAL': '27e5a8c3cef7746cb38f89cd86c474c5', + 'MM_NIAH_TEST': 'f490eb2a43096307465fe9e7ef13497c'} + + def prepare_tsv(self, url, file_md5=None): + import os + data_root = LMUDataRoot() + os.makedirs(data_root, exist_ok=True) + update_flag = False + file_name = 'MM_NIAH_VAL.tsv' if 'MM_NIAH_VAL' in url else 'MM_NIAH_TEST.tsv' + data_path = osp.join(data_root, file_name) + if osp.exists(data_path) and (file_md5 is None or md5(data_path) == file_md5): + pass + elif file_name == 'MM_NIAH_TEST.tsv': + warnings.warn('The dataset tsv is not downloaded') + for i in range(len(url)): + if osp.exists(osp.join(data_root, 'part-a' + chr(ord('a') + i))): + print('part_a' + chr(ord('a') + i) + ' is existed') + continue + download_file(url[i], data_path) + file_prefix = 'part-' + output_file = data_path + split_files = sorted([f for f in os.listdir(data_root) if f.startswith(file_prefix)]) + with open(output_file, 'wb') as outfile: + # 逐个读取每个拆分文件并写入到输出文件 + for filename in split_files: + with open(osp.join(data_root, filename), 'rb') as infile: + outfile.write(infile.read()) + update_flag = True + else: + warnings.warn('The dataset tsv is not downloaded') + download_file(url, data_path) + update_flag = True + + if file_size(data_path, 'GB') > 1: + local_path = data_path.replace('.tsv', '_local.tsv') + if not osp.exists(local_path) or os.environ.get('FORCE_LOCAL', None) or update_flag: + from ..tools import LOCALIZE + LOCALIZE(data_path, local_path) + data_path = local_path + return load(data_path) + + @classmethod + def evaluate(self, eval_file, **judge_kwargs): + from .utils.mmniah import is_correct + # find-image, count-text, find-text, + # infer-choose, count-image, visual-reasoning + MMNIAH_score = { + 'count-text': 0, + 'find-image': 0, + 'find-text': 0, + 'infer-choose': 0, + 'count-image': 0, + 'visual-reasoning': 0, + 'total': 0, + } + MMNIAH_num = { + 'count-text': 0, + 'find-image': 0, + 'find-text': 0, + 'infer-choose': 0, + 'count-image': 0, + 'visual-reasoning': 0, + 'total': 0, + } + final_score_dict = { + 'count-text': 0, + 'find-image': 0, + 'find-text': 0, + 'infer-choose': 0, + 'count-image': 0, + 'visual-reasoning': 0, + 'total': 0, + } + data = load(eval_file) + lt = len(data) + lines = [data.iloc[i] for i in range(lt)] + for i in tqdm(range(len(lines))): + line = lines[i] + predict = line['prediction'] + answers = line['answer'] + category = line['category'] + if category in ['visual-reasoning', 'find-image']: + answers = int(answers) + if is_correct(answers, predict): + MMNIAH_score[category] += 1 + MMNIAH_score['total'] += 1 + MMNIAH_num[category] += 1 + MMNIAH_num['total'] += 1 + + for category in ['find-image', 'count-text', 'find-text', + 'infer-choose', 'count-image', 'visual-reasoning', 'total']: + if MMNIAH_num[category] != 0: + final_score_dict[category] = MMNIAH_score[category] / MMNIAH_num[category] + else: + final_score_dict[category] = None + + score_pth = eval_file.replace('.xlsx', '_score.json') + dump(final_score_dict, score_pth) + return final_score_dict + + def build_prompt(self, line): + msgs = super().build_prompt(line) + if isinstance(line, int): + line = self.data.iloc[line] + totalchoice = line['multi-choice options'] + totalchoice = eval(totalchoice) + # find-image, count-text, find-text, + # infer-choose, count-image, visual-reasoning + context = msgs[-1]['value'] + context = eval(context) + question = context[0] + '\n' + context[1] + # tgt_path是所有图像地址列表 + tgt_path = [] + for i in range(len(msgs) - 1): + tgt_path.append(msgs[i]['value']) + choices = totalchoice[0] + choices_image = totalchoice[1] + if choices: + for c_idx, c in enumerate(choices): + question = f"{question}\n{chr(c_idx + ord('A'))}. {c}" + question += "\nAnswer with the option's letter from the given choices directly." + elif choices_image: + for c_idx in range(len(choices_image)): + question = f"{question}\n{chr(c_idx + ord('A'))}. " + question += "\nAnswer with the option's letter from the given choices directly." + else: + question += '\nAnswer the question using a single word or phrase.' + question = '' + question + '' + question = question.split('') + if choices_image: + for i in range(len(question) - 5): + question[i] = question[i] + '\n' + for i in range(len(question) - 5, len(question) - 1): + question[i] = question[i] + '' + else: + for i in range(len(question) - 1): + question[i] = question[i] + '\n' + assert len(tgt_path) + 1 == len(question) + context = [] + for i in range(len(tgt_path)): + context.append(question[i]) + context.append(tgt_path[i]) + context.append(question[-1]) + context[0] = context[0][7:] + context[-1] = context[-1][:-5] + msgs = [] + for i in range(len(context)): + if i % 2 == 0: + msgs.append(dict(type='text', value=context[i])) + else: + ROOT = LMUDataRoot() + msgs.append(dict(type='image', value=osp.join(osp.join(ROOT, 'images', self.dataset_name), context[i]))) + for element in msgs: + if element['value'] == '': + msgs.remove(element) + return msgs diff --git a/vlmeval/VLMEvalKit_old/vlmeval/dataset/image_yorn.py b/vlmeval/VLMEvalKit_old/vlmeval/dataset/image_yorn.py new file mode 100644 index 0000000000000000000000000000000000000000..46083e6c3b8147901448a8919d20d3e58dfc2b9f --- /dev/null +++ b/vlmeval/VLMEvalKit_old/vlmeval/dataset/image_yorn.py @@ -0,0 +1,95 @@ +from ..smp import * +from ..utils import * +from .image_base import ImageBaseDataset +from .utils import build_judge, DEBUG_MESSAGE + + +class ImageYORNDataset(ImageBaseDataset): + + TYPE = 'Y/N' + + DATASET_URL = { + 'MME': 'https://opencompass.openxlab.space/utils/VLMEval/MME.tsv', + 'HallusionBench': 'https://opencompass.openxlab.space/utils/VLMEval/HallusionBench.tsv', + 'POPE': 'https://opencompass.openxlab.space/utils/VLMEval/POPE.tsv', + 'AMBER': 'https://huggingface.co/datasets/yifanzhang114/AMBER_base64/resolve/main/AMBER.tsv', + } + + DATASET_MD5 = { + 'MME': 'b36b43c3f09801f5d368627fb92187c3', + 'HallusionBench': '0c23ac0dc9ef46832d7a24504f2a0c7c', + 'POPE': 'c12f5acb142f2ef1f85a26ba2fbe41d5', + 'AMBER': '970d94c0410916166e0a76ba75da7934', + } + + # It returns a dataframe + def evaluate(self, eval_file, **judge_kwargs): + from .utils.yorn import YOrN_Extraction, YOrN_auxeval + from .utils.yorn import default_rating, MME_rating, Hallusion_rating, POPE_rating, AMBER_rating + + dataset = self.dataset_name + data = load(eval_file) + data['prediction'] = [str(x) for x in data['prediction']] + storage = eval_file.replace('.xlsx', '_auxmatch.xlsx') + tmp_file = eval_file.replace('.xlsx', '_tmp.pkl') + nproc = judge_kwargs.pop('nproc', 4) + + if not osp.exists(storage): + ans_map = {k: YOrN_Extraction(v) for k, v in zip(data['index'], data['prediction'])} + if osp.exists(tmp_file): + tmp = load(tmp_file) + for k in tmp: + if ans_map[k] == 'Unknown' and tmp[k] != 'Unknown': + ans_map[k] = tmp[k] + + data['extracted'] = [ans_map[x] for x in data['index']] + unknown = data[data['extracted'] == 'Unknown'] + + model = judge_kwargs.get('model', 'exact_matching') + if model == 'exact_matching': + model = None + elif gpt_key_set(): + model = build_judge(**judge_kwargs) + if not model.working(): + warnings.warn('OPENAI API is not working properly, will use exact matching for evaluation') + warnings.warn(DEBUG_MESSAGE) + model = None + else: + model = None + warnings.warn('OPENAI_API_KEY is not working properly, will use exact matching for evaluation') + + if model is not None: + lt = len(unknown) + lines = [unknown.iloc[i] for i in range(lt)] + tups = [(model, line) for line in lines] + indices = list(unknown['index']) + if len(tups): + res = track_progress_rich( + YOrN_auxeval, tups, nproc=nproc, chunksize=nproc, keys=indices, save=tmp_file) + for k, v in zip(indices, res): + ans_map[k] = v + + data['extracted'] = [ans_map[x] for x in data['index']] + dump(data, storage) + + data = load(storage) + if listinstr(['AMBER'], dataset): + data['score'] = (data['answer'].str.lower() == data['extracted'].str.lower()) + else: + data['score'] = (data['answer'] == data['extracted']) + dump(data, storage) + + if dataset is not None and listinstr(['MME'], dataset): + score = MME_rating(storage) + elif dataset is not None and listinstr(['Hallusion'], dataset): + score = Hallusion_rating(storage) + elif dataset is not None and listinstr(['POPE'], dataset): + score = POPE_rating(storage) + elif dataset is not None and listinstr(['AMBER'], dataset): + score = AMBER_rating(storage) + else: + score = default_rating(storage) + + score_tgt = eval_file.replace('.xlsx', '_score.csv') + dump(score, score_tgt) + return score diff --git a/vlmeval/VLMEvalKit_old/vlmeval/dataset/miabench.py b/vlmeval/VLMEvalKit_old/vlmeval/dataset/miabench.py new file mode 100644 index 0000000000000000000000000000000000000000..2e99d39ecb3a62957dbe2b8cdd0bdb3911532729 --- /dev/null +++ b/vlmeval/VLMEvalKit_old/vlmeval/dataset/miabench.py @@ -0,0 +1,167 @@ +import json +import os + +import pandas as pd + +from .image_base import ImageBaseDataset +from ..smp import * +from .utils import build_judge, DEBUG_MESSAGE +from ..utils import track_progress_rich + + +def generate_prompt(d): + question = d['question'] + weights = eval(d['component_weight']) + components = eval(d['components']) + num_of_component = int(d['num_of_component']) + response = d['prediction'] + + if num_of_component == 1: + components = f"The first component is: '{components[0]}'. " + score = f"The first component is worth: {weights[0]} scores. " + elif num_of_component == 2: + components = f"The first component is: '{components[0]}', and the second component is '{components[1]}'. " + score = f"The first and second component is each worth {weights[0]} and {weights[1]} scores. " + elif num_of_component == 3: + components = ( + f"The first component is: '{components[0]}', and the second component is '{components[1]}', " + f"and the third component is '{components[2]}'. " + ) + score = ( + "The first, second, and third component is each worth " + f"{weights[0]}, {weights[1]}, and {weights[2]} scores." + ) + elif num_of_component == 4: + components = ( + f"The first component is: '{components[0]}', and the second component is '{components[1]}', " + f"and the third component is '{components[2]}', and the fourth component is '{components[3]}'. " + ) + score = ( + "The first, second, third, and fourth component is each worth " + f"{weights[0]}, {weights[1]}, {weights[2]}, and {weights[3]} scores." + ) + elif num_of_component == 5: + components = ( + f"The first component is: '{components[0]}', and the second component is '{components[1]}', " + f"and the third component is '{components[2]}', and the fourth component is '{components[3]}', " + f"and the fifth component is '{components[4]}'. " + ) + score = ( + "The first, second, third, fourth, and fifth component is each worth " + f"{weights[0]}, {weights[1]}, {weights[2]}, {weights[3]}, and {weights[4]} scores." + ) + + return ( + "Here is an instruction for a multimodal LLM: '" + f"{question}" + "'. You need to grade if the response from the model follows each component of the instruction. " + f"{components}" + "The response is: '" + f"{response}" + "'. You need to score the response and be strict. The total score ranges from 0 to 10, " + "depending on if the response follows the instruction. " + f"{score}" + "List scores of each component, and the total score in one sentence in this format: " + "score of component 1: x/2, score of component 2: y/8, total score: z/10. Then explain your reasons." + ) + + +def process_rawscore(component_type, raw_score): + first_sentence = raw_score.split('.')[0].split(',') + score_dict = {} + for i in range(len(first_sentence) - 1): + score_ = first_sentence[i].split(':')[1][1:].split('/') + score = int(score_[0]) / int(score_[1]) + score_dict[component_type[i]] = score + total_score_ = first_sentence[i + 1].split(':')[1][1:].split('/') + total_score = int(total_score_[0]) / int(total_score_[1]) + score_dict['total_score'] = total_score + return score_dict + + +def get_score_dict(data, score_raw): + cat_score_dict = {} + for i in range(len(data)): + try: + cmp = data['component_type'][i][2:-2] + cmp_list = cmp.split('\', \'') + score_dict = process_rawscore(cmp_list, score_raw[i]) + for key, val in score_dict.items(): + if key not in cat_score_dict.keys(): + cat_score_dict[key] = [val] + else: + cat_score_dict[key].append(val) + except: + pass + cat_score_dict_average = {} + for key, val in cat_score_dict.items(): + cat_score_dict_average[key] = sum(val) / len(val) + return cat_score_dict_average + + +class MIABench(ImageBaseDataset): + TYPE = 'VQA' + + DATASET_URL = { + 'MIA-Bench': 'https://opencompass.openxlab.space/utils/VLMEval/Mia-Bench.tsv', + } + DATASET_MD5 = { + 'MIA-Bench': '0b9de595f4dd40af18a69b94d89aba82', + } + + @classmethod + def evaluate(self, eval_file, **judge_kwargs): + judge_name = judge_kwargs.pop('model', 'gpt-4o') + + model = build_judge(model=judge_name, **judge_kwargs) + suffix = eval_file.split('.')[-1] + + storage = eval_file.replace(f'.{suffix}', f'_{judge_name}.xlsx') # noqa: F841 + tmp_file = eval_file.replace(f'.{suffix}', f'_{judge_name}.pkl') # noqa: F841 + nproc = judge_kwargs.pop('nproc', 4) # noqa: F841 + + if not osp.exists(storage): + data = load(eval_file) + num_samples = len(data) + lines = [data.loc[i] for i in range(num_samples)] + prompts = [generate_prompt(line) for line in lines] + org_data = MIABench('MIA-Bench').data + img_map = {x: y for x, y in zip(org_data['index'], org_data['image'])} + image_b64 = [img_map[idx] for idx in data['index']] + indices = list(data['index']) + mm_messages = [ + dict(message=[ + dict(type='text', value=prompt), + dict(type='image', value=f'data:image/jpeg;base64,{b64}') + ]) + for prompt, b64 in zip(prompts, image_b64) + ] + + res = {} + if osp.exists(tmp_file): + res = load(tmp_file) + + jobs = {k: v for k, v in zip(indices, mm_messages) if k not in res} + job_keys = list(jobs.keys()) + job_vals = [jobs[k] for k in job_keys] + + resps = track_progress_rich( + model.generate, + job_vals, + nproc=nproc, + chunksize=nproc, + keys=job_keys, + save=tmp_file, + ) + for k, resp in zip(job_keys, resps): + res[k] = resp + data['score_raw'] = [res[idx] for idx in indices] + dump(data, storage) + + goresult = load(storage) + results = get_score_dict(goresult, goresult['score_raw']) + result_pth = storage.replace('.xlsx', '_score.csv') + results_pd = pd.DataFrame.from_dict(list(results.items())) + dump(results_pd, result_pth) + + return results diff --git a/vlmeval/VLMEvalKit_old/vlmeval/dataset/mmmath.py b/vlmeval/VLMEvalKit_old/vlmeval/dataset/mmmath.py new file mode 100644 index 0000000000000000000000000000000000000000..a6d78d57c82b4cd6ea9863fae2317bda825d2aaa --- /dev/null +++ b/vlmeval/VLMEvalKit_old/vlmeval/dataset/mmmath.py @@ -0,0 +1,446 @@ +import re +import json +import sympy as sp +import numpy as np +from sympy import simplify, Eq, sympify, Pow, pi +from sympy.parsing.latex import parse_latex +import sys +import math +import os +import argparse + +from .image_base import ImageBaseDataset +from ..utils import track_progress_rich +from ..smp import load, dump + + +class AutoScoringJudge: + def __init__(self): + # Map of special symbols to their replacements + self.special_signal_map = { + "\\left": "", + "\\right": "", + "厘米":"", + # "∶": ":", + ",": ",", + "$": "", + "(":"(", + ")":")", + "\\infty":"oo", + "\\colon ":":", + # "\\approx": "=", + # "\\simeq": "=", + # "\\sim": "=", + # "^\\prime": "'", + # "^{\\prime}": "'", + "+":"+", + "\\, ": "", + "\\,":"", + "^\\circ": "", + "^{\\circ}": "", + # "%": "", + } + self.pi = parse_latex("\\pi") + # MM-Math default precision + self.precision = 1e-2 + + def trans_greater_sign_to_interval(self, expr:str): + expr_tmp = expr.split("<") + return "(" + expr_tmp[0] + ", " + expr_tmp[-1] + ")" + + def split_by_comma(self, expr: str): + # Splits expressions by commas outside of brackets + in_bracket_num = 0 + splitted_expr = [] + start_idx = 0 + for i, char in enumerate(expr): + if char in ["(", "["]: + in_bracket_num += 1 + elif char in [")", "]"]: + in_bracket_num -= 1 + elif char == "," and in_bracket_num == 0: + splitted_expr.append(expr[start_idx:i].strip()) + start_idx = i + 1 + + if start_idx < len(expr): + splitted_expr.append(expr[start_idx:].strip()) + + return splitted_expr + + def trans_plus_minus_sign(self, expr_list: list): + # Translates plus-minus signs into separate expressions + new_expr_list = [] + for expr in expr_list: + if "\\pm" in expr: + new_expr_list.append(expr.replace("\\pm", "+")) + new_expr_list.append(expr.replace("\\pm", "-")) + else: + new_expr_list.append(expr) + + return new_expr_list + + def judge(self, expression1, expression2, precision=1e-2): + # Judge if two expressions are equal (expression1 is considered as the Ground Truth) + # Default precision is a list for supporting multiple expressions + precision = precision if isinstance(precision, list) else [precision] + + try: + expression1, expression2 = self.preprocess(expression1, expression2) + except: + return False + if expression1 == expression2: + # print("Exactly equal") + return True + + # Remove Chinese characters from the string, as answers like "yes" or "no" in Chinese have been considered + expression1 = expression1 if re.fullmatch(r"[\u4e00-\u9fff]+", expression1) else re.sub(r'[\u4e00-\u9fff]+', '', expression1) # noqa: E501 + expression2 = expression2 if re.fullmatch(r'[\u4e00-\u9fff]+', expression2) else re.sub(r'[\u4e00-\u9fff]+', '', expression2) # noqa: E501 + # Check if two < or > in expression + if self.is_two_greater_sign(expression1): + expression1 = self.trans_greater_sign_to_interval(expression1) + + if self.is_two_greater_sign(expression2): + expression2 = self.trans_greater_sign_to_interval(expression2) + + expression1 = self.split_by_comma(expression1) + expression2 = self.split_by_comma(expression2) + + temp_list1 = self.trans_plus_minus_sign(expression1) + temp_list2 = self.trans_plus_minus_sign(expression2) + + # Set up a list for allowed errors + if len(precision) <= 1: + precision = precision * len(temp_list1) + + if len(temp_list1) != len(temp_list2): + return False + + # Check if elements in both lists can be paired and are equal + idx = -1 + while len(temp_list1) != 0: + idx = (idx + 1) % len(temp_list1) + + item1 = temp_list1[idx] + self.precision = precision[idx] + + for item2 in temp_list2: + if self.is_equal(item1, item2): + temp_list1.remove(item1) + temp_list2.remove(item2) + precision.remove(self.precision) + break + else: + # If no match was found, return False + return False + + # If all elements are matched, return True + return True + + def is_interval(self, expr): + # Checks if an expression is an interval + return expr.startswith(("(", "[")) and expr.endswith((")", "]")) + + def is_two_greater_sign(self, expr): + match = re.findall(r'<', expr) + return len(match) == 2 + + def sympy_sub_pi(self, expression_sympy): + # Replaces the symbol for pi in sympy expressions with its numerical value + return expression_sympy.subs(self.pi, math.pi) + + def is_equal(self, expression1, expression2): + # Default first expression is ground truth. Check if expressions are equal in different aspects + if expression1 == expression2 and expression1 != "" and expression2 != "": + # print("Equivalent natively") + return True + + # First check if both are intervals + if self.is_interval(expression1) and self.is_interval(expression2): + try: + if self.interval_equal(expression1, expression2): + # print("Interval equivalent") + return True + except: + return False + + # Then check for numerical equality + try: + if self.numerical_equal(expression1, expression2): + # print("Numerically equivalent") + return True + except: + pass + # Then check if expressions are mathematically equal + try: + if self.expression_equal(expression1, expression2) and not ("=" in expression1 and "=" in expression2): + # print("Expression equivalent") + return True + except: + pass + + # Lastly, check for equation equality + try: + if self.equation_equal(expression1, expression2): + # print("Equation equivalent") + return True + except: + pass + + return False + + def numerical_equal(self, expression1: str, expression2: str, include_percentage: bool = True): + # Check if two numerical values are equal within an allowed error range + # Includes possible percentage cases + reference = float(expression1) + prediction = float(expression2) + + if include_percentage: + gt_result = [reference / 100, reference, reference * 100] + else: + gt_result = [reference] + + for item in gt_result: + if abs(item - prediction) <= self.precision * 1.01: + return True + return False + + def expression_equal(self, exp1, exp2): + # Check if two expressions are mathematically equivalent + # Extract expression and use sympy for equivalence checking + def extract_expression(expression): + if "=" in expression: + expression = expression.split("=")[1] + return expression.strip() + + exp1 = extract_expression(exp1) + exp2 = extract_expression(exp2) + + exp_too_long = len(exp1) > 300 or len(exp2) > 300 + + expr1_sym = sympify(parse_latex(exp1)) + expr2_sym = sympify(parse_latex(exp2)) + if expr1_sym == expr2_sym: + return True + else: + expr1_sym = self.sympy_sub_pi(expr1_sym) + expr2_sym = self.sympy_sub_pi(expr2_sym) + + if (expr1_sym.has(sp.Symbol) and not expr2_sym.has(sp.Symbol)) or \ + (not expr1_sym.has(sp.Symbol) and expr2_sym.has(sp.Symbol)): + return False + elif not expr1_sym.has(sp.Symbol) and not expr2_sym.has(sp.Symbol): + try: + if not (self.can_compute_power(expr1_sym) and self.can_compute_power(expr2_sym)): + print("These two numbers cannot be calculated by the current computer for: " + f"\"{str(expr1_sym)}\" and \"{str(expr2_sym)}\"") + return False + if exp_too_long: + print(f'Expression {exp1} or {exp2} is too long to compute. ') + return False + if abs(expr1_sym.evalf() - expr2_sym.evalf()) <= self.precision * 1.01: + return True + else: + return False + except: + return False + elif exp_too_long: + print(f'Expression {exp1} or {exp2} is too long to compute. ') + return False + else: + try: + simplified_expr = simplify(expr1_sym - expr2_sym) + num_value = simplified_expr.evalf() + return abs(num_value) < 1e-3 + except: + return False + + def equation_equal(self, expression1, expression2): + # Check if two equations are mathematically equivalent + # Simplify equations and use sympy for equivalence checking + def simplify_equation(latex_eq): + lhs, rhs = latex_eq.split('=') + + lhs_expr = parse_latex(lhs) + rhs_expr = parse_latex(rhs) + + equation = Eq(lhs_expr, rhs_expr) + + simplified_eq = simplify(equation.lhs - equation.rhs) + + return simplified_eq + + expr1_sym = simplify_equation(expression1) + expr2_sym = simplify_equation(expression2) + + division_result_1 = simplify(expr1_sym / expr2_sym) + division_result_2 = simplify(expr2_sym / expr1_sym) + + if ((division_result_1.is_Integer and division_result_1 != 0) or # noqa: W504 + (division_result_2.is_Integer and division_result_2 != 0)): + return True + else: + return False + + def interval_equal(self, expression1, expression2): + # Check if two intervals are mathematically equivalent + def compare_two_interval(inter1, inter2): + if inter1[0] != inter2[0] or inter1[-1] != inter2[-1]: + return False + + inter1 = inter1.strip('[]()') + inter2 = inter2.strip('[]()') + + items_1 = inter1.split(',') + items_2 = inter2.split(',') + + for item_1, item_2 in zip(items_1, items_2): + if not self.expression_equal(item_1, item_2): + return False + return True + + interval1 = expression1 + interval2 = expression2 + + if interval1 == interval2: + return True + else: + inter_list1 = interval1.split("\\cup") + inter_list2 = interval2.split("\\cup") + + if len(inter_list1) != len(inter_list2): + return False + else: + for inter1, inter2 in zip(inter_list1, inter_list2): + if not compare_two_interval(inter1, inter2): + return False + return True + + def preprocess(self, expression1, expression2): + # Preprocess expressions to extract and replace special symbols + def extract_boxed_content(latex_str): + boxed_matches = re.finditer(r'\\boxed{', latex_str) + results = "" + + for match in boxed_matches: + start_index = match.end() + end_index = start_index + stack = 1 + + while stack > 0 and end_index < len(latex_str): + if latex_str[end_index] == '{': + stack += 1 + elif latex_str[end_index] == '}': + stack -= 1 + end_index += 1 + + if stack == 0: + content = latex_str[start_index:end_index - 1] + results += content + "," + else: + raise ValueError("Mismatched braces in LaTeX string.") + + if results == "": + last_line_ans = latex_str.strip().split("\n")[-1] + dollar_pattern = r"\$(.*?)\$" + answers = re.findall(dollar_pattern, last_line_ans) + + if answers: + for ans in answers: + results += ans + "," + else: + results = latex_str + + return results + + def sepcial_symbol_replace(expression): + + expression = expression.replace("\\text{cm}^2", '').replace("\\text{cm}", "").replace("\\,cm", '').replace("\\text{ cm}", '').replace("cm", '').replace("\\text{分米}^2", '').replace("cm^{2}", '').replace("60 \\text{ cm}^2",'').replace("\\ \\text{m}", "").replace("\\text{米}","").strip() # noqa: E501 + + expression = re.sub(r"(.+)m$", r"\1", expression) + + if "\\in " in expression: + expression = expression.split("\\in ")[1] + + for signal in self.special_signal_map: + expression = expression.replace(signal, self.special_signal_map[signal]) + + expression = re.sub(r'(\\sin|\\cos|\\tan)(\d+)', r'\1((\2/180)\\pi)', expression) + + expression = expression.strip("\n,.:;^_=+`!@#%^&*~,。") + + pattern = r'\\(?:mathrm|mathbf)\{~?([^}]*)\}' + expression = re.sub(pattern, r'\1', expression) + + return expression + + exp1, exp2 = extract_boxed_content(expression1), extract_boxed_content(expression2) + + exp1, exp2 = sepcial_symbol_replace(exp1), sepcial_symbol_replace(exp2) + + return exp1, exp2 + + def can_compute_power(self, expr): + # Checks if a power expression can be computed + if isinstance(expr, Pow): + base, exp = expr.as_base_exp() + if base.is_number and exp.is_number: + MAX_EXP = 1000 # Adjust based on computing environment + if abs(exp.evalf()) > MAX_EXP: + return False + else: + return True + else: + return False + else: + return True # Not a power expression, can compute + + +class MMMath(ImageBaseDataset): + + TYPE = 'VQA' + + DATASET_URL = { + 'MM-Math': 'https://opencompass.openxlab.space/utils/VLMEval/MM-Math.tsv', + } + DATASET_MD5 = { + 'MM-Math': '1f064ed7c4e0e8926a3fa65849419ca5', + } + + @classmethod + def evaluate(self, eval_file, **kwargs): + + data = load(eval_file) + judger = AutoScoringJudge() + func = judger.judge + + tups = [dict(expression1=x, expression2=y) for x, y in zip(data['answer'], data['prediction'])] + + res = track_progress_rich(func, tups, nproc=16) + data['hit'] = res + dump(data, eval_file) + + score_file = eval_file.replace('.xlsx', '_score.json') + score = {} + score['overall'] = np.mean(data['hit']) + # Results by Difficulty + difficulties = set(data['difficulty']) + for d in difficulties: + score[f'Difficulty-{d}'] = np.mean(data[data['difficulty'] == d]['hit']) + + # Results by Year + years = set(data['year']) + for y in years: + score[f'Year-{y}'] = np.mean(data[data['year'] == y]['hit']) + + # Results by Knowledge-L1 + points = set(data['knowledge_l1']) + for p in points: + score[f'Knowledge-L1-{p}'] = np.mean(data[data['knowledge_l1'] == p]['hit']) + + # Results by Knowledge-L2 + points = set(data['knowledge_l2']) + for p in points: + score[f'Knowledge-L2-{p}'] = np.mean(data[data['knowledge_l2'] == p]['hit']) + + dump(score, score_file) + return score diff --git a/vlmeval/VLMEvalKit_old/vlmeval/dataset/mvbench.py b/vlmeval/VLMEvalKit_old/vlmeval/dataset/mvbench.py new file mode 100644 index 0000000000000000000000000000000000000000..a4c03c87e0e31d7b125430004c8a7748e5dbb81f --- /dev/null +++ b/vlmeval/VLMEvalKit_old/vlmeval/dataset/mvbench.py @@ -0,0 +1,668 @@ +import huggingface_hub +from huggingface_hub import snapshot_download +from ..smp import * +from .video_base import VideoBaseDataset +from .utils import build_judge, DEBUG_MESSAGE +from ..utils import track_progress_rich +import torchvision.transforms as T +from torchvision import transforms +from torchvision.transforms.functional import InterpolationMode +from decord import VideoReader, cpu +import imageio +import cv2 +import zipfile +import os +import glob +from .utils.mvbench import * + +FAIL_MSG = 'Failed to obtain answer via API.' + + +class MVBench(VideoBaseDataset): + + MD5 = 'fd21d36522cdedd46d84dc46715ad832' + SYS = """Carefully watch the video and pay attention to the cause and sequence of events, \ +the detail and movement of objects, and the action and pose of persons. \ +Based on your observations, select the best option that accurately addresses the question. +""" + + TYPE = 'Video-MCQ' + + def __init__(self, dataset='MVBench', pack=False): + self.type_data_list = { + 'Action Sequence': ('action_sequence.json', + 'your_data_path/star/Charades_v1_480/', 'video', True), # has start & end + 'Action Prediction': ('action_prediction.json', + 'your_data_path/star/Charades_v1_480/', 'video', True), # has start & end + 'Action Antonym': ('action_antonym.json', + 'your_data_path/ssv2_video/', 'video', False), + 'Fine-grained Action': ('fine_grained_action.json', + 'your_data_path/Moments_in_Time_Raw/videos/', 'video', False), + 'Unexpected Action': ('unexpected_action.json', + 'your_data_path/FunQA_test/test/', 'video', False), + 'Object Existence': ('object_existence.json', + 'your_data_path/clevrer/video_validation/', 'video', False), + 'Object Interaction': ('object_interaction.json', + 'your_data_path/star/Charades_v1_480/', 'video', True), # has start & end + 'Object Shuffle': ('object_shuffle.json', + 'your_data_path/perception/videos/', 'video', False), + 'Moving Direction': ('moving_direction.json', + 'your_data_path/clevrer/video_validation/', 'video', False), + 'Action Localization': ('action_localization.json', + 'your_data_path/sta/sta_video/', 'video', True), # has start & end + 'Scene Transition': ('scene_transition.json', + 'your_data_path/scene_qa/video/', 'video', False), + 'Action Count': ('action_count.json', + 'your_data_path/perception/videos/', 'video', False), + 'Moving Count': ('moving_count.json', + 'your_data_path/clevrer/video_validation/', 'video', False), + 'Moving Attribute': ('moving_attribute.json', + 'your_data_path/clevrer/video_validation/', 'video', False), + 'State Change': ('state_change.json', + 'your_data_path/perception/videos/', 'video', False), + 'Fine-grained Pose': ('fine_grained_pose.json', + 'your_data_path/nturgbd/', 'video', False), + 'Character Order': ('character_order.json', + 'your_data_path/perception/videos/', 'video', False), + 'Egocentric Navigation': ('egocentric_navigation.json', + 'your_data_path/vlnqa/', 'video', False), + 'Episodic Reasoning': ('episodic_reasoning.json', + 'your_data_path/tvqa/frames_fps3_hq/', 'frame', True), # has start & end, read frame + 'Counterfactual Inference': ('counterfactual_inference.json', + 'your_data_path/clevrer/video_validation/', 'video', False), + } + super().__init__(dataset=dataset, pack=pack) + + @classmethod + def supported_datasets(cls): + return ['MVBench'] + + def prepare_dataset(self, dataset_name='MVBench', repo_id='OpenGVLab/MVBench'): + def check_integrity(pth): + data_file = osp.join(pth, f'{dataset_name}.tsv') + + if not os.path.exists(data_file): + return False + + if md5(data_file) != self.MD5: + return False + + data = load(data_file) + for idx, item in data.iterrows(): + if not osp.exists(osp.join(pth, item['prefix'], item['video'])): + return False + return True + + if modelscope_flag_set(): + repo_id = 'modelscope/MVBench' + + cache_path = get_cache_path(repo_id, branch='main') + if cache_path is not None and check_integrity(cache_path): + dataset_path = cache_path + else: + def unzip_hf_zip(pth): + pth = os.path.join(pth, 'video/') + for filename in os.listdir(pth): + if filename.endswith('.zip'): + # 构建完整的文件路径 + zip_path = os.path.join(pth, filename) + + # 解压 ZIP 文件 + with zipfile.ZipFile(zip_path, 'r') as zip_ref: + zip_ref.extractall(pth) + + def generate_tsv(pth): + data_file = osp.join(pth, f'{dataset_name}.tsv') + if os.path.exists(data_file) and md5(data_file) == self.MD5: + return + json_data_dir = os.path.join(pth, 'json') + self.data_list = [] + for k, v in self.type_data_list.items(): + with open(os.path.join(json_data_dir, v[0]), 'r') as f: + json_data = json.load(f) + for data in json_data: + if os.path.exists(os.path.join(pth, v[1].replace('your_data_path', 'video'), data['video'])): + self.data_list.append({ + 'task_type': k, + 'prefix': v[1].replace('your_data_path', 'video'), + 'data_type': v[2], + 'bound': v[3], + 'start': data['start'] if 'start' in data.keys() else None, + 'end': data['end'] if 'end' in data.keys() else None, + 'video': data['video'], + 'question': data['question'], + 'answer': data['answer'], + 'candidates': data['candidates'] + }) + else: + print( + 'NTURGB-D zip file is removed according to MVBench, you can view it at ' + 'https://huggingface.co/datasets/OpenGVLab/MVBench for detailed reason.' + ) + raise Exception( + f"{os.path.join(v[1].replace('your_data_path', 'video'), data['video'])} does not exist" + ) + + data_df = pd.DataFrame(self.data_list) + data_df = data_df.assign(index=range(len(data_df))) + data_df.to_csv(data_file, sep='\t', index=False) + + def move_files(pth): + src_folder = os.path.join(pth, 'video/data0613') + if not os.path.exists(src_folder): + return + for subdir in os.listdir(src_folder): + subdir_path = os.path.join(src_folder, subdir) + if os.path.isdir(subdir_path): + for subsubdir in os.listdir(subdir_path): + subsubdir_path = os.path.join(subdir_path, subsubdir) + if os.path.isdir(subsubdir_path): + for item in os.listdir(subsubdir_path): + item_path = os.path.join(subsubdir_path, item) + target_folder = os.path.join(pth, 'video', subdir, subsubdir) + if not os.path.exists(target_folder): + os.makedirs(target_folder) + target_path = os.path.join(target_folder, item) + try: + shutil.move(item_path, target_path) + except Exception as e: + print(f"Error moving {item_path} to {target_path}: {e}") + + if modelscope_flag_set(): + from modelscope import dataset_snapshot_download + dataset_path = dataset_snapshot_download(dataset_id=repo_id, revision='master') + else: + hf_token = os.environ.get('HUGGINGFACE_TOKEN') + huggingface_hub.login(hf_token) + dataset_path = snapshot_download(repo_id=repo_id, repo_type='dataset') + unzip_hf_zip(dataset_path) + move_files(dataset_path) + generate_tsv(dataset_path) + + data_file = osp.join(dataset_path, f'{dataset_name}.tsv') + + self.decord_method = { + 'video': self.read_video, + 'gif': self.read_gif, + 'frame': self.read_frame, + } + + self.nframe = 8 + self.frame_fps = 3 + + # transform + self.transform = T.Compose([ + Stack(), + ToTorchFormatTensor() + ]) + + return dict(root=dataset_path, data_file=data_file) + + def get_index(self, bound, fps, max_frame, first_idx=0): + if bound: + start, end = bound[0], bound[1] + else: + start, end = -100000, 100000 + start_idx = max(first_idx, round(start * fps)) + end_idx = min(round(end * fps), max_frame) + seg_size = float(end_idx - start_idx) / self.num_segments + frame_indices = np.array([ + int(start_idx + (seg_size / 2) + np.round(seg_size * idx)) + for idx in range(self.num_segments) + ]) + return frame_indices + + def read_video(self, video_path, bound=None): + vr = VideoReader(video_path, ctx=cpu(0), num_threads=1) + max_frame = len(vr) - 1 + fps = float(vr.get_avg_fps()) + + images_group = list() + frame_indices = self.get_index(bound, fps, max_frame, first_idx=0) + for frame_index in frame_indices: + img = Image.fromarray(vr[frame_index].asnumpy()) + images_group.append(img) + torch_imgs = self.transform(images_group) + return torch_imgs + + def read_gif(self, video_path, bound=None, fps=25): + gif = imageio.get_reader(video_path) + max_frame = len(gif) - 1 + + images_group = list() + frame_indices = self.get_index(bound, fps, max_frame, first_idx=0) + for index, frame in enumerate(gif): + if index in frame_indices: + img = cv2.cvtColor(frame, cv2.COLOR_RGBA2RGB) + img = Image.fromarray(img) + images_group.append(img) + torch_imgs = self.transform(images_group) + return torch_imgs + + def read_frame(self, video_path, bound=None, fps=3): + max_frame = len(os.listdir(video_path)) + images_group = list() + frame_indices = self.get_index(bound, fps, max_frame, first_idx=1) # frame_idx starts from 1 + for frame_index in frame_indices: + img = Image.open(os.path.join(video_path, f'{frame_index:05d}.jpg')) + images_group.append(img) + torch_imgs = self.transform(images_group) + return torch_imgs + + def save_video_frames(self, imgs, video_name, frames): + + frame_paths = self.frame_paths(video_name, frames) + flag = np.all([osp.exists(p) for p in frame_paths]) + + if not flag: + block_size = imgs.size(0) // frames + split_tensors = torch.split(imgs, block_size) + to_pil = transforms.ToPILImage() + images = [to_pil(arr) for arr in split_tensors] + for im, pth in zip(images, frame_paths): + if not osp.exists(pth): + im.save(pth) + + return frame_paths + + def qa_template(self, data): + question = f"Question: {data['question']}\n" + question += 'Options:\n' + answer = data['answer'] + answer_idx = -1 + for idx, c in enumerate(eval(data['candidates'])): + question += f"({chr(ord('A') + idx)}) {c}\n" + if c == answer: + answer_idx = idx + question = question.rstrip() + answer = f"({chr(ord('A') + answer_idx)}) {answer}" + return question, answer + + def load_into_video_and_process(self, line): + try: + from moviepy.editor import VideoFileClip, ImageSequenceClip + except: + raise ImportError( + 'MoviePy is not installed, please install it by running "pip install moviepy==1.0.3"' + ) + video_path = os.path.join(self.data_root, line['prefix'], line['video']) + + if line['data_type'] in ['gif'] or os.path.splitext(video_path)[1] in ['.webm']: + processed_video_path = video_path.replace(os.path.splitext(video_path)[1], '.mp4') + if not os.path.exists(processed_video_path): + # using MoviePy to transform GIF, webm into mp4 format + gif_clip = VideoFileClip(video_path) + gif_clip.write_videofile(processed_video_path, codec='libx264') + gif_clip.close() + elif line['data_type'] in ['frame']: + input_images = os.path.join(video_path, '*.jpg') + processed_video_path = f'{video_path}.mp4' + if not os.path.exists(processed_video_path): + # using MoviePy to transform images into mp4 + image_files = sorted(glob.glob(input_images)) + image_clip = ImageSequenceClip(image_files, fps=self.frame_fps) + image_clip.write_videofile(processed_video_path, codec='libx264') + image_clip.close() + else: + processed_video_path = video_path + + if line['bound']: + base_name, suffix = os.path.splitext(processed_video_path) + output_video_path = f'{base_name}_processed{suffix}' + if not os.path.exists(output_video_path): + video_clip = VideoFileClip(processed_video_path) + clip = video_clip.subclip(line['start'], min(line['end'], video_clip.duration)) + clip.write_videofile(output_video_path) + clip.close() + else: + output_video_path = processed_video_path + + return output_video_path + + def save_video_into_images(self, line, num_frames): + bound = None + if line['bound']: + bound = ( + line['start'], + line['end'], + ) + video_path = os.path.join(self.data_root, line['prefix'], line['video']) + decord_method = self.decord_method[line['data_type']] + self.num_segments = num_frames if num_frames > 0 else self.nframe + torch_imgs = decord_method(video_path, bound) + img_frame_paths = self.save_video_frames(torch_imgs, line['video'], self.num_segments) + return img_frame_paths + + def build_prompt(self, line, num_frames, video_llm, fps): + if fps > 0: + raise ValueError('MVBench does not support fps setting, please transfer to MVBench_MP4!') + if isinstance(line, int): + assert line < len(self) + line = self.data.iloc[line] + + question, answer = self.qa_template(line) + message = [dict(type='text', value=self.SYS, role='system')] + message.append(dict(type='text', value=question)) + if video_llm: + new_video_path = self.load_into_video_and_process(line) + message.append(dict(type='video', value=new_video_path)) + else: + img_frame_paths = self.save_video_into_images(line, num_frames) + for im in img_frame_paths: + message.append(dict(type='image', value=im)) + message.append(dict(type='text', value='\nOnly give the best option.')) + message.append(dict(type='text', value='Best option:(', role='assistant')) + return message + + @classmethod + def evaluate(self, eval_file, **judge_kwargs): + + assert eval_file.endswith('.xlsx'), 'data file should be an xlsx file' + + tmp_file = eval_file.replace('.xlsx', '_tmp.pkl') + tgt_file = eval_file.replace('.xlsx', '_rating.json') + score_file = eval_file.replace('.xlsx', '_score.xlsx') + + if not osp.exists(score_file): + model = judge_kwargs.setdefault('model', 'chatgpt-0125') + assert model in ['chatgpt-0125', 'exact_matching', 'gpt-4-0125'] + + if model == 'exact_matching': + model = None + elif gpt_key_set(): + model = build_judge(**judge_kwargs) + if not model.working(): + warnings.warn('OPENAI API is not working properly, will use exact matching for evaluation') + warnings.warn(DEBUG_MESSAGE) + model = None + else: + warnings.warn('OPENAI_API_KEY is not set properly, will use exact matching for evaluation') + model = None + res = {} if not osp.exists(tmp_file) else load(tmp_file) + res = {k: v for k, v in res.items() if FAIL_MSG not in v} + + data = load(eval_file) + data_un = data[~pd.isna(data['prediction'])] + + for idx in data_un['index']: + ans = data.loc[data['index'] == idx, 'answer'].values[0] + pred = data.loc[data['index'] == idx, 'prediction'].values[0] + options = eval(data.loc[data['index'] == idx, 'candidates'].values[0]) + answer_idx = -1 + for id, c in enumerate(options): + if c == ans: + answer_idx = id + ans = f"({chr(ord('A') + answer_idx)}) {ans}" + input_item = data.loc[data['index'] == idx].to_dict(orient='records')[0] + for id, option_content in enumerate(eval(input_item['candidates'])): + input_item[chr(ord('A') + id)] = option_content + if option_content == input_item['answer']: + input_item['answer'] = chr(ord('A') + id) + + if FAIL_MSG in pred: + data.loc[idx, 'score'] = -1 + else: + data.loc[idx, 'score'] = int(check_ans_with_model( + pred, ans, model, + input_item, + 'MVBench' + )) + + rejected = [x for x in data['score'] if x == -1] + + print( + f'Among {len(data)} questions, failed to obtain prediction for {len(data) - len(data_un)} questions, ' + f'failed to obtain the score for another {len(rejected)} questions. ' + f'Those questions will be counted as -1 score in ALL rating, and will not be counted in VALID rating.' + ) + + dump(data, score_file) + + rating = get_dimension_rating(score_file) + dump(rating, tgt_file) + return rating + + +class MVBench_MP4(VideoBaseDataset): + + MP4_MD5 = '5c8c6f8b7972c2de65a629590f7c42f5' + SYS = """Carefully watch the video and pay attention to the cause and sequence of events, \ +the detail and movement of objects, and the action and pose of persons. \ +Based on your observations, select the best option that accurately addresses the question. +""" + TYPE = 'Video-MCQ' + + def __init__(self, dataset='MVBench_MP4', pack=False): + super().__init__(dataset=dataset, pack=pack) + + @classmethod + def supported_datasets(cls): + return ['MVBench_MP4'] + + def prepare_dataset(self, dataset_name='MVBench_MP4', repo_id='OpenGVLab/MVBench'): + def check_integrity(pth): + data_file = osp.join(pth, f'{dataset_name}.tsv') + + if not os.path.exists(data_file): + return False + + if md5(data_file) != self.MP4_MD5: + return False + + data = load(data_file) + for idx, item in data.iterrows(): + if not osp.exists(osp.join(pth, item['prefix'], item['video'])): + return False + return True + + if modelscope_flag_set(): + repo_id = 'modelscope/MVBench' + + cache_path = get_cache_path(repo_id, branch='video') + if cache_path is not None and check_integrity(cache_path): + dataset_path = cache_path + else: + def generate_tsv(pth): + data_file = osp.join(pth, f'{dataset_name}.tsv') + if os.path.exists(data_file) and md5(data_file) == self.MP4_MD5: + return + json_data_path = os.path.join(dataset_path, 'test.json') + json_data = load(json_data_path) + root_data_dict = json_data['root'] + self.data_list = [] + for k, v in json_data['meta'].items(): + for item in v: + self.data_list.append({ + 'task_type': k, + 'prefix': root_data_dict[k], + 'video': item['video'], + 'question': item['question'], + 'answer': item['answer'], + 'candidates': item['candidates'] + }) + data_df = pd.DataFrame(self.data_list) + data_df = data_df.assign(index=range(len(data_df))) + data_df.to_csv(data_file, sep='\t', index=False) + + if modelscope_flag_set(): + from modelscope import dataset_snapshot_download + dataset_path = dataset_snapshot_download(dataset_id=repo_id, revision='video') + else: + hf_token = os.environ.get('HUGGINGFACE_TOKEN') + huggingface_hub.login(hf_token) + dataset_path = snapshot_download(repo_id=repo_id, repo_type='dataset', revision='video') + generate_tsv(dataset_path) + + data_file = osp.join(dataset_path, f'{dataset_name}.tsv') + + self.nframe = 8 + + # transform + self.transform = T.Compose([ + Stack(), + ToTorchFormatTensor() + ]) + + return dict(root=dataset_path, data_file=data_file) + + def qa_template(self, data): + question = f"Question: {data['question']}\n" + question += 'Options:\n' + answer = data['answer'] + answer_idx = -1 + for idx, c in enumerate(eval(data['candidates'])): + question += f"({chr(ord('A') + idx)}) {c}\n" + if c == answer: + answer_idx = idx + question = question.rstrip() + answer = f"({chr(ord('A') + answer_idx)}) {answer}" + return question, answer + + def get_index_by_frame(self, max_frame): + seg_size = float(max_frame) / self.num_segments + frame_indices = np.array([ + int((seg_size / 2) + np.round(seg_size * idx)) + for idx in range(self.num_segments) + ]) + return frame_indices + + def get_index_by_fps(self, vid, fps): + total_frames = len(vid) + video_fps = vid.get_avg_fps() + total_duration = total_frames / video_fps + required_frames = int(total_duration * fps) + step_size = video_fps / fps + frame_indices = np.array([int(i * step_size) for i in range(required_frames)]) + self.num_segments = len(frame_indices) + return frame_indices + + def read_video(self, video_path, fps=-1): + vr = VideoReader(video_path, ctx=cpu(0), num_threads=1) + max_frame = len(vr) - 1 + + images_group = list() + if fps < 0: + frame_indices = self.get_index_by_frame(max_frame) + else: + frame_indices = self.get_index_by_fps(vr, fps) + + for frame_index in frame_indices: + img = Image.fromarray(vr[frame_index].asnumpy()) + images_group.append(img) + torch_imgs = self.transform(images_group) + return torch_imgs + + def save_video_frames(self, imgs, video_name, frames, fps): + if fps > 0: + frame_paths = self.frame_paths_fps(video_name, frames, fps) + else: + frame_paths = self.frame_paths(video_name, frames) + flag = np.all([osp.exists(p) for p in frame_paths]) + + if not flag: + block_size = imgs.size(0) // frames + split_tensors = torch.split(imgs, block_size) + to_pil = transforms.ToPILImage() + images = [to_pil(arr) for arr in split_tensors] + for im, pth in zip(images, frame_paths): + if not osp.exists(pth): + im.save(pth) + + return frame_paths + + def save_video_into_images(self, line, num_frames, fps=-1): + video_path = os.path.join(self.data_root, line['prefix'], line['video']) + if fps <= 0: + self.num_segments = num_frames if num_frames > 0 else self.nframe + else: + self.num_segments = 0 + torch_imgs = self.read_video(video_path, fps) + img_frame_paths = self.save_video_frames(torch_imgs, line['video'], self.num_segments, fps) + return img_frame_paths + + def build_prompt(self, line, num_frames, video_llm, fps): + if isinstance(line, int): + assert line < len(self) + line = self.data.iloc[line] + + question, answer = self.qa_template(line) + message = [dict(type='text', value=self.SYS, role='system')] + message.append(dict(type='text', value=question)) + video_path = os.path.join(self.data_root, line['prefix'], line['video']) + if video_llm: + message.append(dict(type='video', value=video_path)) + else: + img_frame_paths = self.save_video_into_images(line, num_frames, fps) + for im in img_frame_paths: + message.append(dict(type='image', value=im)) + message.append(dict(type='text', value='\nOnly give the best option.')) + message.append(dict(type='text', value='Best option:(', role='assistant')) + return message + + @classmethod + def evaluate(self, eval_file, **judge_kwargs): + + assert eval_file.endswith('.xlsx'), 'data file should be an xlsx file' + + tmp_file = eval_file.replace('.xlsx', '_tmp.pkl') + tgt_file = eval_file.replace('.xlsx', '_rating.json') + score_file = eval_file.replace('.xlsx', '_score.xlsx') + + if not osp.exists(score_file): + model = judge_kwargs.setdefault('model', 'chatgpt-0125') + assert model in ['chatgpt-0125', 'exact_matching', 'gpt-4-0125'] + + if model == 'exact_matching': + model = None + elif gpt_key_set(): + model = build_judge(**judge_kwargs) + if not model.working(): + warnings.warn('OPENAI API is not working properly, will use exact matching for evaluation') + warnings.warn(DEBUG_MESSAGE) + model = None + else: + warnings.warn('OPENAI_API_KEY is not set properly, will use exact matching for evaluation') + model = None + res = {} if not osp.exists(tmp_file) else load(tmp_file) + res = {k: v for k, v in res.items() if FAIL_MSG not in v} + + data = load(eval_file) + data_un = data[~pd.isna(data['prediction'])] + + for idx in data_un['index']: + ans = data.loc[data['index'] == idx, 'answer'].values[0] + pred = data.loc[data['index'] == idx, 'prediction'].values[0] + options = eval(data.loc[data['index'] == idx, 'candidates'].values[0]) + answer_idx = -1 + for id, c in enumerate(options): + if c == ans: + answer_idx = id + ans = f"({chr(ord('A') + answer_idx)}) {ans}" + input_item = data.loc[data['index'] == idx].to_dict(orient='records')[0] + for id, option_content in enumerate(eval(input_item['candidates'])): + input_item[chr(ord('A') + id)] = option_content + if option_content == input_item['answer']: + input_item['answer'] = chr(ord('A') + id) + + if FAIL_MSG in pred: + data.loc[idx, 'score'] = -1 + else: + data.loc[idx, 'score'] = int(check_ans_with_model( + pred, ans, model, + input_item, + 'MVBench_MP4' + )) + + rejected = [x for x in data['score'] if x == -1] + + print( + f'Among {len(data)} questions, failed to obtain prediction for {len(data) - len(data_un)} questions, ' + f'failed to obtain the score for another {len(rejected)} questions. ' + f'Those questions will be counted as -1 score in ALL rating, and will not be counted in VALID rating.' + ) + + dump(data, score_file) + + rating = get_dimension_rating(score_file) + dump(rating, tgt_file) + return rating diff --git a/vlmeval/VLMEvalKit_old/vlmeval/dataset/text_base.py b/vlmeval/VLMEvalKit_old/vlmeval/dataset/text_base.py new file mode 100644 index 0000000000000000000000000000000000000000..67bb9378915f4ca08ddec25ecfa02f27dca06d86 --- /dev/null +++ b/vlmeval/VLMEvalKit_old/vlmeval/dataset/text_base.py @@ -0,0 +1,88 @@ +from abc import abstractmethod +from ..smp import * + + +class TextBaseDataset: + MODALITY = 'TEXT' + DATASET_URL = {} + DATASET_MD5 = {} + + def __init__(self, dataset='MMBench', **kwargs): + self.dataset_name = dataset + + data = self.load_data(dataset) + + data['index'] = [str(x) for x in data['index']] + + if np.all([istype(x, int) for x in data['index']]): + data['index'] = [int(x) for x in data['index']] + + self.data = data + self.post_build(dataset) + + def __len__(self): + return len(self.data) + + def __getitem__(self, idx): + return dict(self.data.iloc[idx]) + + def prepare_tsv(self, url, file_md5=None): + data_root = LMUDataRoot() + os.makedirs(data_root, exist_ok=True) + update_flag = False + file_name = url.split('/')[-1] + data_path = osp.join(data_root, file_name) + if osp.exists(data_path) and (file_md5 is None or md5(data_path) == file_md5): + pass + else: + warnings.warn('The dataset tsv is not downloaded') + download_file(url, data_path) + update_flag = True + + if file_size(data_path, 'GB') > 1: + local_path = data_path.replace('.tsv', '_local.tsv') + if not osp.exists(local_path) or os.environ.get('FORCE_LOCAL', None) or update_flag: + from ..tools import LOCALIZE + LOCALIZE(data_path, local_path) + data_path = local_path + return load(data_path) + + def dump_image(self, line): + return [] + + def display(self, line): + if isinstance(line, int): + line = self.data.iloc[line] + assert isinstance(line, pd.Series) or isinstance(line, dict) + mmqa_display(line) + + # Return a list of dataset names that are supported by this class, can override + @classmethod + def supported_datasets(cls): + return list(cls.DATASET_URL) + + # Given the dataset name, return the dataset as a pandas dataframe, can override + def load_data(self, dataset): + url = self.DATASET_URL[dataset] + file_md5 = self.DATASET_MD5[dataset] + return self.prepare_tsv(url, file_md5) + + # Post built hook, will be called after the dataset is built, can override + def post_build(self, dataset): + pass + + # Given one data record, return the built prompt (a multi-modal message), can override + def build_prompt(self, line): + if isinstance(line, int): + line = self.data.iloc[line] + + question = line['question'] + + msgs = [] + msgs.append(dict(type='text', value=question)) + return msgs + + # Given the prediction file, return the evaluation results in the format of a dictionary or pandas dataframe + @abstractmethod + def evaluate(self, eval_file, **judge_kwargs): + pass diff --git a/vlmeval/VLMEvalKit_old/vlmeval/dataset/vcr.py b/vlmeval/VLMEvalKit_old/vlmeval/dataset/vcr.py new file mode 100644 index 0000000000000000000000000000000000000000..c659c60f406eaec1a1c3508bb4ba323efeef29ea --- /dev/null +++ b/vlmeval/VLMEvalKit_old/vlmeval/dataset/vcr.py @@ -0,0 +1,335 @@ +import uuid +from functools import partial +from .image_base import ImageBaseDataset +from ..smp import * + +rouge = None +nlp_en = None +nlp_zh = None +nlp = None + + +def initialize(): + import evaluate + import spacy + + global rouge, nlp_en, nlp_zh, nlp + + try: + rouge = evaluate.load('rouge', experiment_id=str(uuid.uuid4())) + except Exception as e: + logging.critical(f'{type(e)}: {e}') + logging.critical('Please first `pip install rouge_score`.') + + try: + nlp_en = spacy.load('en_core_web_sm') + except Exception as e: + logging.warning(f'{type(e)}: {e}') + logging.warning('Will automatically download en_core_web_sm via spacy.') + spacy.cli.download('en_core_web_sm') + nlp_en = spacy.load('en_core_web_sm') + + try: + nlp_zh = spacy.load('zh_core_web_sm') + except Exception as e: + logging.warning(f'{type(e)}: {e}') + logging.warning('Will automatically download zh_core_web_sm via spacy.') + spacy.cli.download('zh_core_web_sm') + nlp_zh = spacy.load('zh_core_web_sm') + + nlp = {'en': nlp_en, 'zh': nlp_zh} + + +def rough_filter(answer_text): + if "I can't" in answer_text: + return False + elif 'I cannot' in answer_text: + return False + elif 'sorry' in answer_text.lower(): + return False + if '无法' in answer_text: + return False + elif '抱歉' in answer_text: + return False + else: + return True + + +def zero_template(crossed_text): + return { + 'crossed_text': crossed_text, + 'max_sim_val': 0, + 'max_sim_string': '', + 'precision': 0, + 'recall': 0, + 'f1': 0, + 'jaccard': 0, + 'rouge1': 0, + 'exact_match': 0, + } + + +def tokenize(text, language): + """ + Tokenize the text and return the tokens. + + Parameters: + text (str): The text to tokenize. + language (str): The language of the text. + + Returns: + list: The list of tokens. + """ + assert language in ['en', 'zh'] + nlp_language = nlp[language] + processed_text = nlp_language(text) + return [token.text for token in processed_text] + + +def find_best_match(needle, hay, language, rouge): + """ + Finds the best matching n-gram in the haystack for the given needle. + + Parameters: + needle (str): The string to find. + hay (str): The text to search within. + + Returns: + tuple: The highest similarity value and the best matching string. + """ + assert language in ['en', 'zh'] + from nltk.util import ngrams + from difflib import SequenceMatcher as SM + + tokens_hay = tokenize(hay, language) + tokens_needle = tokenize(needle, language) + + splitter = '' if language == 'zh' else ' ' + ngrams_ = ngrams(tokens_hay, len(tokens_needle)) + max_sim_val = 0 + max_sim_string = '' + max_sim_ngram = [] + tokens_needle_set = set(tokens_needle) + ngrams_hasjoint = [ + ngram + for ngram in ngrams_ + if not set(ngram).isdisjoint(tokens_needle_set) + ] + + for ngram in ngrams_hasjoint: + hay_ngram = splitter.join(ngram) + similarity = SM(None, hay_ngram, needle).ratio() + if similarity > max_sim_val: + max_sim_val = similarity + max_sim_string = hay_ngram + max_sim_ngram = ngram + + # Evaluate + if len(max_sim_ngram) == 0: + return { + 'crossed_text': needle, + 'max_sim_val': 0, + 'max_sim_string': '', + 'precision': 0, + 'recall': 0, + 'f1': 0, + 'jaccard': 0, + 'rouge1': 0, + 'exact_match': 0, + } + pred_set = set(max_sim_ngram) + ref_set = set(tokens_needle) + correct_tokens = pred_set.intersection(ref_set) + len_correct_tokens = len(correct_tokens) + + precision = len_correct_tokens / len(pred_set) + recall = len_correct_tokens / len(ref_set) + if (precision + recall) == 0: + f1 = 0 + else: + f1 = 2 * precision * recall / (precision + recall) + union = pred_set.union(ref_set) + jaccard = len_correct_tokens / len(union) if len(union) > 0 else 0 + rouge_1 = rouge.compute( + predictions=[max_sim_string], + references=[needle], + tokenizer=partial(tokenize, language=language), + rouge_types=['rouge1'], + )['rouge1'] + exact_match = float(list(max_sim_ngram) == list(tokens_needle)) + out = { + 'crossed_text': needle, + 'max_sim_string': max_sim_string, + 'max_sim_val': max_sim_val, + 'precision': precision, + 'recall': recall, + 'f1': f1, + 'jaccard': jaccard, + 'rouge1': rouge_1, + 'exact_match': exact_match, + } + return out + + +def process_match_single_new( + image_id, prediction, answer, language, progress +): + """ + process the inference results for a single image and calculate the metrics + + Parameters: + image_id (int): The image id (question id). + prediction (str): The prediction text. + answer (Union[str, List[str]]): The answer text, or a list of answer texts. The masked n-grams in the image. + language (str): The language of the text. Can be "en" or "zh". + rouge (rouge): The rouge metric object. + progress (multiprocessing.Queue): The progress queue. + + Returns: + tuple: The image id (question_id, int) and the result per id (dict of dict of dict). + """ + result_per_id = {image_id: {}} + if isinstance(answer, str): + answer = eval(answer) + assert isinstance(answer, list) + result = prediction.split('Assistant: ')[-1] + for i, crossed_text in enumerate(answer): + if rough_filter(result): + find_best_match_result = find_best_match( + crossed_text, result, language, rouge + ) + if i == 0: + result_per_id[image_id] = {str(i): find_best_match_result} + else: + result_per_id[image_id][str(i)] = find_best_match_result + else: + if i == 0: + result_per_id[image_id] = {str(i): zero_template(crossed_text)} + else: + result_per_id[image_id][str(i)] = zero_template(crossed_text) + progress.put(1) + return image_id, result_per_id + + +class VCRDataset(ImageBaseDataset): + TYPE = 'VQA' + + URL_PREFIX = 'https://huggingface.co/datasets/vcr-org' + + DATASET_URL = { + 'VCR_EN_EASY_500': f'{URL_PREFIX}/VCR-wiki-en-easy-test-500/resolve/main/VCR-wiki-en-easy-test-500.tsv', + 'VCR_EN_EASY_100': f'{URL_PREFIX}/VCR-wiki-en-easy-test-100/resolve/main/VCR-wiki-en-easy-test-100.tsv', + 'VCR_EN_EASY_ALL': f'{URL_PREFIX}/VCR-wiki-en-easy-test/resolve/main/VCR-wiki-en-easy-test.tsv', + 'VCR_EN_HARD_500': f'{URL_PREFIX}/VCR-wiki-en-hard-test-500/resolve/main/VCR-wiki-en-hard-test-500.tsv', + 'VCR_EN_HARD_100': f'{URL_PREFIX}/VCR-wiki-en-hard-test-100/resolve/main/VCR-wiki-en-hard-test-100.tsv', + 'VCR_EN_HARD_ALL': f'{URL_PREFIX}/VCR-wiki-en-hard-test/resolve/main/VCR-wiki-en-hard-test.tsv', + 'VCR_ZH_EASY_500': f'{URL_PREFIX}/VCR-wiki-zh-easy-test-500/resolve/main/VCR-wiki-zh-easy-test-500.tsv', + 'VCR_ZH_EASY_100': f'{URL_PREFIX}/VCR-wiki-zh-easy-test-100/resolve/main/VCR-wiki-zh-easy-test-100.tsv', + 'VCR_ZH_EASY_ALL': f'{URL_PREFIX}/VCR-wiki-zh-easy-test/resolve/main/VCR-wiki-zh-easy-test.tsv', + 'VCR_ZH_HARD_500': f'{URL_PREFIX}/VCR-wiki-zh-hard-test-500/resolve/main/VCR-wiki-zh-hard-test-500.tsv', + 'VCR_ZH_HARD_100': f'{URL_PREFIX}/VCR-wiki-zh-hard-test-100/resolve/main/VCR-wiki-zh-hard-test-100.tsv', + 'VCR_ZH_HARD_ALL': f'{URL_PREFIX}/VCR-wiki-zh-hard-test/resolve/main/VCR-wiki-zh-hard-test.tsv', + } + + DATASET_MD5 = { + 'VCR_EN_EASY_500': 'fd9258db52f8685dc710619a0ea0a261', + 'VCR_EN_EASY_100': '9df5d7266683458621ecbe122beb72f0', + 'VCR_EN_EASY_ALL': '8a9b96885f251d1c85f42f84073327f1', + 'VCR_EN_HARD_500': '0a22a85080b6a1f52b1f95e302d43df4', + 'VCR_EN_HARD_100': '1b20f5cbcbeae0b0bec77f7a36143958', + 'VCR_EN_HARD_ALL': '2d8b8b1ee0eba0e0b618fd3aa7d9710e', + 'VCR_ZH_EASY_500': 'beca5fd54176adf44cf94bd9b50cf048', + 'VCR_ZH_EASY_100': '4a86a5678a79844d6d22ab0629c51cd5', + 'VCR_ZH_EASY_ALL': '5050fe7f0027ad2068fd4c7f220edaea', + 'VCR_ZH_HARD_500': '617e3360f75c54455625cb0a8da5c1e7', + 'VCR_ZH_HARD_100': 'b0e38c85f5d5e63894a3b881c372a62b', + 'VCR_ZH_HARD_ALL': '54bbfef448206518b03127ef8b61404c', + } + + def __init__(self, dataset='VCR_EN_EASY_500', skip_noimg=True): + super().__init__(dataset, skip_noimg) + + initialize() + self.language = 'en' if 'EN' in dataset else 'zh' + self.difficulty = 'easy' if 'EASY' in dataset else 'hard' + + # def build_prompt(self, line): + # msgs = super().build_prompt(line) + # assert msgs[-1]['type'] == 'text' + # if self.language == 'zh': + # msgs[-1]['value'] += '图像中被覆盖的文本是什么?请在不输出解释的情况下还原被覆盖的文本。' + # else: + # msgs[-1]['value'] += ('What is the covered texts in the image? ' + # 'Please restore the covered texts without outputting the explanations.') + # return msgs + + def evaluate(self, eval_file, **judge_kwargs): + import multiprocessing + + vcr_score_list = {'Exact_Match': [], 'Jaccard': []} + vcr_score = {'Exact_Match': 0, 'Jaccard': 0} + logger = get_logger('Evaluation') + data = load(eval_file) + + lt = len(data) + lines = [data.iloc[i] for i in range(lt)] + + pool = multiprocessing.Pool() + manager = multiprocessing.Manager() + progress_queue = manager.Queue() + results = [] + + overall_results = {str(image_id): {} for image_id in range(len(lines))} + + for instance_id, instance in enumerate(lines): + results.append( + pool.apply_async( + process_match_single_new, + args=( + str(instance_id), + instance['prediction'], + instance['answer'], + self.language, + progress_queue, + ), + ) + ) + pool.close() + + # Display progress bar + for _ in tqdm(range(len(results))): + progress_queue.get() + + pool.join() + + # Merging results into overall_result + for result in results: + image_id, result_per_id = result.get() + overall_results[str(image_id)].update(result_per_id[image_id]) + for blank_id_str in result_per_id[image_id].keys(): + vcr_score_list['Exact_Match'].append( + result_per_id[image_id][blank_id_str]['exact_match'] + ) + vcr_score_list['Jaccard'].append( + result_per_id[image_id][blank_id_str]['jaccard'] + ) + vcr_score['Exact_Match'] = np.mean(vcr_score_list['Exact_Match']) + vcr_score['Jaccard'] = np.mean(vcr_score_list['Jaccard']) + results_out = { + k: v for i in range(len(results)) for k, v in results[i].get()[1].items() + } + results_with_metrics = { + 'Exact_Match': vcr_score['Exact_Match'], + 'Jaccard': vcr_score['Jaccard'], + 'Predictions': results_out, + } + score_pth = eval_file.replace( + '.xlsx', f'{self.language}_{self.difficulty}_score.json' + ) + dump(results_with_metrics, score_pth) + logger.info( + f'VCR successfully finished evaluating {eval_file}, results saved in {score_pth}' + ) + logger.info('Score: ') + for key, value in vcr_score.items(): + logger.info('{}:{}'.format(key, value)) diff --git a/vlmeval/VLMEvalKit_old/vlmeval/dataset/video_concat_dataset.py b/vlmeval/VLMEvalKit_old/vlmeval/dataset/video_concat_dataset.py new file mode 100644 index 0000000000000000000000000000000000000000..5a03fd0be670e9479f3f6832c1d64b8643f09e76 --- /dev/null +++ b/vlmeval/VLMEvalKit_old/vlmeval/dataset/video_concat_dataset.py @@ -0,0 +1,83 @@ +from ..smp import * +from .video_base import VideoBaseDataset + + +class ConcatVideoDataset(VideoBaseDataset): + # This dataset takes multiple dataset names as input and aggregate them into a single dataset. + # Each single dataset should not have a field named `SUB_DATASET` + + DATASET_SETS = {} + + def __init__(self, dataset): + from . import build_dataset + datasets = self.DATASET_SETS[dataset] + self.dataset_map = {} + # The name of the compliation + self.dataset_name = dataset + self.datasets = datasets + for dname in datasets: + dataset = build_dataset(dname) + assert dataset is not None, dataset + self.dataset_map[dname] = dataset + TYPES = [x.TYPE for x in self.dataset_map.values()] + MODALITIES = [x.MODALITY for x in self.dataset_map.values()] + # assert np.all([x == TYPES[0] for x in TYPES]), (datasets, TYPES) + assert np.all([x == MODALITIES[0] for x in MODALITIES]), (datasets, MODALITIES) + self.TYPE = TYPES + self.MODALITY = MODALITIES[0] + data_all = [] + for dname in datasets: + data = self.dataset_map[dname].data + data['SUB_DATASET'] = [dname] * len(data) + data_all.append(data) + + data = pd.concat(data_all) + data['original_index'] = data.pop('index') + data['index'] = np.arange(len(data)) + self.data = data + + def build_prompt(self, line, num_frames, video_llm, fps): + if isinstance(line, int): + line = self.data.iloc[line] + idx = line['original_index'] + dname = line['SUB_DATASET'] + org_data = self.dataset_map[dname].data + org_line = cp.deepcopy(org_data[org_data['index'] == idx]).iloc[0] + return self.dataset_map[dname].build_prompt(org_line, num_frames, video_llm, fps) + + def dump_image(self, line): + # Assert all images are pre-dumped + assert 'image' not in line + assert 'image_path' in line + tgt_path = toliststr(line['image_path']) + return tgt_path + + @classmethod + def supported_datasets(cls): + return [] # list(cls.DATASET_SETS) + + def evaluate(self, eval_file, **judge_kwargs): + suffix = eval_file.split('.')[-1] + # First, split the eval_file by dataset + data_all = load(eval_file) + for dname in self.datasets: + tgt = eval_file.replace(self.dataset_name, dname) + data_sub = data_all[data_all['SUB_DATASET'] == dname] + data_sub.pop('index') + data_sub['index'] = data_sub.pop('original_index') + data_sub.pop('SUB_DATASET') + dump(data_sub, tgt) + # Then, evaluate each dataset separately + results_all = {} + for dname in self.datasets: + tgt = eval_file.replace(self.dataset_name, dname) + res = self.dataset_map[dname].evaluate(tgt, **judge_kwargs) + results_all.update(res) + + result = pd.DataFrame(results_all, index=['success', 'overall']) + result = result.T + for idx, item in result.iterrows(): + result.loc[idx, 'acc'] = round(item['success'] / item['overall'] * 100, 1) + score_file = eval_file.replace(f'.{suffix}', '_acc.csv') + dump(result, score_file) + return result diff --git a/vlmeval/VLMEvalKit_old/vlmeval/smp/__pycache__/__init__.cpython-310.pyc b/vlmeval/VLMEvalKit_old/vlmeval/smp/__pycache__/__init__.cpython-310.pyc new file mode 100644 index 0000000000000000000000000000000000000000..c8d0e260cc221751d70a162cde2d5d141c132191 Binary files /dev/null and b/vlmeval/VLMEvalKit_old/vlmeval/smp/__pycache__/__init__.cpython-310.pyc differ diff --git a/vlmeval/VLMEvalKit_old/vlmeval/smp/__pycache__/__init__.cpython-311.pyc b/vlmeval/VLMEvalKit_old/vlmeval/smp/__pycache__/__init__.cpython-311.pyc new file mode 100644 index 0000000000000000000000000000000000000000..88cb7a18ad1cac4207e9a74fec00065f69b24803 Binary files /dev/null and b/vlmeval/VLMEvalKit_old/vlmeval/smp/__pycache__/__init__.cpython-311.pyc differ diff --git a/vlmeval/VLMEvalKit_old/vlmeval/smp/__pycache__/__init__.cpython-38.pyc b/vlmeval/VLMEvalKit_old/vlmeval/smp/__pycache__/__init__.cpython-38.pyc new file mode 100644 index 0000000000000000000000000000000000000000..25a053cc2b9ff0cab99e03442dab105518a4cb69 Binary files /dev/null and b/vlmeval/VLMEvalKit_old/vlmeval/smp/__pycache__/__init__.cpython-38.pyc differ diff --git a/vlmeval/VLMEvalKit_old/vlmeval/smp/__pycache__/file.cpython-310.pyc b/vlmeval/VLMEvalKit_old/vlmeval/smp/__pycache__/file.cpython-310.pyc new file mode 100644 index 0000000000000000000000000000000000000000..d7f642e36f7dd6e6347139f7bdbd5e33cd078cfb Binary files /dev/null and b/vlmeval/VLMEvalKit_old/vlmeval/smp/__pycache__/file.cpython-310.pyc differ diff --git a/vlmeval/VLMEvalKit_old/vlmeval/smp/__pycache__/file.cpython-311.pyc b/vlmeval/VLMEvalKit_old/vlmeval/smp/__pycache__/file.cpython-311.pyc new file mode 100644 index 0000000000000000000000000000000000000000..da7d53e88dc3cc05e5814074e22086d59c896f5c Binary files /dev/null and b/vlmeval/VLMEvalKit_old/vlmeval/smp/__pycache__/file.cpython-311.pyc differ diff --git a/vlmeval/VLMEvalKit_old/vlmeval/smp/__pycache__/file.cpython-38.pyc b/vlmeval/VLMEvalKit_old/vlmeval/smp/__pycache__/file.cpython-38.pyc new file mode 100644 index 0000000000000000000000000000000000000000..217665fc6bcc3d09844eab8e0f81a36fc3017a40 Binary files /dev/null and b/vlmeval/VLMEvalKit_old/vlmeval/smp/__pycache__/file.cpython-38.pyc differ diff --git a/vlmeval/VLMEvalKit_old/vlmeval/smp/__pycache__/log.cpython-310.pyc b/vlmeval/VLMEvalKit_old/vlmeval/smp/__pycache__/log.cpython-310.pyc new file mode 100644 index 0000000000000000000000000000000000000000..71b96c730d1f689fcc4c6c59e40c80d3fa4b7e1a Binary files /dev/null and b/vlmeval/VLMEvalKit_old/vlmeval/smp/__pycache__/log.cpython-310.pyc differ diff --git a/vlmeval/VLMEvalKit_old/vlmeval/smp/__pycache__/log.cpython-38.pyc b/vlmeval/VLMEvalKit_old/vlmeval/smp/__pycache__/log.cpython-38.pyc new file mode 100644 index 0000000000000000000000000000000000000000..58d2d8c31d0d37ef39c6152816132f7eea7e0f7d Binary files /dev/null and b/vlmeval/VLMEvalKit_old/vlmeval/smp/__pycache__/log.cpython-38.pyc differ diff --git a/vlmeval/VLMEvalKit_old/vlmeval/smp/__pycache__/misc.cpython-310.pyc b/vlmeval/VLMEvalKit_old/vlmeval/smp/__pycache__/misc.cpython-310.pyc new file mode 100644 index 0000000000000000000000000000000000000000..3b81c8e143e97bd23e314b084352b3d720085b97 Binary files /dev/null and b/vlmeval/VLMEvalKit_old/vlmeval/smp/__pycache__/misc.cpython-310.pyc differ diff --git a/vlmeval/VLMEvalKit_old/vlmeval/smp/__pycache__/misc.cpython-38.pyc b/vlmeval/VLMEvalKit_old/vlmeval/smp/__pycache__/misc.cpython-38.pyc new file mode 100644 index 0000000000000000000000000000000000000000..031cb80bff6a9dc6abae16d6007f9f847d30e77e Binary files /dev/null and b/vlmeval/VLMEvalKit_old/vlmeval/smp/__pycache__/misc.cpython-38.pyc differ diff --git a/vlmeval/VLMEvalKit_old/vlmeval/smp/__pycache__/vlm.cpython-38.pyc b/vlmeval/VLMEvalKit_old/vlmeval/smp/__pycache__/vlm.cpython-38.pyc new file mode 100644 index 0000000000000000000000000000000000000000..1ce6ece6f2c129a0849b2d243880864e70a63956 Binary files /dev/null and b/vlmeval/VLMEvalKit_old/vlmeval/smp/__pycache__/vlm.cpython-38.pyc differ diff --git a/vlmeval/VLMEvalKit_old/vlmeval/vlm/bunnyllama3.py b/vlmeval/VLMEvalKit_old/vlmeval/vlm/bunnyllama3.py new file mode 100644 index 0000000000000000000000000000000000000000..848e5b5919d6ca50c02580aa72a886931fc2ff49 --- /dev/null +++ b/vlmeval/VLMEvalKit_old/vlmeval/vlm/bunnyllama3.py @@ -0,0 +1,133 @@ +import torch +import transformers +from transformers import AutoModelForCausalLM, AutoTokenizer +from PIL import Image +import warnings +import re + +from .base import BaseModel +from ..smp import * +from ..dataset import DATASET_TYPE + + +class BunnyLLama3(BaseModel): + INSTALL_REQ = False + INTERLEAVE = False + + def __init__(self, model_path='BAAI/Bunny-v1_1-Llama-3-8B-V', **kwargs): + assert model_path is not None + transformers.logging.set_verbosity_error() + transformers.logging.disable_progress_bar() + warnings.filterwarnings('ignore') + self.tokenizer = AutoTokenizer.from_pretrained(model_path, trust_remote_code=True) + self.model = AutoModelForCausalLM.from_pretrained(model_path, device_map='auto', trust_remote_code=True) + self.kwargs = kwargs + + def use_custom_prompt(self, dataset): + if listinstr(['MCQ', 'Y/N'], DATASET_TYPE(dataset)) or listinstr(['mathvista'], dataset.lower()): + return True + else: + return False + + def build_prompt(self, line, dataset): + if dataset is None: + dataset = self.dataset + + if isinstance(line, int): + line = self.data.iloc[line] + + tgt_path = self.dump_image(line, dataset) + + prompt = line['question'] + + if DATASET_TYPE(dataset) == 'MCQ': + if listinstr(['mmmu'], dataset.lower()): + hint = line['hint'] if ('hint' in line and not pd.isna(line['hint'])) else None + assert hint is None + + question = line['question'] + question = re.sub(r'', lambda x: x.group(0)[1:-1], question) + + options = { + cand: line[cand] + for cand in string.ascii_uppercase + if cand in line and not pd.isna(line[cand]) + } + options_prompt = '\n' + for key, item in options.items(): + options_prompt += f'({key}) {item}\n' + + prompt = question + if len(options): + prompt += options_prompt + prompt += "\nAnswer with the option's letter from the given choices directly." + else: + prompt += '\nAnswer the question using a single word or phrase.' + else: + hint = line['hint'] if ('hint' in line and not pd.isna(line['hint'])) else None + prompt = '' + if hint is not None: + prompt += f'{hint}\n' + + question = line['question'] + + options = { + cand: line[cand] + for cand in string.ascii_uppercase + if cand in line and not pd.isna(line[cand]) + } + options_prompt = '\n' + for key, item in options.items(): + options_prompt += f'{key}. {item}\n' + + prompt += question + options_prompt + if listinstr(['cn', 'ccbench'], dataset.lower()): + prompt += '请直接回答选项字母。' + else: + prompt += "Answer with the option's letter from the given choices directly." + elif DATASET_TYPE(dataset) == 'Y/N': + if listinstr(['mme'], dataset.lower()): + if not listinstr( + ['code_reasoning', 'commonsense_reasoning', 'numerical_calculation', 'text_translation'], + line['category']): + prompt = prompt.replace(' Please answer yes or no.', + '\nAnswer the question using a single word or phrase.') + elif listinstr(['pope'], dataset.lower()): + prompt = prompt.replace(' Please answer yes or no.', + '\nAnswer the question using a single word or phrase.') + elif listinstr(['mathvista'], dataset.lower()): + match = re.search(r'Hint: (.*?)\nQuestion: (.*?)\n(Choices:\n(.*))?', prompt + '\n', re.DOTALL) + + prompt = match.group(2) + if match.group(4) is not None: + prompt += '\n' + match.group(4).rstrip('\n') + prompt += '\n' + match.group(1) + else: + raise ValueError( + f"Bunny doesn't implement a custom prompt for {dataset}. It should use the default prompt, but didn't.") + + msgs = [] + if isinstance(tgt_path, list): + msgs.extend([dict(type='image', value=p) for p in tgt_path]) + else: + msgs = [dict(type='image', value=tgt_path)] + msgs.append(dict(type='text', value=prompt)) + + return msgs + + def generate_inner(self, message, dataset=None): + + prompt, image_path = self.message_to_promptimg(message, dataset=dataset) + + text = (f'A chat between a curious user and an artificial intelligence assistant. ' + f"The assistant gives helpful, detailed, and polite answers to the user's questions. " + f'USER: \n{prompt} ASSISTANT:') + + text_chunks = [self.tokenizer(chunk).input_ids for chunk in text.split('')] + input_ids = torch.tensor(text_chunks[0] + [-200] + text_chunks[1][1:], dtype=torch.long).unsqueeze(0) + image = Image.open(image_path).convert('RGB') + image_tensor = self.model.process_images([image], self.model.config).to(dtype=self.model.dtype) + + output_ids = self.model.generate(input_ids, images=image_tensor, max_new_tokens=128, use_cache=True)[0] + response = self.tokenizer.decode(output_ids[input_ids.shape[1]:], skip_special_tokens=True) + return response diff --git a/vlmeval/VLMEvalKit_old/vlmeval/vlm/cambrian.py b/vlmeval/VLMEvalKit_old/vlmeval/vlm/cambrian.py new file mode 100644 index 0000000000000000000000000000000000000000..e97753298de4bd80cf5e0461d94af3b449bc459f --- /dev/null +++ b/vlmeval/VLMEvalKit_old/vlmeval/vlm/cambrian.py @@ -0,0 +1,84 @@ +import torch +from PIL import Image +from .base import BaseModel +from ..smp import * +import warnings + +IMAGE_TOKEN_INDEX = -200 +DEFAULT_IMAGE_TOKEN = '' +DEFAULT_IM_START_TOKEN = '' +DEFAULT_IM_END_TOKEN = '' + + +class Cambrian(BaseModel): + + INSTALL_REQ = True + INTERLEAVE = False + + def __init__(self, model_path='nyu-visionx/cambrian-8b', **kwargs): + assert model_path is not None + try: + from cambrian.conversation import conv_templates, SeparatorStyle + from cambrian.model.builder import load_pretrained_model + from cambrian.mm_utils import tokenizer_image_token, process_images, get_model_name_from_path + except Exception as e: + logging.critical('Please install cambrian from https://github.com/cambrian-mllm/cambrian.') + raise e + + model_name = get_model_name_from_path(model_path) + tokenizer, model, image_processor, context_len = load_pretrained_model( + model_path, + None, + model_name, + device_map=None + ) + + if '8b' in model_path: + self.conv_mode = 'llama_3' + elif '13b' in model_path: + self.conv_mode = 'vicuna_v1' + else: + self.conv_mode = 'chatml_direct' + + self.model_config = model.config + self.conv_templates = conv_templates + self.tokenizer_image_token = tokenizer_image_token + self.process_images = process_images + + self.tokenizer = tokenizer + self.image_processor = image_processor + self.model = model.to('cuda') + + def process(self, image, question): + if self.model_config.mm_use_im_start_end: + question = DEFAULT_IM_START_TOKEN + DEFAULT_IMAGE_TOKEN + DEFAULT_IM_END_TOKEN + '\n' + question + else: + question = DEFAULT_IMAGE_TOKEN + '\n' + question + conv = self.conv_templates[self.conv_mode].copy() + conv.append_message(conv.roles[0], question) + conv.append_message(conv.roles[1], None) + prompt = conv.get_prompt() + image_size = [image.size] + image_tensor = self.process_images([image], self.image_processor, self.model_config) + input_ids = self.tokenizer_image_token(prompt, self.tokenizer, IMAGE_TOKEN_INDEX, return_tensors='pt') + input_ids = input_ids.unsqueeze(0).cuda() + return input_ids, image_tensor, image_size, prompt + + def generate_inner(self, message, dataset=None): + prompt, image_path = self.message_to_promptimg(message, dataset=dataset) + image = Image.open(image_path).convert('RGB') + input_ids, image_tensor, image_sizes, prompt = self.process(image, prompt) + input_ids = input_ids.to(device='cuda', non_blocking=True) + with torch.inference_mode(): + output_ids = self.model.generate( + input_ids, + images=image_tensor, + image_sizes=image_sizes, + do_sample=False, + temperature=0, + num_beams=1, + max_new_tokens=512, + use_cache=True + ) + outputs = self.tokenizer.batch_decode(output_ids, skip_special_tokens=True)[0].strip() + return outputs diff --git a/vlmeval/VLMEvalKit_old/vlmeval/vlm/chameleon.py b/vlmeval/VLMEvalKit_old/vlmeval/vlm/chameleon.py new file mode 100644 index 0000000000000000000000000000000000000000..6fd4320c05e27c9e93093325eac7b052c22da40e --- /dev/null +++ b/vlmeval/VLMEvalKit_old/vlmeval/vlm/chameleon.py @@ -0,0 +1,49 @@ +import os.path as osp +import warnings +from .base import BaseModel +from ..smp import * +from PIL import Image +import torch + + +class Chameleon(BaseModel): + + INSTALL_REQ = False + INTERLEAVE = True + + def __init__(self, model_path='facebook/chameleon-7b', **kwargs): + try: + from transformers import ChameleonProcessor, ChameleonForConditionalGeneration + except Exception as e: + logging.critical('Please install the latest transformers.') + raise e + + processor = ChameleonProcessor.from_pretrained(model_path) + model = ChameleonForConditionalGeneration.from_pretrained(model_path, torch_dtype=torch.bfloat16) + + self.model = model.cuda().eval() + self.processor = processor + + def generate_inner(self, message, dataset=None): + content, images = '', [] + for x in message: + if x['type'] == 'text': + content += x['value'] + elif x['type'] == 'image': + content += '\n' + images.append(Image.open(x['value'])) + + inputs = self.processor( + text=[content], + images=images, + padding=True, + return_tensors='pt' + ).to(device='cuda', dtype=torch.bfloat16) + generate_ids = self.model.generate(**inputs, max_new_tokens=512) + input_token_len = inputs.input_ids.shape[1] + text = self.processor.batch_decode( + generate_ids[:, input_token_len:], + skip_special_tokens=True, + clean_up_tokenization_spaces=False + )[0] + return text diff --git a/vlmeval/VLMEvalKit_old/vlmeval/vlm/cogvlm.py b/vlmeval/VLMEvalKit_old/vlmeval/vlm/cogvlm.py new file mode 100644 index 0000000000000000000000000000000000000000..d5d1ece94fe3d8c3a61b0f86b51f207a346528c7 --- /dev/null +++ b/vlmeval/VLMEvalKit_old/vlmeval/vlm/cogvlm.py @@ -0,0 +1,131 @@ +import torch +from PIL import Image +from .base import BaseModel +from ..smp import * +from ..dataset import DATASET_TYPE +from transformers import AutoModelForCausalLM, LlamaTokenizer, AutoTokenizer + + +class GLM4v(BaseModel): + + INSTALL_REQ = False + INTERLEAVE = False + + def __init__(self, model_path='THUDM/glm-4v-9b', **kwargs): + assert model_path is not None + self.model_path = model_path + self.tokenizer = AutoTokenizer.from_pretrained(model_path, trust_remote_code=True) + self.model = AutoModelForCausalLM.from_pretrained( + model_path, + torch_dtype=torch.bfloat16, + low_cpu_mem_usage=True, + trust_remote_code=True + ).to('cuda').eval() + gen_kwargs = {'max_length': 2048, 'do_sample': False} + gen_kwargs.update(kwargs) + self.kwargs = gen_kwargs + self.end_text_token = '<|endoftext|>' + + def generate_inner(self, message, dataset=None): + prompt, image_path = self.message_to_promptimg(message, dataset=dataset) + image = Image.open(image_path).convert('RGB') + if dataset is not None and DATASET_TYPE(dataset) in ['MCQ', 'Y/N']: + prompt += '\nShort Answer.' + inputs = self.tokenizer.apply_chat_template( + [{'role': 'user', 'image': image, 'content': prompt}], + add_generation_prompt=True, tokenize=True, return_tensors='pt', return_dict=True + ) + inputs = inputs.to('cuda') + + with torch.no_grad(): + outputs = self.model.generate(**inputs, **self.kwargs) + outputs = outputs[:, inputs['input_ids'].shape[1]:] + response = self.tokenizer.decode(outputs[0]) + return response.split(self.end_text_token)[0] + + +class CogVlm(BaseModel): + + INSTALL_REQ = False + INTERLEAVE = False + + def __init__(self, model_path='THUDM/cogvlm2-llama3-chat-19B', tokenizer_name=None, **kwargs): + assert model_path is not None + model = AutoModelForCausalLM.from_pretrained( + model_path, + torch_dtype=torch.bfloat16, + trust_remote_code=True, + ).to('cuda').eval() + + self.kwargs = kwargs + if tokenizer_name: + tokenizer = LlamaTokenizer.from_pretrained(tokenizer_name) + gen_kwargs = {'max_length': 2048, 'do_sample': False} + self.end_text_token = '' + else: + tokenizer = AutoTokenizer.from_pretrained(model_path, trust_remote_code=True) + gen_kwargs = {'max_new_tokens': 2048, 'pad_token_id': 128002} + self.end_text_token = '<|end_of_text|>' + self.kwargs.update(gen_kwargs) + self.tokenizer = tokenizer + self.model = model + + def use_custom_prompt(self, dataset): + assert dataset is not None + if DATASET_TYPE(dataset) == 'MCQ': + return True + return False + + def build_prompt(self, line, dataset=None): + assert dataset is None or isinstance(dataset, str) + assert self.use_custom_prompt(dataset) + tgt_path = self.dump_image(line, dataset) + + if dataset is not None and DATASET_TYPE(dataset) == 'MCQ': + question = line['question'] + hint = line['hint'] if ('hint' in line and not pd.isna(line['hint'])) else None + if hint is not None: + question = hint + '\n' + question + + option_candidate = string.ascii_uppercase + options = { + cand: line[cand] + for cand in option_candidate + if cand in line and not pd.isna(line[cand]) + } + for key, item in options.items(): + question += f'\n{key}. {item}' + prompt = question + + if not cn_string(prompt): + prompt = prompt + '\n' + "Answer with the option's letter from the given choices directly." + else: + prompt = prompt + '\n' + '请直接回答选项字母。' + else: + prompt = line['question'] + message = [dict(type='text', value=prompt)] + message.extend([dict(type='image', value=p) for p in tgt_path]) + + return message + + def generate_inner(self, message, dataset=None): + prompt, image_path = self.message_to_promptimg(message, dataset=dataset) + if dataset is not None and DATASET_TYPE(dataset) in ['MCQ', 'Y/N']: + prompt += '\nShort Answer.' + + image = Image.open(image_path).convert('RGB') + inputs = self.model.build_conversation_input_ids( + self.tokenizer, query=prompt, history=[], images=[image]) # chat mode + inputs = { + 'input_ids': inputs['input_ids'].unsqueeze(0).to('cuda'), + 'token_type_ids': inputs['token_type_ids'].unsqueeze(0).to('cuda'), + 'attention_mask': inputs['attention_mask'].unsqueeze(0).to('cuda'), + 'images': [[inputs['images'][0].to('cuda').to(torch.bfloat16)]], + } + + with torch.no_grad(): + outputs = self.model.generate(**inputs, **self.kwargs) + outputs = outputs[:, inputs['input_ids'].shape[1]:] + response = self.tokenizer.decode(outputs[0]) + response = response.split(self.end_text_token)[0].strip() + return response diff --git a/vlmeval/VLMEvalKit_old/vlmeval/vlm/eagle_x.py b/vlmeval/VLMEvalKit_old/vlmeval/vlm/eagle_x.py new file mode 100644 index 0000000000000000000000000000000000000000..2f02bc692589b901a84684aa8a9e2d80117d6f4e --- /dev/null +++ b/vlmeval/VLMEvalKit_old/vlmeval/vlm/eagle_x.py @@ -0,0 +1,180 @@ +import torch +from PIL import Image +from abc import abstractproperty +import sys +import os.path as osp +from .base import BaseModel +from ..smp import * +from ..dataset import DATASET_TYPE +import copy + + +# This function is used to split Eagle-X5-34B +def split_model(model_name): + import math + device_map = {} + num_gpus = torch.cuda.device_count() + rank, world_size = get_rank_and_world_size() + num_gpus = num_gpus // world_size + + num_layers_map = { + 'Eagle-X5-34B-Chat': 60, + 'Eagle-X5-34B-Plus': 60 + } + if model_name not in num_layers_map: + return 'cuda' + num_layers = num_layers_map[model_name] + 8 + # Since the first GPU will be used for ViT, treat it as 0.5 GPU. + num_layers_per_gpu = math.ceil(num_layers / num_gpus) + num_layers_per_gpu = [num_layers_per_gpu] * num_gpus + num_layers_per_gpu[-1] = num_layers - sum(num_layers_per_gpu[:-1]) + num_layers_per_gpu[0] -= 4 + layer_cnt = 0 + for i, num_layer in enumerate(num_layers_per_gpu): + for j in range(num_layer): + device_map[f'model.layers.{layer_cnt}'] = rank + world_size * i + layer_cnt += 1 + device_map['model.vision_tower'] = rank + device_map['model.embed_tokens'] = rank + device_map['model.norm'] = rank + device_map['model.rotary_emb'] = rank + device_map['model.mm_projector'] = rank + device_map['lm_head'] = rank + device_map[f'model.layers.{num_layers - 1}'] = rank + + logging.warning("Remove L157-L158 in https://github.com/NVlabs/EAGLE/blob/fef95f103b5e9899acbbe2c237e5b99147ab7e8e/eagle/model/builder.py to make it work properly.") # noqa: E501 + return device_map + + +class Eagle(BaseModel): + INSTALL_REQ = True + INTERLEAVE = True + + def __init__(self, + model_path='NVEagle/Eagle-X5-7B', + **kwargs): + try: + from eagle.model.builder import load_pretrained_model + from eagle.utils import disable_torch_init + from eagle.mm_utils import get_model_name_from_path + except Exception as e: + logging.critical('''Please install eagle before using Eagle, + you can install it from "https://github.com/NVlabs/EAGLE.git"''') + raise e + + warnings.warn('Please install the latest version of eagle from github before you evaluate the Eagle model.') + assert osp.exists(model_path) or splitlen(model_path) == 2 + model_name = get_model_name_from_path(model_path) + rank, world_size = get_rank_and_world_size() + + device_map = split_model(model_path.split('/')[-1]) + + self.tokenizer, self.model, self.image_processor, self.context_len = ( + load_pretrained_model(model_path, None, model_name, False, False, device_map=device_map) + ) + self.model.eval() + self.conv_mode = 'vicuna_v1' + + default_kwargs = dict( + do_sample=True, + temperature=0.2, + top_p=0.5, + num_beams=1, + max_new_tokens=512, + use_cache=True + ) + + default_kwargs.update(kwargs) + self.kwargs = default_kwargs + warnings.warn(f'Following kwargs received: {self.kwargs}, will use as generation config. ') + torch.cuda.empty_cache() + + def generate_inner(self, message, dataset=None): + try: + from eagle import conversation as conversation_lib + from eagle.constants import (IMAGE_TOKEN_INDEX, DEFAULT_IMAGE_TOKEN, + DEFAULT_IM_START_TOKEN, DEFAULT_IM_END_TOKEN) + from eagle.conversation import conv_templates, SeparatorStyle + from eagle.mm_utils import tokenizer_image_token, process_images, KeywordsStoppingCriteria + except Exception as e: + logging.critical('''Please install eagle before using Eagle, + you can install it from "https://github.com/NVlabs/EAGLE.git"''') + raise e + + kwargs = self.kwargs + + images = [] + prompt = '' + + for s in message: + if s['type'] == 'image': + images.append(s['value']) + elif s['type'] == 'text': + prompt += s['value'] + + DEFAULT_IMAGE_TOKEN = DEFAULT_IMAGE_TOKEN * len(images) + if self.model.config.mm_use_im_start_end: + prompt = DEFAULT_IM_START_TOKEN + DEFAULT_IMAGE_TOKEN + DEFAULT_IM_END_TOKEN + '\n' + prompt + else: + prompt = DEFAULT_IMAGE_TOKEN + '\n' + prompt + + conv = conv_templates[self.conv_mode].copy() + conv.append_message(conv.roles[0], prompt) + conv.append_message(conv.roles[1], None) + prompt = conv.get_prompt() + images = [Image.open(s).convert('RGB') for s in images] + + image_tensor = process_images(images, self.image_processor, self.model.config) + input_ids = tokenizer_image_token(prompt, self.tokenizer, IMAGE_TOKEN_INDEX, return_tensors='pt') + input_ids = input_ids.to(device='cuda', non_blocking=True) + image_tensor = image_tensor.to(dtype=torch.float16, device='cuda', non_blocking=True) + + with torch.inference_mode(): + output_ids = self.model.generate( + input_ids.unsqueeze(0), + images=image_tensor, + image_sizes=[img.size for img in images], + **kwargs + ) + + outputs = self.tokenizer.batch_decode(output_ids, skip_special_tokens=True)[0].strip() + return outputs + + def use_custom_prompt(self, dataset): + assert dataset is not None + if listinstr(['MMMU'], dataset): + return False + if DATASET_TYPE(dataset) == 'MCQ' or dataset == 'MMVet': + return True + return False + + def build_prompt(self, line, dataset=None): + assert dataset is None or isinstance(dataset, str) + assert self.use_custom_prompt(dataset) + tgt_path = self.dump_image(line, dataset) + question = line['question'] + if dataset == 'MMVet': + prompt = question + '\nAnswer the question directly. ' + elif DATASET_TYPE(dataset) == 'MCQ': + options = { + cand: line[cand] + for cand in string.ascii_uppercase + if cand in line and not pd.isna(line[cand]) + } + options_prompt = '' + for key, item in options.items(): + options_prompt += f'{key}. {item}\n' + + hint = line['hint'] if ('hint' in line and not pd.isna(line['hint'])) else None + prompt = f'Hint: {hint}\n' if hint is not None else '' + prompt += f'{question}\n' + prompt += ( + f'{options_prompt}\nAnswer with the option’s letter from the given choices directly. ' + if len(options) else 'Answer the question directly. ' + ) + else: + raise NotImplementedError + + message = [dict(type='text', value=prompt)] + message.extend([dict(type='image', value=s) for s in tgt_path]) + return message diff --git a/vlmeval/VLMEvalKit_old/vlmeval/vlm/emu.py b/vlmeval/VLMEvalKit_old/vlmeval/vlm/emu.py new file mode 100644 index 0000000000000000000000000000000000000000..1051c799b78ce8f415b012e7fe4c0a2e902927ca --- /dev/null +++ b/vlmeval/VLMEvalKit_old/vlmeval/vlm/emu.py @@ -0,0 +1,89 @@ +import os +import torch +from PIL import Image +import os.path as osp +from .base import BaseModel +from ..smp import * + + +class Emu(BaseModel): + + INSTALL_REQ = False + INTERLEAVE = True + + def __init__(self, + model_path='BAAI/Emu2-Chat', + **kwargs): + + self.model_path = model_path + assert osp.exists(model_path) or splitlen(model_path) == 2 + + from transformers import AutoModelForCausalLM, AutoTokenizer + from accelerate import init_empty_weights, infer_auto_device_map, dispatch_model + + local_rank = os.environ.get('LOCAL_RANK', 0) + + device_num = torch.cuda.device_count() + assert local_rank * 2 <= device_num, 'The number of devices does not match the world size' + assert device_num >= 2, 'You need at least 2 GPUs to use EMU' + + device_1 = local_rank + device_2 = local_rank + device_num // 2 + + torch.cuda.set_device(device_1) + torch.cuda.set_device(device_2) + + tokenizer = AutoTokenizer.from_pretrained(model_path) # "BAAI/Emu2-Chat" + self.tokenizer = tokenizer + with init_empty_weights(): + model = AutoModelForCausalLM.from_pretrained( + model_path, # "BAAI/Emu2-Chat" + torch_dtype=torch.bfloat16, + low_cpu_mem_usage=True, + trust_remote_code=True) + + device_map = infer_auto_device_map( + model, + max_memory={ + device_1: '38GiB', + device_2: '38GiB' + }, + no_split_module_classes=['Block', 'LlamaDecoderLayer']) + + # input and output logits should be on same device + device_map['model.decoder.lm.lm_head'] = device_1 + + model = dispatch_model( + model, + device_map=device_map).eval() + + self.model = model + kwargs_default = dict(max_new_tokens=512, length_penalty=-1) + kwargs_default.update(kwargs) + self.kwargs = kwargs_default + warnings.warn(f'Following kwargs received: {self.kwargs}, will use as generation config. ') + + def generate_inner(self, message, dataset=None): + query, images = '', [] + for item in message: + if item['type'] == 'image': + images.append(Image.open(item['value']).convert('RGB')) + query += '[]' + elif item['type'] == 'text': + query += item['value'] + + inputs = self.model.build_input_ids( + text=[query], + tokenizer=self.tokenizer, + image=images + ) + + with torch.no_grad(): + outputs = self.model.generate( + input_ids=inputs['input_ids'], + attention_mask=inputs['attention_mask'], + image=inputs['image'].to(torch.bfloat16), + **self.kwargs) + + output_text = self.tokenizer.batch_decode(outputs, skip_special_tokens=True) + return output_text[0] diff --git a/vlmeval/VLMEvalKit_old/vlmeval/vlm/idefics.py b/vlmeval/VLMEvalKit_old/vlmeval/vlm/idefics.py new file mode 100644 index 0000000000000000000000000000000000000000..163e1bff409dea4080ba046974ba313e5bc41053 --- /dev/null +++ b/vlmeval/VLMEvalKit_old/vlmeval/vlm/idefics.py @@ -0,0 +1,310 @@ +import torch +import os.path as osp +import warnings +from .base import BaseModel +from ..smp import splitlen, listinstr +from PIL import Image +from transformers import AutoProcessor, AutoModelForVision2Seq +from transformers.image_utils import load_image + + +class IDEFICS(BaseModel): + INSTALL_REQ = False + INTERLEAVE = True + + def __init__(self, model_path='HuggingFaceM4/idefics-9b-instruct', **kwargs): + assert osp.exists(model_path) or splitlen(model_path) == 2 + from transformers import IdeficsForVisionText2Text, AutoProcessor + + self.model = IdeficsForVisionText2Text.from_pretrained( + model_path, torch_dtype=torch.bfloat16, device_map='auto' + ) + self.processor = AutoProcessor.from_pretrained(model_path) + kwargs_default = {'max_new_tokens': 512} + kwargs_default.update(kwargs) + self.kwargs = kwargs_default + self.file_root = osp.dirname(__file__) + warnings.warn( + f'Following kwargs received: {self.kwargs}, will use as generation config. ' + ) + + def generate_inner(self, message, dataset=None): + prompts = ( + ['Users:'] + + [msg['value'] if msg['type'] == 'text' else Image.open(msg['value']) for msg in message] + + ['', '\nAssistant: '] + ) + inputs = self.processor( + prompts, add_end_of_utterance_token=False, return_tensors='pt' + ).to('cuda') + exit_condition = self.processor.tokenizer( + '', add_special_tokens=False + ).input_ids + bad_words_ids = self.processor.tokenizer( + ['', ''], add_special_tokens=False + ).input_ids + + generated_ids = self.model.generate( + **inputs, + eos_token_id=exit_condition, + bad_words_ids=bad_words_ids, + # max_new_tokens=4096, + **self.kwargs, + ) + generated_text = self.processor.batch_decode( + generated_ids, skip_special_tokens=True + ) + text = generated_text[0].split('\nAssistant: ')[-1] + return text + + +class IDEFICS2(BaseModel): + INSTALL_REQ = True + INTERLEAVE = True + + def __init__(self, model_path='HuggingFaceM4/idefics2-8b', **kwargs): + assert model_path is not None + self.model_path = model_path + if 'Idefics3' in self.model_path.lower(): + warnings.warn('Install transfomers from source: PR https://github.com/open-compass/VLMEvalKit/pull/379') + warnings.warn('Reference: https://huggingface.co/HuggingFaceM4/Idefics3-8B-Llama3') + self.processor = AutoProcessor.from_pretrained(model_path) + model = AutoModelForVision2Seq.from_pretrained( + model_path, + torch_dtype=torch.bfloat16, + _attn_implementation='flash_attention_2', + device_map='cpu') + self.model = model.to('cuda') + + kwargs_default = {'max_new_tokens': 1024} + kwargs_default.update(kwargs) + self.kwargs = kwargs_default + warnings.warn( + f'Following kwargs received: {self.kwargs}, will use as generation config. ' + ) + torch.cuda.empty_cache() + + def _process(self, formatted_messages, formatted_images): + inputs = self.processor( + text=formatted_messages, images=formatted_images, return_tensors='pt' + ) + inputs = {k: v.to(self.model.device) for k, v in inputs.items()} + return inputs + + def build_prompt_default(self, message, add_brief=False, add_yes_or_no=False, change_the_img_place=False): + if change_the_img_place: + new_message = [] + for s in message: + if s['type'] == 'image': + new_message.append(s) + for s in message: + if s['type'] == 'text': + new_message.append(s) + message = new_message + prompt, images = 'User:', [] + for msg in message: + if msg['type'] == 'image': + img = load_image(msg['value']) + images.append(img) + prompt += '' + elif msg['type'] == 'text': + prompt += msg['value'].strip() + if add_brief: + prompt += '\nGive a very brief answer.' + if add_yes_or_no: + prompt += '\nAnswer yes or no.' + prompt += '\nAssistant:' + return prompt, images + + def build_prompt_puremcq(self, message): + replace_mapping = { + '\nOptions:': '\nChoices:', + 'Please select the correct answer from the options above.': 'Answer with the letter.', + } + + prompt, images = 'User:', [] + for msg in message: + if msg['type'] == 'image': + img = load_image(msg['value']) + images.append(img) + prompt += '' + elif msg['type'] == 'text': + instruction = msg['value'].strip() + for k, v in replace_mapping.items(): + instruction = instruction.replace(k, v) + prompt += instruction + prompt += '\nAssistant: Answer:' + return prompt, images + + def build_prompt_mt(self, message): + prompt, images = '', [] + for msg in message: + if msg['role'] == 'user': + prompt += 'User: ' + elif msg['role'] == 'assistant': + prompt += 'Assistant: ' + for item in msg['content']: + if item['type'] == 'image': + img = load_image(item['value']) + images.append(img) + prompt += '' + elif item['type'] == 'text': + prompt += item['value'].strip() + prompt += '\n' + return prompt + 'Assistant: ' + + def build_prompt_mmbench(self, message): + replace_mapping = { + '\nOptions:': '\nChoices:', + 'Please select the correct answer from the options above.': 'Answer with a letter.', + } + + prompt, images = 'User:', [] + for msg in message: + if msg['type'] == 'image': + img = load_image(msg['value']) + images.append(img) + prompt += '' + elif msg['type'] == 'text': + instruction = msg['value'].strip() + for k, v in replace_mapping.items(): + instruction = instruction.replace(k, v) + # Swap hint and question + if instruction.startswith('Hint:'): + hint, question = instruction.split('\nQuestion:') + question, choices = question.split('\nChoices:') + instruction = ( + 'Question:' + question + '\n' + hint + '\nChoices:' + choices + ) + prompt += instruction + prompt += '\nAssistant: Answer:' + return prompt, images + + def build_prompt_mmmu(self, message): + replace_mapping = { + 'Question:': '', + 'Please select the correct answer from the options above.': 'Answer with the letter.', + '\nOptions:': '\nChoices:', + } + + prompt, images, img_counter = 'User: Question: ', [], 1 + for msg in message: + if msg['type'] == 'image': + prompt += f':\n' + img_counter += 1 + img_counter = 1 + + for msg in message: + if msg['type'] == 'image': + img = load_image(msg['value']) + images.append(img) + prompt += f' ' + img_counter += 1 + elif msg['type'] == 'text': + instruction = msg['value'].strip() + for k, v in replace_mapping.items(): + instruction = instruction.replace(k, v) + prompt += instruction.strip() + prompt += '\nAssistant:' + if 'A.' in prompt and 'B.' in prompt: + prompt += ' Answer:' + return prompt, images + + def build_prompt_mathvista(self, message): + replace_mapping = { + '(A) ': 'A. ', + '(B) ': 'B. ', + '(C) ': 'C. ', + '(D) ': 'D. ', + '(E) ': 'E. ', + '(F) ': 'F. ', + '(G) ': 'G. ', + '(H) ': 'H. ', + '\nOptions:': '\nChoices:', + 'Hint: ': '', + } + + prompt, images = 'User:', [] + for msg in message: + if msg['type'] == 'image': + img = load_image(msg['value']) + images.append(img) + prompt += '' + elif msg['type'] == 'text': + instruction = msg['value'].strip() + for k, v in replace_mapping.items(): + instruction = instruction.replace(k, v) + prompt += instruction.strip() + if 'A.' in prompt and 'B.' in prompt: + prompt += '\nAnswer with the letter.' + prompt += '\nAssistant:' + if 'A.' in prompt and 'B.' in prompt: + prompt += ' Answer:' + return prompt, images + + def chat_inner(self, message, dataset=None): + formatted_messages, formatted_images = self.build_prompt_mt(message) + inputs = self._process(formatted_messages, formatted_images) + + generated_ids = self.model.generate(**inputs, **self.kwargs) + generated_text = self.processor.batch_decode( + generated_ids[:, inputs['input_ids'].size(1):], skip_special_tokens=True + )[0] + response = generated_text.strip() + # print(dataset, " | ", formatted_messages.replace("\n", "\\n"), " | ", response.replace("\n", "\\n")) + return response + + def generate_inner(self, message, dataset=None): + if dataset in [ + 'MMBench_DEV_EN', 'MMBench_DEV_EN_V11', + 'MMBench_TEST_EN', 'MMBench_TEST_EN_V11', + 'MMBench_DEV_CN', 'MMBench_DEV_CN_V11', + 'MMBench_TEST_CN', 'MMBench_TEST_CN_V11', + 'MMBench', 'MMBench_V11', 'MMBench_CN', 'MMBench_CN_V11' + ]: + formatted_messages, formatted_images = self.build_prompt_mmbench(message) + elif dataset in ['MMMU_DEV_VAL', 'MMMU_TEST']: + formatted_messages, formatted_images = self.build_prompt_mmmu(message) + elif dataset in ['MathVista_MINI']: + formatted_messages, formatted_images = self.build_prompt_mathvista(message) + elif dataset in [ + 'MME', + 'MMVet', + 'OCRVQA_TEST', + 'OCRVQA_TESTCORE', + 'TextVQA_VAL', + 'ChartQA_TEST', + 'DocVQA_VAL', + 'DocVQA_TEST', + 'InfoVQA_VAL', + 'InfoVQA_TEST', + ]: + formatted_messages, formatted_images = self.build_prompt_default( + message, add_brief=True + ) + elif dataset == 'HallusionBench': + formatted_messages, formatted_images = self.build_prompt_default( + message, add_yes_or_no=True + ) + elif dataset in [ + 'MMStar', + 'SEEDBench_IMG', + 'AI2D_TEST', + 'ScienceQA_VAL', + 'ScienceQA_TEST', + ]: + formatted_messages, formatted_images = self.build_prompt_puremcq(message) + elif listinstr(['MLVU','TempCompass','MVBench'], dataset): + formatted_messages, formatted_images = self.build_prompt_default(message, change_the_img_place=True) + else: + formatted_messages, formatted_images = self.build_prompt_default(message) + + inputs = self._process(formatted_messages, formatted_images) + + generated_ids = self.model.generate(**inputs, **self.kwargs) + generated_text = self.processor.batch_decode( + generated_ids[:, inputs['input_ids'].size(1):], skip_special_tokens=True + )[0] + response = generated_text.strip() + # print(dataset, " | ", formatted_messages.replace("\n", "\\n"), " | ", response.replace("\n", "\\n")) + return response diff --git a/vlmeval/VLMEvalKit_old/vlmeval/vlm/janus.py b/vlmeval/VLMEvalKit_old/vlmeval/vlm/janus.py new file mode 100644 index 0000000000000000000000000000000000000000..bcb98c512db962f9b6d23e65db82c39d6eb22d49 --- /dev/null +++ b/vlmeval/VLMEvalKit_old/vlmeval/vlm/janus.py @@ -0,0 +1,136 @@ +import sys +import torch +from transformers import AutoModelForCausalLM, AutoConfig +import warnings +from .base import BaseModel +from ..smp import * +from ..dataset import DATASET_TYPE + + +class Janus(BaseModel): + + INSTALL_REQ = True + INTERLEAVE = True + + def check_install(self): + try: + import janus + except Exception as e: + logging.critical( + 'Please first install janus from source codes in: https://github.com/deepseek-ai/Janus') + raise e + + def __init__(self, model_path='deepseek-ai/Janus-1.3B', **kwargs): + self.check_install() + assert model_path is not None + self.model_path = model_path + from janus.models import VLChatProcessor + + self.vl_chat_processor = VLChatProcessor.from_pretrained(model_path) + self.tokenizer = self.vl_chat_processor.tokenizer + + model = AutoModelForCausalLM.from_pretrained(model_path, trust_remote_code=True) + self.model = model.to(torch.bfloat16).cuda().eval() + + torch.cuda.empty_cache() + default_kwargs = dict( + max_new_tokens=512, + do_sample=False, + use_cache=True, + output_logits=False, + output_scores=False, + return_dict_in_generate=False) + + default_kwargs.update(kwargs) + self.kwargs = default_kwargs + warnings.warn(f'Following kwargs received: {self.kwargs}, will use as generation config. ') + + def prepare_inputs(self, message): + def prepare_itlist(msgs): + content, images = '', [] + for s in msgs: + if s['type'] == 'image': + images.append(s['value']) + content += '' + elif s['type'] == 'text': + content += s['value'] + return content, images + conversation = [] + if 'role' not in message[0]: + content, images = prepare_itlist(message) + conversation.append(dict(role='User', content=content, images=images)) + else: + role_map = {'user': 'User', 'assistant': 'Assistant'} + for msgs in message: + role = role_map[msgs['role']] + content, images = prepare_itlist(msgs['content']) + conversation.append(dict(role=role, content=content, images=images)) + conversation.append(dict(role='Assistant', content='')) + return conversation + + def generate_inner(self, message, dataset=None): + if dataset is None or not ('MMVet' in dataset): + self.vl_chat_processor.system_prompt = "" + else: + self.vl_chat_processor.system_prompt = "You are a helpful assistant. Please answer truthfully and write out your thinking step by step to be sure you get the right answer." # noqa: E501 + + conversation = self.prepare_inputs(message) + from janus.utils.io import load_pil_images + pil_images = load_pil_images(conversation) + prepare_inputs = self.vl_chat_processor(conversations=conversation, images=pil_images, force_batchify=True) + prepare_inputs = prepare_inputs.to(self.model.device, dtype=torch.bfloat16) + inputs_embeds = self.model.prepare_inputs_embeds(**prepare_inputs) + + outputs = self.model.language_model.generate( + inputs_embeds=inputs_embeds, + attention_mask=prepare_inputs.attention_mask, + pad_token_id=self.tokenizer.eos_token_id, + bos_token_id=self.tokenizer.bos_token_id, + eos_token_id=self.tokenizer.eos_token_id, + **self.kwargs) + answer = self.tokenizer.decode(outputs[0].cpu().tolist(), skip_special_tokens=True) + return answer + + def chat_inner(self, message, dataset=None): + return self.generate_inner(message, dataset=dataset) + + def use_custom_prompt(self, dataset): + assert dataset is not None + if DATASET_TYPE(dataset) == 'Y/N' or DATASET_TYPE(dataset) == 'MCQ' or dataset == 'MMVet': + return True + return False + + def build_prompt(self, line, dataset=None): + assert dataset is None or isinstance(dataset, str) + assert self.use_custom_prompt(dataset) + tgt_path = self.dump_image(line, dataset) + question = line['question'] + if DATASET_TYPE(dataset) == 'Y/N': + if dataset == 'POPE': + question = question.replace(" Please answer yes or no.", "") + prompt = '\n' + question + "\nAnswer the question using a single word or phrase." + elif DATASET_TYPE(dataset) == 'MCQ': + options = { + cand: line[cand] + for cand in string.ascii_uppercase + if cand in line and not pd.isna(line[cand]) + } + options_prompt = '' + for key, item in options.items(): + options_prompt += f'{key}. {item}\n' + + hint = line['hint'] if ('hint' in line and not pd.isna(line['hint'])) else None + prompt = f'\nHint: {hint}\n' if hint is not None else '\n' + prompt += f'{question}\n' + prompt += ( + f"{options_prompt}\nAnswer with the option's letter from the given choices directly." + if len(options) else 'Answer the question directly. ' + ) + elif dataset == 'MMVet': + prompt = '\n' + question + else: + raise NotImplementedError + + message = [dict(type='image', value=s) for s in tgt_path] + message.extend([dict(type='text', value=prompt)]) + return message diff --git a/vlmeval/VLMEvalKit_old/vlmeval/vlm/llama_vision.py b/vlmeval/VLMEvalKit_old/vlmeval/vlm/llama_vision.py new file mode 100644 index 0000000000000000000000000000000000000000..9abbcfd95e890f1942591d3b41ca20655a495c2d --- /dev/null +++ b/vlmeval/VLMEvalKit_old/vlmeval/vlm/llama_vision.py @@ -0,0 +1,204 @@ +import torch +from PIL import Image +import os.path as osp +import sys +from .base import BaseModel +from ..smp import * +from ..dataset import DATASET_TYPE + + +class llama_vision(BaseModel): + + INSTALL_REQ = False + INTERLEAVE = False + + # This function is used to split Llama-3.2-90B + def split_model(self): + import math + device_map = {} + num_gpus = torch.cuda.device_count() + rank, world_size = get_rank_and_world_size() + num_gpus = num_gpus // world_size + + num_layers = 100 + # GPU0: -5, GPU-1: -7 + total_cost = num_layers + 5 + 7 + + # Since the first GPU will be used for ViT, treat it as 0.8 GPU. + num_layers_per_gpu = total_cost // num_gpus + num_layers_per_gpu = [num_layers_per_gpu] * num_gpus + # The total number of GPUs might be odd + num_layers_per_gpu[-1] = total_cost - sum(num_layers_per_gpu[:-1]) + num_layers_per_gpu[0] -= 5 + num_layers_per_gpu[-1] -= 7 + + layer_cnt = 0 + for i, num_layer in enumerate(num_layers_per_gpu): + for j in range(num_layer): + device_map[f'language_model.model.layers.{layer_cnt}'] = rank + world_size * i + layer_cnt += 1 + + device_map['vision_model'] = rank + device_map['language_model.model.embed_tokens'] = rank + device_map['language_model.model.rotary_emb'] = rank + device_map['language_model.model.norm'] = rank + world_size * (num_gpus - 1) + device_map['language_model.lm_head'] = rank + world_size * (num_gpus - 1) + device_map['multi_modal_projector'] = rank + world_size * (num_gpus - 1) + return device_map + + def __init__(self, model_path='meta-llama/Llama-3.2-11B-Vision-Instruct', **kwargs): + try: + from transformers import MllamaForConditionalGeneration, AutoProcessor + except Exception as e: + logging.critical('Please install transformers>=4.45.0 before using llama_vision.') + raise e + + rank, world_size = get_rank_and_world_size() + + if '11b' in model_path.lower() and auto_split_flag(): + assert world_size == 1, 'We only support world_size == 1 when AUTO_SPLIT is set for Llama-3.2-11B' + logging.warning('Currently, we only support to split the 11B model across all GPUs.') + self.model = MllamaForConditionalGeneration.from_pretrained( + model_path, + torch_dtype=torch.bfloat16, + device_map='auto', + ).eval() + elif '90b' in model_path.lower(): + device_map = self.split_model() + self.model = MllamaForConditionalGeneration.from_pretrained( + model_path, + torch_dtype=torch.bfloat16, + device_map=device_map, + ).eval() + else: + self.model = MllamaForConditionalGeneration.from_pretrained( + model_path, + torch_dtype=torch.bfloat16, + device_map='cpu', + ).cuda().eval() + + self.device = 'cuda' + self.processor = AutoProcessor.from_pretrained(model_path) + if 'Instruct' in model_path: + kwargs_default = dict(do_sample=True, temperature=0.6, top_p=0.9) + else: + kwargs_default = dict(do_sample=False, max_new_tokens=512, temperature=0.0, top_p=None, num_beams=1) + kwargs.update(kwargs_default) + print(f'Following kwargs received: {kwargs}, will use as generation config. ') + self.kwargs = kwargs + self.model_name = model_path + + def use_custom_prompt(self, dataset): + if dataset is None: + return False + if listinstr(['AI2D', 'MMMU', 'MathVista', 'ChartQA', 'DocVQA'], dataset): + # For Certain dataset we use custom prompt + return True + else: + return False + + def build_prompt(self, line, dataset=None): + assert self.use_custom_prompt(dataset) + assert dataset is None or isinstance(dataset, str) + tgt_path = self.dump_image(line, dataset) + question = line['question'] + options = { + cand: line[cand] + for cand in string.ascii_uppercase + if cand in line and not pd.isna(line[cand]) + } + if listinstr(['AI2D'], dataset): + self.kwargs['max_new_tokens'] = 400 + for key, item in options.items(): + question += f'\n{key}. {item}' + if '11B' in self.model_name: + prompt = ( + f'Look at the scientific diagram carefully and answer the following question: {question}\n' + f'Think step by step and finally respond to the question ' + f"with only the correct option number as \"FINAL ANSWER\"." + f"Let's think step by step." + ) + elif '90B' in self.model_name: + prompt = ( + f'Look at the scientific diagram carefully and answer the following question: {question}\n' + f'Respond only with the correct option digit.' + ) + elif listinstr(['MMMU'], dataset): + self.kwargs['max_new_tokens'] = 2048 + options = '\n'.join([f'{key}. {item}' for key, item in options.items()]) + prompt = ( + f'Look at the image carefully and solve the following question step-by-step. ' + f'Question: {question} Options: {options} Indicate the correct answer at the end.' + ) + for i in range(len(tgt_path)): + prompt = prompt.replace(f'', '') + elif listinstr(['MathVista'], dataset): + self.kwargs['max_new_tokens'] = 2048 + prompt = f'{question}' + elif listinstr(['ChartQA'], dataset): + self.kwargs['max_new_tokens'] = 512 + if '11B' in self.model_name: + prompt = ( + f'You are provided a chart image and will be asked a question. ' + f'You have to think through your answer and provide a step-by-step solution. ' + f'Once you have the solution, write the final answer in at most a few words at the end ' + f"with the phrase \"FINAL ANSWER:\". " + f"The question is: {question}Let's think step by step." + ) + elif '90B' in self.model_name: + prompt = ( + f'You are provided a chart image and will be asked a question. ' + f'Follow these steps carefully:\n ' + f'Step 1: Analyze the question to understand what specific data or information is being asked for. ' + f'Focus on whether the question is asking for a specific number or category ' + f'from the chart image.\n ' + f'Step 2: Identify any numbers, categories, or groups mentioned in the question ' + f'and take note of them. Focus on detecting and matching them directly to the image. \n' + f'Step 3: Study the image carefully and find the relevant data corresponding to the categories ' + f'or numbers mentioned. Avoid unnecessary assumptions or calculations; ' + f'simply read the correct data from the image.\n ' + f'Step 4: Develop a clear plan to solve the question by locating the right data. ' + f'Focus only on the specific category or group that matches the question. \n' + f'Step 5: Use step-by-step reasoning to ensure you are referencing the correct numbers ' + f'or data points from the image, avoiding unnecessary extra steps or interpretations.\n ' + f"Step 6: Provide the final answer, starting with \"FINAL ANSWER:\" " + f'and using as few words as possible, ' + f'simply stating the number or data point requested. \n\n ' + f"The question is: {question}Let's think step by step." + ) + elif listinstr(['DocVQA'], dataset): + self.kwargs['max_new_tokens'] = 512 + prompt = ( + f'Read the text in the image carefully and answer the question ' + f'with the text as seen exactly in the image. ' + f'For yes/no questions, just respond Yes or No. ' + f'If the answer is numeric, just respond with the number and nothing else. ' + f'If the answer has multiple words, just respond with the words and absolutely nothing else. ' + f'Never respond in a sentence or a phrase.\n Question: {question}' + ) + else: + raise NotImplementedError(f'Dataset {dataset}) not supported.') + + message = [dict(type='text', value=prompt)] + message.extend([dict(type='image', value=s) for s in tgt_path]) + return message + + def generate_inner(self, message, dataset=None): + prompt, image_path = self.message_to_promptimg(message, dataset=dataset) + + image = Image.open(image_path) + messages = [ + {'role': 'user', 'content': [ + {'type': 'image'}, + {'type': 'text', 'text': prompt} + ]} + ] + input_text = self.processor.apply_chat_template(messages, add_generation_prompt=True) + inputs = self.processor(image, input_text, return_tensors='pt').to(self.device) + if not self.use_custom_prompt(dataset): + if dataset is not None and DATASET_TYPE(dataset) in ['MCQ', 'Y/N']: + self.kwargs['max_new_tokens'] = 128 + else: + self.kwargs['max_new_tokens'] = 512 + output = self.model.generate(**inputs, **self.kwargs) + return self.processor.decode(output[0][inputs['input_ids'].shape[1]:]).replace('<|eot_id|>', '') diff --git a/vlmeval/VLMEvalKit_old/vlmeval/vlm/mgm.py b/vlmeval/VLMEvalKit_old/vlmeval/vlm/mgm.py new file mode 100644 index 0000000000000000000000000000000000000000..fd2a15be8886d176f7748b3797401eee9b4db5c6 --- /dev/null +++ b/vlmeval/VLMEvalKit_old/vlmeval/vlm/mgm.py @@ -0,0 +1,158 @@ +import sys +import torch +import os.path as osp +import os +import warnings +from .base import BaseModel +from ..smp import * +from PIL import Image + +''' + Please follow the instructions to download ckpt. + https://github.com/dvlab-research/MGM?tab=readme-ov-file#pretrained-weights +''' + + +class Mini_Gemini(BaseModel): + INSTALL_REQ = True + INTERLEAVE = False + + def __init__(self, model_path, root=None, conv_mode='llava_v1', **kwargs): + if root is None: + warnings.warn('Please set `root` to Mini_Gemini code directory, \ + which is cloned from here: "https://github.com/dvlab-research/MGM?tab=readme-ov-file" ') + raise ValueError + warnings.warn('Please follow the instructions of Mini_Gemini to put the ckpt file in the right place, \ + which can be found at https://github.com/dvlab-research/MGM?tab=readme-ov-file#structure') + assert model_path == 'YanweiLi/MGM-7B-HD', 'We only support MGM-7B-HD for now' + self.model_path = model_path + sys.path.append(root) + try: + from mgm.model.builder import load_pretrained_model + from mgm.mm_utils import get_model_name_from_path + except Exception as e: + logging.critical( + 'Please first install Mini_Gemini and set the root path to use Mini_Gemini, ' + 'which is cloned from here: "https://github.com/dvlab-research/MGM?tab=readme-ov-file" ' + ) + raise e + + VLMEvalKit_path = os.getcwd() + os.chdir(root) + warnings.warn('Please set `root` to Mini_Gemini code directory, \ + which is cloned from here: "https://github.com/dvlab-research/MGM?tab=readme-ov-file" ') + model_path = osp.join(root, 'work_dirs', 'MGM', 'MGM-7B-HD') + try: + model_name = get_model_name_from_path(model_path) + except Exception as e: + logging.critical( + 'Please follow the instructions of Mini_Gemini to put the ckpt file in the right place, ' + 'which can be found at https://github.com/dvlab-research/MGM?tab=readme-ov-file#structure' + ) + raise e + + tokenizer, model, image_processor, context_len = load_pretrained_model(model_path, None, model_name) + os.chdir(VLMEvalKit_path) + self.model = model + self.tokenizer = tokenizer + self.image_processor = image_processor + self.conv_mode = conv_mode + + kwargs_default = dict(temperature=float(0), num_beams=1, top_p=None, max_new_tokens=1024, use_cache=True) + kwargs_default.update(kwargs) + do_sample = kwargs_default['temperature'] > 0 + kwargs_default.update({'do_sample': do_sample}) + self.kwargs = kwargs_default + + def generate_inner(self, message, dataset=None): + try: + from mgm.constants import IMAGE_TOKEN_INDEX, DEFAULT_IMAGE_TOKEN, \ + DEFAULT_IM_START_TOKEN, DEFAULT_IM_END_TOKEN + from mgm.conversation import conv_templates + from mgm.mm_utils import tokenizer_image_token, process_images + except Exception as e: + logging.critical( + 'Please first install Mini_Gemini and set the root path to use Mini_Gemini, ' + 'which is cloned from here: "https://github.com/dvlab-research/MGM?tab=readme-ov-file" ' + ) + raise e + + prompt, image = self.message_to_promptimg(message, dataset=dataset) + image = Image.open(image) + prompt = DEFAULT_IM_START_TOKEN + DEFAULT_IMAGE_TOKEN + DEFAULT_IM_END_TOKEN + '\n' + prompt + conv = conv_templates[self.conv_mode].copy() + conv.append_message(conv.roles[0], prompt) + conv.append_message(conv.roles[1], None) + prompt = conv.get_prompt() + + input_ids = tokenizer_image_token(prompt, self.tokenizer, IMAGE_TOKEN_INDEX, return_tensors='pt') + input_ids = input_ids.unsqueeze(0).cuda() + if hasattr(self.model.config, 'image_size_aux'): + if not hasattr(self.image_processor, 'image_size_raw'): + self.image_processor.image_size_raw = self.image_processor.crop_size.copy() + self.image_processor.crop_size['height'] = self.model.config.image_size_aux + self.image_processor.crop_size['width'] = self.model.config.image_size_aux + self.image_processor.size['shortest_edge'] = self.model.config.image_size_aux + image_tensor = process_images([image], self.image_processor, self.model.config)[0] + image_grid = getattr(self.model.config, 'image_grid', 1) + if hasattr(self.model.config, 'image_size_aux'): + raw_shape = [ + self.image_processor.image_size_raw['height'] * image_grid, + self.image_processor.image_size_raw['width'] * image_grid + ] + image_tensor_aux = image_tensor + image_tensor = torch.nn.functional.interpolate( + image_tensor[None], + size=raw_shape, + mode='bilinear', + align_corners=False + )[0] + else: + image_tensor_aux = [] + if image_grid >= 2: + raw_image = image_tensor.reshape( + 3, image_grid, self.image_processor.image_size_raw['height'], + image_grid, self.image_processor.image_size_raw['width'] + ) + raw_image = raw_image.permute(1, 3, 0, 2, 4) + raw_image = raw_image.reshape( + -1, 3, self.image_processor.image_size_raw['height'], self.image_processor.image_size_raw['width'] + ) + + if getattr(self.model.config, 'image_global', False): + global_image = image_tensor + if len(global_image.shape) == 3: + global_image = global_image[None] + global_image = torch.nn.functional.interpolate( + global_image, + size=[ + self.image_processor.image_size_raw['height'], + self.image_processor.image_size_raw['width'] + ], + mode='bilinear', + align_corners=False + ) + # [image_crops, image_global] + raw_image = torch.cat([raw_image, global_image], dim=0) + image_tensor = raw_image.contiguous() + + images = image_tensor[None].to(dtype=self.model.dtype, device='cuda', non_blocking=True) + if len(image_tensor_aux) > 0: + images_aux = image_tensor_aux[None].to(dtype=self.model.dtype, device='cuda', non_blocking=True) + else: + images_aux = None + + with torch.inference_mode(): + output_ids = self.model.generate( + input_ids, + images=images, + images_aux=images_aux, + # no_repeat_ngram_size=3, + bos_token_id=self.tokenizer.bos_token_id, # Begin of sequence token + eos_token_id=self.tokenizer.eos_token_id, # End of sequence token + pad_token_id=self.tokenizer.pad_token_id, # Pad token + **self.kwargs + ) + + outputs = self.tokenizer.batch_decode(output_ids, skip_special_tokens=True)[0].strip() + return outputs diff --git a/vlmeval/VLMEvalKit_old/vlmeval/vlm/minicpm_v.py b/vlmeval/VLMEvalKit_old/vlmeval/vlm/minicpm_v.py new file mode 100644 index 0000000000000000000000000000000000000000..971ae351633d6e7712e2a1117aec224d46cea4dd --- /dev/null +++ b/vlmeval/VLMEvalKit_old/vlmeval/vlm/minicpm_v.py @@ -0,0 +1,471 @@ +import math +import torch +import random +import numpy as np +from PIL import Image +from transformers import AutoModel, AutoTokenizer + +from .base import BaseModel +from ..smp import * +from ..dataset import DATASET_TYPE, DATASET_MODALITY + + +class MiniCPM_V(BaseModel): + + INSTALL_REQ = False + INTERLEAVE = False + + def __init__(self, model_path='openbmb/MiniCPM-V', **kwargs): + assert model_path is not None + self.model_path = model_path + print(f'load from {self.model_path}') + self.model = AutoModel.from_pretrained(self.model_path, trust_remote_code=True) + self.model = self.model.to(dtype=torch.bfloat16) + self.model.eval().cuda() + self.kwargs = kwargs + self.tokenizer = AutoTokenizer.from_pretrained(self.model_path, trust_remote_code=True) + torch.cuda.empty_cache() + self.num_beams = 1 if self.model_path == 'openbmb/MiniCPM-V' else 3 + + def use_custom_prompt(self, dataset): + assert dataset is not None + if listinstr(['MMDU', 'MME-RealWorld', 'MME-RealWorld-CN'], dataset): + # For Multi-Turn we don't have custom prompt + return False + return False + + def build_prompt(self, line, dataset=None): + assert dataset is None or isinstance(dataset, str) + assert self.use_custom_prompt(dataset) + tgt_path = self.dump_image(line, dataset) + + question = line['question'] + options = { + cand: line[cand] + for cand in string.ascii_uppercase + if cand in line and not pd.isna(line[cand]) + } + options_prompt = 'Options:\n' + for key, item in options.items(): + options_prompt += f'{key}. {item}\n' + hint = line['hint'] if ('hint' in line and not pd.isna(line['hint'])) else None + prompt = '' + if hint is not None: + prompt += f'Hint: {hint}\n' + prompt += f'{question}\n' + if len(options): + prompt += options_prompt + prompt = 'Study the image carefully and pick the option associated with the correct answer. \ + Focus solely on selecting the option and avoid including any other content.\n' + prompt + message = [dict(type='text', value=prompt)] + message.extend([dict(type='image', value=p) for p in tgt_path]) + + return message + + def generate_inner(self, message, dataset=None): + prompt, image_path = self.message_to_promptimg(message, dataset=dataset) + image = Image.open(image_path).convert('RGB') + msgs = [{'role': 'user', 'content': prompt}] + if DATASET_TYPE(dataset) == 'MCQ': + max_new_tokens = 20 + elif DATASET_TYPE(dataset) == 'Y/N': + max_new_tokens = 100 + else: + max_new_tokens = 1024 + + default_kwargs = dict( + max_new_tokens=max_new_tokens, + sampling=False, + num_beams=self.num_beams + ) + default_kwargs.update(self.kwargs) + res, _, _ = self.model.chat( + image=image, + msgs=msgs, + context=None, + tokenizer=self.tokenizer, + **default_kwargs + ) + return res + + +class MiniCPM_Llama3_V(BaseModel): + + INSTALL_REQ = False + INTERLEAVE = True + + def __init__(self, model_path='openbmb/MiniCPM-Llama3-V-2_5', **kwargs): + assert model_path is not None + self.model_path = model_path + print(f'load from {self.model_path}') + self.model = AutoModel.from_pretrained(self.model_path, trust_remote_code=True) + self.model = self.model.to(dtype=torch.float16) + self.model.eval().cuda() + self.kwargs = kwargs + self.tokenizer = AutoTokenizer.from_pretrained(self.model_path, trust_remote_code=True) + torch.cuda.empty_cache() + self.num_beams = 1 if self.model_path == 'openbmb/MiniCPM-V' else 3 + self.options_system_prompt = ('Carefully read the following question and select the letter corresponding ' + 'to the correct answer. Highlight the applicable choices without giving ' + 'explanations.') + self.wo_options_system_prompt = 'Carefully read the following question Answer the question directly.' + self.detail_system_prompt = 'Answer this question in detail.' + self.vqa_prompt = 'Answer the question using a single word or phrase.' + + def use_custom_prompt(self, dataset): + if listinstr(['MCQ', 'VQA'], DATASET_TYPE(dataset)): + return True + elif dataset is not None and listinstr(['HallusionBench'], dataset): + return True + return False + + def build_prompt(self, line, dataset=None): + if isinstance(line, int): + line = self.data.iloc[line] + + tgt_path = self.dump_image(line, dataset) + system_prompt = '' + + question = line['question'] + if DATASET_TYPE(dataset) == 'MCQ': + options = { + cand: line[cand] + for cand in string.ascii_uppercase + if cand in line and not pd.isna(line[cand]) + } + options_prompt = 'Options:\n' + for key, item in options.items(): + options_prompt += f'{key}. {item}\n' + hint = line['hint'] if ('hint' in line and not pd.isna(line['hint'])) else None + prompt = '' + if hint is not None: + prompt += f'Hint: {hint}\n' + prompt += f'Question: {question}\n' + if len(options): + prompt += options_prompt + system_prompt = self.options_system_prompt + '\nPlease just indicate your choice.' + else: + system_prompt = self.wo_options_system_prompt + if 'MMMU' in dataset: # Corner Case + prompt = system_prompt + '\n' + prompt + system_prompt = '' + elif dataset is not None and listinstr(['HallusionBench'], dataset): + question = line['question'] + ' Yes or No?' + prompt = question + elif dataset is not None and listinstr(['MME'], dataset): + question = line['question'] + ' Yes or No?' + prompt = question + elif dataset is not None and listinstr(['OCRBench'], dataset): + system_prompt = self.vqa_prompt + question = line['question'] + prompt = question + elif DATASET_TYPE(dataset) == 'VQA': + if listinstr(['LLaVABench', 'MMLongBench_DOC'], dataset): + system_prompt = '' + prompt = question + elif listinstr(['MMVet'], dataset): + system_prompt = self.detail_system_prompt + prompt = question + else: + system_prompt = self.vqa_prompt + prompt = question + + msgs = [] + if system_prompt: + msgs.append(dict(type='text', value=system_prompt)) + if isinstance(tgt_path, list): + msgs.extend([dict(type='image', value=p) for p in tgt_path]) + else: + msgs = [dict(type='image', value=tgt_path)] + msgs.append(dict(type='text', value=prompt)) + return msgs + + def generate_inner(self, message, dataset=None): + if DATASET_TYPE(dataset) == 'MCQ': + max_new_tokens = 200 + elif DATASET_TYPE(dataset) == 'Y/N': + max_new_tokens = 3 + else: + max_new_tokens = 1024 + + default_kwargs = dict( + max_new_tokens=max_new_tokens, + sampling=False, + num_beams=self.num_beams, + ) + default_kwargs.update(self.kwargs) + + content = [] + for x in message: + if x['type'] == 'text': + content.append(x['value']) + elif x['type'] == 'image': + image = Image.open(x['value']).convert('RGB') + content.append(image) + msgs = [{'role': 'user', 'content': content}] + + res = self.model.chat( + msgs=msgs, + context=None, + image=None, + tokenizer=self.tokenizer, + **default_kwargs + ) + + if isinstance(res, tuple) and len(res) > 0: + res = res[0] + return res + + def chat_inner(self, message, dataset=None): + max_new_tokens = 1024 + + default_kwargs = dict( + max_new_tokens=max_new_tokens, + sampling=False, + num_beams=self.num_beams, + ) + default_kwargs.update(self.kwargs) + + msgs = [] + for msg in message: + content = [] + if len(msg['content']) == 1 and msg['content'][0]['type'] == 'text': + msg_new = {'role': msg['role'], 'content': msg['content'][0]['value']} + msgs.append(msg_new) + continue + + for x in msg['content']: + if x['type'] == 'text': + content.append(x['value']) + elif x['type'] == 'image': + image = Image.open(x['value']).convert('RGB') + content.append(image) + msg_new = {'role': msg['role'], 'content': content} + msgs.append(msg_new) + + res = self.model.chat( + msgs=msgs, + context=None, + image=None, + tokenizer=self.tokenizer, + **default_kwargs) + + if isinstance(res, tuple) and len(res) > 0: + res = res[0] + return res + + +class MiniCPM_V_2_6(BaseModel): + INSTALL_REQ = False + INTERLEAVE = True + + def __init__(self, model_path='openbmb/MiniCPM-V', **kwargs): + random.seed(0) + np.random.seed(0) + torch.manual_seed(0) + torch.cuda.manual_seed_all(0) + + assert model_path is not None + self.model_path = model_path + print(f'load from path {self.model_path}') + self.model = AutoModel.from_pretrained(self.model_path, trust_remote_code=True) + self.model = self.model.to(dtype=torch.bfloat16) + self.model.eval().cuda() + + self.kwargs = kwargs + self.tokenizer = AutoTokenizer.from_pretrained(self.model_path, trust_remote_code=True) + torch.cuda.empty_cache() + self.num_beams = 1 if self.model_path == 'openbmb/MiniCPM-V' else 3 + + self.options_suffix_prompt = '''\nAnswer with the option's letter from the given choices directly.''' + self.wo_options_system_prompt = 'Carefully read the following question Answer the question directly.' + self.detail_system_prompt = 'Answer this question in detail.' + self.vqa_prompt = 'Answer the question using a single word or phrase.' + + self.multi_choice_cot_prompt = ('''Carefully read the following multichoice question, solve it step ''' + '''by step and finally pick the option associated with the correct ''' + '''answer in the format of "Answer: selected option\n\n''') + self.short_ans_cot_prompt = ('''Read the following question carefully, solve it step by step, and ''' + '''then output the final answer in the format of "Answer: single number ''' + '''or single word or phrase".\n\n''') + + def use_custom_prompt(self, dataset=None): + if dataset is None: + return False + if DATASET_TYPE(dataset) in ['MCQ', 'VQA', 'Y/N']: + return True + return False + + def use_cot(self, dataset=None): + if dataset is None: + return False + if listinstr(['MMMU', 'HallusionBench', 'OCRBench', 'ChartQA'], dataset): + return True + elif listinstr(['MathVista', 'MMVet', 'MMBench', 'MMStar', 'AI2D', 'RealWorldQA', + 'POPE', 'ScienceQA', 'TextVQA', 'DocVQA'], dataset): + return False + else: + return False + + def use_upsize(self, dataset=None): + if dataset is None: + return False + if listinstr(['MMVet', 'MMBench', 'MMStar', 'AI2D', 'OCRBench'], dataset): + return True + else: + return False + + def build_prompt(self, line, dataset=None): + if isinstance(line, int): + line = self.data.iloc[line] + + tgt_path = self.dump_image(line, dataset) + system_prompt, prompt = '', '' + + question = line['question'] + + if not self.use_cot(dataset): + if DATASET_TYPE(dataset) == 'MCQ': + options = { + cand: line[cand] + for cand in string.ascii_uppercase + if cand in line and not pd.isna(line[cand]) + } + options_prompt = 'Options:\n' + for key, item in options.items(): + options_prompt += f'{key}. {item}\n' + hint = line['hint'] if ('hint' in line and not pd.isna(line['hint'])) else None + if hint is not None: + prompt += f'Hint: {hint}\n' + prompt += f'Question: {question}\n' + if len(options): + prompt += options_prompt + prompt += self.options_suffix_prompt + else: + system_prompt = self.wo_options_system_prompt + + if 'MMMU' in dataset: + if len(system_prompt) > 0: + prompt = system_prompt + '\n' + prompt + system_prompt = '' + elif dataset is not None and listinstr(['HallusionBench'], dataset): + question += ' Yes or No?' + prompt = question + elif dataset is not None and listinstr(['OCRBench'], dataset): + system_prompt = self.vqa_prompt + prompt = question + elif DATASET_TYPE(dataset) == 'VQA': + if listinstr(['LLaVABench'], dataset): + system_prompt = '' + elif listinstr(['MMVet'], dataset): + system_prompt = self.detail_system_prompt + else: + system_prompt = self.vqa_prompt + prompt = question + else: + prompt = question + else: + has_options = True + if DATASET_TYPE(dataset) == 'MCQ': + options = { + cand: line[cand] + for cand in string.ascii_uppercase + if cand in line and not pd.isna(line[cand]) + } + options_prompt = '' + for key, item in options.items(): + options_prompt += f'{key}. {item}\n' + hint = line['hint'] if ('hint' in line and not pd.isna(line['hint'])) else None + if hint is not None: + prompt += f'Hint: {hint}\n' + prompt += f'{question}\n' + + if len(options): + prompt += options_prompt + else: + has_options = False + + if 'MMMU' in dataset: + if len(system_prompt) > 0: + prompt = system_prompt + '\n' + prompt + system_prompt = '' + else: + prompt = question + + if DATASET_TYPE(dataset) in ['MCQ', 'Y/N', 'VQA']: + if DATASET_TYPE(dataset) == 'MCQ': + if has_options: + prompt = self.multi_choice_cot_prompt + prompt + else: + prompt = self.short_ans_cot_prompt + prompt + elif DATASET_TYPE(dataset) == 'Y/N': + prompt = self.short_ans_cot_prompt + prompt + else: + prompt = self.short_ans_cot_prompt + prompt + + msgs = [] + if system_prompt: + msgs.append(dict(type='text', value=system_prompt)) + if isinstance(tgt_path, list): + msgs.extend([dict(type='image', value=p) for p in tgt_path]) + else: + msgs = [dict(type='image', value=tgt_path)] + msgs.append(dict(type='text', value=prompt)) + + return msgs + + def generate_inner(self, message, dataset=None): + if DATASET_MODALITY(dataset) == 'VIDEO': + max_slice_nums = 1 + use_image_id = False + max_inp_length = 2048 * 10 + else: + max_slice_nums = None + use_image_id = True + max_inp_length = 8192 + + max_new_tokens = 2048 + default_kwargs = dict( + max_new_tokens=max_new_tokens, + sampling=False, + num_beams=self.num_beams, + ) + default_kwargs.update(self.kwargs) + + content = [] + + for x in message: + if x['type'] == 'text': + content.append(x['value']) + elif x['type'] == 'image': + image = Image.open(x['value']).convert('RGB') + if not self.use_upsize(dataset): + content.append(image) + else: + img_width, img_height = image.width, image.height + if (img_width * img_height) >= (1344 * 1344): + content.append(image) + else: + ratio = math.sqrt((1344 * 1344) / (img_width * img_height)) + max_img_width = int(img_width * ratio) + new_img_width = random.randint(img_width, max_img_width) + new_img_height = int(new_img_width / img_width * img_height) + resized_image = image.resize((new_img_width, new_img_height)) + content.append(resized_image) + msgs = [{'role': 'user', 'content': content}] + + res = self.model.chat( + image=None, + msgs=msgs, + context=None, + tokenizer=self.tokenizer, + max_inp_length=max_inp_length, + use_image_id=use_image_id, + max_slice_nums=max_slice_nums, + **default_kwargs + ) + + if isinstance(res, tuple) and len(res) > 0: + res = res[0] + + return res diff --git a/vlmeval/VLMEvalKit_old/vlmeval/vlm/mixsense.py b/vlmeval/VLMEvalKit_old/vlmeval/vlm/mixsense.py new file mode 100644 index 0000000000000000000000000000000000000000..154c494a630b4ea65a9155263767b6d5a29b7e6a --- /dev/null +++ b/vlmeval/VLMEvalKit_old/vlmeval/vlm/mixsense.py @@ -0,0 +1,46 @@ +import torch +import transformers +from transformers import AutoModelForCausalLM, AutoTokenizer +from PIL import Image +import warnings + +from .base import BaseModel +from ..smp import * + + +class LLama3Mixsense(BaseModel): + + INSTALL_REQ = False + INTERLEAVE = False + + def __init__(self, model_path='Zero-Vision/Llama-3-MixSenseV1_1', **kwargs): + assert model_path is not None + transformers.logging.set_verbosity_error() + transformers.logging.disable_progress_bar() + warnings.filterwarnings('ignore') + self.tokenizer = AutoTokenizer.from_pretrained( + model_path, trust_remote_code=True + ) + self.model = AutoModelForCausalLM.from_pretrained( + model_path, trust_remote_code=True + ).to('cuda').eval() + self.kwargs = kwargs + + def generate_inner(self, message, dataset=None): + prompt, image_path = self.message_to_promptimg(message) + input_ids = self.model.text_process(prompt, self.tokenizer).to(device='cuda') + image = Image.open(image_path).convert('RGB') + image_tensor = self.model.image_process([image]).to(dtype=self.model.dtype, device='cuda') + # generate + with torch.inference_mode(): + output_ids = self.model.generate( + input_ids, + images=image_tensor, + max_new_tokens=2048, + use_cache=True, + eos_token_id=[ + self.tokenizer.eos_token_id, + self.tokenizer.convert_tokens_to_ids(['<|eot_id|>'])[0], + ], + ) + return self.tokenizer.batch_decode(output_ids, skip_special_tokens=True)[0].strip() diff --git a/vlmeval/VLMEvalKit_old/vlmeval/vlm/mplug_owl2.py b/vlmeval/VLMEvalKit_old/vlmeval/vlm/mplug_owl2.py new file mode 100644 index 0000000000000000000000000000000000000000..30d6adc72dbdacc02c45c409d2890ce49988358c --- /dev/null +++ b/vlmeval/VLMEvalKit_old/vlmeval/vlm/mplug_owl2.py @@ -0,0 +1,126 @@ +import sys +import torch +from PIL import Image +from .base import BaseModel +from ..smp import * +from ..dataset import DATASET_TYPE + + +class mPLUG_Owl2(BaseModel): + + INSTALL_REQ = True + INTERLEAVE = False + + def __init__(self, model_path='MAGAer13/mplug-owl2-llama2-7b', **kwargs): + try: + from mplug_owl2.model.builder import load_pretrained_model + from mplug_owl2.mm_utils import get_model_name_from_path + except Exception as e: + logging.critical('Please install mPLUG_Owl2 before using mPLUG_Owl2. ') + raise e + + model_name = get_model_name_from_path(model_path) + tokenizer, model, image_processor, context_len = load_pretrained_model( + model_path, None, model_name, load_8bit=False, load_4bit=False, device='cpu') + + self.model = model.cuda() + self.device = self.model.device + self.image_processor = image_processor + tokenizer.padding_side = 'left' + tokenizer.pad_token_id = tokenizer.eos_token_id + self.tokenizer = tokenizer + self.context_len = context_len + + kwargs_default = dict( + max_new_tokens=512, do_sample=False, num_beams=1, + min_new_tokens=1, length_penalty=1, num_return_sequences=1) + kwargs_default.update(kwargs) + self.kwargs = kwargs_default + warnings.warn(f'Following kwargs received: {self.kwargs}, will use as generation config. ') + + def use_custom_prompt(self, dataset): + assert dataset is not None + if listinstr(['MMMU'], dataset): + return False + if DATASET_TYPE(dataset) == 'MCQ' or dataset == 'MMVet': + return True + return False + + def build_prompt(self, line, dataset=None): + assert dataset is None or isinstance(dataset, str) + assert self.use_custom_prompt(dataset) + tgt_path = self.dump_image(line, dataset) + question = line['question'] + if dataset == 'MMVet': + prompt = question + '\nAnswer the question directly. ' + elif DATASET_TYPE(dataset) == 'MCQ': + options = { + cand: line[cand] + for cand in string.ascii_uppercase + if cand in line and not pd.isna(line[cand]) + } + options_prompt = '' + for key, item in options.items(): + options_prompt += f'{key}. {item}\n' + + hint = line['hint'] if ('hint' in line and not pd.isna(line['hint'])) else None + prompt = f'Hint: {hint}\n' if hint is not None else '' + prompt += f'{question}\n' + prompt += ( + f'{options_prompt}\nAnswer with the option’s letter from the given choices directly. ' + if len(options) else 'Answer the question directly. ' + ) + else: + raise NotImplementedError + + message = [dict(type='text', value=prompt)] + message.extend([dict(type='image', value=s) for s in tgt_path]) + return message + + def generate_inner(self, message, dataset=None): + from mplug_owl2.constants import IMAGE_TOKEN_INDEX + from mplug_owl2.mm_utils import process_images, tokenizer_image_token + kwargs = cp.deepcopy(self.kwargs) + if dataset in ['MMVet', 'LLaVABench']: + kwargs['length_penalty'] = 0 + elif dataset is not None and DATASET_TYPE(dataset) == 'VQA': + kwargs['length_penalty'] = 0 + elif dataset is not None and DATASET_TYPE(dataset) == 'MCQ': + kwargs['max_new_tokens'] = 10 + num_images = len([x for x in message if x['type'] == 'image']) + assert num_images >= 0 + prompt_full = 'USER: ' + images = [] + if num_images == 1: + prompt, image = self.message_to_promptimg(message, dataset=dataset) + prompt_full += f'<|image|>{prompt} \nASSISTANT: ' + images.append(image) + else: + for msg in message: + if msg['type'] == 'image': + images.append(msg['value']) + prompt_full += '<|image|>' + elif msg['type'] == 'text': + prompt_full += msg['value'] + prompt_full += '\nASSISTANT: ' + + def preproc_image(fname): + image = Image.open(fname).convert('RGB') + max_edge = max(image.size) + image = image.resize((max_edge, max_edge)) + return image + images = [preproc_image(fname) for fname in images] + image_tensor = process_images(images, self.image_processor) + image_tensor = image_tensor.to(self.device, dtype=torch.float16) + input_ids = tokenizer_image_token( + prompt_full, self.tokenizer, IMAGE_TOKEN_INDEX, return_tensors='pt').unsqueeze(0).to(self.device) + + with torch.inference_mode(): + output_ids = self.model.generate( + input_ids=input_ids, + images=image_tensor, + output_hidden_states=True, + use_cache=True, + **kwargs) + answer = self.tokenizer.decode(output_ids[0, input_ids.shape[1]:]).strip() + return answer.split('')[0] diff --git a/vlmeval/VLMEvalKit_old/vlmeval/vlm/nvlm.py b/vlmeval/VLMEvalKit_old/vlmeval/vlm/nvlm.py new file mode 100644 index 0000000000000000000000000000000000000000..619341aeb4d79380894da649ef4b694a0a15cf69 --- /dev/null +++ b/vlmeval/VLMEvalKit_old/vlmeval/vlm/nvlm.py @@ -0,0 +1,148 @@ +import torch +from transformers import AutoTokenizer, AutoModel +import math +from PIL import Image +import torchvision.transforms as T +from torchvision.transforms.functional import InterpolationMode + +from .base import BaseModel +from ..smp import * +from ..dataset import DATASET_TYPE + + +IMAGENET_MEAN = (0.485, 0.456, 0.406) +IMAGENET_STD = (0.229, 0.224, 0.225) + + +def split_model(): + device_map = {} + + num_gpus = torch.cuda.device_count() + rank, world_size = get_rank_and_world_size() + num_gpus = num_gpus // world_size + num_layers = 80 + # Since the first GPU will be used for ViT, treat it as half a GPU. + num_layers_per_gpu = math.ceil(num_layers / (num_gpus - 0.5)) + num_layers_per_gpu = [num_layers_per_gpu] * num_gpus + num_layers_per_gpu[0] = math.ceil(num_layers_per_gpu[0] * 0.5) + layer_cnt = 0 + + for i, num_layer in enumerate(num_layers_per_gpu): + for j in range(num_layer): + device_map[f'language_model.model.layers.{layer_cnt}'] = rank + i * world_size + layer_cnt += 1 + + device_map['vision_model'] = rank + device_map['mlp1'] = rank + device_map['language_model.model.embed_tokens'] = rank + device_map['language_model.model.norm'] = rank + device_map['language_model.model.rotary_emb'] = rank + device_map['language_model.lm_head'] = rank + device_map[f'language_model.model.layers.{num_layers - 1}'] = rank + return device_map + + +def build_transform(input_size): + MEAN, STD = IMAGENET_MEAN, IMAGENET_STD + transform = T.Compose([ + T.Lambda(lambda img: img.convert('RGB') if img.mode != 'RGB' else img), + T.Resize((input_size, input_size), interpolation=InterpolationMode.BICUBIC), + T.ToTensor(), + T.Normalize(mean=MEAN, std=STD) + ]) + return transform + + +def find_closest_aspect_ratio(aspect_ratio, target_ratios, width, height, image_size): + best_ratio_diff = float('inf') + best_ratio = (1, 1) + area = width * height + for ratio in target_ratios: + target_aspect_ratio = ratio[0] / ratio[1] + ratio_diff = abs(aspect_ratio - target_aspect_ratio) + if ratio_diff < best_ratio_diff: + best_ratio_diff = ratio_diff + best_ratio = ratio + elif ratio_diff == best_ratio_diff: + if area > 0.5 * image_size * image_size * ratio[0] * ratio[1]: + best_ratio = ratio + return best_ratio + + +def dynamic_preprocess(image, min_num=1, max_num=12, image_size=448, use_thumbnail=False): + orig_width, orig_height = image.size + aspect_ratio = orig_width / orig_height + + # calculate the existing image aspect ratio + target_ratios = set( + (i, j) for n in range(min_num, max_num + 1) for i in range(1, n + 1) for j in range(1, n + 1) if + i * j <= max_num and i * j >= min_num) + target_ratios = sorted(target_ratios, key=lambda x: x[0] * x[1]) + + # find the closest aspect ratio to the target + target_aspect_ratio = find_closest_aspect_ratio( + aspect_ratio, target_ratios, orig_width, orig_height, image_size) + + # calculate the target width and height + target_width = image_size * target_aspect_ratio[0] + target_height = image_size * target_aspect_ratio[1] + blocks = target_aspect_ratio[0] * target_aspect_ratio[1] + + # resize the image + resized_img = image.resize((target_width, target_height)) + processed_images = [] + for i in range(blocks): + box = ( + (i % (target_width // image_size)) * image_size, + (i // (target_width // image_size)) * image_size, + ((i % (target_width // image_size)) + 1) * image_size, + ((i // (target_width // image_size)) + 1) * image_size + ) + # split the image + split_img = resized_img.crop(box) + processed_images.append(split_img) + assert len(processed_images) == blocks + if use_thumbnail and len(processed_images) != 1: + thumbnail_img = image.resize((image_size, image_size)) + processed_images.append(thumbnail_img) + return processed_images + + +def load_image(image_file, input_size=448, max_num=12): + image = Image.open(image_file).convert('RGB') + transform = build_transform(input_size=input_size) + images = dynamic_preprocess(image, image_size=input_size, use_thumbnail=True, max_num=max_num) + pixel_values = [transform(image) for image in images] + pixel_values = torch.stack(pixel_values) + return pixel_values + + +class NVLM(BaseModel): + + INSTALL_REQ = False + INTERLEAVE = False + + def __init__(self, model_path='nvidia/NVLM-D-72B', **kwargs): + assert model_path is not None + self.model_path = model_path + self.tokenizer = AutoTokenizer.from_pretrained(model_path, trust_remote_code=True, use_fast=False) + kwargs_default = dict(max_new_tokens=1024, do_sample=False) + kwargs_default.update(kwargs) + self.kwargs = kwargs_default + + self.model = AutoModel.from_pretrained( + model_path, + torch_dtype=torch.bfloat16, + low_cpu_mem_usage=True, + use_flash_attn=False, + trust_remote_code=True, + device_map=split_model()).eval() + + logging.info(f'Following kwargs received: {self.kwargs}, will use as generation config. ') + torch.cuda.empty_cache() + + def generate_inner(self, message, dataset=None): + prompt, image_path = self.message_to_promptimg(message, dataset=dataset) + pixel_values = load_image(image_path, max_num=6).to(torch.bfloat16).cuda() + response = self.model.chat(self.tokenizer, pixel_values, prompt, self.kwargs) + return response.strip() diff --git a/vlmeval/VLMEvalKit_old/vlmeval/vlm/omchat.py b/vlmeval/VLMEvalKit_old/vlmeval/vlm/omchat.py new file mode 100644 index 0000000000000000000000000000000000000000..c352d253e7e3ecad9d2eb38b63e5d90804c3fdf9 --- /dev/null +++ b/vlmeval/VLMEvalKit_old/vlmeval/vlm/omchat.py @@ -0,0 +1,159 @@ +import torch +from PIL import Image +import re +from transformers import AutoModel, AutoProcessor + +from .base import BaseModel +from ..smp import * +from ..dataset import DATASET_TYPE + + +class OmChat(BaseModel): + + INSTALL_REQ = True + INTERLEAVE = True + + def __init__(self, model_path='omlab/omchat-v2.0-13B-single-beta_hf', **kwargs): + + # Recommend to install `transformers==4.44.0` + assert model_path is not None + self.model_path = model_path + print(f'load from {self.model_path}') + model = AutoModel.from_pretrained(self.model_path, trust_remote_code=True, torch_dtype=torch.float16) + self.model = model.cuda().eval() + self.kwargs = kwargs + self.processor = AutoProcessor.from_pretrained(self.model_path, trust_remote_code=True) + torch.cuda.empty_cache() + + # system prompt + self.default_system_prompt = 'You are a helpful assistant. Focus on accuracy and reliability in your response.' + self.new1_system_prompt = 'You are a helpful assistant.' + self.new2_system_prompt = ( + 'Read the following question carefully, ' + 'solve it step by step, ' + 'and then output the final answer in the format of ' + "'Answer: single number or single word or phrase'.\n\n" + ) + + # suffix_prompt for MCQ + self.mcq_suffix_prompt_en = 'Please select the correct answer from the options above. \n' + self.mcq_suffix_prompt_cn = '请直接回答选项字母。\n' + # suffix_prompt for Y/N + self.yorn_suffix_prompt = ' Please answer yes or no. Answer the question using a single word or phrase.' + + def use_custom_prompt(self, dataset): + assert dataset is not None + if DATASET_TYPE(dataset) == 'MCQ' or DATASET_TYPE(dataset) == 'Y/N': + return True + return False + + def build_prompt(self, line, dataset=None): + assert dataset is None or isinstance(dataset, str) + assert self.use_custom_prompt(dataset) + tgt_path = self.dump_image(line, dataset) + + if isinstance(line, int): + line = self.data.iloc[line] + + question = line['question'] + + if DATASET_TYPE(dataset) == 'MCQ': + hint = line['hint'] if ('hint' in line and not pd.isna(line['hint'])) else None + options = { + cand: line[cand] + for cand in string.ascii_uppercase + if cand in line and not pd.isna(line[cand]) + } + options_prompt = 'Options:\n' + for key, item in options.items(): + options_prompt += f'{key}. {item}\n' + + prompt = '' + if hint is not None: + prompt += f'Hint: {hint}\n' + prompt += f'Question: {question}\n' + if len(options): + prompt += options_prompt + if not dataset.startswith('MMMU_'): + if not cn_string(prompt): + prompt += self.mcq_suffix_prompt_en + else: + prompt += self.mcq_suffix_prompt_cn + + elif DATASET_TYPE(dataset) == 'Y/N': + prompt = question + self.yorn_suffix_prompt + + print(DATASET_TYPE(dataset)) + message = [] + if isinstance(tgt_path, list): + message.extend([dict(type='image', value=p) for p in tgt_path]) + else: + message = [dict(type='image', value=tgt_path)] + message.append(dict(type='text', value=prompt)) + + return message + + def message_to_promptimg(self, message, dataset=None): + if dataset is None or listinstr(['MMMU'], dataset): + prompt = '\n'.join([ + re.sub(r'', '', x['value']) + for x in message + if x['type'] == 'text' + ]) + image = [x['value'] for x in message if x['type'] == 'image'] + else: + prompt = '\n'.join([x['value'] for x in message if x['type'] == 'text']) + image = [x['value'] for x in message if x['type'] == 'image'] + return prompt, image + + def generate_inner(self, message, dataset=None): + + def replace_last_dot(input_string): + if input_string.endswith('.'): + return input_string[:-1] + else: + return input_string + + prompt, image_path = self.message_to_promptimg(message, dataset=dataset) + image = [Image.open(img_path).convert('RGB') for img_path in image_path] + + default_kwargs = dict( + max_new_tokens=1024, + do_sample=False, + temperature=0.0, + top_p=1) + + if dataset is not None and listinstr(['MathVista_MINI'], dataset): + system_prompt = self.new2_system_prompt + elif dataset is not None and listinstr(['MMMU_DEV_VAL', 'MMStar'], dataset): + system_prompt = self.new1_system_prompt + else: + system_prompt = self.default_system_prompt + inputs = self.processor(text=prompt, system_prompt=system_prompt, images=image, return_tensors='pt').to('cuda') + default_kwargs.update(self.kwargs) + + with torch.inference_mode(): + output_ids = self.model.generate( + **inputs, + eos_token_id=self.model.generation_config.eos_token_id, + **default_kwargs + ) + res = self.processor.tokenizer.decode(output_ids[0, inputs.input_ids.shape[1]:]).strip() + if '<|im_end|>' in res: + res = res.split('<|im_end|>')[0].strip() + + if dataset != 'MMMU_DEV_VAL': + if res.startswith('Answer: '): + res = res[len('Answer: '):] + + match = re.search(r'\nThe answer is:(.+)', res) + if match: + res = match.group(1).strip() + + # for OCRBench + doc_match = re.search(r'(.*?)<\/doc>', res) + if doc_match: + res = doc_match.group(1).strip() + res = replace_last_dot(res) + + return res diff --git a/vlmeval/VLMEvalKit_old/vlmeval/vlm/omnilmm.py b/vlmeval/VLMEvalKit_old/vlmeval/vlm/omnilmm.py new file mode 100644 index 0000000000000000000000000000000000000000..12971cd7795d0bd0305186582ec6f9d6e0762a43 --- /dev/null +++ b/vlmeval/VLMEvalKit_old/vlmeval/vlm/omnilmm.py @@ -0,0 +1,183 @@ +import torch +from PIL import Image +from transformers import AutoTokenizer + +from .base import BaseModel +from ..smp import * +from ..dataset import DATASET_TYPE + + +DEFAULT_IMAGE_TOKEN = '' +DEFAULT_IMAGE_PATCH_TOKEN = '' +DEFAULT_IM_START_TOKEN = '' +DEFAULT_IM_END_TOKEN = '' + + +def init_omni_lmm(model_path): + from omnilmm.model.omnilmm import OmniLMMForCausalLM + from omnilmm.utils import disable_torch_init + from omnilmm.model.utils import build_transform + + torch.backends.cuda.matmul.allow_tf32 = True + disable_torch_init() + tokenizer = AutoTokenizer.from_pretrained(model_path, model_max_length=2048) + + model = OmniLMMForCausalLM.from_pretrained( + model_path, tune_clip=True, torch_dtype=torch.bfloat16, device_map='cpu' + ) + model = model.to(device='cuda', dtype=torch.bfloat16) + + image_processor = build_transform( + is_train=False, input_size=model.model.config.image_size, std_mode='OPENAI_CLIP' + ) + + mm_use_im_start_end = getattr(model.config, 'mm_use_im_start_end', False) + assert mm_use_im_start_end + + tokenizer.add_tokens( + [DEFAULT_IMAGE_PATCH_TOKEN, DEFAULT_IM_START_TOKEN, DEFAULT_IM_END_TOKEN], + special_tokens=True, + ) + + vision_config = model.model.vision_config + vision_config.im_patch_token = tokenizer.convert_tokens_to_ids( + [DEFAULT_IMAGE_PATCH_TOKEN] + )[0] + vision_config.use_im_start_end = mm_use_im_start_end + vision_config.im_start_token, vision_config.im_end_token = ( + tokenizer.convert_tokens_to_ids([DEFAULT_IM_START_TOKEN, DEFAULT_IM_END_TOKEN]) + ) + image_token_len = model.model.config.num_query + + return model, image_processor, image_token_len, tokenizer + + +def expand_question_into_multimodal( + question_text, image_token_len, im_st_token, im_ed_token, im_patch_token +): + if '' in question_text[0]['content']: + question_text[0]['content'] = question_text[0]['content'].replace( + '', im_st_token + im_patch_token * image_token_len + im_ed_token + ) + else: + question_text[0]['content'] = ( + im_st_token + + im_patch_token * image_token_len + + im_ed_token + + '\n' + + question_text[0]['content'] + ) + return question_text + + +def wrap_question_for_omni_lmm(question, image_token_len, tokenizer): + from omnilmm.train.train_utils import omni_preprocess + + question = expand_question_into_multimodal( + question, + image_token_len, + DEFAULT_IM_START_TOKEN, + DEFAULT_IM_END_TOKEN, + DEFAULT_IMAGE_PATCH_TOKEN, + ) + + conversation = question + data_dict = omni_preprocess( + sources=[conversation], tokenizer=tokenizer, generation=True + ) + + data_dict = dict(input_ids=data_dict['input_ids'][0], labels=data_dict['labels'][0]) + return data_dict + + +class OmniLMM12B(BaseModel): + + INSTALL_REQ = True + INTERLEAVE = False + + def __init__(self, model_path, root, **kwargs) -> None: + sys.path.append(root) + model, img_processor, image_token_len, tokenizer = init_omni_lmm(model_path) + self.model = model + self.image_token_len = image_token_len + self.image_transform = img_processor + self.tokenizer = tokenizer + self.model.eval() + default_kwargs = dict( + max_new_tokens=512, + do_sample=False, + output_scores=True, + return_dict_in_generate=True, + repetition_penalty=1.1, + ) + default_kwargs.update(kwargs) + self.kwargs = default_kwargs + torch.cuda.empty_cache() + + def generate_inner(self, message, dataset=None): + prompt, image_path = self.message_to_promptimg(message, dataset=dataset) + try: + image = Image.open(image_path).convert('RGB') + except: + logger = get_logger('OmniLMM Inference') + logger.error('Image Decode Error') + return 'Image Decode Error' + + msgs = [dict(role='user', content=prompt)] + input_ids = wrap_question_for_omni_lmm( + msgs, self.image_token_len, self.tokenizer + )['input_ids'] + input_ids = torch.as_tensor(input_ids) + image = self.image_transform(image) + + with torch.inference_mode(): + output = self.model.generate_vllm( + input_ids=input_ids.unsqueeze(0).cuda(), + images=image.unsqueeze(0).half().cuda(), + **self.kwargs, + ) + + response = self.tokenizer.decode( + output.sequences[0], skip_special_tokens=True + ) + response = response.strip() + return response + + def use_custom_prompt(self, dataset): + assert dataset is not None + if DATASET_TYPE(dataset) == 'MCQ': + return True + return False + + def build_prompt(self, line, dataset=None): + assert dataset is None or isinstance(dataset, str) + assert self.use_custom_prompt(dataset) + tgt_path = self.dump_image(line, dataset) + + question = line['question'] + options = { + cand: line[cand] + for cand in string.ascii_uppercase + if cand in line and not pd.isna(line[cand]) + } + options_prompt = 'Options:\n' + for key, item in options.items(): + options_prompt += f'{key}. {item}\n' + hint = line['hint'] if ('hint' in line and not pd.isna(line['hint'])) else None + prompt = '' + if hint is not None: + prompt += f'Hint: {hint}\n' + prompt += f'{question}\n' + if len(options): + prompt += options_prompt + prompt = ( + """ +Study the image carefully and pick the option associated with the correct answer. +Focus solely on selecting the option and avoid including any other content.\n +""" + + prompt + ) + + message = [dict(type='text', value=prompt)] + message.extend([dict(type='image', value=s) for s in tgt_path]) + return message diff --git a/vlmeval/VLMEvalKit_old/vlmeval/vlm/pandagpt.py b/vlmeval/VLMEvalKit_old/vlmeval/vlm/pandagpt.py new file mode 100644 index 0000000000000000000000000000000000000000..76bebe88e5aa823d61e6649325e92da187e110b6 --- /dev/null +++ b/vlmeval/VLMEvalKit_old/vlmeval/vlm/pandagpt.py @@ -0,0 +1,63 @@ +import sys +import torch +import os.path as osp +import warnings +from .base import BaseModel +from ..smp import * + + +class PandaGPT(BaseModel): + + INSTALL_REQ = True + INTERLEAVE = False + + def __init__(self, name, root=None, **kwargs): + if root is None: + raise ValueError('Please set `root` to PandaGPT code directory, which is cloned from here: ') + + assert name == 'PandaGPT_13B' + self.name = name + sys.path.append(osp.join(root, 'code')) + try: + from model.openllama import OpenLLAMAPEFTModel + except Exception as e: + logging.critical( + 'Please first install PandaGPT and set the root path to use PandaGPT, ' + 'which is cloned from here: https://github.com/yxuansu/PandaGPT. ' + ) + raise e + + self.args = { + 'model': 'openllama_peft', + 'imagebind_ckpt_path': osp.join(root, 'pretrained_ckpt/imagebind_ckpt'), + 'vicuna_ckpt_path': osp.join(root, 'pretrained_ckpt/vicuna_ckpt/13b_v0'), + 'delta_ckpt_path': osp.join(root, 'pretrained_ckpt/pandagpt_ckpt/13b/pytorch_model.pt'), + 'stage': 2, + 'max_tgt_len': 512, + 'lora_r': 32, + 'lora_alpha': 32, + 'lora_dropout': 0.1, + } + model = OpenLLAMAPEFTModel(**self.args) + delta_ckpt = torch.load(self.args['delta_ckpt_path'], map_location=torch.device('cpu')) + model.load_state_dict(delta_ckpt, strict=False) + torch.cuda.empty_cache() + self.model = model.eval().half().cuda() + kwargs_default = {'top_p': 0.9, 'do_sample': False, 'max_tgt_len': 128, 'temperature': 0.001} + kwargs_default.update(kwargs) + self.kwargs = kwargs_default + warnings.warn(f'Following kwargs received: {self.kwargs}, will use as generation config. ') + + def generate_inner(self, message, dataset=None): + prompt, image_path = self.message_to_promptimg(message, dataset=dataset) + struct = { + 'prompt': prompt, + 'image_paths': [image_path], + 'audio_paths': [], + 'video_paths': [], + 'thermal_paths': [], + 'modality_embeds': [] + } + struct.update(self.kwargs) + resp = self.model.generate(struct) + return resp diff --git a/vlmeval/VLMEvalKit_old/vlmeval/vlm/parrot.py b/vlmeval/VLMEvalKit_old/vlmeval/vlm/parrot.py new file mode 100644 index 0000000000000000000000000000000000000000..518f00b460eec68a2f719ed089ab152e96d5ce6e --- /dev/null +++ b/vlmeval/VLMEvalKit_old/vlmeval/vlm/parrot.py @@ -0,0 +1,216 @@ +import os + +import torch +from PIL import Image +from abc import abstractproperty +from .base import BaseModel +from ..dataset import DATASET_TYPE +from ..smp import * + + +class Parrot(BaseModel): + INSTALL_REQ = False + INTERLEAVE = False + + def __init__(self, model_path='AIDC-AI/Parrot-7B', **kwargs): + try: + from parrot.model.parrot_arch import ParrotMetaForCausalLM + from parrot.utils.constants import DEFAULT_IMAGE_TOKEN, BEGIN_LINE, END_LINE + from parrot.model.conversation_formatter import ConversationFormatter + from parrot.utils.mm_utils import process_images + except Exception as e: + logging.critical('Please install Parrot before using Parrot') + logging.critical('Please install Parrot from https://github.com/AIDC-AI/Parrot') + logging.critical('Using `pip install -e . --no-deps` in the Parrot directory') + logging.critical('Recommend to install transformers==4.39.0') + raise e + + self.process_images = process_images + self.ConversationFormatter = ConversationFormatter + self.DEFAULT_IMAGE_TOKEN = DEFAULT_IMAGE_TOKEN + self.BEGIN_LINE = BEGIN_LINE + self.END_LINE = END_LINE + + try: + model_name = 'parrot_qwen2' + model, tokenizer, conversation_formatter = ParrotMetaForCausalLM.build( + model_name, model_path, mm_vision_tower='openai/clip-vit-large-patch14-336' + ) + self.model = model.cuda() + self.vision_tower = self.model.get_vision_tower() + self.tokenizer = tokenizer + self.conversation_formatter = conversation_formatter + self.image_processor = self.model.get_vision_tower().image_processor + except Exception as e: + logging.critical('Error when loading Parrot model:') + raise e + + self.kwargs = dict( + do_sample=False, + num_beams=1, + max_new_tokens=512, + repetition_penalty=None, + use_cache=True, + eos_token_id=self.tokenizer.eos_token_id, + pad_token_id=self.tokenizer.pad_token_id + ) + if int(os.environ.get('LOCAL_RANK', '0')) == 0: + print(f'Following kwargs {self.kwargs} will be used as generation config.') + + self.count = 0 + + def use_custom_prompt(self, dataset): + if DATASET_TYPE(dataset) == 'Y/N' or DATASET_TYPE(dataset) == 'MCQ': + return True + return False + + def build_prompt(self, line, dataset=None): + assert self.use_custom_prompt(dataset) + assert isinstance(dataset, str) + tgt_path = self.dump_image(line, dataset) + + if DATASET_TYPE(dataset) == 'Y/N': + prompt = self.built_yorn_prompt(line, dataset) + elif DATASET_TYPE(dataset) == 'MCQ': + prompt = self.build_multi_choice_prompt(line, dataset) + else: + raise ValueError(f'Invalid dataset type: {DATASET_TYPE(dataset)}') + + message = [dict(type='text', value=prompt)] + message.extend([dict(type='image', value=p) for p in tgt_path]) + return message + + def built_yorn_prompt(self, line, dataset=None): + prompt = line['question'] + previous_suffixs = [' Please answer yes or no.', ' Yes or No', ' Answer in one sentence.'] + for previous_suffix in previous_suffixs: + if prompt.endswith(previous_suffix): + prompt = prompt[:-len(previous_suffix)] + break + prompt += '\n请直接回答Yes或No。请用单个词或短语回答问题。' if cn_string( + prompt) else '\nPlease strictly answer Yes or No. Answer the question using a single word or phrase.' + return prompt + + def build_multi_choice_prompt(self, line, dataset=None): + question = line['question'] + hint = line['hint'] if ('hint' in line and not pd.isna(line['hint'])) else None + if hint is not None: + question = hint + '\n' + question + + options = { + cand: line[cand] + for cand in string.ascii_uppercase + if cand in line and not pd.isna(line[cand]) + } + for key, item in options.items(): + question += f'\n{key}. {item}' + prompt = question + + if len(options): + default_prompt = "\nAnswer with the option's letter from the given choices directly." + if dataset[-3:] == '_cn' or cn_string(prompt): + default_prompt = '\n请直接用给定选项中的选项字母回答。' + elif dataset[-3:] == '_pt': + default_prompt = '\nResponda diretamente com a letra da opção das escolhas dadas.' + elif dataset[-3:] == '_ar': + default_prompt = '\nأجب مباشرةً بحرف الخيار من الاختيارات المعطاة.' + elif dataset[-3:] == '_ru': + default_prompt = '\nОтветьте буквой варианта из предложенных вариантов напрямую.' + elif dataset[-3:] == '_tr': + default_prompt = '\nVerilen seçeneklerden doğrudan seçeneğin harfi ile cevap verin.' + prompt += default_prompt + # prompt += ( + # '\n请直接回答选项字母。' if cn_string(prompt) else + # "\nAnswer with the option's letter from the given choices directly." + # ) + else: + prompt += '\n请用单个词或短语回答问题。' if cn_string( + prompt) else '\nAnswer the question using a single word or phrase.' + + return prompt + + def process_answer_prefix(self, answer, prefixes): + for prefix in prefixes: + if prefix in answer.lower(): + return answer[answer.lower().find(prefix) + len(prefix):] + return answer + + def generate_inner(self, message, dataset=None): + query, image_paths = self.prepare_inputs(message) + images_list = [Image.open(image_path).convert('RGB') for image_path in image_paths] + args = abstractproperty() + args.image_aspect_ratio = 'pad' + image_tensors = self.process_images(images_list, self.image_processor, args).cuda() + prompt, input_ids = self.conversation_formatter.format_query(query) + input_ids = input_ids.unsqueeze(0).cuda() + + with torch.inference_mode(): + kwargs = dict( + images=image_tensors, + ) + kwargs.update(self.kwargs) + output_ids = self.model.generate(input_ids, **kwargs) + + input_token_len = input_ids.shape[1] + n_diff_input_output = (input_ids != output_ids[:, :input_token_len]).sum().item() + if n_diff_input_output > 0: + print(f'[Warning] {n_diff_input_output} output_ids are not the same as the input_ids') + response = self.tokenizer.batch_decode(output_ids[:, input_token_len:], + skip_special_tokens=True)[0].strip(string.whitespace) + answer = response + + if query.endswith("Answer with the option's letter from the given choices directly.") or query.endswith( + '请直接回答选项字母。'): + qtype = 'multiple-choice' + while True: + answer = answer.strip(string.punctuation + string.whitespace) + if len(answer) > 1: + if answer[0] in string.ascii_uppercase and answer[1] in string.whitespace + string.punctuation: + answer = answer[0] + break + elif answer[-1] in string.ascii_uppercase and answer[-2] in string.whitespace + string.punctuation: + answer = answer[-1] + break + elif listinstr(['answer is', 'answer:'], answer.lower()): + answer = self.process_answer_prefix(answer, ['answer is', 'answer:']) + answer = self.process_answer_prefix(answer, ['option']) + else: + break + else: + break + else: + qtype = 'open' + + if self.count % 50 == 0 and int(os.environ.get('LOCAL_RANK', '0')) == 0: + print(f'\n{self.BEGIN_LINE}') + print(f'image_paths: {image_paths}\n') + print(f'prompt: {prompt}\n') + print(f'qtype: {qtype}\n') + print(f'output: {response}\n') + print(f'answer: {answer}\n') + print(f'{self.END_LINE}\n', flush=True) + + self.count += 1 + + return answer + + def prepare_inputs(self, message): + prompt = '' + image_paths = [] + image_count = 0 + text_count = 0 + pure_text = '' + for msg in message: + if msg['type'] == 'text': + text_count += 1 + prompt += msg['value'] + pure_text += msg['value'] + elif msg['type'] == 'image': + image_count += 1 + prompt += self.DEFAULT_IMAGE_TOKEN + image_paths.append(msg['value']) + + if image_count == 1 and text_count == 1: + prompt = self.DEFAULT_IMAGE_TOKEN + '\n' + pure_text + + return prompt, image_paths diff --git a/vlmeval/VLMEvalKit_old/vlmeval/vlm/pixtral.py b/vlmeval/VLMEvalKit_old/vlmeval/vlm/pixtral.py new file mode 100644 index 0000000000000000000000000000000000000000..b26fe30aefc51cb6bc664d2fa3f14ec6350e54c8 --- /dev/null +++ b/vlmeval/VLMEvalKit_old/vlmeval/vlm/pixtral.py @@ -0,0 +1,70 @@ +import torch +from PIL import Image +from .base import BaseModel +from ..smp import * +import warnings +from huggingface_hub import snapshot_download + + +class Pixtral(BaseModel): + + INSTALL_REQ = False + INTERLEAVE = True + + def __init__(self, model_path='mistralai/Pixtral-12B-2409', **kwargs): + + self.model_path = model_path + try: + from mistral_inference.transformer import Transformer + from mistral_common.tokens.tokenizers.mistral import MistralTokenizer + except ImportError as err: + logging.critical('Please install `mistral-inference` and `mistral_common`') + raise err + + if os.path.exists(model_path): + cache_path = model_path + else: + if get_cache_path(model_path) is None: + snapshot_download(repo_id=model_path) + cache_path = get_cache_path(self.model_path) + + self.tokenizer = MistralTokenizer.from_file(f'{cache_path}/tekken.json') + model = Transformer.from_folder(cache_path, device='cpu') + model.cuda() + self.model = model + self.max_tokens = 512 + + def generate_inner(self, message, dataset=None): + try: + from mistral_inference.generate import generate + from mistral_common.protocol.instruct.messages import UserMessage, TextChunk, ImageURLChunk + from mistral_common.protocol.instruct.request import ChatCompletionRequest + except ImportError as err: + logging.critical('Please install `mistral-inference` and `mistral_common`') + raise err + + msg_new = [] + for msg in message: + tp, val = msg['type'], msg['value'] + if tp == 'text': + msg_new.append(TextChunk(text=val)) + elif tp == 'image': + b64 = encode_image_file_to_base64(val) + image_url = f'data:image/jpeg;base64,{b64}' + msg_new.append(ImageURLChunk(image_url=image_url)) + + completion_request = ChatCompletionRequest(messages=[UserMessage(content=msg_new)]) + encoded = self.tokenizer.encode_chat_completion(completion_request) + images = encoded.images + tokens = encoded.tokens + + out_tokens, _ = generate( + [tokens], + self.model, + images=[images], + max_tokens=self.max_tokens, + temperature=0, + eos_id=self.tokenizer.instruct_tokenizer.tokenizer.eos_id) + + result = self.tokenizer.decode(out_tokens[0]) + return result diff --git a/vlmeval/VLMEvalKit_old/vlmeval/vlm/points.py b/vlmeval/VLMEvalKit_old/vlmeval/vlm/points.py new file mode 100644 index 0000000000000000000000000000000000000000..d79b1b18eff29d148ada31f0d69e830a91e076e1 --- /dev/null +++ b/vlmeval/VLMEvalKit_old/vlmeval/vlm/points.py @@ -0,0 +1,138 @@ +from transformers import AutoModelForCausalLM, AutoTokenizer +from transformers import CLIPImageProcessor +import transformers +from PIL import Image +import torch +from .base import BaseModel +from ..dataset import DATASET_TYPE +from ..smp import cn_string, listinstr +import pandas as pd +import string +from typing import List + + +class POINTS(BaseModel): + """Official implementation of POINTS: Improving Your Vision-language Model with Affordable Strategies # noqa + + Paper link: https://arxiv.org/abs/2409.04828 + POINTS is a vision-language model developed by researchers at WeChat AI. This model represents the inaugural version in our + series of multimodal models, known as WePOINTS. + + Args: + model_path (str): The path or the name (the unique huggingface id) of the model. + """ + + def __init__(self, model_path: str, **kwargs) -> None: + version = transformers.__version__ + use_fast = True + if 'yi' in model_path.lower(): + assert version == '4.38.2', f'The version of transformers for Yi-1.5 should be 4.38.2, but got {version}.' # noqa + use_fast = False + self.tokenizer = AutoTokenizer.from_pretrained( + model_path, use_fast=use_fast) + self.model = AutoModelForCausalLM.from_pretrained(model_path, + trust_remote_code=True, # noqa + device_map='cuda' + ).to(torch.bfloat16) + self.image_processor = CLIPImageProcessor.from_pretrained( + model_path) + + def use_custom_prompt(self, dataset: str) -> bool: + """Whether to use custom prompt for the dataset. + + Args: + dataset (str): The name of the dataset. + + Returns: + bool: Whether to use custom prompt for the dataset. + """ + if DATASET_TYPE(dataset) == 'MCQ': + return True + return False + + def build_prompt(self, line: str, dataset: str) -> List[dict]: + """Build prompt for multi-choice dataset. + + Args: + line (str): one line of the dataset. + dataset (str): The name of the dataset. + + Returns: + List[dict]: A list of elements constructed for current line. + """ + assert self.use_custom_prompt(dataset) + assert isinstance(dataset, str) + tgt_path = self.dump_image(line, dataset) + + question = line['question'] + hint = line['hint'] if ( + 'hint' in line and not pd.isna(line['hint'])) else None + if hint is not None: + question = hint + '\n' + question + + options = { + cand: line[cand] + for cand in string.ascii_uppercase + if cand in line and not pd.isna(line[cand]) + } + for key, item in options.items(): + question += f'\n{key}. {item}' + prompt = question + + if len(options): + prompt += ( + '\n请直接回答选项字母。' if cn_string(prompt) else # noqa + "\nAnswer with the option\'s letter from the given choices directly." # noqa + ) + else: + prompt += '\n请直接回答问题。' if cn_string( # noqa + prompt) else '\nAnswer the question directly.' + message = [dict(type='image', value=s) for s in tgt_path] + message.append(dict(type='text', value=prompt)) + return message + + def generate_inner(self, message: List[dict], dataset: str = None) -> str: + """Generate response for the given message. + + Args: + message (List[dict]): A list of elements constructed for + current line. + dataset (str): The name of the dataset. + + Returns: + str: The generated response. + """ + prompt, image_path = self.message_to_promptimg(message) + catty = True # whether to use catty + if dataset == 'HallusionBench': + prompt = prompt + \ + ' Please answer yes or no. Answer the question using a single word or phrase.' # noqa + elif dataset == 'MMVet': + prompt = prompt + ' Answer this question in detail.' + catty = False + else: + # use default setting + pass + + if dataset is None: + max_splits = 8 + elif listinstr(['MMBench', 'OCRBench'], dataset): + max_splits = 12 + else: + max_splits = 8 + + image = Image.open(image_path).convert('RGB') + generation_config = { + 'max_new_tokens': 1024, + 'temperature': 0.0, + 'top_p': 0.0, + 'num_beams': 1, + } + response = self.model.chat(image, + prompt, + self.tokenizer, + self.image_processor, + catty, + generation_config, + max_splits) + return response diff --git a/vlmeval/VLMEvalKit_old/vlmeval/vlm/qh_360vl.py b/vlmeval/VLMEvalKit_old/vlmeval/vlm/qh_360vl.py new file mode 100644 index 0000000000000000000000000000000000000000..a52e10302f69dfce8dbf7ea4d0cbfaba3f7b3dfa --- /dev/null +++ b/vlmeval/VLMEvalKit_old/vlmeval/vlm/qh_360vl.py @@ -0,0 +1,61 @@ +import torch +from transformers import AutoModelForCausalLM, AutoTokenizer +import warnings +import os.path as osp +from PIL import Image +from .base import BaseModel +from ..smp import * +from ..dataset import DATASET_TYPE + + +class QH_360VL(BaseModel): + + INSTALL_REQ = False + INTERLEAVE = False + + def __init__(self, model_path='qihoo360/360VL-70B', **kwargs): + assert model_path is not None + self.model_path = model_path + self.tokenizer = AutoTokenizer.from_pretrained(model_path, trust_remote_code=True) + self.model = AutoModelForCausalLM.from_pretrained(model_path, + torch_dtype=torch.float16, + low_cpu_mem_usage=True, + device_map='auto', + trust_remote_code=True).eval() + vision_tower = self.model.get_vision_tower() + vision_tower.load_model() + vision_tower.to(device='cuda', dtype=torch.float16) + self.image_processor = vision_tower.image_processor + self.tokenizer.pad_token = self.tokenizer.eos_token + self.kwargs = kwargs + warnings.warn(f'Following kwargs received: {self.kwargs}, will use as generation config. ') + torch.cuda.empty_cache() + + def generate(self, message, dataset=None): + + prompt, image_path = self.message_to_promptimg(message, dataset=dataset) + print(prompt) + image = Image.open(image_path).convert('RGB') + terminators = [ + self.tokenizer.convert_tokens_to_ids('<|eot_id|>',) + ] + inputs = self.model.build_conversation_input_ids(self.tokenizer, + query=prompt, + image=image, + image_processor=self.image_processor) + input_ids = inputs['input_ids'].to(device='cuda', non_blocking=True) + images = inputs['image'].to(dtype=torch.float16, device='cuda', non_blocking=True) + + output_ids = self.model.generate(input_ids=input_ids, + images=images, + do_sample=False, + num_beams=1, + max_new_tokens=512, + eos_token_id=terminators, + use_cache=True) + + input_token_len = input_ids.shape[1] + outputs = self.tokenizer.batch_decode(output_ids[:, input_token_len:], skip_special_tokens=True)[0] + response = outputs.strip() + + return response diff --git a/vlmeval/VLMEvalKit_old/vlmeval/vlm/transcore_m.py b/vlmeval/VLMEvalKit_old/vlmeval/vlm/transcore_m.py new file mode 100644 index 0000000000000000000000000000000000000000..e235d136812ed6f26fa320f0e11ddfa70ee43161 --- /dev/null +++ b/vlmeval/VLMEvalKit_old/vlmeval/vlm/transcore_m.py @@ -0,0 +1,162 @@ +import sys +import torch +from abc import abstractproperty +from .base import BaseModel +from ..smp import * +from ..dataset import DATASET_TYPE +from transformers import AutoTokenizer, BitsAndBytesConfig + + +class TransCoreM(BaseModel): + + INSTALL_REQ = True + INTERLEAVE = False + + def load_pretrained_model(self, model_path, load_8bit=False, load_4bit=False, revision='main'): + from transcorem.model import TransCoreMQWenForCausalLM + from transcorem.constants import DEFAULT_IMAGE_PATCH_TOKEN, DEFAULT_IM_START_TOKEN, DEFAULT_IM_END_TOKEN + import transcorem.config_param as config_param + kwargs = {'revision': revision} + if load_8bit: + kwargs['load_in_8bit'] = True + elif load_4bit: + kwargs['load_in_4bit'] = True + kwargs['quantization_config'] = BitsAndBytesConfig( + load_in_4bit=True, + bnb_4bit_compute_dtype=torch.float16, + bnb_4bit_use_double_quant=True, + bnb_4bit_quant_type='nf4' + ) + else: + kwargs['torch_dtype'] = torch.float16 + + config_param.model_path = model_path + tokenizer = AutoTokenizer.from_pretrained( + model_path, use_fast=False, revision=revision, trust_remote_code=True) + model = TransCoreMQWenForCausalLM.from_pretrained( + model_path, low_cpu_mem_usage=True, trust_remote_code=True, **kwargs) + + image_processor = None + mm_use_im_start_end = getattr(model.config, 'mm_use_im_start_end', False) + mm_use_im_patch_token = getattr(model.config, 'mm_use_im_patch_token', True) + if mm_use_im_patch_token: + tokenizer.add_tokens([DEFAULT_IMAGE_PATCH_TOKEN], special_tokens=True) + if mm_use_im_start_end: + tokenizer.add_tokens([DEFAULT_IM_START_TOKEN, DEFAULT_IM_END_TOKEN], special_tokens=True) + model.resize_token_embeddings(len(tokenizer)) + + vision_tower = model.get_vision_tower() + if not vision_tower.is_loaded: + vision_tower.load_model() + vision_tower.to(device='cpu', dtype=torch.float16) + image_processor = vision_tower.image_processor + + if hasattr(model.config, 'max_sequence_length'): + context_len = model.config.max_sequence_length + else: + context_len = 2048 + + return tokenizer, model, image_processor, context_len + + def __init__(self, + root=None, + revision='main', + **kwargs): + + self.root = root + self.revision = revision + sys.path.append(root) + + model_path = 'PCIResearch/TransCore-M' + assert osp.exists(model_path) or splitlen(model_path) == 2 + self.tokenizer, self.model, self.image_processor, self.context_len = self.load_pretrained_model( + model_path=model_path, revision=revision) + self.model = self.model.cuda() + print('==============conv_mode: transcorem_v1') + self.conv_mode = 'transcorem_v1' + + kwargs_default = dict(do_sample=False, temperature=0.0, max_new_tokens=512, top_p=None, num_beams=1) + kwargs_default.update(kwargs) + self.kwargs = kwargs_default + warnings.warn(f'Following kwargs received: {self.kwargs}, will use as generation config. ') + + def use_custom_prompt(self, dataset): + assert dataset is not None + if DATASET_TYPE(dataset) == 'MCQ': + return True + return False + + def build_prompt(self, line, dataset=None): + assert dataset is None or isinstance(dataset, str) + assert self.use_custom_prompt(dataset) + tgt_path = self.dump_image(line, dataset) + + question = line['question'] + hint = line['hint'] if ('hint' in line and not pd.isna(line['hint'])) else None + if hint is not None: + question = hint + '\n' + question + + options = { + cand: line[cand] + for cand in string.ascii_uppercase + if cand in line and not pd.isna(line[cand]) + } + for key, item in options.items(): + question += f'\n{key}. {item}' + prompt = question + + if len(options): + prompt += ( + '\n请直接回答选项字母。' if cn_string(prompt) else + "\nAnswer with the option's letter from the given choices directly." + ) + else: + prompt += '\n请直接回答问题。' if cn_string(prompt) else '\nAnswer the question directly.' + message = [dict(type='text', value=prompt)] + message.extend([dict(type='image', value=f) for f in tgt_path]) + return message + + def generate_inner(self, message, dataset=None): + from transcorem.mm_utils import highres_process_images, tokenizer_image_token, KeywordsStoppingCriteria + from transcorem.constants import ( + IMAGE_TOKEN_INDEX, DEFAULT_IMAGE_TOKEN, DEFAULT_IM_START_TOKEN, DEFAULT_IM_END_TOKEN) + from transcorem.conversation import conv_templates, SeparatorStyle + + prompt, image_path = self.message_to_promptimg(message, dataset=dataset) + image = Image.open(image_path).convert('RGB') + args = abstractproperty() + args.image_aspect_ratio = 'pad' + image_patches = highres_process_images(image, self.image_processor, args, base_reso=336) + image_patches = [patch.unsqueeze(0).to('cuda', dtype=torch.float16) for patch in image_patches] + if self.model.config.mm_use_im_start_end: + inp = DEFAULT_IM_START_TOKEN + DEFAULT_IMAGE_TOKEN + DEFAULT_IM_END_TOKEN + '\n' + prompt + else: + inp = DEFAULT_IMAGE_TOKEN + '\n' + prompt + + conv = conv_templates[self.conv_mode].copy() + conv.append_message(conv.roles[0], inp) + conv.append_message(conv.roles[1], None) + prompt_conv = conv.get_prompt() + input_ids = tokenizer_image_token(prompt_conv, self.tokenizer, IMAGE_TOKEN_INDEX, + return_tensors='pt').unsqueeze(0).cuda() + stop_str = conv.sep if conv.sep_style != SeparatorStyle.TWO else conv.sep2 + keywords = [stop_str] + stopping_criteria = KeywordsStoppingCriteria(keywords, self.tokenizer, input_ids) + with torch.inference_mode(): + output_ids = self.model.generate( + input_ids, + images=image_patches, + use_cache=True, + stopping_criteria=[stopping_criteria], + **self.kwargs) + + input_token_len = input_ids.shape[1] + n_diff_input_output = (input_ids != output_ids[:, :input_token_len]).sum().item() + if n_diff_input_output > 0: + print(f'[Warning] {n_diff_input_output} output_ids are not the same as the input_ids') + outputs = self.tokenizer.batch_decode(output_ids[:, input_token_len:], skip_special_tokens=True)[0] + outputs = outputs.strip() + if outputs.endswith(stop_str): + outputs = outputs[:-len(stop_str)] + outputs = outputs.strip() + return outputs diff --git a/vlmeval/VLMEvalKit_old/vlmeval/vlm/vintern_chat.py b/vlmeval/VLMEvalKit_old/vlmeval/vlm/vintern_chat.py new file mode 100644 index 0000000000000000000000000000000000000000..f53d82f25cf5e0dfa8b8421c42bcddef0e40f5f0 --- /dev/null +++ b/vlmeval/VLMEvalKit_old/vlmeval/vlm/vintern_chat.py @@ -0,0 +1,395 @@ +import torch +from transformers import AutoTokenizer, AutoConfig, AutoModel, CLIPImageProcessor +import warnings +from PIL import Image +from .base import BaseModel +from ..smp import * +from ..dataset import DATASET_TYPE, DATASET_MODALITY +import pandas as pd +import string +import torch.distributed as dist +import torchvision.transforms as T +import transformers + +from torchvision.transforms.functional import InterpolationMode +import re + + +IMAGENET_MEAN = (0.485, 0.456, 0.406) +IMAGENET_STD = (0.229, 0.224, 0.225) + + +def build_transform(input_size): + MEAN, STD = IMAGENET_MEAN, IMAGENET_STD + transform = T.Compose([ + T.Lambda(lambda img: img.convert('RGB') if img.mode != 'RGB' else img), + T.Resize((input_size, input_size), interpolation=InterpolationMode.BICUBIC), + T.ToTensor(), + T.Normalize(mean=MEAN, std=STD) + ]) + return transform + + +def find_closest_aspect_ratio(aspect_ratio, target_ratios, width, height, image_size): + best_ratio_diff = float('inf') + best_ratio = (1, 1) + area = width * height + for ratio in target_ratios: + target_aspect_ratio = ratio[0] / ratio[1] + ratio_diff = abs(aspect_ratio - target_aspect_ratio) + if ratio_diff < best_ratio_diff: + best_ratio_diff = ratio_diff + best_ratio = ratio + elif ratio_diff == best_ratio_diff: + if area > 0.5 * image_size * image_size * ratio[0] * ratio[1]: + best_ratio = ratio + return best_ratio + + +def dynamic_preprocess(image, min_num=1, max_num=4, image_size=448, use_thumbnail=False): + orig_width, orig_height = image.size + aspect_ratio = orig_width / orig_height + + # calculate the existing image aspect ratio + target_ratios = set( + (i, j) for n in range(min_num, max_num + 1) for i in range(1, n + 1) for j in range(1, n + 1) if + i * j <= max_num and i * j >= min_num) + target_ratios = sorted(target_ratios, key=lambda x: x[0] * x[1]) + + # find the closest aspect ratio to the target + target_aspect_ratio = find_closest_aspect_ratio( + aspect_ratio, target_ratios, orig_width, orig_height, image_size) + + # calculate the target width and height + target_width = image_size * target_aspect_ratio[0] + target_height = image_size * target_aspect_ratio[1] + blocks = target_aspect_ratio[0] * target_aspect_ratio[1] + + # resize the image + resized_img = image.resize((target_width, target_height)) + processed_images = [] + for i in range(blocks): + box = ( + (i % (target_width // image_size)) * image_size, + (i // (target_width // image_size)) * image_size, + ((i % (target_width // image_size)) + 1) * image_size, + ((i // (target_width // image_size)) + 1) * image_size + ) + # split the image + split_img = resized_img.crop(box) + processed_images.append(split_img) + assert len(processed_images) == blocks + if use_thumbnail and len(processed_images) != 1: + thumbnail_img = image.resize((image_size, image_size)) + processed_images.append(thumbnail_img) + return processed_images + + +def load_image(image_file, input_size=448, max_num=6, upscale=False): + image = Image.open(image_file).convert('RGB') + if upscale: + image = image.resize((image.width * 2, image.height * 2), Image.BILINEAR) + transform = build_transform(input_size=input_size) + images = dynamic_preprocess(image, image_size=input_size, use_thumbnail=True, max_num=max_num) + pixel_values = [transform(image) for image in images] + pixel_values = torch.stack(pixel_values) + return pixel_values + + +class VinternChat(BaseModel): + + INSTALL_REQ = False + INTERLEAVE = True + + def __init__(self, model_path='5CD-AI/Vintern-3B-beta', load_in_8bit=False, **kwargs): + assert model_path is not None + assert version_cmp(transformers.__version__, '4.36.2', 'ge') + + self.model_path = model_path + self.tokenizer = AutoTokenizer.from_pretrained(model_path, trust_remote_code=True, use_fast=False) + + # Regular expression to match the pattern 'Image' followed by a number, e.g. Image1 + self.pattern = r'Image(\d+)' + # Replacement pattern to insert a hyphen between 'Image' and the number, e.g. Image-1 + self.replacement = r'Image-\1' + + # Convert InternVL2 response to dataset format + # e.g. Image1 -> Image-1 + + # Regular expression to match the pattern 'Image-' followed by a number + self.reverse_pattern = r'Image-(\d+)' + # Replacement pattern to remove the hyphen (Image-1 -> Image1) + self.reverse_replacement = r'Image\1' + + device = torch.cuda.current_device() + self.device = device + self.model = AutoModel.from_pretrained( + model_path, + torch_dtype=torch.bfloat16, + trust_remote_code=True, + load_in_8bit=load_in_8bit).eval() + if not load_in_8bit: + self.model = self.model.to(device) + + self.image_size = self.model.config.vision_config.image_size + kwargs_default = dict(do_sample=False, max_new_tokens=1024, top_p=None, num_beams=3) + kwargs_default.update(kwargs) + self.kwargs = kwargs_default + + warnings.warn(f'Following kwargs received: {self.kwargs}, will use as generation config. ') + + def use_custom_prompt(self, dataset): + if dataset is None: + return False + if listinstr(['MMDU', 'MME-RealWorld', 'MME-RealWorld-CN'], dataset): + # For Multi-Turn we don't have custom prompt + return False + if DATASET_MODALITY(dataset) == 'VIDEO': + # For Video benchmarks we don't have custom prompt at here + return False + else: + return True + + def build_multi_choice_prompt(self, line, dataset=None): + question = line['question'] + hint = line['hint'] if ('hint' in line and not pd.isna(line['hint'])) else None + if hint is not None: + question = hint + '\n' + question + + options = { + cand: line[cand] + for cand in string.ascii_uppercase + if cand in line and not pd.isna(line[cand]) + } + for key, item in options.items(): + question += f'\n{key}. {item}' + prompt = question + + if len(options): + prompt += '\n请直接回答选项字母。' if cn_string( + prompt) else "\nAnswer with the option's letter from the given choices directly." + else: + prompt += '\n请直接回答问题。' if cn_string(prompt) else '\nAnswer the question directly.' + + return prompt + + def build_video_prompt(self, prompt, dataset=None, max_frames=64): + for start in range(0, max_frames, 8): + images_to_remove = ''.join([f'' for i in range(start + 1, start + 9)]) + prompt = prompt.replace(images_to_remove, '') + for i in range(max_frames): + prompt = prompt.replace(f'Image-{i + 1}', f'Frame-{i + 1}') + if listinstr(['MMBench-Video'], dataset): + prompt = prompt.replace('\nAnswer:', '') + elif listinstr(['Video-MME'], dataset): + prompt = prompt.replace('\nAnswer:', '') + prompt += "\nAnswer with the option's letter from the given choices directly." + elif listinstr(['MVBench'], dataset): + prompt = prompt.replace('Best option:(', '') + + return prompt + + def build_prompt(self, line, dataset=None): + assert self.use_custom_prompt(dataset) + assert dataset is None or isinstance(dataset, str) + tgt_path = self.dump_image(line, dataset) + + kwargs_default = dict(do_sample=False, max_new_tokens=1024, top_p=None, num_beams=3) + + if listinstr(['MTVQA'], dataset): + kwargs_default["max_new_tokens"] = 256 + + if listinstr(['MMMU_DEV_VAL','MMMU_TEST'], dataset): + kwargs_default["num_beams"] = 1 + + self.kwargs = kwargs_default + + if dataset is not None and DATASET_TYPE(dataset) == 'Y/N': + question = line['question'] + if listinstr(['MME'], dataset): + prompt = question + ' Answer the question using a single word or phrase.' + elif listinstr(['HallusionBench'], dataset): + prompt = question + ' Please answer yes or no. Answer the question using a single word or phrase.' + else: + prompt = line['question'] + elif dataset is not None and DATASET_TYPE(dataset) == 'MCQ': + prompt = self.build_multi_choice_prompt(line, dataset) + elif dataset is not None and DATASET_TYPE(dataset) == 'VQA': + question = line['question'] + if listinstr(['MathVista', 'MathVision', 'VCR', 'MTVQA', 'MMVet', 'MathVerse'], dataset): + prompt = question + elif listinstr(['LLaVABench'], dataset): + prompt = question + '\nAnswer this question in detail.' + else: + prompt = question + '\nAnswer the question using a single word or phrase.' + else: + prompt = line['question'] + message = [dict(type='text', value=prompt)] + message.extend([dict(type='image', value=s) for s in tgt_path]) + return message + + def set_max_num(self, dataset): + if dataset is None: + self.max_num = 1 + return + + # res_1_datasets = ['MMBench-Video', 'Video-MME', 'MVBench', 'Video'] + res_12_datasets = ['ChartQA_TEST', 'MMMU_DEV_VAL', 'MMMU_TEST', 'MME-RealWorld', + 'MME-RealWorld', 'VCR_EN', 'VCR_ZH'] + res_18_datasets = ['DocVQA_VAL', 'DocVQA_TEST'] + res_24_datasets = ['InfoVQA_VAL', 'InfoVQA_TEST', 'OCRBench', 'HRBench4K', 'HRBench8K'] + if DATASET_MODALITY(dataset) == 'VIDEO': + self.max_num = 1 + elif listinstr(res_12_datasets, dataset): + self.max_num = 6 # 12 + elif listinstr(res_18_datasets, dataset): + self.max_num = 6 # 18 + elif listinstr(res_24_datasets, dataset): + self.max_num = 6 # 24 + elif listinstr(["MME"], dataset): + self.max_num = 6 # 24 + else: + self.max_num = 6 # 6 + + def generate_v2(self, message, dataset=None): + image_num = len([x for x in message if x['type'] == 'image']) + if image_num == 1: + prompt = '\n' + '\n'.join([x['value'] for x in message if x['type'] == 'text']) + else: + prompt, image_idx = '', 1 + for x in message: + if x['type'] == 'text': + prompt += x['value'] + elif x['type'] == 'image': + prompt += f'' + image_idx += 1 + prompt = '\n'.join([f'Image-{i + 1}: ' for i in range(image_num)]) + '\n' + prompt + + if dataset is not None and DATASET_MODALITY(dataset) == 'VIDEO': + prompt = self.build_video_prompt(prompt, dataset) + + if image_num > 1: + image_path = [x['value'] for x in message if x['type'] == 'image'] + num_patches_list = [] + pixel_values_list = [] + for image_idx, file_name in enumerate(image_path): + upscale_flag = image_idx == 0 and dataset is not None and listinstr(['MMMU_DEV_VAL'], dataset) + curr_pixel_values = load_image( + file_name, max_num=self.max_num, upscale=upscale_flag).to(self.device).to(torch.bfloat16) + num_patches_list.append(curr_pixel_values.size(0)) + pixel_values_list.append(curr_pixel_values) + pixel_values = torch.cat(pixel_values_list, dim=0) + elif image_num == 1: + image_path = [x['value'] for x in message if x['type'] == 'image'][0] + upscale_flag = dataset is not None and listinstr(['MMMU_DEV_VAL'], dataset) + pixel_values = load_image( + image_path, max_num=self.max_num, upscale=upscale_flag).to(self.device).to(torch.bfloat16) + num_patches_list = [pixel_values.size(0)] + else: + pixel_values = None + num_patches_list = [] + + with torch.no_grad(): + response = self.model.chat( + self.tokenizer, + pixel_values=pixel_values, + num_patches_list=num_patches_list, + question=prompt, + generation_config=self.kwargs, + verbose=False + ) + return response + + def generate_inner(self, message, dataset=None): + self.set_max_num(dataset) + return self.generate_v2(message, dataset) + + def build_history(self, message): + # Global Variables + image_path = [] + image_cnt = 0 + + def concat_tilist(tilist): + nonlocal image_cnt # Declare image_cnt as nonlocal to modify it + prompt = '' + for item in tilist: + # Substitute the pattern in the text + if item['type'] == 'text': + prompt += re.sub(self.pattern, self.replacement, item['value']) + elif item['type'] == 'image': + image_cnt += 1 + prompt += '\n' + image_path.append(item['value']) + return prompt + + # Only previous messages + assert len(message) % 2 == 0 + history = [] + for i in range(len(message) // 2): + m1, m2 = message[2 * i], message[2 * i + 1] + assert m1['role'] == 'user' and m2['role'] == 'assistant' + history.append((concat_tilist(m1['content']), concat_tilist(m2['content']))) + + return history, image_path, image_cnt + + def chat_inner_v2(self, message, dataset=None): + + image_cnt = 0 + if len(message) > 1: + history, image_path, image_cnt = self.build_history(message[:-1]) + else: + history, image_path, image_cnt = None, [], 1 + current_msg = message[-1] + question = '' + + # If message is just text in the conversation + if len(current_msg['content']) == 1 and current_msg['content'][0]['type'] == 'text': + question = current_msg['content'][0]['value'] + question = re.sub(self.pattern, self.replacement, question) # Fix pattern as per InternVL + else: + for msg in current_msg['content']: + if msg['type'] == 'text': + question += re.sub(self.pattern, self.replacement, msg['value']) + elif msg['type'] == 'image': + image_cnt += 1 + question += '\n' + image_path.append(msg['value']) + + if image_cnt > 1: + num_patches_list = [] + pixel_values_list = [] + for image_idx, file_name in enumerate(image_path): + upscale_flag = image_idx == 0 and dataset is not None and listinstr(['MMMU_DEV_VAL'], dataset) + curr_pixel_values = load_image( + file_name, max_num=1, upscale=upscale_flag).to(self.device).to(torch.bfloat16) + num_patches_list.append(curr_pixel_values.size(0)) + pixel_values_list.append(curr_pixel_values) + pixel_values = torch.cat(pixel_values_list, dim=0) + elif image_cnt == 1: + upscale_flag = dataset is not None and listinstr(['MMMU_DEV_VAL'], dataset) + pixel_values = load_image( + image_path, max_num=self.max_num, upscale=upscale_flag).to(self.device).to(torch.bfloat16) + num_patches_list = [pixel_values.size(0)] + else: + pixel_values = None + num_patches_list = [] + + response, history = self.model.chat( + self.tokenizer, + pixel_values=pixel_values, + num_patches_list=num_patches_list, + question=question, + generation_config=self.kwargs, + history=history, + return_history=True + ) + + response = re.sub(self.reverse_pattern, self.reverse_replacement, response) + + return response + + def chat_inner(self, message, dataset=None): + self.set_max_num(dataset) + kwargs_default = dict(do_sample=False, max_new_tokens=512, top_p=None, num_beams=3) + self.kwargs = kwargs_default + return self.chat_inner_v2(message, dataset)