tuandunghcmut commited on
Commit
dae9dfe
·
verified ·
1 Parent(s): 28829b5

Add files using upload-large-folder tool

Browse files
This view is limited to 50 files because it contains too many changes.   See raw diff
Files changed (50) hide show
  1. VLMEvalKit/vlmeval/utils/__pycache__/matching_util.cpython-310.pyc +0 -0
  2. VLMEvalKit/vlmeval/vlm/__pycache__/__init__.cpython-310.pyc +0 -0
  3. VLMEvalKit/vlmeval/vlm/__pycache__/base.cpython-310.pyc +0 -0
  4. VLMEvalKit/vlmeval/vlm/__pycache__/cogvlm.cpython-310.pyc +0 -0
  5. VLMEvalKit/vlmeval/vlm/__pycache__/eagle_x.cpython-310.pyc +0 -0
  6. VLMEvalKit/vlmeval/vlm/__pycache__/h2ovl_mississippi.cpython-310.pyc +0 -0
  7. VLMEvalKit/vlmeval/vlm/__pycache__/idefics.cpython-310.pyc +0 -0
  8. VLMEvalKit/vlmeval/vlm/__pycache__/instructblip.cpython-310.pyc +0 -0
  9. VLMEvalKit/vlmeval/vlm/__pycache__/kosmos.cpython-310.pyc +0 -0
  10. VLMEvalKit/vlmeval/vlm/__pycache__/llama_vision.cpython-310.pyc +0 -0
  11. VLMEvalKit/vlmeval/vlm/__pycache__/mgm.cpython-310.pyc +0 -0
  12. VLMEvalKit/vlmeval/vlm/__pycache__/minigpt4.cpython-310.pyc +0 -0
  13. VLMEvalKit/vlmeval/vlm/__pycache__/mixsense.cpython-310.pyc +0 -0
  14. VLMEvalKit/vlmeval/vlm/__pycache__/molmo.cpython-310.pyc +0 -0
  15. VLMEvalKit/vlmeval/vlm/__pycache__/monkey.cpython-310.pyc +0 -0
  16. VLMEvalKit/vlmeval/vlm/__pycache__/moondream.cpython-310.pyc +0 -0
  17. VLMEvalKit/vlmeval/vlm/__pycache__/mplug_owl2.cpython-310.pyc +0 -0
  18. VLMEvalKit/vlmeval/vlm/__pycache__/nvlm.cpython-310.pyc +0 -0
  19. VLMEvalKit/vlmeval/vlm/__pycache__/omchat.cpython-310.pyc +0 -0
  20. VLMEvalKit/vlmeval/vlm/__pycache__/open_flamingo.cpython-310.pyc +0 -0
  21. VLMEvalKit/vlmeval/vlm/__pycache__/paligemma.cpython-310.pyc +0 -0
  22. VLMEvalKit/vlmeval/vlm/__pycache__/pandagpt.cpython-310.pyc +0 -0
  23. VLMEvalKit/vlmeval/vlm/__pycache__/parrot.cpython-310.pyc +0 -0
  24. VLMEvalKit/vlmeval/vlm/__pycache__/phi3_vision.cpython-310.pyc +0 -0
  25. VLMEvalKit/vlmeval/vlm/__pycache__/pixtral.cpython-310.pyc +0 -0
  26. VLMEvalKit/vlmeval/vlm/__pycache__/qh_360vl.cpython-310.pyc +0 -0
  27. VLMEvalKit/vlmeval/vlm/__pycache__/sail_vl.cpython-310.pyc +0 -0
  28. VLMEvalKit/vlmeval/vlm/__pycache__/slime.cpython-310.pyc +0 -0
  29. VLMEvalKit/vlmeval/vlm/__pycache__/transcore_m.cpython-310.pyc +0 -0
  30. VLMEvalKit/vlmeval/vlm/__pycache__/vila.cpython-310.pyc +0 -0
  31. VLMEvalKit/vlmeval/vlm/__pycache__/visualglm.cpython-310.pyc +0 -0
  32. VLMEvalKit/vlmeval/vlm/__pycache__/wemm.cpython-310.pyc +0 -0
  33. VLMEvalKit/vlmeval/vlm/__pycache__/yi_vl.cpython-310.pyc +0 -0
  34. VLMEvalKit/vlmeval/vlm/internvl/__init__.py +3 -0
  35. VLMEvalKit/vlmeval/vlm/internvl/__pycache__/__init__.cpython-310.pyc +0 -0
  36. VLMEvalKit/vlmeval/vlm/internvl/__pycache__/internvl_chat.cpython-310.pyc +0 -0
  37. VLMEvalKit/vlmeval/vlm/internvl/__pycache__/utils.cpython-310.pyc +0 -0
  38. VLMEvalKit/vlmeval/vlm/internvl/internvl_chat.py +353 -0
  39. VLMEvalKit/vlmeval/vlm/internvl/utils.py +349 -0
  40. VLMEvalKit/vlmeval/vlm/llava/__init__.py +4 -0
  41. VLMEvalKit/vlmeval/vlm/llava/__pycache__/__init__.cpython-310.pyc +0 -0
  42. VLMEvalKit/vlmeval/vlm/llava/__pycache__/llava.cpython-310.pyc +0 -0
  43. VLMEvalKit/vlmeval/vlm/llava/__pycache__/llava_xtuner.cpython-310.pyc +0 -0
  44. VLMEvalKit/vlmeval/vlm/llava/llava.py +897 -0
  45. VLMEvalKit/vlmeval/vlm/llava/llava_xtuner.py +239 -0
  46. VLMEvalKit/vlmeval/vlm/misc/blip2_instruct_vicuna13b.yaml +43 -0
  47. VLMEvalKit/vlmeval/vlm/misc/blip2_instruct_vicuna7b.yaml +43 -0
  48. VLMEvalKit/vlmeval/vlm/misc/minigpt4_13b_eval.yaml +37 -0
  49. VLMEvalKit/vlmeval/vlm/misc/minigpt4_7b_eval.yaml +38 -0
  50. VLMEvalKit/vlmeval/vlm/misc/minigptv2_eval.yaml +36 -0
VLMEvalKit/vlmeval/utils/__pycache__/matching_util.cpython-310.pyc ADDED
Binary file (2.01 kB). View file
 
VLMEvalKit/vlmeval/vlm/__pycache__/__init__.cpython-310.pyc ADDED
Binary file (3.17 kB). View file
 
VLMEvalKit/vlmeval/vlm/__pycache__/base.cpython-310.pyc ADDED
Binary file (7.59 kB). View file
 
VLMEvalKit/vlmeval/vlm/__pycache__/cogvlm.cpython-310.pyc ADDED
Binary file (4.6 kB). View file
 
VLMEvalKit/vlmeval/vlm/__pycache__/eagle_x.cpython-310.pyc ADDED
Binary file (6.17 kB). View file
 
VLMEvalKit/vlmeval/vlm/__pycache__/h2ovl_mississippi.cpython-310.pyc ADDED
Binary file (4.67 kB). View file
 
VLMEvalKit/vlmeval/vlm/__pycache__/idefics.cpython-310.pyc ADDED
Binary file (8.54 kB). View file
 
VLMEvalKit/vlmeval/vlm/__pycache__/instructblip.cpython-310.pyc ADDED
Binary file (2.13 kB). View file
 
VLMEvalKit/vlmeval/vlm/__pycache__/kosmos.cpython-310.pyc ADDED
Binary file (4.13 kB). View file
 
VLMEvalKit/vlmeval/vlm/__pycache__/llama_vision.cpython-310.pyc ADDED
Binary file (7.55 kB). View file
 
VLMEvalKit/vlmeval/vlm/__pycache__/mgm.cpython-310.pyc ADDED
Binary file (4.75 kB). View file
 
VLMEvalKit/vlmeval/vlm/__pycache__/minigpt4.cpython-310.pyc ADDED
Binary file (2.91 kB). View file
 
VLMEvalKit/vlmeval/vlm/__pycache__/mixsense.cpython-310.pyc ADDED
Binary file (1.81 kB). View file
 
VLMEvalKit/vlmeval/vlm/__pycache__/molmo.cpython-310.pyc ADDED
Binary file (2.31 kB). View file
 
VLMEvalKit/vlmeval/vlm/__pycache__/monkey.cpython-310.pyc ADDED
Binary file (3.17 kB). View file
 
VLMEvalKit/vlmeval/vlm/__pycache__/moondream.cpython-310.pyc ADDED
Binary file (5.25 kB). View file
 
VLMEvalKit/vlmeval/vlm/__pycache__/mplug_owl2.cpython-310.pyc ADDED
Binary file (4.89 kB). View file
 
VLMEvalKit/vlmeval/vlm/__pycache__/nvlm.cpython-310.pyc ADDED
Binary file (5.07 kB). View file
 
VLMEvalKit/vlmeval/vlm/__pycache__/omchat.cpython-310.pyc ADDED
Binary file (5.51 kB). View file
 
VLMEvalKit/vlmeval/vlm/__pycache__/open_flamingo.cpython-310.pyc ADDED
Binary file (3.11 kB). View file
 
VLMEvalKit/vlmeval/vlm/__pycache__/paligemma.cpython-310.pyc ADDED
Binary file (1.76 kB). View file
 
VLMEvalKit/vlmeval/vlm/__pycache__/pandagpt.cpython-310.pyc ADDED
Binary file (2.35 kB). View file
 
VLMEvalKit/vlmeval/vlm/__pycache__/parrot.cpython-310.pyc ADDED
Binary file (7.52 kB). View file
 
VLMEvalKit/vlmeval/vlm/__pycache__/phi3_vision.cpython-310.pyc ADDED
Binary file (4.48 kB). View file
 
VLMEvalKit/vlmeval/vlm/__pycache__/pixtral.cpython-310.pyc ADDED
Binary file (2.45 kB). View file
 
VLMEvalKit/vlmeval/vlm/__pycache__/qh_360vl.cpython-310.pyc ADDED
Binary file (2.24 kB). View file
 
VLMEvalKit/vlmeval/vlm/__pycache__/sail_vl.cpython-310.pyc ADDED
Binary file (15.4 kB). View file
 
VLMEvalKit/vlmeval/vlm/__pycache__/slime.cpython-310.pyc ADDED
Binary file (2.73 kB). View file
 
VLMEvalKit/vlmeval/vlm/__pycache__/transcore_m.cpython-310.pyc ADDED
Binary file (6.05 kB). View file
 
VLMEvalKit/vlmeval/vlm/__pycache__/vila.cpython-310.pyc ADDED
Binary file (3.77 kB). View file
 
VLMEvalKit/vlmeval/vlm/__pycache__/visualglm.cpython-310.pyc ADDED
Binary file (1.47 kB). View file
 
VLMEvalKit/vlmeval/vlm/__pycache__/wemm.cpython-310.pyc ADDED
Binary file (2.81 kB). View file
 
VLMEvalKit/vlmeval/vlm/__pycache__/yi_vl.cpython-310.pyc ADDED
Binary file (4.7 kB). View file
 
VLMEvalKit/vlmeval/vlm/internvl/__init__.py ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ from .internvl_chat import InternVLChat
2
+
3
+ __all__ = ['InternVLChat']
VLMEvalKit/vlmeval/vlm/internvl/__pycache__/__init__.cpython-310.pyc ADDED
Binary file (244 Bytes). View file
 
VLMEvalKit/vlmeval/vlm/internvl/__pycache__/internvl_chat.cpython-310.pyc ADDED
Binary file (10.4 kB). View file
 
VLMEvalKit/vlmeval/vlm/internvl/__pycache__/utils.cpython-310.pyc ADDED
Binary file (11.4 kB). View file
 
VLMEvalKit/vlmeval/vlm/internvl/internvl_chat.py ADDED
@@ -0,0 +1,353 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import math
2
+ import pandas as pd
3
+ import random
4
+ import re
5
+ import string
6
+ import torch
7
+ import torch.distributed as dist
8
+ import torchvision.transforms as T
9
+ import transformers
10
+ import warnings
11
+ from PIL import Image
12
+ from torchvision.transforms.functional import InterpolationMode
13
+ from transformers import AutoTokenizer, AutoConfig, AutoModel, CLIPImageProcessor
14
+
15
+ from .utils import (build_multi_choice_prompt,
16
+ build_video_prompt,
17
+ build_mpo_prompt,
18
+ build_mcq_cot_prompt,
19
+ build_qa_cot_prompt,
20
+ mpo_post_processing,
21
+ reorganize_prompt,
22
+ split_model, load_image)
23
+ from .utils import mpo_prompt_with_final_answer, mpo_prompt_without_final_answer
24
+ from ..base import BaseModel
25
+ from ...dataset import DATASET_TYPE, DATASET_MODALITY
26
+ from ...smp import *
27
+
28
+
29
+ class InternVLChat(BaseModel):
30
+ INSTALL_REQ = False
31
+ INTERLEAVE = True
32
+
33
+ def __init__(self,
34
+ model_path='OpenGVLab/InternVL-Chat-V1-5',
35
+ load_in_8bit=False,
36
+ use_mpo_prompt=False,
37
+ version='V1.0',
38
+ **kwargs):
39
+
40
+ assert model_path is not None
41
+ assert version_cmp(transformers.__version__, '4.37.2', 'ge')
42
+
43
+ self.use_mpo_prompt = use_mpo_prompt
44
+ self.model_path = model_path
45
+ self.tokenizer = AutoTokenizer.from_pretrained(model_path, trust_remote_code=True, use_fast=False)
46
+
47
+ # Regular expression to match the pattern 'Image' followed by a number, e.g. Image1
48
+ self.pattern = r'Image(\d+)'
49
+ # Replacement pattern to insert a hyphen between 'Image' and the number, e.g. Image-1
50
+ self.replacement = r'Image-\1'
51
+
52
+ # Convert InternVL2 response to dataset format
53
+ # e.g. Image1 -> Image-1
54
+
55
+ # Regular expression to match the pattern 'Image-' followed by a number
56
+ self.reverse_pattern = r'Image-(\d+)'
57
+ # Replacement pattern to remove the hyphen (Image-1 -> Image1)
58
+ self.reverse_replacement = r'Image\1'
59
+
60
+ if auto_split_flag():
61
+ device_map, visible_devices = split_model(model_path=model_path)
62
+ self.device = visible_devices[0]
63
+ self.model = AutoModel.from_pretrained(
64
+ model_path,
65
+ torch_dtype=torch.bfloat16,
66
+ load_in_8bit=load_in_8bit,
67
+ trust_remote_code=True,
68
+ low_cpu_mem_usage=True,
69
+ device_map=device_map).eval()
70
+ else:
71
+ self.model = AutoModel.from_pretrained(
72
+ model_path,
73
+ torch_dtype=torch.bfloat16,
74
+ load_in_8bit=load_in_8bit,
75
+ trust_remote_code=True,
76
+ low_cpu_mem_usage=True).eval().cuda()
77
+ self.device = 'cuda'
78
+
79
+ self.image_size = self.model.config.vision_config.image_size
80
+ self.version = version
81
+ kwargs_default = dict(do_sample=False, max_new_tokens=4096, top_p=None)
82
+ kwargs_default.update(kwargs)
83
+ self.kwargs = kwargs_default
84
+
85
+ warnings.warn(f'Following kwargs received: {self.kwargs}, will use as generation config. ')
86
+
87
+ def use_custom_prompt(self, dataset):
88
+ assert dataset is not None
89
+ if listinstr(['MMDU', 'MME-RealWorld', 'MME-RealWorld-CN'], dataset):
90
+ # For Multi-Turn we don't have custom prompt
91
+ return False
92
+ if DATASET_MODALITY(dataset) == 'VIDEO':
93
+ # For Video benchmarks we don't have custom prompt at here
94
+ return False
95
+ else:
96
+ return True
97
+
98
+ def build_prompt(self, line, dataset=None):
99
+ assert self.use_custom_prompt(dataset)
100
+ assert dataset is None or isinstance(dataset, str)
101
+ tgt_path = self.dump_image(line, dataset)
102
+
103
+ if dataset is not None and DATASET_TYPE(dataset) == 'Y/N':
104
+ question = line['question']
105
+ if listinstr(['MME'], dataset):
106
+ prompt = question + ' Answer the question using a single word or phrase.'
107
+ elif listinstr(['HallusionBench', 'AMBER'], dataset):
108
+ prompt = question + ' Please answer yes or no. Answer the question using a single word or phrase.'
109
+ else:
110
+ prompt = question
111
+ elif dataset is not None and DATASET_TYPE(dataset) == 'MCQ':
112
+ prompt = build_multi_choice_prompt(line, dataset)
113
+ if os.getenv('USE_COT') == '1':
114
+ prompt = build_mcq_cot_prompt(line, prompt)
115
+ elif dataset is not None and DATASET_TYPE(dataset) == 'VQA':
116
+ question = line['question']
117
+ if listinstr(['LLaVABench', 'WildVision'], dataset):
118
+ prompt = question + '\nAnswer this question in detail.'
119
+ elif listinstr(['OCRVQA', 'TextVQA', 'ChartQA', 'DocVQA', 'InfoVQA', 'OCRBench',
120
+ 'DUDE', 'SLIDEVQA', 'GQA', 'MMLongBench_DOC'], dataset):
121
+ prompt = question + '\nAnswer the question using a single word or phrase.'
122
+ elif listinstr(['MathVista', 'MathVision', 'VCR', 'MTVQA', 'MMVet', 'MathVerse',
123
+ 'MMDU', 'CRPE', 'MIA-Bench', 'MM-Math', 'DynaMath', 'QSpatial'], dataset):
124
+ prompt = question
125
+ if os.getenv('USE_COT') == '1':
126
+ prompt = build_qa_cot_prompt(line, prompt)
127
+ else:
128
+ prompt = question + '\nAnswer the question using a single word or phrase.'
129
+ else:
130
+ # VQA_ex_prompt: OlympiadBench, VizWiz
131
+ prompt = line['question']
132
+ if os.getenv('USE_COT') == '1':
133
+ prompt = build_qa_cot_prompt(line, prompt)
134
+
135
+ message = [dict(type='text', value=prompt)]
136
+ message.extend([dict(type='image', value=s) for s in tgt_path])
137
+
138
+ if self.use_mpo_prompt:
139
+ message = build_mpo_prompt(message, line, dataset)
140
+ return message
141
+
142
+ def set_max_num(self, dataset):
143
+ # The total limit on the number of images processed, set to avoid Out-of-Memory issues.
144
+ self.total_max_num = 64
145
+ if dataset is None:
146
+ self.max_num = 6
147
+ return None
148
+ res_12_datasets = ['ChartQA_TEST', 'MMMU_DEV_VAL', 'MMMU_TEST', 'MME-RealWorld',
149
+ 'VCR_EN', 'VCR_ZH', 'OCRVQA']
150
+ res_18_datasets = ['DocVQA_VAL', 'DocVQA_TEST', 'DUDE', 'MMLongBench_DOC', 'SLIDEVQA']
151
+ res_24_datasets = ['InfoVQA_VAL', 'InfoVQA_TEST', 'OCRBench', 'HRBench4K', 'HRBench8K']
152
+ if DATASET_MODALITY(dataset) == 'VIDEO':
153
+ self.max_num = 1
154
+ elif listinstr(res_12_datasets, dataset):
155
+ self.max_num = 12
156
+ elif listinstr(res_18_datasets, dataset):
157
+ self.max_num = 18
158
+ elif listinstr(res_24_datasets, dataset):
159
+ self.max_num = 24
160
+ else:
161
+ self.max_num = 6
162
+
163
+ def generate_v1_2(self, message, dataset=None):
164
+ self.INTERLEAVE = False
165
+ prompt, image_path = self.message_to_promptimg(message, dataset=dataset)
166
+ image = Image.open(image_path).convert('RGB')
167
+ image = image.resize((self.image_size, self.image_size))
168
+ image_processor = CLIPImageProcessor.from_pretrained(self.model_path)
169
+ pixel_values = image_processor(images=image, return_tensors='pt').pixel_values
170
+ pixel_values = pixel_values.to(torch.bfloat16).to(self.device)
171
+ with torch.no_grad():
172
+ response = self.model.chat(self.tokenizer, pixel_values=pixel_values,
173
+ question=prompt, generation_config=self.kwargs)
174
+ return response
175
+
176
+ def generate_v1_5(self, message, dataset=None):
177
+ image_num = len([x for x in message if x['type'] == 'image'])
178
+ max_num = max(1, min(self.max_num, self.total_max_num // image_num))
179
+ prompt = '\n'.join([x['value'] for x in message if x['type'] == 'text'])
180
+
181
+ if DATASET_MODALITY(dataset) == 'VIDEO':
182
+ prompt = build_video_prompt(prompt, dataset)
183
+
184
+ if image_num > 1:
185
+ image_path = [x['value'] for x in message if x['type'] == 'image']
186
+ pixel_values_list = []
187
+ for file_name in image_path:
188
+ pixel_values_list.append(load_image(file_name, max_num=max_num).to(self.device).to(torch.bfloat16))
189
+ pixel_values = torch.cat(pixel_values_list, dim=0)
190
+ elif image_num == 1:
191
+ image_path = [x['value'] for x in message if x['type'] == 'image'][0]
192
+ pixel_values = load_image(image_path, max_num=max_num).to(self.device).to(torch.bfloat16)
193
+ else:
194
+ pixel_values = None
195
+ with torch.no_grad():
196
+ response = self.model.chat(
197
+ self.tokenizer,
198
+ pixel_values=pixel_values,
199
+ question=prompt,
200
+ generation_config=self.kwargs,
201
+ verbose=True)
202
+ return response
203
+
204
+ def generate_v2(self, message, dataset=None):
205
+ image_num = len([x for x in message if x['type'] == 'image'])
206
+ max_num = max(1, min(self.max_num, self.total_max_num // image_num))
207
+ prompt = reorganize_prompt(message, image_num, dataset=dataset)
208
+
209
+ if dataset is not None and DATASET_MODALITY(dataset) == 'VIDEO':
210
+ prompt = build_video_prompt(prompt, dataset)
211
+
212
+ if image_num > 1:
213
+ image_path = [x['value'] for x in message if x['type'] == 'image']
214
+ num_patches_list, pixel_values_list = [], []
215
+ for image_idx, file_name in enumerate(image_path):
216
+ upscale_flag = image_idx == 0 and dataset is not None and listinstr(['MMMU'], dataset)
217
+ curr_pixel_values = load_image(
218
+ file_name, max_num=max_num, upscale=upscale_flag).to(self.device).to(torch.bfloat16)
219
+ num_patches_list.append(curr_pixel_values.size(0))
220
+ pixel_values_list.append(curr_pixel_values)
221
+ pixel_values = torch.cat(pixel_values_list, dim=0)
222
+ elif image_num == 1:
223
+ image_path = [x['value'] for x in message if x['type'] == 'image'][0]
224
+ upscale_flag = dataset is not None and listinstr(['MMMU'], dataset)
225
+ pixel_values = load_image(
226
+ image_path, max_num=max_num, upscale=upscale_flag).to(self.device).to(torch.bfloat16)
227
+ num_patches_list = [pixel_values.size(0)]
228
+ else:
229
+ pixel_values = None
230
+ num_patches_list = []
231
+
232
+ with torch.no_grad():
233
+ response = self.model.chat(
234
+ self.tokenizer,
235
+ pixel_values=pixel_values,
236
+ num_patches_list=num_patches_list,
237
+ question=prompt,
238
+ generation_config=self.kwargs,
239
+ verbose=True
240
+ )
241
+
242
+ if self.use_mpo_prompt:
243
+ response = mpo_post_processing(response, dataset)
244
+ return response
245
+
246
+ def generate_inner(self, message, dataset=None):
247
+ self.set_max_num(dataset)
248
+ print(f'InternVL model version: {self.version}')
249
+ if self.version in ['V1.1', 'V1.2']:
250
+ return self.generate_v1_2(message, dataset)
251
+ elif self.version == 'V1.5':
252
+ return self.generate_v1_5(message, dataset)
253
+ elif self.version == 'V2.0':
254
+ return self.generate_v2(message, dataset)
255
+ else:
256
+ raise ValueError(f'Unsupported version: {self.version}')
257
+
258
+ def build_history(self, message):
259
+ # Global Variables
260
+ image_path = []
261
+ image_cnt = 0
262
+
263
+ def concat_tilist(tilist):
264
+ nonlocal image_cnt # Declare image_cnt as nonlocal to modify it
265
+ prompt = ''
266
+ for item in tilist:
267
+ # Substitute the pattern in the text
268
+ if item['type'] == 'text':
269
+ prompt += re.sub(self.pattern, self.replacement, item['value'])
270
+ elif item['type'] == 'image':
271
+ image_cnt += 1
272
+ prompt += '<image>\n'
273
+ image_path.append(item['value'])
274
+ return prompt
275
+
276
+ # Only previous messages
277
+ assert len(message) % 2 == 0
278
+ history = []
279
+ for i in range(len(message) // 2):
280
+ m1, m2 = message[2 * i], message[2 * i + 1]
281
+ assert m1['role'] == 'user' and m2['role'] == 'assistant'
282
+ history.append((concat_tilist(m1['content']), concat_tilist(m2['content'])))
283
+
284
+ return history, image_path, image_cnt
285
+
286
+ def chat_inner_v2(self, message, dataset=None):
287
+
288
+ if len(message) > 1:
289
+ history, image_path, image_cnt = self.build_history(message[:-1])
290
+ else:
291
+ history, image_path, image_cnt = None, [], 1
292
+ current_msg = message[-1]
293
+ question = ''
294
+
295
+ # If message is just text in the conversation
296
+ if len(current_msg['content']) == 1 and current_msg['content'][0]['type'] == 'text':
297
+ question = current_msg['content'][0]['value']
298
+ question = re.sub(self.pattern, self.replacement, question) # Fix pattern as per InternVL
299
+ else:
300
+ for msg in current_msg['content']:
301
+ if msg['type'] == 'text':
302
+ question += re.sub(self.pattern, self.replacement, msg['value'])
303
+ elif msg['type'] == 'image':
304
+ image_cnt += 1
305
+ question += '<image>\n'
306
+ image_path.append(msg['value'])
307
+
308
+ if image_cnt > 1:
309
+ num_patches_list = []
310
+ pixel_values_list = []
311
+ for image_idx, file_name in enumerate(image_path):
312
+ upscale_flag = image_idx == 0 and dataset is not None and listinstr(['MMMU_DEV_VAL'], dataset)
313
+ curr_pixel_values = load_image(
314
+ file_name, max_num=self.max_num, upscale=upscale_flag).to(self.device).to(torch.bfloat16)
315
+ num_patches_list.append(curr_pixel_values.size(0))
316
+ pixel_values_list.append(curr_pixel_values)
317
+ pixel_values = torch.cat(pixel_values_list, dim=0)
318
+ elif image_cnt == 1:
319
+ upscale_flag = listinstr(['MMMU_DEV_VAL'], dataset)
320
+ pixel_values = load_image(
321
+ image_path, max_num=self.max_num, upscale=upscale_flag).to(self.device).to(torch.bfloat16)
322
+ num_patches_list = [pixel_values.size(0)]
323
+ else:
324
+ pixel_values = None
325
+ num_patches_list = []
326
+
327
+ response, history = self.model.chat(
328
+ self.tokenizer,
329
+ pixel_values=pixel_values,
330
+ num_patches_list=num_patches_list,
331
+ question=question,
332
+ generation_config=self.kwargs,
333
+ history=history,
334
+ return_history=True
335
+ )
336
+
337
+ response = re.sub(self.reverse_pattern, self.reverse_replacement, response)
338
+
339
+ return response
340
+
341
+ def chat_inner(self, message, dataset=None):
342
+ self.set_max_num(dataset)
343
+
344
+ if self.version in ['V1.1', 'V1.2']:
345
+ raise ValueError(f'Unsupported version for Multi-Turn: {self.version}')
346
+ elif self.version == 'V1.5':
347
+ raise ValueError(f'Unsupported version for Multi-Turn: {self.version}')
348
+ elif self.version == 'V2.0':
349
+ kwargs_default = dict(do_sample=False, max_new_tokens=512, top_p=None, num_beams=1)
350
+ self.kwargs = kwargs_default
351
+ return self.chat_inner_v2(message, dataset)
352
+ else:
353
+ raise ValueError(f'Unsupported version for Multi-Turn: {self.version}')
VLMEvalKit/vlmeval/vlm/internvl/utils.py ADDED
@@ -0,0 +1,349 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import math
2
+ import pandas as pd
3
+ import random
4
+ import re
5
+ import string
6
+ import torch
7
+ import torch.distributed as dist
8
+ import torchvision.transforms as T
9
+ import transformers
10
+ import warnings
11
+ from PIL import Image
12
+ from torchvision.transforms.functional import InterpolationMode
13
+ from transformers import AutoTokenizer, AutoConfig, AutoModel, CLIPImageProcessor
14
+
15
+ from ..base import BaseModel
16
+ from ...dataset import DATASET_TYPE, DATASET_MODALITY
17
+ from ...smp import *
18
+
19
+ IMAGENET_MEAN = (0.485, 0.456, 0.406)
20
+ IMAGENET_STD = (0.229, 0.224, 0.225)
21
+
22
+
23
+ def build_transform(input_size):
24
+ MEAN, STD = IMAGENET_MEAN, IMAGENET_STD
25
+ transform = T.Compose([
26
+ T.Lambda(lambda img: img.convert('RGB') if img.mode != 'RGB' else img),
27
+ T.Resize((input_size, input_size), interpolation=InterpolationMode.BICUBIC),
28
+ T.ToTensor(),
29
+ T.Normalize(mean=MEAN, std=STD)
30
+ ])
31
+ return transform
32
+
33
+
34
+ def find_closest_aspect_ratio(aspect_ratio, target_ratios, width, height, image_size):
35
+ best_ratio_diff = float('inf')
36
+ best_ratio = (1, 1)
37
+ area = width * height
38
+ for ratio in target_ratios:
39
+ target_aspect_ratio = ratio[0] / ratio[1]
40
+ ratio_diff = abs(aspect_ratio - target_aspect_ratio)
41
+ if ratio_diff < best_ratio_diff:
42
+ best_ratio_diff = ratio_diff
43
+ best_ratio = ratio
44
+ elif ratio_diff == best_ratio_diff:
45
+ if area > 0.5 * image_size * image_size * ratio[0] * ratio[1]:
46
+ best_ratio = ratio
47
+ return best_ratio
48
+
49
+
50
+ def dynamic_preprocess(image, min_num=1, max_num=6, image_size=448, use_thumbnail=False):
51
+ orig_width, orig_height = image.size
52
+ aspect_ratio = orig_width / orig_height
53
+
54
+ # calculate the existing image aspect ratio
55
+ target_ratios = set(
56
+ (i, j) for n in range(min_num, max_num + 1) for i in range(1, n + 1) for j in range(1, n + 1) if
57
+ i * j <= max_num and i * j >= min_num)
58
+ target_ratios = sorted(target_ratios, key=lambda x: x[0] * x[1])
59
+
60
+ # find the closest aspect ratio to the target
61
+ target_aspect_ratio = find_closest_aspect_ratio(
62
+ aspect_ratio, target_ratios, orig_width, orig_height, image_size)
63
+
64
+ # calculate the target width and height
65
+ target_width = image_size * target_aspect_ratio[0]
66
+ target_height = image_size * target_aspect_ratio[1]
67
+ blocks = target_aspect_ratio[0] * target_aspect_ratio[1]
68
+
69
+ # resize the image
70
+ resized_img = image.resize((target_width, target_height))
71
+ processed_images = []
72
+ for i in range(blocks):
73
+ box = (
74
+ (i % (target_width // image_size)) * image_size,
75
+ (i // (target_width // image_size)) * image_size,
76
+ ((i % (target_width // image_size)) + 1) * image_size,
77
+ ((i // (target_width // image_size)) + 1) * image_size
78
+ )
79
+ # split the image
80
+ split_img = resized_img.crop(box)
81
+ processed_images.append(split_img)
82
+ assert len(processed_images) == blocks
83
+ if use_thumbnail and len(processed_images) != 1:
84
+ thumbnail_img = image.resize((image_size, image_size))
85
+ processed_images.append(thumbnail_img)
86
+ return processed_images
87
+
88
+
89
+ def load_image(image_file, input_size=448, max_num=6, upscale=False):
90
+ image = Image.open(image_file).convert('RGB')
91
+ if upscale:
92
+ image = image.resize((image.width * 2, image.height * 2), Image.BILINEAR)
93
+ transform = build_transform(input_size=input_size)
94
+ images = dynamic_preprocess(image, image_size=input_size, use_thumbnail=True, max_num=max_num)
95
+ pixel_values = [transform(image) for image in images]
96
+ pixel_values = torch.stack(pixel_values)
97
+ return pixel_values
98
+
99
+
100
+ def get_local_rank_and_local_world_size():
101
+ if not dist.is_available():
102
+ return 0, 1
103
+ if not dist.is_initialized():
104
+ return 0, 1
105
+
106
+ if 'SLURM_LOCALID' in os.environ:
107
+ local_rank = int(os.environ['SLURM_LOCALID'])
108
+ local_world_size = int(os.environ['SLURM_NTASKS_PER_NODE'])
109
+ return local_rank, local_world_size
110
+
111
+ if 'LOCAL_RANK' in os.environ and 'LOCAL_WORLD_SIZE' in os.environ:
112
+ return int(os.environ['LOCAL_RANK']), int(os.environ['LOCAL_WORLD_SIZE'])
113
+
114
+ raise NotImplementedError(
115
+ "Fail to get local_rank and local_world_size! "
116
+ "Please ensure that you set the environment variable "
117
+ "`LOCAL_RANK` and `LOCAL_WORLD_SIZE`"
118
+ )
119
+
120
+
121
+ def split_model(model_path):
122
+ num_gpus_per_node = 8
123
+ rank, world_size = get_rank_and_world_size()
124
+ try:
125
+ local_rank, local_world_size = get_local_rank_and_local_world_size()
126
+ except:
127
+ local_rank = rank
128
+
129
+ if 'GPUS_PER_PROCESS' in os.environ:
130
+ gpus_per_process = int(os.environ['GPUS_PER_PROCESS'])
131
+ else:
132
+ gpus_per_process = 8 # default to use 8 GPUs for one model
133
+
134
+ start_gpu = local_rank * gpus_per_process
135
+ end_gpu = start_gpu + gpus_per_process
136
+
137
+ assert end_gpu <= num_gpus_per_node, f"Process {local_rank} tries to access GPU {end_gpu}, " \
138
+ f"but only {num_gpus_per_node} GPUs are available per node."
139
+
140
+ visible_devices = list(range(start_gpu, end_gpu))
141
+
142
+ device_map = {}
143
+ config = AutoConfig.from_pretrained(model_path, trust_remote_code=True)
144
+
145
+ num_gpus_for_vit = 0.5
146
+ num_layers = config.llm_config.num_hidden_layers
147
+ num_layers_per_gpu = math.ceil(num_layers / (len(visible_devices) - num_gpus_for_vit))
148
+ num_layers_per_gpu = [num_layers_per_gpu] * len(visible_devices)
149
+ num_layers_per_gpu[0] = math.ceil(num_layers_per_gpu[0] * 0.5)
150
+
151
+ layer_cnt = 0
152
+ for i, num_layer in enumerate(num_layers_per_gpu):
153
+ for j in range(num_layer):
154
+ device_map[f'language_model.model.layers.{layer_cnt}'] = visible_devices[i]
155
+ layer_cnt += 1
156
+ device_map['vision_model'] = visible_devices[0]
157
+ device_map['mlp1'] = visible_devices[0]
158
+ device_map['language_model.model.tok_embeddings'] = visible_devices[0]
159
+ device_map['language_model.model.embed_tokens'] = visible_devices[0]
160
+ device_map['language_model.output'] = visible_devices[0]
161
+ device_map['language_model.model.norm'] = visible_devices[0]
162
+ device_map['language_model.lm_head'] = visible_devices[0]
163
+ device_map[f'language_model.model.layers.{num_layers - 1}'] = visible_devices[0]
164
+
165
+ return device_map, visible_devices
166
+
167
+
168
+ def split_model_old(model_name):
169
+ import math
170
+ device_map = {}
171
+ num_gpus = torch.cuda.device_count()
172
+ rank, world_size = get_rank_and_world_size()
173
+ num_gpus = num_gpus // world_size
174
+
175
+ num_layers_map = {
176
+ 'InternVL2-8B': 32,
177
+ 'InternVL2-26B': 48,
178
+ 'InternVL2-40B': 60,
179
+ 'InternVL2-Llama3-76B': 80
180
+ }
181
+
182
+ if model_name not in num_layers_map:
183
+ return 'cuda'
184
+ num_layers = num_layers_map[model_name]
185
+ # Since the first GPU will be used for ViT, treat it as 0.5 GPU.
186
+ num_layers_per_gpu = math.ceil(num_layers / (num_gpus - 0.5))
187
+ num_layers_per_gpu = [num_layers_per_gpu] * num_gpus
188
+ num_layers_per_gpu[0] = math.ceil(num_layers_per_gpu[0] * 0.5)
189
+ layer_cnt = 0
190
+ for i, num_layer in enumerate(num_layers_per_gpu):
191
+ for j in range(num_layer):
192
+ device_map[f'language_model.model.layers.{layer_cnt}'] = rank + world_size * i
193
+ layer_cnt += 1
194
+ device_map['vision_model'] = rank
195
+ device_map['mlp1'] = rank
196
+ device_map['language_model.model.tok_embeddings'] = rank
197
+ device_map['language_model.model.embed_tokens'] = rank
198
+ device_map['language_model.output'] = rank
199
+ device_map['language_model.model.norm'] = rank
200
+ device_map['language_model.lm_head'] = rank
201
+ device_map['language_model.model.rotary_emb'] = rank
202
+ device_map[f'language_model.model.layers.{num_layers - 1}'] = rank
203
+ return device_map
204
+
205
+
206
+ def build_mcq_cot_prompt(line, prompt):
207
+ cot_prompt = (
208
+ "Answer the preceding multiple choice question. The last line of your response should follow "
209
+ "this format: 'Answer: \\boxed{$LETTER}' (without quotes), where LETTER is one of the options. "
210
+ "If you are uncertain or the problem is too complex, make a reasoned guess based on the "
211
+ "information provided. Avoid repeating steps indefinitely—provide your best guess even if "
212
+ "unsure. Think step by step logically, considering all relevant information before answering."
213
+ )
214
+ prompt = prompt.replace("Answer with the option's letter from the given choices directly.", '').strip()
215
+ prompt = prompt + '\n' + cot_prompt
216
+
217
+ return prompt
218
+
219
+
220
+ def build_qa_cot_prompt(line, prompt):
221
+ cot_prompt = (
222
+ "Answer the preceding question. The last line of your response should follow this format: "
223
+ "'Answer: \\boxed{$FINAL_ANSWER}' (without quotes), where 'FINAL_ANSWER' is your conclusion "
224
+ "based on the reasoning provided. If you are uncertain or the problem is too complex, make "
225
+ "a reasoned guess based on the information provided. Avoid repeating steps indefinitely—"
226
+ "provide your best guess even if unsure. Think step by step logically, considering all "
227
+ "relevant information before answering."
228
+ )
229
+ prompt = prompt + '\n' + cot_prompt
230
+
231
+ return prompt
232
+
233
+
234
+ def build_multi_choice_prompt(line, dataset=None):
235
+ question = line['question']
236
+ hint = line['hint'] if ('hint' in line and not pd.isna(line['hint'])) else None
237
+ if hint is not None:
238
+ question = hint + '\n' + question
239
+
240
+ options = {
241
+ cand: line[cand]
242
+ for cand in string.ascii_uppercase
243
+ if cand in line and not pd.isna(line[cand])
244
+ }
245
+ for key, item in options.items():
246
+ question += f'\n{key}. {item}'
247
+ prompt = question
248
+
249
+ if len(options):
250
+ prompt += '\n请直接回答选项字母。' if cn_string(
251
+ prompt) else "\nAnswer with the option's letter from the given choices directly."
252
+ else:
253
+ prompt += '\n请直接回答问题。' if cn_string(prompt) else '\nAnswer the question directly.'
254
+
255
+ return prompt
256
+
257
+
258
+ def build_video_prompt(prompt, dataset=None, max_frames=64):
259
+ for start in range(0, max_frames, 8):
260
+ images_to_remove = ''.join([f'<Image-{i}>' for i in range(start + 1, start + 9)])
261
+ prompt = prompt.replace(images_to_remove, '')
262
+ for i in range(max_frames):
263
+ prompt = prompt.replace(f'Image-{i + 1}', f'Frame-{i + 1}')
264
+ if listinstr(['MMBench-Video'], dataset):
265
+ prompt = prompt.replace('\nAnswer:', '')
266
+ elif listinstr(['Video-MME'], dataset):
267
+ prompt = prompt.replace('\nAnswer:', '')
268
+ prompt += "\nAnswer with the option's letter from the given choices directly."
269
+ elif listinstr(['MVBench'], dataset):
270
+ prompt = prompt.replace('Best option:(', '')
271
+
272
+ return prompt
273
+
274
+
275
+ def reorganize_prompt(message, image_num, dataset=None):
276
+ if dataset is not None and listinstr(['MUIRBench'], dataset):
277
+ prompt = '\n'.join([x['value'] for x in message if x['type'] == 'text'])
278
+ images_to_remove = ' '.join(['<image>'] * image_num)
279
+ prompt = prompt.replace(images_to_remove, '')
280
+ for i in range(image_num):
281
+ prompt = prompt.replace('<image>', f'<Image-{i + 1}>', 1)
282
+ prompt = ''.join([f'Image-{i + 1}: <image>\n' for i in range(image_num)]) + prompt
283
+ elif image_num == 1:
284
+ prompt = '<image>\n' + '\n'.join([x['value'] for x in message if x['type'] == 'text'])
285
+ else:
286
+ prompt, image_idx = '', 1
287
+ for x in message:
288
+ if x['type'] == 'text':
289
+ prompt += x['value']
290
+ elif x['type'] == 'image':
291
+ prompt += f'<Image-{image_idx}>'
292
+ image_idx += 1
293
+ prompt = ''.join([f'Image-{i + 1}: <image>\n' for i in range(image_num)]) + prompt
294
+ images_to_remove = ''.join([f'<Image-{i + 1}>' for i in range(image_num)])
295
+ prompt = prompt.replace(images_to_remove, '')
296
+ return prompt
297
+
298
+
299
+ mpo_prompt_with_final_answer = (
300
+ "Your task is to answer the question below. "
301
+ "Give step by step reasoning before you answer, and when you're ready to answer, "
302
+ "please use the format \"Final answer: ..\""
303
+ "\n\n"
304
+ "Question:"
305
+ "\n\n"
306
+ "{question}"
307
+ )
308
+
309
+ mpo_prompt_without_final_answer = (
310
+ "Your task is to answer the question below. "
311
+ "Give step by step reasoning. "
312
+ "\n\n"
313
+ "Question:"
314
+ "\n\n"
315
+ "{question}"
316
+ )
317
+
318
+
319
+ def mpo_post_processing(response, dataset):
320
+
321
+ def extract_answer(text):
322
+ match = re.search(r'(Final answer:|Answer:)\s*(.*)', text, re.IGNORECASE)
323
+ if match:
324
+ return match.group(2).strip()
325
+ return text
326
+
327
+ if dataset is not None and (DATASET_TYPE(dataset) in ['Y/N', 'MCQ'] or listinstr(['CRPE'], dataset)):
328
+ response = extract_answer(response).strip()
329
+ return response
330
+
331
+
332
+ def build_mpo_prompt(message, line, dataset):
333
+ if not listinstr(['LLaVABench'], dataset):
334
+
335
+ if listinstr(['MMVet'], dataset):
336
+ cot_prompt = mpo_prompt_without_final_answer
337
+ else:
338
+ cot_prompt = mpo_prompt_with_final_answer
339
+
340
+ question_orig = line['question']
341
+ if listinstr(['MathVerse', 'MathVision'], dataset):
342
+ question_orig = question_orig.split('Question:', 1)[-1].strip()
343
+ question_orig = question_orig.replace('Choices:\n', '').strip()
344
+
345
+ prompt = cot_prompt.format(question=question_orig)
346
+ else:
347
+ prompt = line['question']
348
+ message[0]['value'] = prompt
349
+ return message
VLMEvalKit/vlmeval/vlm/llava/__init__.py ADDED
@@ -0,0 +1,4 @@
 
 
 
 
 
1
+ from .llava import LLaVA, LLaVA_Next, LLaVA_Next2, LLaVA_OneVision, LLaVA_OneVision_HF
2
+ from .llava_xtuner import LLaVA_XTuner
3
+
4
+ __all__ = ['LLaVA', 'LLaVA_Next', 'LLaVA_XTuner', 'LLaVA_Next2', 'LLaVA_OneVision', 'LLaVA_OneVision_HF']
VLMEvalKit/vlmeval/vlm/llava/__pycache__/__init__.cpython-310.pyc ADDED
Binary file (402 Bytes). View file
 
VLMEvalKit/vlmeval/vlm/llava/__pycache__/llava.cpython-310.pyc ADDED
Binary file (22.1 kB). View file
 
VLMEvalKit/vlmeval/vlm/llava/__pycache__/llava_xtuner.cpython-310.pyc ADDED
Binary file (6.95 kB). View file
 
VLMEvalKit/vlmeval/vlm/llava/llava.py ADDED
@@ -0,0 +1,897 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import torch
2
+ from PIL import Image
3
+ from abc import abstractproperty
4
+ import sys
5
+ import os.path as osp
6
+ from ..base import BaseModel
7
+ from ...smp import *
8
+ from ...dataset import DATASET_TYPE, DATASET_MODALITY
9
+ import copy
10
+ import requests
11
+
12
+
13
+ class LLaVA(BaseModel):
14
+
15
+ INSTALL_REQ = True
16
+ INTERLEAVE = True
17
+
18
+ def __init__(self, model_path="liuhaotian/llava_v1.5_7b", **kwargs):
19
+ try:
20
+ from llava.model.builder import load_pretrained_model
21
+ from llava.mm_utils import get_model_name_from_path
22
+ except Exception as err:
23
+ logging.critical(
24
+ "Please install llava from https://github.com/haotian-liu/LLaVA"
25
+ )
26
+ raise err
27
+
28
+ assert osp.exists(model_path) or splitlen(model_path) == 2
29
+ self.system_prompt = (
30
+ "A chat between a curious human and an artificial intelligence assistant. "
31
+ "The assistant gives helpful, detailed, and polite answers to the human's questions. "
32
+ )
33
+ self.stop_str = "</s>"
34
+
35
+ if model_path == "Lin-Chen/ShareGPT4V-7B":
36
+ model_name = "llava-v1.5-7b"
37
+ elif model_path == "Lin-Chen/ShareGPT4V-13B":
38
+ model_name = "llava-v1.5-13b"
39
+ else:
40
+ model_name = get_model_name_from_path(model_path)
41
+
42
+ try:
43
+ self.tokenizer, self.model, self.image_processor, self.context_len = (
44
+ load_pretrained_model(
45
+ model_path=model_path,
46
+ model_base=None,
47
+ model_name=model_name,
48
+ device="cpu",
49
+ device_map="cpu",
50
+ )
51
+ )
52
+ except Exception as err:
53
+ if "ShareGPT4V" in model_path:
54
+ import llava
55
+
56
+ logging.critical(
57
+ "Please manually remove the encoder type check in "
58
+ f"{llava.__path__[0]}/model/multimodal_encoder/builder.py "
59
+ "Line 8 to use the ShareGPT4V model. "
60
+ )
61
+ else:
62
+ logging.critical("Unknown error when loading LLaVA model.")
63
+ raise err
64
+
65
+ self.model = self.model.cuda()
66
+ self.conv_mode = "llava_v1"
67
+
68
+ kwargs_default = dict(
69
+ do_sample=False,
70
+ temperature=0,
71
+ max_new_tokens=512,
72
+ top_p=None,
73
+ num_beams=1,
74
+ use_cache=True,
75
+ ) # noqa E501
76
+ kwargs_default.update(kwargs)
77
+ self.kwargs = kwargs_default
78
+ warnings.warn(
79
+ f"Following kwargs received: {self.kwargs}, will use as generation config. "
80
+ )
81
+
82
+ def use_custom_prompt(self, dataset):
83
+ assert dataset is not None
84
+ if DATASET_TYPE(dataset) == "MCQ":
85
+ return True
86
+ return False
87
+
88
+ def build_prompt(self, line, dataset=None):
89
+ assert self.use_custom_prompt(dataset)
90
+ assert dataset is None or isinstance(dataset, str)
91
+ tgt_path = self.dump_image(line, dataset)
92
+
93
+ question = line["question"]
94
+ hint = line["hint"] if ("hint" in line and not pd.isna(line["hint"])) else None
95
+ if hint is not None:
96
+ question = hint + "\n" + question
97
+
98
+ options = {
99
+ cand: line[cand]
100
+ for cand in string.ascii_uppercase
101
+ if cand in line and not pd.isna(line[cand])
102
+ }
103
+ for key, item in options.items():
104
+ question += f"\n{key}. {item}"
105
+ prompt = question
106
+
107
+ if len(options):
108
+ prompt += (
109
+ "\n请直接回答选项字母。"
110
+ if cn_string(prompt)
111
+ else "\nAnswer with the option's letter from the given choices directly."
112
+ )
113
+ else:
114
+ prompt += (
115
+ "\n请直接回答问题。"
116
+ if cn_string(prompt)
117
+ else "\nAnswer the question directly."
118
+ )
119
+
120
+ message = [dict(type="image", value=s) for s in tgt_path]
121
+ message.append(dict(type="text", value=prompt))
122
+ return message
123
+
124
+ def concat_tilist(self, message):
125
+ text, images = "", []
126
+ for item in message:
127
+ if item["type"] == "text":
128
+ text += item["value"]
129
+ elif item["type"] == "image":
130
+ text += " <image> "
131
+ images.append(item["value"])
132
+ return text, images
133
+
134
+ def chat_inner(self, message, dataset=None):
135
+ from llava.mm_utils import (
136
+ process_images,
137
+ tokenizer_image_token,
138
+ KeywordsStoppingCriteria,
139
+ )
140
+ from llava.constants import IMAGE_TOKEN_INDEX
141
+
142
+ prompt = self.system_prompt
143
+ images = []
144
+ for utter in message:
145
+ prompt += "USER: " if utter["role"] == "user" else "ASSISTANT: "
146
+ content, images_sub = self.concat_tilist(utter["content"])
147
+ prompt += content
148
+ images.extend(images_sub)
149
+ prompt += " " if utter["role"] == "user" else self.stop_str
150
+ assert message[-1]["role"] == "user", message
151
+ prompt += "ASSISTANT: "
152
+
153
+ images = [Image.open(s).convert("RGB") for s in images]
154
+ args = abstractproperty()
155
+ args.image_aspect_ratio = "pad"
156
+ image_tensor = process_images(images, self.image_processor, args).to(
157
+ "cuda", dtype=torch.float16
158
+ )
159
+
160
+ input_ids = (
161
+ tokenizer_image_token(
162
+ prompt, self.tokenizer, IMAGE_TOKEN_INDEX, return_tensors="pt"
163
+ )
164
+ .unsqueeze(0)
165
+ .cuda()
166
+ )
167
+ keywords = [self.stop_str]
168
+ stopping_criteria = KeywordsStoppingCriteria(
169
+ keywords, self.tokenizer, input_ids
170
+ )
171
+ with torch.inference_mode():
172
+ output_ids = self.model.generate(
173
+ input_ids,
174
+ images=image_tensor,
175
+ stopping_criteria=[stopping_criteria],
176
+ **self.kwargs,
177
+ )
178
+ output = self.tokenizer.batch_decode(output_ids, skip_special_tokens=True)[
179
+ 0
180
+ ].strip()
181
+ return output
182
+
183
+ def generate_inner(self, message, dataset=None):
184
+ from llava.mm_utils import (
185
+ process_images,
186
+ tokenizer_image_token,
187
+ KeywordsStoppingCriteria,
188
+ )
189
+ from llava.constants import IMAGE_TOKEN_INDEX
190
+
191
+ # Support interleave text and image
192
+ content, images = self.concat_tilist(message)
193
+
194
+ images = [Image.open(s).convert("RGB") for s in images]
195
+ args = abstractproperty()
196
+ args.image_aspect_ratio = "pad"
197
+ if images:
198
+ image_tensor = process_images(images, self.image_processor, args).to(
199
+ "cuda", dtype=torch.float16
200
+ )
201
+ else:
202
+ image_tensor = None
203
+
204
+ prompt = self.system_prompt + "USER: " + content + " ASSISTANT: "
205
+
206
+ input_ids = (
207
+ tokenizer_image_token(
208
+ prompt, self.tokenizer, IMAGE_TOKEN_INDEX, return_tensors="pt"
209
+ )
210
+ .unsqueeze(0)
211
+ .cuda()
212
+ )
213
+ keywords = [self.stop_str]
214
+ stopping_criteria = KeywordsStoppingCriteria(
215
+ keywords, self.tokenizer, input_ids
216
+ )
217
+ with torch.inference_mode():
218
+ output_ids = self.model.generate(
219
+ input_ids,
220
+ images=image_tensor,
221
+ stopping_criteria=[stopping_criteria],
222
+ **self.kwargs,
223
+ )
224
+
225
+ output = self.tokenizer.batch_decode(output_ids, skip_special_tokens=True)[
226
+ 0
227
+ ].strip()
228
+ return output
229
+
230
+
231
+ class LLaVA_Next(BaseModel):
232
+
233
+ INSTALL_REQ = False
234
+ INTERLEAVE = True
235
+
236
+ def __init__(self, model_path="llava-hf/llava-v1.6-vicuna-7b-hf", **kwargs):
237
+ import transformers
238
+ from transformers import (
239
+ LlavaNextProcessor,
240
+ LlavaNextForConditionalGeneration,
241
+ AutoProcessor,
242
+ LlavaForConditionalGeneration,
243
+ )
244
+
245
+ self.model_path = model_path
246
+ if "34b" in model_path.lower():
247
+ self.processor = LlavaNextProcessor.from_pretrained(
248
+ self.model_path, use_fast=False
249
+ )
250
+ elif "interleave" in model_path.lower():
251
+ self.processor = AutoProcessor.from_pretrained(self.model_path)
252
+ else:
253
+ self.processor = LlavaNextProcessor.from_pretrained(self.model_path)
254
+ flash_attn_flag = False
255
+ try:
256
+ import flash_attn
257
+
258
+ flash_attn_flag = True
259
+ except ImportError:
260
+ pass
261
+
262
+ if flash_attn_flag:
263
+ if "interleave" in model_path.lower():
264
+ model = LlavaForConditionalGeneration.from_pretrained(
265
+ self.model_path,
266
+ torch_dtype=torch.float16,
267
+ low_cpu_mem_usage=True,
268
+ use_flash_attention_2=True,
269
+ )
270
+ else:
271
+ model = LlavaNextForConditionalGeneration.from_pretrained(
272
+ self.model_path,
273
+ torch_dtype=torch.float16,
274
+ low_cpu_mem_usage=True,
275
+ use_flash_attention_2=True,
276
+ )
277
+ else:
278
+ if "interleave" in model_path.lower():
279
+ model = LlavaForConditionalGeneration.from_pretrained(
280
+ self.model_path, torch_dtype=torch.float16, low_cpu_mem_usage=True
281
+ )
282
+ else:
283
+ model = LlavaNextForConditionalGeneration.from_pretrained(
284
+ self.model_path, torch_dtype=torch.float16, low_cpu_mem_usage=True
285
+ )
286
+
287
+ model = model.eval()
288
+ self.model = model.cuda()
289
+ kwargs_default = dict(
290
+ do_sample=False, temperature=0, max_new_tokens=512, top_p=None, num_beams=1
291
+ )
292
+ kwargs_default.update(kwargs)
293
+ self.kwargs = kwargs_default
294
+ warnings.warn(
295
+ f"Following kwargs received: {self.kwargs}, will use as generation config. "
296
+ )
297
+
298
+ def apply_prompt_template(self, prompt):
299
+ model_path = self.model_path.lower()
300
+ if "mistral" in model_path:
301
+ template = "[INST] PLACEHOLDER [/INST]"
302
+ elif "vicuna" in model_path:
303
+ template = (
304
+ "A chat between a curious human and an artificial intelligence assistant. "
305
+ "The assistant gives helpful, detailed, and polite answers to the human's questions. "
306
+ "USER: PLACEHOLDER ASSISTANT:"
307
+ )
308
+ elif "34b" in model_path:
309
+ template = (
310
+ "<|im_start|>system\nAnswer the questions.<|im_end|><|im_start|>user\nPLACEHOLDER<|im_end|>"
311
+ "<|im_start|>assistant\n"
312
+ )
313
+ else:
314
+ raise NotImplementedError(
315
+ f"Prompt template for {model_path} not implemented."
316
+ )
317
+
318
+ prompt = template.replace("PLACEHOLDER", f"<image>\n{prompt}")
319
+ return prompt
320
+
321
+ def output_process(self, answer):
322
+ if "<s>" in answer:
323
+ answer = answer.replace("<s>", "").strip()
324
+ if "[/INST]" in answer:
325
+ answer = answer.split("[/INST]")[1].strip()
326
+ elif "ASSISTANT:" in answer:
327
+ answer = answer.split("ASSISTANT:")[1].strip()
328
+ elif "assistant\n" in answer:
329
+ answer = answer.split("assistant\n")[1].strip()
330
+ elif "<|end_header_id|>\n\n" in answer:
331
+ answer = answer.split("<|end_header_id|>\n\n")[2].strip()
332
+
333
+ if "</s>" in answer:
334
+ answer = answer.split("</s>")[0].strip()
335
+ elif "<|im_end|>" in answer:
336
+ answer = answer.split("<|im_end|>")[0].strip()
337
+ elif "<|eot_id|>" in answer:
338
+ answer = answer.split("<|eot_id|>")[0].strip()
339
+ return answer
340
+
341
+ def use_custom_prompt(self, dataset):
342
+ assert dataset is not None
343
+ if DATASET_TYPE(dataset) == "MCQ":
344
+ return True
345
+ return False
346
+
347
+ def build_prompt(self, line, dataset=None):
348
+ assert self.use_custom_prompt(dataset)
349
+ assert dataset is None or isinstance(dataset, str)
350
+ tgt_path = self.dump_image(line, dataset)
351
+
352
+ question = line["question"]
353
+ hint = line["hint"] if ("hint" in line and not pd.isna(line["hint"])) else None
354
+ if hint is not None:
355
+ question = hint + "\n" + question
356
+
357
+ options = {
358
+ cand: line[cand]
359
+ for cand in string.ascii_uppercase
360
+ if cand in line and not pd.isna(line[cand])
361
+ }
362
+ for key, item in options.items():
363
+ question += f"\n{key}. {item}"
364
+ prompt = question
365
+
366
+ if len(options):
367
+ prompt += (
368
+ "\n请直接回答选项字母。"
369
+ if cn_string(prompt)
370
+ else "\nAnswer with the option's letter from the given choices directly."
371
+ )
372
+ else:
373
+ prompt += (
374
+ "\n请直接回答问题。"
375
+ if cn_string(prompt)
376
+ else "\nAnswer the question directly."
377
+ )
378
+ message = [dict(type="image", value=s) for s in tgt_path]
379
+ message.append(dict(type="text", value=prompt))
380
+ return message
381
+
382
+ def generate_inner(self, message, dataset=None):
383
+ content, images = [], []
384
+ for msg in message:
385
+ if msg["type"] == "text":
386
+ content.append({"type": msg["type"], "text": msg["value"]})
387
+ else:
388
+ content.append({"type": "image"})
389
+ images.append(Image.open(msg["value"]).convert("RGB"))
390
+ conversation = [
391
+ {
392
+ "role": "user",
393
+ "content": content,
394
+ }
395
+ ]
396
+ prompt = self.processor.apply_chat_template(
397
+ conversation, add_generation_prompt=True
398
+ )
399
+ inputs = self.processor(prompt, images, return_tensors="pt").to(
400
+ "cuda", torch.float16
401
+ )
402
+ output = self.model.generate(**inputs, **self.kwargs)
403
+ answer = self.processor.decode(output[0], skip_special_token=True)
404
+ answer = self.output_process(answer)
405
+ return answer
406
+
407
+
408
+ class LLaVA_Next2(BaseModel):
409
+ INSTALL_REQ = True
410
+ INTERLEAVE = True
411
+
412
+ DEFAULT_IMAGE_TOKEN = "<image>"
413
+ IMAGE_TOKEN_INDEX = -200
414
+
415
+ def __init__(self, model_path="lmms-lab/llama3-llava-next-8b", **kwargs):
416
+ assert model_path is not None
417
+ try:
418
+ from llava.model.builder import load_pretrained_model
419
+ from llava.conversation import conv_templates, SeparatorStyle
420
+ from llava.mm_utils import (
421
+ get_model_name_from_path,
422
+ tokenizer_image_token,
423
+ KeywordsStoppingCriteria,
424
+ )
425
+ except Exception as err:
426
+ logging.critical(
427
+ "Please `pip install git+https://github.com/LLaVA-VL/LLaVA-NeXT.git`"
428
+ )
429
+ raise err
430
+
431
+ model_name = get_model_name_from_path(model_path)
432
+ tokenizer, model, image_processor, _ = load_pretrained_model(
433
+ model_path, None, model_name, device_map=None
434
+ )
435
+ model.cuda().eval()
436
+ model.tie_weights()
437
+
438
+ if "llama3" in model_path.lower():
439
+ conv_mode = "llava_llama_3"
440
+ elif "qwen" in model_path.lower():
441
+ conv_mode = "qwen_1_5"
442
+ self.conv_template = conv_mode
443
+ self.conv_templates = conv_templates
444
+ self.tokenizer = tokenizer
445
+ self.model = model
446
+ self.image_processor = image_processor
447
+ self.tokenizer_image_token = tokenizer_image_token
448
+ self.KeywordStoppingCriteria = KeywordsStoppingCriteria
449
+ self.SeparatorStyle = SeparatorStyle
450
+
451
+ def generate_inner(self, message, dataset=None):
452
+ content, images = "", []
453
+ for msg in message:
454
+ if msg["type"] == "text":
455
+ content += msg["value"]
456
+ else:
457
+ images.append(Image.open(msg["value"]).convert("RGB"))
458
+ content += self.DEFAULT_IMAGE_TOKEN + "\n"
459
+
460
+ preprocess = self.image_processor.preprocess
461
+ image_tokenizer = self.tokenizer_image_token
462
+ image_tensor = [
463
+ preprocess(f, return_tensors="pt")["pixel_values"][0].half().cuda()
464
+ for f in images
465
+ ]
466
+ image_tensor = torch.stack(image_tensor)
467
+
468
+ conv = copy.deepcopy(self.conv_templates[self.conv_template])
469
+ conv.append_message(conv.roles[0], content)
470
+ conv.append_message(conv.roles[1], None)
471
+ prompt_question = conv.get_prompt()
472
+
473
+ input_ids = image_tokenizer(
474
+ prompt_question, self.tokenizer, self.IMAGE_TOKEN_INDEX, return_tensors="pt"
475
+ )
476
+ input_ids = input_ids.unsqueeze(0).cuda()
477
+
478
+ stop_str = conv.sep if conv.sep_style != self.SeparatorStyle.TWO else conv.sep2
479
+ keywords = [stop_str]
480
+ stopping_criteria = self.KeywordStoppingCriteria(
481
+ keywords, self.tokenizer, input_ids
482
+ )
483
+
484
+ cont = self.model.generate(
485
+ input_ids,
486
+ images=image_tensor,
487
+ do_sample=False,
488
+ temperature=0,
489
+ max_new_tokens=512,
490
+ stopping_criteria=[stopping_criteria],
491
+ )
492
+ text_outputs = self.tokenizer.batch_decode(cont, skip_special_tokens=True)[0]
493
+ return text_outputs
494
+
495
+
496
+ class LLaVA_OneVision(BaseModel):
497
+ INSTALL_REQ = True
498
+ INTERLEAVE = True
499
+ VIDEO_LLM = True
500
+ DEFAULT_IMAGE_TOKEN = "<image>"
501
+ IMAGE_TOKEN_INDEX = -200
502
+
503
+ # This function is used to split InternVL2-Llama3-76B
504
+ def split_model(self, model_path):
505
+ import math
506
+
507
+ device_map = {}
508
+ num_gpus = torch.cuda.device_count()
509
+ rank, world_size = get_rank_and_world_size()
510
+ num_gpus = num_gpus // world_size
511
+ if "72b" not in model_path.lower():
512
+ return None
513
+ # embed_tokens, vision_tower, mm_projector, lm_head are treated as 2 layers
514
+ num_layers = 80 + 8
515
+ num_layers_per_gpu = math.ceil(num_layers / num_gpus)
516
+ num_layers_per_gpu = [num_layers_per_gpu] * num_gpus
517
+ num_layers_per_gpu[0] -= 6
518
+ num_layers_per_gpu[-1] -= 2
519
+ layer_cnt = 0
520
+ for i, num_layer in enumerate(num_layers_per_gpu):
521
+ for j in range(num_layer):
522
+ device_map[f"model.layers.{layer_cnt}"] = rank + world_size * i
523
+ layer_cnt += 1
524
+ last_gpu = rank + world_size * (num_gpus - 1)
525
+ device_map["model.image_newline"] = rank
526
+ device_map["model.embed_tokens"] = rank
527
+ device_map["model.norm"] = rank
528
+ device_map["model.vision_tower"] = rank
529
+ device_map["model.vision_resampler"] = rank
530
+ device_map["model.mm_projector"] = rank
531
+ device_map["lm_head"] = last_gpu
532
+ return device_map
533
+
534
+ def __init__(self, model_path="lmms-lab/llava-onevision-qwen2-7b-si", **kwargs):
535
+ assert model_path is not None
536
+ try:
537
+ from llava.model.builder import load_pretrained_model
538
+ from llava.conversation import conv_templates, SeparatorStyle
539
+ from llava.mm_utils import (
540
+ get_model_name_from_path,
541
+ process_images,
542
+ tokenizer_image_token,
543
+ KeywordsStoppingCriteria,
544
+ ) # noqa: E501
545
+ except Exception as err:
546
+ logging.critical(
547
+ "Please `pip install git+https://github.com/LLaVA-VL/LLaVA-NeXT.git`"
548
+ )
549
+ raise err
550
+
551
+ video_kwargs_default = dict(
552
+ overwrite=True, mm_spatial_pool_mode="average", force_sample=True
553
+ )
554
+ video_kwargs_default.update(kwargs)
555
+ self.video_kwargs = video_kwargs_default
556
+
557
+ overwrite_config = None
558
+ if "video" in model_path.lower():
559
+ if self.video_kwargs["overwrite"]:
560
+ overwrite_config = {}
561
+ overwrite_config["mm_spatial_pool_mode"] = self.video_kwargs[
562
+ "mm_spatial_pool_mode"
563
+ ]
564
+
565
+ rank, world_size = get_rank_and_world_size()
566
+ model_name = get_model_name_from_path(model_path)
567
+ device_map = self.split_model(model_path)
568
+
569
+ if device_map is None:
570
+ if auto_split_flag():
571
+ assert world_size == 1, 'Only support world_size == 1 when AUTO_SPLIT set for non-72B LLaVA-OneVision'
572
+ logging.warning('Currently, we only support to split the non-72B model across all GPUs.')
573
+ tokenizer, model, image_processor, _ = load_pretrained_model(
574
+ model_path,
575
+ None,
576
+ model_name,
577
+ device_map="auto",
578
+ overwrite_config=overwrite_config,
579
+ )
580
+ else:
581
+ tokenizer, model, image_processor, _ = load_pretrained_model(
582
+ model_path,
583
+ None,
584
+ model_name,
585
+ device_map="cpu",
586
+ overwrite_config=overwrite_config,
587
+ )
588
+ model.cuda()
589
+ else:
590
+ tokenizer, model, image_processor, _ = load_pretrained_model(
591
+ model_path,
592
+ None,
593
+ model_name,
594
+ device_map=device_map,
595
+ overwrite_config=overwrite_config,
596
+ )
597
+ model.eval()
598
+ model.tie_weights()
599
+
600
+ if "llava" in model_path.lower():
601
+ conv_mode = "qwen_1_5"
602
+ if 'llava-video' in model_path.lower():
603
+ self.nframe = 64
604
+ else:
605
+ self.nframe = 16
606
+ if "72b" in model_path.lower():
607
+ self.nframe = 32
608
+
609
+ if "video" in model_path.lower():
610
+ self.force_sample = self.video_kwargs["force_sample"]
611
+ else:
612
+ self.force_sample = False
613
+
614
+ self.conv_template = conv_mode
615
+ self.conv_templates = conv_templates
616
+ self.tokenizer = tokenizer
617
+ self.model = model
618
+ self.image_processor = image_processor
619
+ self.tokenizer_image_token = tokenizer_image_token
620
+ self.process_images = (
621
+ process_images # Store process_images as a class attribute
622
+ )
623
+ self.KeywordStoppingCriteria = KeywordsStoppingCriteria
624
+ self.SeparatorStyle = SeparatorStyle
625
+
626
+ def generate_inner_image(self, message, dataset=None):
627
+ content, images = "", []
628
+ image_sizes = [] # Store image sizes
629
+
630
+ for msg in message:
631
+ if msg["type"] == "text":
632
+ content += msg["value"]
633
+ else:
634
+ img = Image.open(msg["value"]).convert("RGB")
635
+ images.append(img)
636
+ image_sizes.append(img.size) # Store the size of each image
637
+ content += self.DEFAULT_IMAGE_TOKEN + "\n"
638
+
639
+ # Process images using the class attribute self.process_images
640
+ image_tensor = self.process_images(
641
+ images, self.image_processor, self.model.config
642
+ )
643
+ image_tensor = [
644
+ _image.to(dtype=torch.float16, device="cuda") for _image in image_tensor
645
+ ]
646
+
647
+ conv = copy.deepcopy(self.conv_templates[self.conv_template])
648
+ conv.append_message(conv.roles[0], content)
649
+ conv.append_message(conv.roles[1], None)
650
+ prompt_question = conv.get_prompt()
651
+
652
+ input_ids = self.tokenizer_image_token(
653
+ prompt_question, self.tokenizer, self.IMAGE_TOKEN_INDEX, return_tensors="pt"
654
+ )
655
+ input_ids = input_ids.unsqueeze(0).cuda()
656
+
657
+ stop_str = conv.sep if conv.sep_style != self.SeparatorStyle.TWO else conv.sep2
658
+ keywords = [stop_str]
659
+ stopping_criteria = self.KeywordStoppingCriteria(
660
+ keywords, self.tokenizer, input_ids
661
+ )
662
+
663
+ # Pass image sizes along with other parameters
664
+ cont = self.model.generate(
665
+ input_ids,
666
+ images=image_tensor,
667
+ image_sizes=image_sizes, # Pass the image sizes here
668
+ do_sample=False,
669
+ temperature=0,
670
+ max_new_tokens=512,
671
+ stopping_criteria=[stopping_criteria],
672
+ )
673
+ text_outputs = self.tokenizer.batch_decode(cont, skip_special_tokens=True)[0]
674
+ return text_outputs
675
+
676
+ def generate_inner_video(self, message, dataset=None):
677
+ content, text_content, visual_content, videos = "", "", "", []
678
+
679
+ for msg in message:
680
+ if msg["type"] == "text":
681
+ text_content += msg["value"]
682
+ else:
683
+ videos.append(msg["value"])
684
+ visual_content += self.DEFAULT_IMAGE_TOKEN + "\n"
685
+
686
+ if len(videos) > 1:
687
+ raise ValueError(
688
+ "LLaVA-OneVision does not support multiple videos as input."
689
+ )
690
+
691
+ video_frames, frame_time, video_time = self.load_video(
692
+ videos[0], self.nframe, self.force_sample
693
+ )
694
+
695
+ time_instruciton = (
696
+ f"The video lasts for {video_time:.2f} seconds,"
697
+ f"and {len(video_frames[0])} frames are uniformly sampled from it."
698
+ f"These frames are located at {frame_time}."
699
+ f"Please answer the following questions related to this video.\n"
700
+ )
701
+
702
+ if self.force_sample:
703
+ content = visual_content + time_instruciton + text_content
704
+ else:
705
+ content = visual_content + text_content
706
+
707
+ image_tensors = []
708
+ frames = (
709
+ self.image_processor.preprocess(video_frames, return_tensors="pt")[
710
+ "pixel_values"
711
+ ]
712
+ .half()
713
+ .cuda()
714
+ )
715
+ image_tensors.append(frames)
716
+
717
+ conv = copy.deepcopy(self.conv_templates[self.conv_template])
718
+ conv.append_message(conv.roles[0], content)
719
+ conv.append_message(conv.roles[1], None)
720
+ prompt_question = conv.get_prompt()
721
+
722
+ input_ids = self.tokenizer_image_token(
723
+ prompt_question, self.tokenizer, self.IMAGE_TOKEN_INDEX, return_tensors="pt"
724
+ )
725
+ input_ids = input_ids.unsqueeze(0).cuda()
726
+ image_sizes = [frame.size for frame in video_frames]
727
+ modalities = ["video"] * len(video_frames)
728
+
729
+ stop_str = conv.sep if conv.sep_style != self.SeparatorStyle.TWO else conv.sep2
730
+ keywords = [stop_str]
731
+ stopping_criteria = self.KeywordStoppingCriteria(
732
+ keywords, self.tokenizer, input_ids
733
+ )
734
+
735
+ # Pass image sizes along with other parameters
736
+ cont = self.model.generate(
737
+ input_ids,
738
+ images=image_tensors,
739
+ image_sizes=image_sizes, # Pass the image sizes here
740
+ do_sample=False,
741
+ temperature=0,
742
+ max_new_tokens=512,
743
+ modalities=modalities,
744
+ stopping_criteria=[stopping_criteria],
745
+ )
746
+ text_outputs = self.tokenizer.batch_decode(cont, skip_special_tokens=True)[0]
747
+ return text_outputs
748
+
749
+ def load_video(self, video_path, max_frames_num, force_sample=False, fps=1):
750
+ from decord import VideoReader, cpu
751
+ import numpy as np
752
+
753
+ if max_frames_num == 0:
754
+ return np.zeros((1, 336, 336, 3))
755
+ vr = VideoReader(video_path, ctx=cpu(0), num_threads=1)
756
+ total_frame_num = len(vr)
757
+ video_time = total_frame_num / vr.get_avg_fps()
758
+ fps = round(vr.get_avg_fps() / fps)
759
+ frame_idx = [i for i in range(0, len(vr), fps)]
760
+ frame_time = [i / fps for i in frame_idx]
761
+ if len(frame_idx) > max_frames_num or force_sample:
762
+ sample_fps = max_frames_num
763
+ uniform_sampled_frames = np.linspace(
764
+ 0, total_frame_num - 1, sample_fps, dtype=int
765
+ )
766
+ frame_idx = uniform_sampled_frames.tolist()
767
+ frame_time = [i / vr.get_avg_fps() for i in frame_idx]
768
+ frame_time = ",".join([f"{i:.2f}s" for i in frame_time])
769
+ spare_frames = vr.get_batch(frame_idx).asnumpy()
770
+ # import pdb;pdb.set_trace()
771
+ return spare_frames, frame_time, video_time
772
+
773
+ def generate_inner(self, message, dataset=None):
774
+ if DATASET_MODALITY(dataset) == 'VIDEO':
775
+ return self.generate_inner_video(message, dataset)
776
+ else:
777
+ return self.generate_inner_image(message, dataset)
778
+
779
+
780
+ class LLaVA_OneVision_HF(BaseModel):
781
+ INSTALL_REQ = True
782
+ INTERLEAVE = True
783
+ VIDEO_LLM = True
784
+ DEFAULT_IMAGE_TOKEN = "<image>"
785
+ IMAGE_TOKEN_INDEX = -200
786
+
787
+ def __init__(self, model_path="llava-hf/llava-onevision-qwen2-0.5b-ov-hf", **kwargs):
788
+ from transformers import AutoProcessor, LlavaOnevisionForConditionalGeneration
789
+ assert model_path is not None, "Model path must be provided."
790
+ self.model = LlavaOnevisionForConditionalGeneration.from_pretrained(
791
+ model_path, torch_dtype=torch.float16, low_cpu_mem_usage=True
792
+ ).to(0)
793
+ self.processor = AutoProcessor.from_pretrained(model_path)
794
+
795
+ self.video_kwargs = kwargs.get("video_kwargs", {})
796
+ self.force_sample = self.video_kwargs.get("force_sample", False)
797
+ self.nframe = kwargs.get("nframe", 8)
798
+ self.fps = 1
799
+
800
+ def generate_inner_image(self, message, dataset=None):
801
+ content, images = "", []
802
+ image_sizes = []
803
+
804
+ for msg in message:
805
+ if msg["type"] == "text":
806
+ content += msg["value"]
807
+ elif msg["type"] == "image":
808
+ img = Image.open(msg["value"]).convert("RGB")
809
+ images.append(img)
810
+ image_sizes.append(img.size)
811
+ content += self.DEFAULT_IMAGE_TOKEN + "\n"
812
+
813
+ conversation = [
814
+ {
815
+ "role": "user",
816
+ "content": [
817
+ {"type": "text", "text": content.split("\n", 1)[-1]},
818
+ {"type": "image"},
819
+ ],
820
+ }
821
+ ]
822
+ prompt = self.processor.apply_chat_template(conversation, add_generation_prompt=True)
823
+ inputs = self.processor(images=images, text=prompt, return_tensors="pt").to(0, torch.float16)
824
+
825
+ output = self.model.generate(**inputs, max_new_tokens=100)
826
+ return self.processor.decode(output[0], skip_special_tokens=True)
827
+
828
+ def generate_inner_video(self, message, dataset=None):
829
+ content, text_content, visual_content, videos = "", "", "", []
830
+
831
+ for msg in message:
832
+ if msg["type"] == "text":
833
+ text_content += msg["value"]
834
+ elif msg["type"] == "video":
835
+ videos.append(msg["value"])
836
+ visual_content += self.DEFAULT_IMAGE_TOKEN + "\n"
837
+
838
+ if len(videos) > 1:
839
+ raise ValueError("LLaVA-OneVision does not support multiple videos as input.")
840
+
841
+ video_frames, frame_time, video_time = self.load_video(
842
+ videos[0], self.nframe, fps=1, force_sample=self.force_sample
843
+ )
844
+
845
+ time_instruction = (
846
+ f"The video lasts for {video_time:.2f} seconds, "
847
+ f"and {len(video_frames)} frames are uniformly sampled from it. "
848
+ f"These frames are located at {frame_time}. "
849
+ f"Please answer the following questions related to this video.\n"
850
+ )
851
+
852
+ content = visual_content + time_instruction + text_content
853
+ conversation = [
854
+ {
855
+ "role": "user",
856
+ "content": [{"type": "text", "text": content}, {"type": "video"}],
857
+ }
858
+ ]
859
+ prompt = self.processor.apply_chat_template(conversation, add_generation_prompt=True)
860
+
861
+ inputs = self.processor(videos=video_frames, text=prompt, return_tensors="pt").to(0, torch.float16)
862
+ output = self.model.generate(**inputs, max_new_tokens=512)
863
+ return self.processor.decode(output[0], skip_special_tokens=True)
864
+
865
+ def load_video(self, video_path, max_frames_num, fps=1, force_sample=False):
866
+ from decord import VideoReader, cpu
867
+ import numpy as np
868
+
869
+ vr = VideoReader(video_path, ctx=cpu(0), num_threads=1)
870
+ total_frame_num = len(vr)
871
+ avg_fps = vr.get_avg_fps()
872
+
873
+ if avg_fps == 0:
874
+ raise ValueError(f"Video '{video_path}' has an average FPS of 0, which is invalid.")
875
+ if fps <= 0:
876
+ raise ValueError("FPS argument must be greater than 0.")
877
+
878
+ effective_fps = round(avg_fps / fps)
879
+ frame_idx = list(range(0, total_frame_num, effective_fps))
880
+ frame_time = [i / avg_fps for i in frame_idx]
881
+
882
+ if len(frame_idx) > max_frames_num or force_sample:
883
+ uniform_sampled_frames = np.linspace(0, total_frame_num - 1, max_frames_num, dtype=int)
884
+ frame_idx = uniform_sampled_frames.tolist()
885
+ frame_time = [i / avg_fps for i in frame_idx]
886
+
887
+ frame_time_str = ", ".join([f"{t:.2f}s" for t in frame_time])
888
+ video_frames = vr.get_batch(frame_idx).asnumpy()
889
+ video_time = total_frame_num / avg_fps
890
+
891
+ return video_frames, frame_time_str, video_time
892
+
893
+ def generate_inner(self, message, dataset=None):
894
+ if DATASET_MODALITY(dataset) == "VIDEO":
895
+ return self.generate_inner_video(message, dataset)
896
+ else:
897
+ return self.generate_inner_image(message, dataset)
VLMEvalKit/vlmeval/vlm/llava/llava_xtuner.py ADDED
@@ -0,0 +1,239 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import os
2
+ import os.path as osp
3
+ import string
4
+ import sys
5
+ import warnings
6
+
7
+ import pandas as pd
8
+ import torch
9
+ from huggingface_hub import snapshot_download
10
+ from PIL import Image
11
+ from transformers import (AutoModel, AutoModelForCausalLM, AutoTokenizer,
12
+ CLIPImageProcessor, CLIPVisionModel,
13
+ GenerationConfig, StoppingCriteriaList)
14
+
15
+ from ..base import BaseModel
16
+ from ...smp import *
17
+ from ...dataset import DATASET_TYPE
18
+
19
+
20
+ class LLaVA_XTuner(BaseModel):
21
+
22
+ INSTALL_REQ = True
23
+ INTERLEAVE = False
24
+
25
+ def __init__(self,
26
+ llava_path,
27
+ llm_path=None,
28
+ visual_encoder_path='openai/clip-vit-large-patch14-336',
29
+ visual_select_layer=-2,
30
+ prompt_template=None,
31
+ stop_words=[],
32
+ torch_dtype=torch.float16):
33
+ try:
34
+ from peft import PeftModel
35
+ from xtuner.utils import PROMPT_TEMPLATE, StopWordStoppingCriteria
36
+ except Exception as err:
37
+ logging.critical(
38
+ 'Please install xtuner with `pip install -U xtuner` before '
39
+ 'using LLaVA_XTuner')
40
+ raise err
41
+
42
+ if not osp.isdir(llava_path):
43
+ cache_path = get_cache_path(llava_path)
44
+ if cache_path is not None:
45
+ llava_path = cache_path
46
+ else:
47
+ llava_path = snapshot_download(repo_id=llava_path)
48
+ assert osp.exists(llava_path) and osp.isdir(llava_path)
49
+
50
+ # build visual_encoder
51
+ if 'llm' in os.listdir(llava_path):
52
+ assert llm_path is None, (
53
+ "Please don't specify the `llm_path` since passed "
54
+ '`llava_path` contains a LLM!')
55
+ llm_path = osp.join(llava_path, 'llm')
56
+ else:
57
+ assert llm_path is not None, 'Please specify the `llm_path`!'
58
+
59
+ llm = AutoModelForCausalLM.from_pretrained(llm_path,
60
+ trust_remote_code=True,
61
+ torch_dtype=torch_dtype,
62
+ device_map='cpu')
63
+ tokenizer = AutoTokenizer.from_pretrained(llm_path,
64
+ trust_remote_code=True,
65
+ encode_special_tokens=True)
66
+ print(f'Load LLM from {llm_path}')
67
+
68
+ # build visual_encoder
69
+ if 'visual_encoder' in os.listdir(llava_path):
70
+ assert visual_encoder_path is None, (
71
+ "Please don't specify the `visual_encoder_path` since passed "
72
+ '`llava_path` contains a visual encoder!')
73
+ visual_encoder_path = osp.join(llava_path, 'visual_encoder')
74
+ else:
75
+ assert visual_encoder_path is not None, (
76
+ 'Please specify the `visual_encoder_path`!')
77
+ visual_encoder = CLIPVisionModel.from_pretrained(
78
+ visual_encoder_path, torch_dtype=torch_dtype, device_map='cpu')
79
+ image_processor = CLIPImageProcessor.from_pretrained(
80
+ visual_encoder_path)
81
+ print(f'Load visual_encoder from {visual_encoder_path}')
82
+
83
+ # load adapter
84
+ if 'llm_adapter' in os.listdir(llava_path):
85
+ adapter_path = osp.join(llava_path, 'llm_adapter')
86
+ llm = PeftModel.from_pretrained(llm,
87
+ adapter_path,
88
+ trust_remote_code=True,
89
+ device_map='cpu')
90
+ print(f'Load LLM adapter from {llava_path}')
91
+ if 'visual_encoder_adapter' in os.listdir(llava_path):
92
+ adapter_path = osp.join(llava_path, 'visual_encoder_adapter')
93
+ visual_encoder = PeftModel.from_pretrained(visual_encoder,
94
+ adapter_path,
95
+ trust_remote_code=True,
96
+ device_map='cpu')
97
+ print(f'Load visual_encoder adapter from {llava_path}')
98
+
99
+ # build projector
100
+ projector_path = osp.join(llava_path, 'projector')
101
+ projector = AutoModel.from_pretrained(projector_path,
102
+ trust_remote_code=True,
103
+ torch_dtype=torch_dtype,
104
+ device_map='cpu')
105
+ print(f'Load projector from {llava_path}')
106
+
107
+ llm.eval()
108
+ visual_encoder.eval()
109
+ projector.eval()
110
+
111
+ self.llm = llm.cuda()
112
+ self.tokenizer = tokenizer
113
+ self.visual_encoder = visual_encoder.cuda()
114
+ self.image_processor = image_processor
115
+ self.projector = projector.cuda()
116
+ self.visual_select_layer = visual_select_layer
117
+ if prompt_template is not None:
118
+ # modified prompt template
119
+ if prompt_template == 'llama3_chat':
120
+ self.prompt_template = dict(
121
+ SYSTEM=('<|start_header_id|>system<|end_header_id|>\n\n'
122
+ '{system}<|eot_id|>'),
123
+ INSTRUCTION=(
124
+ '<|start_header_id|>user<|end_header_id|>\n\n{input}<|eot_id|>'
125
+ '<|start_header_id|>assistant<|end_header_id|>\n\n'),
126
+ SUFFIX='<|eot_id|>',
127
+ SUFFIX_AS_EOS=True,
128
+ STOP_WORDS=['<|eot_id|>'])
129
+ else:
130
+ self.prompt_template = PROMPT_TEMPLATE[prompt_template]
131
+ stop_words += self.prompt_template.get('STOP_WORDS', [])
132
+ else:
133
+ self.prompt_template = None
134
+
135
+ self.stop_criteria = StoppingCriteriaList()
136
+ for word in stop_words:
137
+ self.stop_criteria.append(
138
+ StopWordStoppingCriteria(self.tokenizer, word))
139
+
140
+ def build_gen_config(self, dataset):
141
+ gen_kwargs = dict(max_new_tokens=512,
142
+ do_sample=True,
143
+ temperature=1,
144
+ num_beams=5,
145
+ eos_token_id=self.tokenizer.eos_token_id,
146
+ pad_token_id=self.tokenizer.pad_token_id
147
+ if self.tokenizer.pad_token_id is not None else
148
+ self.tokenizer.eos_token_id)
149
+ # For single word generation
150
+ if (dataset is not None
151
+ and DATASET_TYPE(dataset) in ['MCQ', 'Y/N']):
152
+ gen_kwargs.update(
153
+ dict(max_new_tokens=5, do_sample=False, num_beams=1))
154
+ return GenerationConfig(**gen_kwargs)
155
+
156
+ def use_custom_prompt(self, dataset):
157
+ assert dataset is not None
158
+ if DATASET_TYPE(dataset) == 'MCQ':
159
+ return True
160
+ return False
161
+
162
+ def build_prompt(self, line, dataset=None):
163
+ assert self.use_custom_prompt(dataset)
164
+ assert dataset is None or isinstance(dataset, str)
165
+ tgt_path = self.dump_image(line, dataset)
166
+
167
+ question = line['question']
168
+ hint = line['hint'] if ('hint' in line
169
+ and not pd.isna(line['hint'])) else None
170
+ if hint is not None:
171
+ question = hint + '\n' + question
172
+
173
+ options = {
174
+ cand: line[cand]
175
+ for cand in string.ascii_uppercase
176
+ if cand in line and not pd.isna(line[cand])
177
+ }
178
+ for key, item in options.items():
179
+ question += f'\n{key}. {item}'
180
+
181
+ if not cn_string(question):
182
+ prompt = question + '\n' + ("Answer with the option's letter "
183
+ 'from the given choices directly.')
184
+ else:
185
+ prompt = question + '\n' + '请直接回答选项字母。'
186
+
187
+ message = [dict(type='text', value=prompt)]
188
+ message.extend([dict(type='image', value=s) for s in tgt_path])
189
+ return message
190
+
191
+ def generate_inner(self, message, dataset=None):
192
+ from xtuner.dataset.utils import expand2square
193
+ from xtuner.model.utils import prepare_inputs_labels_for_multimodal
194
+ from xtuner.utils import DEFAULT_IMAGE_TOKEN, IMAGE_TOKEN_INDEX
195
+ prompt, image_path = self.message_to_promptimg(message, dataset=dataset)
196
+ prompt = prompt.replace('<image>', '')
197
+ image = Image.open(image_path).convert('RGB')
198
+ image = expand2square(
199
+ image,
200
+ tuple(int(x * 255) for x in self.image_processor.image_mean))
201
+ image = self.image_processor.preprocess(
202
+ image, return_tensors='pt')['pixel_values'][0]
203
+ image = image.cuda().unsqueeze(0)
204
+ visual_outputs = self.visual_encoder(image, output_hidden_states=True)
205
+ pixel_values = self.projector(
206
+ visual_outputs.hidden_states[self.visual_select_layer][:, 1:])
207
+
208
+ inputs = DEFAULT_IMAGE_TOKEN + '\n' + prompt
209
+
210
+ if self.prompt_template:
211
+ inputs = self.prompt_template['INSTRUCTION'].format(input=inputs)
212
+
213
+ chunk_encode = []
214
+ for idx, chunk in enumerate(inputs.split(DEFAULT_IMAGE_TOKEN)):
215
+ if idx == 0:
216
+ cur_encode = self.tokenizer(chunk)
217
+ else:
218
+ cur_encode = self.tokenizer(chunk, add_special_tokens=False)
219
+ chunk_encode.append(cur_encode)
220
+ assert len(chunk_encode) == 2
221
+ ids = []
222
+ for idx, cur_chunk_encode in enumerate(chunk_encode):
223
+ ids.extend(cur_chunk_encode['input_ids'])
224
+ if idx != len(chunk_encode) - 1:
225
+ ids.append(IMAGE_TOKEN_INDEX)
226
+ ids = torch.tensor(ids).cuda().unsqueeze(0)
227
+ mm_inputs = prepare_inputs_labels_for_multimodal(
228
+ llm=self.llm, input_ids=ids, pixel_values=pixel_values)
229
+
230
+ gen_config = self.build_gen_config(dataset)
231
+ generate_output = self.llm.generate(
232
+ **mm_inputs,
233
+ generation_config=gen_config,
234
+ streamer=None,
235
+ bos_token_id=self.tokenizer.bos_token_id,
236
+ stopping_criteria=self.stop_criteria)
237
+ predict = self.tokenizer.decode(generate_output[0],
238
+ skip_special_tokens=True).strip()
239
+ return predict
VLMEvalKit/vlmeval/vlm/misc/blip2_instruct_vicuna13b.yaml ADDED
@@ -0,0 +1,43 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # Copyright (c) 2022, salesforce.com, inc.
2
+ # All rights reserved.
3
+ # SPDX-License-Identifier: BSD-3-Clause
4
+ # For full license text, see the LICENSE file in the repo root or https://opensource.org/licenses/BSD-3-Clause
5
+
6
+ model:
7
+ arch: instruct_vicuna13b
8
+ load_finetuned: False
9
+ load_pretrained: True
10
+
11
+ pretrained: "https://storage.googleapis.com/sfr-vision-language-research/LAVIS/models/InstructBLIP/instruct_blip_vicuna13b_trimmed.pth"
12
+ finetuned: ""
13
+
14
+ # vit encoder
15
+ image_size: 224
16
+ drop_path_rate: 0
17
+ use_grad_checkpoint: False
18
+ vit_precision: "fp16"
19
+ freeze_vit: True
20
+
21
+ # Q-Former
22
+ num_query_token: 32
23
+
24
+ # path to Vicuna checkpoint
25
+ llm_model: "Please set the path to your vicuna-13b-v1.1"
26
+
27
+ # generation configs
28
+ prompt: ""
29
+
30
+
31
+ preprocess:
32
+ vis_processor:
33
+ train:
34
+ name: "blip2_image_train"
35
+ image_size: 224
36
+ eval:
37
+ name: "blip_image_eval"
38
+ image_size: 224
39
+ text_processor:
40
+ train:
41
+ name: "blip_caption"
42
+ eval:
43
+ name: "blip_caption"
VLMEvalKit/vlmeval/vlm/misc/blip2_instruct_vicuna7b.yaml ADDED
@@ -0,0 +1,43 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # Copyright (c) 2022, salesforce.com, inc.
2
+ # All rights reserved.
3
+ # SPDX-License-Identifier: BSD-3-Clause
4
+ # For full license text, see the LICENSE file in the repo root or https://opensource.org/licenses/BSD-3-Clause
5
+
6
+ model:
7
+ arch: instruct_vicuna7b
8
+ load_finetuned: False
9
+ load_pretrained: True
10
+
11
+ pretrained: "https://storage.googleapis.com/sfr-vision-language-research/LAVIS/models/InstructBLIP/instruct_blip_vicuna7b_trimmed.pth"
12
+ finetuned: ""
13
+
14
+ # vit encoder
15
+ image_size: 224
16
+ drop_path_rate: 0
17
+ use_grad_checkpoint: False
18
+ vit_precision: "fp16"
19
+ freeze_vit: True
20
+
21
+ # Q-Former
22
+ num_query_token: 32
23
+
24
+ # path to Vicuna checkpoint
25
+ llm_model: "Please set the path to your vicuna-7b-v1.1"
26
+
27
+ # generation configs
28
+ prompt: ""
29
+
30
+
31
+ preprocess:
32
+ vis_processor:
33
+ train:
34
+ name: "blip2_image_train"
35
+ image_size: 224
36
+ eval:
37
+ name: "blip_image_eval"
38
+ image_size: 224
39
+ text_processor:
40
+ train:
41
+ name: "blip_caption"
42
+ eval:
43
+ name: "blip_caption"
VLMEvalKit/vlmeval/vlm/misc/minigpt4_13b_eval.yaml ADDED
@@ -0,0 +1,37 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ model:
2
+ arch: minigpt4
3
+ model_type: pretrain_vicuna_7b
4
+ max_txt_len: 160
5
+ end_sym: "###"
6
+ low_resource: True
7
+ prompt_template: '###Human: {} ###Assistant: '
8
+ ckpt: "please set this value to the path of pretrained checkpoint"
9
+
10
+ # vit encoder
11
+ image_size: 224
12
+ drop_path_rate: 0
13
+ use_grad_checkpoint: False
14
+ vit_precision: "fp16"
15
+ freeze_vit: True
16
+ freeze_qformer: True
17
+
18
+ # Q-Former
19
+ num_query_token: 32
20
+
21
+ # generation configs
22
+ prompt: ""
23
+
24
+ llama_model: "please set this value to the path of vicuna-13b-v0"
25
+
26
+ datasets:
27
+ cc_sbu_align:
28
+ vis_processor:
29
+ train:
30
+ name: "blip2_image_eval"
31
+ image_size: 224
32
+ text_processor:
33
+ train:
34
+ name: "blip_caption"
35
+
36
+ run:
37
+ task: image_text_pretrain
VLMEvalKit/vlmeval/vlm/misc/minigpt4_7b_eval.yaml ADDED
@@ -0,0 +1,38 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ model:
2
+ arch: minigpt4
3
+ model_type: pretrain_vicuna_7b
4
+ max_txt_len: 160
5
+ end_sym: "###"
6
+ low_resource: True
7
+ prompt_template: '###Human: {} ###Assistant: '
8
+ ckpt: "please set this value to the path of pretrained checkpoint"
9
+
10
+ # vit encoder
11
+ image_size: 224
12
+ drop_path_rate: 0
13
+ use_grad_checkpoint: False
14
+ vit_precision: "fp16"
15
+ freeze_vit: True
16
+ freeze_qformer: True
17
+
18
+ # Q-Former
19
+ num_query_token: 32
20
+
21
+ # generation configs
22
+ prompt: ""
23
+
24
+ llama_model: "please set this value to the path of vicuna-7b-v0"
25
+
26
+
27
+ datasets:
28
+ cc_sbu_align:
29
+ vis_processor:
30
+ train:
31
+ name: "blip2_image_eval"
32
+ image_size: 224
33
+ text_processor:
34
+ train:
35
+ name: "blip_caption"
36
+
37
+ run:
38
+ task: image_text_pretrain
VLMEvalKit/vlmeval/vlm/misc/minigptv2_eval.yaml ADDED
@@ -0,0 +1,36 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ model:
2
+ arch: minigpt_v2
3
+ model_type: pretrain
4
+ max_txt_len: 160
5
+ end_sym: "</s>"
6
+ low_resource: True
7
+ prompt_template: '[INST] {} [/INST]'
8
+ ckpt: "please set this value to the path of pretrained checkpoint"
9
+ lora_r: 64
10
+ lora_alpha: 16
11
+
12
+ # vit encoder
13
+ image_size: 448
14
+ drop_path_rate: 0
15
+ use_grad_checkpoint: False
16
+ vit_precision: "fp16"
17
+ freeze_vit: True
18
+
19
+ # generation configs
20
+ prompt: ""
21
+
22
+ # LLM
23
+ llama_model: "please set this value to the path of llama2-chat-7b"
24
+
25
+ datasets:
26
+ cc_sbu_align:
27
+ vis_processor:
28
+ train:
29
+ name: "blip2_image_eval"
30
+ image_size: 448
31
+ text_processor:
32
+ train:
33
+ name: "blip_caption"
34
+
35
+ run:
36
+ task: image_text_pretrain