adjust image processing for batch output (#63)
Browse files- adjust image processing for batch output (4c9f169e66625665bd1be7ac1b2847b8af50ac2c)
- add version in config.json (4e29d338dc2de73cc2d7dc7e9ef736adeb70cace)
- config.json +1 -0
- image_processing_minicpmv.py +1 -1
- processing_minicpmv.py +9 -12
config.json
CHANGED
|
@@ -1,5 +1,6 @@
|
|
| 1 |
{
|
| 2 |
"_name_or_path": "openbmb/MiniCPM-Llama3-V-2_5",
|
|
|
|
| 3 |
"architectures": [
|
| 4 |
"MiniCPMV"
|
| 5 |
],
|
|
|
|
| 1 |
{
|
| 2 |
"_name_or_path": "openbmb/MiniCPM-Llama3-V-2_5",
|
| 3 |
+
"version": "2.5",
|
| 4 |
"architectures": [
|
| 5 |
"MiniCPMV"
|
| 6 |
],
|
image_processing_minicpmv.py
CHANGED
|
@@ -396,7 +396,7 @@ class MiniCPMVImageProcessor(BaseImageProcessor):
|
|
| 396 |
if tgt_sizes:
|
| 397 |
tgt_sizes = np.vstack(tgt_sizes)
|
| 398 |
return MiniCPMVBatchFeature(
|
| 399 |
-
data={"pixel_values": new_images, "image_sizes": image_sizes, "tgt_sizes": tgt_sizes}, tensor_type=return_tensors
|
| 400 |
)
|
| 401 |
|
| 402 |
AutoImageProcessor.register("MiniCPMVImageProcessor", MiniCPMVImageProcessor)
|
|
|
|
| 396 |
if tgt_sizes:
|
| 397 |
tgt_sizes = np.vstack(tgt_sizes)
|
| 398 |
return MiniCPMVBatchFeature(
|
| 399 |
+
data={"pixel_values": [new_images], "image_sizes": [image_sizes], "tgt_sizes": [tgt_sizes]}, tensor_type=return_tensors
|
| 400 |
)
|
| 401 |
|
| 402 |
AutoImageProcessor.register("MiniCPMVImageProcessor", MiniCPMVImageProcessor)
|
processing_minicpmv.py
CHANGED
|
@@ -61,14 +61,10 @@ class MiniCPMVProcessor(ProcessorMixin):
|
|
| 61 |
return_tensors: Optional[Union[str, TensorType]] = TensorType.PYTORCH,
|
| 62 |
) -> MiniCPMVBatchFeature:
|
| 63 |
"""
|
| 64 |
-
|
| 65 |
-
and `kwargs` arguments to LlamaTokenizerFast's [`~LlamaTokenizerFast.__call__`] if `text` is not `None` to encode
|
| 66 |
-
the text. To prepare the image(s), this method forwards the `images` and `kwrags` arguments to
|
| 67 |
-
LlavaNextImageProcessor's [`~LlavaNextImageProcessor.__call__`] if `images` is not `None`. Please refer to the doctsring
|
| 68 |
-
of the above two methods for more information.
|
| 69 |
|
| 70 |
Args:
|
| 71 |
-
text (`str
|
| 72 |
The sequence or batch of sequences to be encoded. Each sequence can be a string or a list of strings
|
| 73 |
(pretokenized string). If the sequences are provided as list of strings (pretokenized), you must set
|
| 74 |
`is_split_into_words=True` (to lift the ambiguity with a batch of sequences).
|
|
@@ -176,19 +172,19 @@ class MiniCPMVProcessor(ProcessorMixin):
|
|
| 176 |
images, image_sizes, tgt_sizes = images["pixel_values"], images["image_sizes"], images["tgt_sizes"]
|
| 177 |
|
| 178 |
image_tags = re.findall(pattern, texts)
|
| 179 |
-
assert len(image_tags) == len(image_sizes)
|
| 180 |
text_chunks = texts.split(pattern)
|
| 181 |
final_texts = ""
|
| 182 |
for i in range(len(image_tags)):
|
| 183 |
-
final_texts = final_texts + text_chunks[i] + self.image_processor.get_slice_image_placeholder(image_sizes[i])
|
| 184 |
final_texts += text_chunks[-1]
|
| 185 |
input_ids, image_bounds = self._convert(final_texts, max_length)
|
| 186 |
return MiniCPMVBatchFeature(data={
|
| 187 |
"input_ids": input_ids,
|
| 188 |
-
"pixel_values":
|
| 189 |
-
"image_sizes":
|
| 190 |
"image_bound": [image_bounds],
|
| 191 |
-
"tgt_sizes":
|
| 192 |
})
|
| 193 |
|
| 194 |
@property
|
|
@@ -244,4 +240,5 @@ class MiniCPMVProcessor(ProcessorMixin):
|
|
| 244 |
else:
|
| 245 |
tensor[i, : len(item[key][0]), :] = item[key][0].clone()
|
| 246 |
|
| 247 |
-
return tensor
|
|
|
|
|
|
| 61 |
return_tensors: Optional[Union[str, TensorType]] = TensorType.PYTORCH,
|
| 62 |
) -> MiniCPMVBatchFeature:
|
| 63 |
"""
|
| 64 |
+
Only support for single input for now. Batched input is coming soon.
|
|
|
|
|
|
|
|
|
|
|
|
|
| 65 |
|
| 66 |
Args:
|
| 67 |
+
text (`str`):
|
| 68 |
The sequence or batch of sequences to be encoded. Each sequence can be a string or a list of strings
|
| 69 |
(pretokenized string). If the sequences are provided as list of strings (pretokenized), you must set
|
| 70 |
`is_split_into_words=True` (to lift the ambiguity with a batch of sequences).
|
|
|
|
| 172 |
images, image_sizes, tgt_sizes = images["pixel_values"], images["image_sizes"], images["tgt_sizes"]
|
| 173 |
|
| 174 |
image_tags = re.findall(pattern, texts)
|
| 175 |
+
assert len(image_tags) == len(image_sizes[0])
|
| 176 |
text_chunks = texts.split(pattern)
|
| 177 |
final_texts = ""
|
| 178 |
for i in range(len(image_tags)):
|
| 179 |
+
final_texts = final_texts + text_chunks[i] + self.image_processor.get_slice_image_placeholder(image_sizes[0][i])
|
| 180 |
final_texts += text_chunks[-1]
|
| 181 |
input_ids, image_bounds = self._convert(final_texts, max_length)
|
| 182 |
return MiniCPMVBatchFeature(data={
|
| 183 |
"input_ids": input_ids,
|
| 184 |
+
"pixel_values": images,
|
| 185 |
+
"image_sizes": image_sizes,
|
| 186 |
"image_bound": [image_bounds],
|
| 187 |
+
"tgt_sizes": tgt_sizes
|
| 188 |
})
|
| 189 |
|
| 190 |
@property
|
|
|
|
| 240 |
else:
|
| 241 |
tensor[i, : len(item[key][0]), :] = item[key][0].clone()
|
| 242 |
|
| 243 |
+
return tensor
|
| 244 |
+
|