hezhihui
commited on
Commit
·
4c9f169
1
Parent(s):
3b6aeff
adjust image processing for batch output
Browse files- image_processing_minicpmv.py +1 -1
- processing_minicpmv.py +9 -12
image_processing_minicpmv.py
CHANGED
|
@@ -396,7 +396,7 @@ class MiniCPMVImageProcessor(BaseImageProcessor):
|
|
| 396 |
if tgt_sizes:
|
| 397 |
tgt_sizes = np.vstack(tgt_sizes)
|
| 398 |
return MiniCPMVBatchFeature(
|
| 399 |
-
data={"pixel_values": new_images, "image_sizes": image_sizes, "tgt_sizes": tgt_sizes}, tensor_type=return_tensors
|
| 400 |
)
|
| 401 |
|
| 402 |
AutoImageProcessor.register("MiniCPMVImageProcessor", MiniCPMVImageProcessor)
|
|
|
|
| 396 |
if tgt_sizes:
|
| 397 |
tgt_sizes = np.vstack(tgt_sizes)
|
| 398 |
return MiniCPMVBatchFeature(
|
| 399 |
+
data={"pixel_values": [new_images], "image_sizes": [image_sizes], "tgt_sizes": [tgt_sizes]}, tensor_type=return_tensors
|
| 400 |
)
|
| 401 |
|
| 402 |
AutoImageProcessor.register("MiniCPMVImageProcessor", MiniCPMVImageProcessor)
|
processing_minicpmv.py
CHANGED
|
@@ -61,14 +61,10 @@ class MiniCPMVProcessor(ProcessorMixin):
|
|
| 61 |
return_tensors: Optional[Union[str, TensorType]] = TensorType.PYTORCH,
|
| 62 |
) -> MiniCPMVBatchFeature:
|
| 63 |
"""
|
| 64 |
-
|
| 65 |
-
and `kwargs` arguments to LlamaTokenizerFast's [`~LlamaTokenizerFast.__call__`] if `text` is not `None` to encode
|
| 66 |
-
the text. To prepare the image(s), this method forwards the `images` and `kwrags` arguments to
|
| 67 |
-
LlavaNextImageProcessor's [`~LlavaNextImageProcessor.__call__`] if `images` is not `None`. Please refer to the doctsring
|
| 68 |
-
of the above two methods for more information.
|
| 69 |
|
| 70 |
Args:
|
| 71 |
-
text (`str
|
| 72 |
The sequence or batch of sequences to be encoded. Each sequence can be a string or a list of strings
|
| 73 |
(pretokenized string). If the sequences are provided as list of strings (pretokenized), you must set
|
| 74 |
`is_split_into_words=True` (to lift the ambiguity with a batch of sequences).
|
|
@@ -176,19 +172,19 @@ class MiniCPMVProcessor(ProcessorMixin):
|
|
| 176 |
images, image_sizes, tgt_sizes = images["pixel_values"], images["image_sizes"], images["tgt_sizes"]
|
| 177 |
|
| 178 |
image_tags = re.findall(pattern, texts)
|
| 179 |
-
assert len(image_tags) == len(image_sizes)
|
| 180 |
text_chunks = texts.split(pattern)
|
| 181 |
final_texts = ""
|
| 182 |
for i in range(len(image_tags)):
|
| 183 |
-
final_texts = final_texts + text_chunks[i] + self.image_processor.get_slice_image_placeholder(image_sizes[i])
|
| 184 |
final_texts += text_chunks[-1]
|
| 185 |
input_ids, image_bounds = self._convert(final_texts, max_length)
|
| 186 |
return MiniCPMVBatchFeature(data={
|
| 187 |
"input_ids": input_ids,
|
| 188 |
-
"pixel_values":
|
| 189 |
-
"image_sizes":
|
| 190 |
"image_bound": [image_bounds],
|
| 191 |
-
"tgt_sizes":
|
| 192 |
})
|
| 193 |
|
| 194 |
@property
|
|
@@ -244,4 +240,5 @@ class MiniCPMVProcessor(ProcessorMixin):
|
|
| 244 |
else:
|
| 245 |
tensor[i, : len(item[key][0]), :] = item[key][0].clone()
|
| 246 |
|
| 247 |
-
return tensor
|
|
|
|
|
|
| 61 |
return_tensors: Optional[Union[str, TensorType]] = TensorType.PYTORCH,
|
| 62 |
) -> MiniCPMVBatchFeature:
|
| 63 |
"""
|
| 64 |
+
Only support for single input for now. Batched input is coming soon.
|
|
|
|
|
|
|
|
|
|
|
|
|
| 65 |
|
| 66 |
Args:
|
| 67 |
+
text (`str`):
|
| 68 |
The sequence or batch of sequences to be encoded. Each sequence can be a string or a list of strings
|
| 69 |
(pretokenized string). If the sequences are provided as list of strings (pretokenized), you must set
|
| 70 |
`is_split_into_words=True` (to lift the ambiguity with a batch of sequences).
|
|
|
|
| 172 |
images, image_sizes, tgt_sizes = images["pixel_values"], images["image_sizes"], images["tgt_sizes"]
|
| 173 |
|
| 174 |
image_tags = re.findall(pattern, texts)
|
| 175 |
+
assert len(image_tags) == len(image_sizes[0])
|
| 176 |
text_chunks = texts.split(pattern)
|
| 177 |
final_texts = ""
|
| 178 |
for i in range(len(image_tags)):
|
| 179 |
+
final_texts = final_texts + text_chunks[i] + self.image_processor.get_slice_image_placeholder(image_sizes[0][i])
|
| 180 |
final_texts += text_chunks[-1]
|
| 181 |
input_ids, image_bounds = self._convert(final_texts, max_length)
|
| 182 |
return MiniCPMVBatchFeature(data={
|
| 183 |
"input_ids": input_ids,
|
| 184 |
+
"pixel_values": images,
|
| 185 |
+
"image_sizes": image_sizes,
|
| 186 |
"image_bound": [image_bounds],
|
| 187 |
+
"tgt_sizes": tgt_sizes
|
| 188 |
})
|
| 189 |
|
| 190 |
@property
|
|
|
|
| 240 |
else:
|
| 241 |
tensor[i, : len(item[key][0]), :] = item[key][0].clone()
|
| 242 |
|
| 243 |
+
return tensor
|
| 244 |
+
|