Update model files
Browse files- .vscode/settings.json +3 -0
- README.md +1 -8
- processing_keye_vl_1_5.py +10 -2
.vscode/settings.json
ADDED
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
1 |
+
{
|
2 |
+
"kwaipilot.settings.proxy": "https://kinsight.corp.kuaishou.com"
|
3 |
+
}
|
README.md
CHANGED
@@ -1,10 +1,3 @@
|
|
1 |
-
---
|
2 |
-
license: apache-2.0
|
3 |
-
language:
|
4 |
-
- zh
|
5 |
-
- en
|
6 |
-
pipeline_tag: image-text-to-text
|
7 |
-
---
|
8 |
# Kwai Keye-VL
|
9 |
|
10 |
|
@@ -12,7 +5,7 @@ pipeline_tag: image-text-to-text
|
|
12 |
<img src="asset/keye_logo_2.png" width="100%" alt="Kwai Keye-VL Logo">
|
13 |
</div>
|
14 |
|
15 |
-
<font size=
|
16 |
[[🍎 Home Page](https://kwai-keye.github.io/)]
|
17 |
[[📖 Technique Report](https://arxiv.org/abs/2507.01949)]
|
18 |
[[📊 Keye-VL-8B-Preview](https://huggingface.co/Kwai-Keye/Keye-VL-8B-Preview) ]
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
# Kwai Keye-VL
|
2 |
|
3 |
|
|
|
5 |
<img src="asset/keye_logo_2.png" width="100%" alt="Kwai Keye-VL Logo">
|
6 |
</div>
|
7 |
|
8 |
+
<font size=7><div align='center' >
|
9 |
[[🍎 Home Page](https://kwai-keye.github.io/)]
|
10 |
[[📖 Technique Report](https://arxiv.org/abs/2507.01949)]
|
11 |
[[📊 Keye-VL-8B-Preview](https://huggingface.co/Kwai-Keye/Keye-VL-8B-Preview) ]
|
processing_keye_vl_1_5.py
CHANGED
@@ -234,10 +234,14 @@ class KeyeVL1_5Processor(ProcessorMixin):
|
|
234 |
mode="bilinear",
|
235 |
antialias=True,
|
236 |
).float()
|
|
|
|
|
|
|
|
|
237 |
# Tensor(N, C, H, W) -> Tuple[Tensor(1, C, H, W)]
|
238 |
# slow_frames = list(slow_frames.split(1, dim=0)),不split,在模型里面做
|
239 |
slow_video_inputs = self.image_processor(
|
240 |
-
images=None, videos=[slow_frames], **output_kwargs["images_kwargs"], do_resize=
|
241 |
slow_video_grid_thw = slow_video_inputs["video_grid_thw"]
|
242 |
batch_slow_frames.append(slow_video_inputs)
|
243 |
# # 当前这个视频每一帧的token数
|
@@ -255,10 +259,14 @@ class KeyeVL1_5Processor(ProcessorMixin):
|
|
255 |
mode="bilinear",
|
256 |
antialias=True,
|
257 |
).float()
|
|
|
|
|
|
|
|
|
258 |
# Tensor(N, C, H, W) -> Tuple[Tensor(1, C, H, W)]
|
259 |
# fast_frames = list(fast_frames.split(1, dim=0))
|
260 |
fast_video_inputs = self.image_processor(
|
261 |
-
images=None, videos=[fast_frames], **output_kwargs["images_kwargs"], do_resize=
|
262 |
fast_video_grid_thw = fast_video_inputs["video_grid_thw"]
|
263 |
batch_fast_frames.append(fast_video_inputs)
|
264 |
# # 当前这个视频的所有token数
|
|
|
234 |
mode="bilinear",
|
235 |
antialias=True,
|
236 |
).float()
|
237 |
+
do_resize = False
|
238 |
+
else:
|
239 |
+
slow_frames = slow_frames.float()
|
240 |
+
do_resize = True
|
241 |
# Tensor(N, C, H, W) -> Tuple[Tensor(1, C, H, W)]
|
242 |
# slow_frames = list(slow_frames.split(1, dim=0)),不split,在模型里面做
|
243 |
slow_video_inputs = self.image_processor(
|
244 |
+
images=None, videos=[slow_frames], **output_kwargs["images_kwargs"], do_resize=do_resize)
|
245 |
slow_video_grid_thw = slow_video_inputs["video_grid_thw"]
|
246 |
batch_slow_frames.append(slow_video_inputs)
|
247 |
# # 当前这个视频每一帧的token数
|
|
|
259 |
mode="bilinear",
|
260 |
antialias=True,
|
261 |
).float()
|
262 |
+
do_fast_resize = False
|
263 |
+
else:
|
264 |
+
fast_frames = fast_frames.float()
|
265 |
+
do_fast_resize = True
|
266 |
# Tensor(N, C, H, W) -> Tuple[Tensor(1, C, H, W)]
|
267 |
# fast_frames = list(fast_frames.split(1, dim=0))
|
268 |
fast_video_inputs = self.image_processor(
|
269 |
+
images=None, videos=[fast_frames], **output_kwargs["images_kwargs"], do_resize=do_fast_resize)
|
270 |
fast_video_grid_thw = fast_video_inputs["video_grid_thw"]
|
271 |
batch_fast_frames.append(fast_video_inputs)
|
272 |
# # 当前这个视频的所有token数
|