Update model files
Browse files- .vscode/settings.json +3 -0
- README.md +1 -8
- processing_keye_vl_1_5.py +10 -2
.vscode/settings.json
ADDED
|
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
+
{
|
| 2 |
+
"kwaipilot.settings.proxy": "https://kinsight.corp.kuaishou.com"
|
| 3 |
+
}
|
README.md
CHANGED
|
@@ -1,10 +1,3 @@
|
|
| 1 |
-
---
|
| 2 |
-
license: apache-2.0
|
| 3 |
-
language:
|
| 4 |
-
- zh
|
| 5 |
-
- en
|
| 6 |
-
pipeline_tag: image-text-to-text
|
| 7 |
-
---
|
| 8 |
# Kwai Keye-VL
|
| 9 |
|
| 10 |
|
|
@@ -12,7 +5,7 @@ pipeline_tag: image-text-to-text
|
|
| 12 |
<img src="asset/keye_logo_2.png" width="100%" alt="Kwai Keye-VL Logo">
|
| 13 |
</div>
|
| 14 |
|
| 15 |
-
<font size=
|
| 16 |
[[🍎 Home Page](https://kwai-keye.github.io/)]
|
| 17 |
[[📖 Technique Report](https://arxiv.org/abs/2507.01949)]
|
| 18 |
[[📊 Keye-VL-8B-Preview](https://huggingface.co/Kwai-Keye/Keye-VL-8B-Preview) ]
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1 |
# Kwai Keye-VL
|
| 2 |
|
| 3 |
|
|
|
|
| 5 |
<img src="asset/keye_logo_2.png" width="100%" alt="Kwai Keye-VL Logo">
|
| 6 |
</div>
|
| 7 |
|
| 8 |
+
<font size=7><div align='center' >
|
| 9 |
[[🍎 Home Page](https://kwai-keye.github.io/)]
|
| 10 |
[[📖 Technique Report](https://arxiv.org/abs/2507.01949)]
|
| 11 |
[[📊 Keye-VL-8B-Preview](https://huggingface.co/Kwai-Keye/Keye-VL-8B-Preview) ]
|
processing_keye_vl_1_5.py
CHANGED
|
@@ -234,10 +234,14 @@ class KeyeVL1_5Processor(ProcessorMixin):
|
|
| 234 |
mode="bilinear",
|
| 235 |
antialias=True,
|
| 236 |
).float()
|
|
|
|
|
|
|
|
|
|
|
|
|
| 237 |
# Tensor(N, C, H, W) -> Tuple[Tensor(1, C, H, W)]
|
| 238 |
# slow_frames = list(slow_frames.split(1, dim=0)),不split,在模型里面做
|
| 239 |
slow_video_inputs = self.image_processor(
|
| 240 |
-
images=None, videos=[slow_frames], **output_kwargs["images_kwargs"], do_resize=
|
| 241 |
slow_video_grid_thw = slow_video_inputs["video_grid_thw"]
|
| 242 |
batch_slow_frames.append(slow_video_inputs)
|
| 243 |
# # 当前这个视频每一帧的token数
|
|
@@ -255,10 +259,14 @@ class KeyeVL1_5Processor(ProcessorMixin):
|
|
| 255 |
mode="bilinear",
|
| 256 |
antialias=True,
|
| 257 |
).float()
|
|
|
|
|
|
|
|
|
|
|
|
|
| 258 |
# Tensor(N, C, H, W) -> Tuple[Tensor(1, C, H, W)]
|
| 259 |
# fast_frames = list(fast_frames.split(1, dim=0))
|
| 260 |
fast_video_inputs = self.image_processor(
|
| 261 |
-
images=None, videos=[fast_frames], **output_kwargs["images_kwargs"], do_resize=
|
| 262 |
fast_video_grid_thw = fast_video_inputs["video_grid_thw"]
|
| 263 |
batch_fast_frames.append(fast_video_inputs)
|
| 264 |
# # 当前这个视频的所有token数
|
|
|
|
| 234 |
mode="bilinear",
|
| 235 |
antialias=True,
|
| 236 |
).float()
|
| 237 |
+
do_resize = False
|
| 238 |
+
else:
|
| 239 |
+
slow_frames = slow_frames.float()
|
| 240 |
+
do_resize = True
|
| 241 |
# Tensor(N, C, H, W) -> Tuple[Tensor(1, C, H, W)]
|
| 242 |
# slow_frames = list(slow_frames.split(1, dim=0)),不split,在模型里面做
|
| 243 |
slow_video_inputs = self.image_processor(
|
| 244 |
+
images=None, videos=[slow_frames], **output_kwargs["images_kwargs"], do_resize=do_resize)
|
| 245 |
slow_video_grid_thw = slow_video_inputs["video_grid_thw"]
|
| 246 |
batch_slow_frames.append(slow_video_inputs)
|
| 247 |
# # 当前这个视频每一帧的token数
|
|
|
|
| 259 |
mode="bilinear",
|
| 260 |
antialias=True,
|
| 261 |
).float()
|
| 262 |
+
do_fast_resize = False
|
| 263 |
+
else:
|
| 264 |
+
fast_frames = fast_frames.float()
|
| 265 |
+
do_fast_resize = True
|
| 266 |
# Tensor(N, C, H, W) -> Tuple[Tensor(1, C, H, W)]
|
| 267 |
# fast_frames = list(fast_frames.split(1, dim=0))
|
| 268 |
fast_video_inputs = self.image_processor(
|
| 269 |
+
images=None, videos=[fast_frames], **output_kwargs["images_kwargs"], do_resize=do_fast_resize)
|
| 270 |
fast_video_grid_thw = fast_video_inputs["video_grid_thw"]
|
| 271 |
batch_fast_frames.append(fast_video_inputs)
|
| 272 |
# # 当前这个视频的所有token数
|