Copy from moonshotai/Kimi-VL-A3B-Thinking
Browse files- .gitattributes +5 -0
- README.md +143 -0
- chat_template.jinja +31 -0
- config.json +75 -0
- configuration_kimi_vl.py +284 -0
- generation_config.json +9 -0
- image_processing_kimi_vl.py +126 -0
- model.safetensors.index.json +0 -0
- modeling_kimi_vl.py +0 -0
- preprocessor_config.json +20 -0
- processing_kimi_vl.py +170 -0
- tiktoken.model +3 -0
- tokenization_moonshot.py +302 -0
- tokenizer_config.json +134 -0
.gitattributes
CHANGED
@@ -33,3 +33,8 @@ saved_model/**/* filter=lfs diff=lfs merge=lfs -text
|
|
33 |
*.zip filter=lfs diff=lfs merge=lfs -text
|
34 |
*.zst filter=lfs diff=lfs merge=lfs -text
|
35 |
*tfevents* filter=lfs diff=lfs merge=lfs -text
|
|
|
|
|
|
|
|
|
|
|
|
33 |
*.zip filter=lfs diff=lfs merge=lfs -text
|
34 |
*.zst filter=lfs diff=lfs merge=lfs -text
|
35 |
*tfevents* filter=lfs diff=lfs merge=lfs -text
|
36 |
+
arch.png filter=lfs diff=lfs merge=lfs -text
|
37 |
+
instruct_perf.png filter=lfs diff=lfs merge=lfs -text
|
38 |
+
thinking_perf.png filter=lfs diff=lfs merge=lfs -text
|
39 |
+
kimi-vl-general-perf.png filter=lfs diff=lfs merge=lfs -text
|
40 |
+
figures/*.png filter=lfs diff=lfs merge=lfs -text
|
README.md
ADDED
@@ -0,0 +1,143 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
---
|
2 |
+
base_model:
|
3 |
+
- moonshotai/Kimi-VL-A3B-Instruct
|
4 |
+
license: mit
|
5 |
+
pipeline_tag: image-text-to-text
|
6 |
+
library_name: transformers
|
7 |
+
---
|
8 |
+
|
9 |
+
<div align="center">
|
10 |
+
<img width="30%" src="figures/logo.png">
|
11 |
+
</div>
|
12 |
+
|
13 |
+
<div align="center">
|
14 |
+
<a href="https://arxiv.org/abs/2504.07491">
|
15 |
+
<b>📄 Tech Report</b>
|
16 |
+
</a> |
|
17 |
+
<a href="https://github.com/MoonshotAI/Kimi-VL">
|
18 |
+
<b>📄 Github</b>
|
19 |
+
</a> |
|
20 |
+
<a href="https://huggingface.co/spaces/moonshotai/Kimi-VL-A3B-Thinking/">💬 Chat Web</a>
|
21 |
+
</div>
|
22 |
+
|
23 |
+
## 1. Introduction
|
24 |
+
|
25 |
+
We present **Kimi-VL**, an efficient open-source Mixture-of-Experts (MoE) vision-language model (VLM) that offers **advanced multimodal reasoning, long-context understanding, and strong agent capabilities**—all while activating only **2.8B** parameters in its language decoder (Kimi-VL-A3B).
|
26 |
+
|
27 |
+
Kimi-VL demonstrates strong performance across challenging domains:
|
28 |
+
as a general-purpose VLM, Kimi-VL excels in multi-turn agent interaction tasks (e.g.,OSWorld), achieving state-of-the-art results comparable to flagship models.
|
29 |
+
Furthermore, it exhibits remarkable capabilities across diverse challenging vision language tasks, including college-level image and video comprehension, optical character recognition (OCR), mathematical reasoning, multi-image understanding, and etc.
|
30 |
+
|
31 |
+
In comparative evaluations, it effectively competes with cutting-edge efficient VLMs such as GPT-4o-mini, Qwen2.5-VL-7B, and Gemma-3-12B-IT, while surpassing GPT-4o in several specialized domains.
|
32 |
+
|
33 |
+
Kimi-VL also advances the pareto frontiers of multimodal models in processing long contexts and perceiving clearly: Equipped with a 128K extended context window, Kimi-VL can processes long and diverse inputs, achieving impressive scores of 64.5 on LongVideoBench, and 35.1 on MMLongBench-Doc; Its native-resolution vision encoder, MoonViT, further allows it to see and understand ultra-high-resolution visual inputs, achieving 83.2 on InfoVQA and 34.5 on ScreenSpot-Pro, while maintaining lower computational cost with common visual inputs and general tasks.
|
34 |
+
|
35 |
+
Building on this foundation, we introduce an advanced long-thinking variant: **Kimi-VL-Thinking**. Developed through long chain-of-thought (CoT) supervised fine-tuning (SFT) and reinforcement learning (RL), this model exhibits strong long-horizon reasoning capabilities. It achieves scores of 61.7 on MMMU, 36.8 on MathVision, and 71.3 on MathVista while maintaining the compact 2.8B activated LLM parameter footprint, setting a new standard for efficient yet capable multimodal **thinking** models.
|
36 |
+
|
37 |
+
More information can be found in our technical report: [Kimi-VL Technical Report](https://arxiv.org/abs/2504.07491).
|
38 |
+
|
39 |
+
## 2. Architecture
|
40 |
+
|
41 |
+
The model adopts an MoE language model, a native-resolution visual encoder (MoonViT), and an MLP projector, as illustrated in the following image.
|
42 |
+
|
43 |
+
<div align="center">
|
44 |
+
<img width="90%" src="figures/arch.png">
|
45 |
+
</div>
|
46 |
+
|
47 |
+
## 3. Model Variants
|
48 |
+
|
49 |
+
🤗 For general multimodal perception and understanding, OCR, long video and long document, video perception, and agent uses, we recommend `Kimi-VL-A3B-Instruct` for efficient inference; for advanced text and multimodal reasoning (e.g. math), please consider using `Kimi-VL-A3B-Thinking`.
|
50 |
+
|
51 |
+
<div align="center">
|
52 |
+
|
53 |
+
| **Model** | **#Total Params** | **#Activated Params** | **Context Length** | **Download Link** |
|
54 |
+
| :------------: | :------------: | :------------: | :------------: | :------------: |
|
55 |
+
| Kimi-VL-A3B-Instruct | 16B | 3B | 128K | [🤗 Hugging Face](https://huggingface.co/moonshotai/Kimi-VL-A3B-Instruct) |
|
56 |
+
| Kimi-VL-A3B-Thinking | 16B | 3B | 128K | [🤗 Hugging Face](https://huggingface.co/moonshotai/Kimi-VL-A3B-Thinking) |
|
57 |
+
|
58 |
+
</div>
|
59 |
+
|
60 |
+
> [!Note]
|
61 |
+
> Recommended parameter settings:
|
62 |
+
> - For **Thinking models**, it is recommended to use `Temperature = 0.6`.
|
63 |
+
> - For **Instruct models**, it is recommended to use `Temperature = 0.2`.
|
64 |
+
|
65 |
+
|
66 |
+
|
67 |
+
## 4. Performance
|
68 |
+
|
69 |
+
With effective long-thinking abilitites, Kimi-VL-A3B-Thinking can match the performance of 30B/70B frontier open-source VLMs on MathVision benchmark:
|
70 |
+
|
71 |
+
<div align="center">
|
72 |
+
<img width="100%" src="figures/thinking_perf.png">
|
73 |
+
</div>
|
74 |
+
|
75 |
+
Full comparison on MMMU, MathVision, and MathVista-mini:
|
76 |
+
|
77 |
+
<div align="center">
|
78 |
+
|
79 |
+
| Benchmark (Metric) | GPT-4o | GPT-4o-mini | Qwen2.5-VL-72B | Qwen2.5-VL-7B | Gemma-3-27B | Gemma-3-12B | o1-1217 | QVQ-72B | Kimi-k1.5 | Kimi-VL-Thinking-A3B |
|
80 |
+
|---------------------------------|--------|-------------|----------------|---------------|-------------|-------------|---------|----------|-----------|----------------------|
|
81 |
+
| *Thinking Model?* | | | | | | | ✅ | ✅ | ✅ | ✅ |
|
82 |
+
| MathVision (full) (Pass@1) | 30.4 | - | 38.1 | 25.1 | 35.5 | 32.1 | - | 35.9 | 38.6 | 36.8 |
|
83 |
+
| MathVista (mini) (Pass@1) | 63.8 | 56.7 | 74.8 | 68.2 | 62.3 | 56.4 | 71.0 | 71.4 | 74.9 | 71.3 |
|
84 |
+
| MMMU (val) (Pass@1) | 69.1 | 60.0 | 74.8 | 58.6 | 64.8 | 59.6 | 77.3 | 70.3 | 70.0 | 61.7 |
|
85 |
+
|
86 |
+
</div>
|
87 |
+
|
88 |
+
### Inference with 🤗 Hugging Face Transformers
|
89 |
+
|
90 |
+
We introduce how to use our model at inference stage using transformers library. It is recommended to use python=3.10, torch>=2.1.0, and transformers=4.48.2 as the development environment.
|
91 |
+
|
92 |
+
```python
|
93 |
+
from PIL import Image
|
94 |
+
from transformers import AutoModelForCausalLM, AutoProcessor
|
95 |
+
|
96 |
+
model_path = "moonshotai/Kimi-VL-A3B-Thinking"
|
97 |
+
model = AutoModelForCausalLM.from_pretrained(
|
98 |
+
model_path,
|
99 |
+
torch_dtype="auto",
|
100 |
+
device_map="auto",
|
101 |
+
trust_remote_code=True,
|
102 |
+
)
|
103 |
+
processor = AutoProcessor.from_pretrained(model_path, trust_remote_code=True)
|
104 |
+
|
105 |
+
image_paths = ["./figures/demo1.png", "./figures/demo2.png"]
|
106 |
+
images = [Image.open(path) for path in image_paths]
|
107 |
+
messages = [
|
108 |
+
{
|
109 |
+
"role": "user",
|
110 |
+
"content": [
|
111 |
+
{"type": "image", "image": image_path} for image_path in image_paths
|
112 |
+
] + [{"type": "text", "text": "Please infer step by step who this manuscript belongs to and what it records"}],
|
113 |
+
},
|
114 |
+
]
|
115 |
+
text = processor.apply_chat_template(messages, add_generation_prompt=True, return_tensors="pt")
|
116 |
+
inputs = processor(images=images, text=text, return_tensors="pt", padding=True, truncation=True).to(model.device)
|
117 |
+
generated_ids = model.generate(**inputs, max_new_tokens=2048)
|
118 |
+
generated_ids_trimmed = [
|
119 |
+
out_ids[len(in_ids) :] for in_ids, out_ids in zip(inputs.input_ids, generated_ids)
|
120 |
+
]
|
121 |
+
response = processor.batch_decode(
|
122 |
+
generated_ids_trimmed, skip_special_tokens=True, clean_up_tokenization_spaces=False
|
123 |
+
)[0]
|
124 |
+
print(response)
|
125 |
+
```
|
126 |
+
|
127 |
+
### Inference with VLLM
|
128 |
+
|
129 |
+
We have submitted a Merge Request [#16387](https://github.com/vllm-project/vllm/pull/16387) to vLLM. You are welcome to deploy Kimi-VL using the branch corresponding to the vLLM MR until the MR is merged.
|
130 |
+
|
131 |
+
## 5. Citation
|
132 |
+
|
133 |
+
```
|
134 |
+
@misc{kimiteam2025kimivltechnicalreport,
|
135 |
+
title={{Kimi-VL} Technical Report},
|
136 |
+
author={Kimi Team and Angang Du and Bohong Yin and Bowei Xing and Bowen Qu and Bowen Wang and Cheng Chen and Chenlin Zhang and Chenzhuang Du and Chu Wei and Congcong Wang and Dehao Zhang and Dikang Du and Dongliang Wang and Enming Yuan and Enzhe Lu and Fang Li and Flood Sung and Guangda Wei and Guokun Lai and Han Zhu and Hao Ding and Hao Hu and Hao Yang and Hao Zhang and Haoning Wu and Haotian Yao and Haoyu Lu and Heng Wang and Hongcheng Gao and Huabin Zheng and Jiaming Li and Jianlin Su and Jianzhou Wang and Jiaqi Deng and Jiezhong Qiu and Jin Xie and Jinhong Wang and Jingyuan Liu and Junjie Yan and Kun Ouyang and Liang Chen and Lin Sui and Longhui Yu and Mengfan Dong and Mengnan Dong and Nuo Xu and Pengyu Cheng and Qizheng Gu and Runjie Zhou and Shaowei Liu and Sihan Cao and Tao Yu and Tianhui Song and Tongtong Bai and Wei Song and Weiran He and Weixiao Huang and Weixin Xu and Xiaokun Yuan and Xingcheng Yao and Xingzhe Wu and Xinxing Zu and Xinyu Zhou and Xinyuan Wang and Y. Charles and Yan Zhong and Yang Li and Yangyang Hu and Yanru Chen and Yejie Wang and Yibo Liu and Yibo Miao and Yidao Qin and Yimin Chen and Yiping Bao and Yiqin Wang and Yongsheng Kang and Yuanxin Liu and Yulun Du and Yuxin Wu and Yuzhi Wang and Yuzi Yan and Zaida Zhou and Zhaowei Li and Zhejun Jiang and Zheng Zhang and Zhilin Yang and Zhiqi Huang and Zihao Huang and Zijia Zhao and Ziwei Chen},
|
137 |
+
year={2025},
|
138 |
+
eprint={2504.07491},
|
139 |
+
archivePrefix={arXiv},
|
140 |
+
primaryClass={cs.CV},
|
141 |
+
url={https://arxiv.org/abs/2504.07491},
|
142 |
+
}
|
143 |
+
```
|
chat_template.jinja
ADDED
@@ -0,0 +1,31 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
{%- for message in messages -%}
|
2 |
+
{%- if loop.first and messages[0]['role'] != 'system' -%}
|
3 |
+
{{'<|im_system|>system<|im_middle|>You are a helpful assistant<|im_end|>'}}
|
4 |
+
{%- endif -%}
|
5 |
+
{%- if message['role'] == 'system' -%}
|
6 |
+
{{'<|im_system|>'}}
|
7 |
+
{%- endif -%}
|
8 |
+
{%- if message['role'] == 'user' -%}
|
9 |
+
{{'<|im_user|>'}}
|
10 |
+
{%- endif -%}
|
11 |
+
{%- if message['role'] == 'assistant' -%}
|
12 |
+
{{'<|im_assistant|>'}}
|
13 |
+
{%- endif -%}
|
14 |
+
{{- message['role'] -}}
|
15 |
+
{{'<|im_middle|>'}}
|
16 |
+
{%- if message['content'] is string -%}
|
17 |
+
{{- message['content'] + '<|im_end|>' -}}
|
18 |
+
{%- else -%}
|
19 |
+
{%- for content in message['content'] -%}
|
20 |
+
{%- if content['type'] == 'image' or 'image' in content or 'image_url' in content -%}
|
21 |
+
{{'<|media_start|>image<|media_content|><|media_pad|><|media_end|>'}}
|
22 |
+
{%- else -%}
|
23 |
+
{{content['text']}}
|
24 |
+
{%- endif -%}
|
25 |
+
{%- endfor -%}
|
26 |
+
{{'<|im_end|>'}}
|
27 |
+
{%- endif -%}
|
28 |
+
{%- endfor -%}
|
29 |
+
{%- if add_generation_prompt -%}
|
30 |
+
{{'<|im_assistant|>assistant<|im_middle|>'}}
|
31 |
+
{%- endif -%}
|
config.json
ADDED
@@ -0,0 +1,75 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
{
|
2 |
+
"architectures": [
|
3 |
+
"KimiVLForConditionalGeneration"
|
4 |
+
],
|
5 |
+
"auto_map": {
|
6 |
+
"AutoConfig": "configuration_kimi_vl.KimiVLConfig",
|
7 |
+
"AutoModel": "modeling_kimi_vl.KimiVLForConditionalGeneration",
|
8 |
+
"AutoModelForCausalLM": "modeling_kimi_vl.KimiVLForConditionalGeneration"
|
9 |
+
},
|
10 |
+
"vision_config": {
|
11 |
+
"model_type": "moonvit",
|
12 |
+
"patch_size": 14,
|
13 |
+
"num_attention_heads": 16,
|
14 |
+
"num_hidden_layers": 27,
|
15 |
+
"hidden_size": 1152,
|
16 |
+
"intermediate_size": 4304,
|
17 |
+
"init_pos_emb_height": 64,
|
18 |
+
"init_pos_emb_width": 64,
|
19 |
+
"merge_kernel_size": [
|
20 |
+
2,
|
21 |
+
2
|
22 |
+
],
|
23 |
+
"torch_dtype": "bfloat16"
|
24 |
+
},
|
25 |
+
"text_config": {
|
26 |
+
"vocab_size": 163840,
|
27 |
+
"max_position_embeddings": 131072,
|
28 |
+
"hidden_size": 2048,
|
29 |
+
"intermediate_size": 11264,
|
30 |
+
"moe_intermediate_size": 1408,
|
31 |
+
"num_hidden_layers": 27,
|
32 |
+
"num_attention_heads": 16,
|
33 |
+
"n_shared_experts": 2,
|
34 |
+
"n_routed_experts": 64,
|
35 |
+
"ep_size": 1,
|
36 |
+
"routed_scaling_factor": 2.446,
|
37 |
+
"kv_lora_rank": 512,
|
38 |
+
"q_lora_rank": null,
|
39 |
+
"qk_rope_head_dim": 64,
|
40 |
+
"v_head_dim": 128,
|
41 |
+
"qk_nope_head_dim": 128,
|
42 |
+
"topk_method": "noaux_tc",
|
43 |
+
"n_group": 1,
|
44 |
+
"topk_group": 1,
|
45 |
+
"num_experts_per_tok": 6,
|
46 |
+
"moe_layer_freq": 1,
|
47 |
+
"first_k_dense_replace": 1,
|
48 |
+
"norm_topk_prob": true,
|
49 |
+
"scoring_func": "sigmoid",
|
50 |
+
"aux_loss_alpha": 0.001,
|
51 |
+
"seq_aux": true,
|
52 |
+
"num_key_value_heads": 16,
|
53 |
+
"hidden_act": "silu",
|
54 |
+
"initializer_range": 0.02,
|
55 |
+
"rms_norm_eps": 1e-05,
|
56 |
+
"pretraining_tp": 1,
|
57 |
+
"use_cache": true,
|
58 |
+
"rope_theta": 800000.0,
|
59 |
+
"rope_scaling": null,
|
60 |
+
"attention_bias": false,
|
61 |
+
"attention_dropout": 0.0,
|
62 |
+
"bos_token_id": 163584,
|
63 |
+
"pad_token_id": 163839,
|
64 |
+
"eos_token_id": 163585,
|
65 |
+
"torch_dtype": "bfloat16",
|
66 |
+
"tie_word_embeddings": false
|
67 |
+
},
|
68 |
+
"ignore_index": -100,
|
69 |
+
"media_placeholder_token_id": 163605,
|
70 |
+
"torch_dtype": "bfloat16",
|
71 |
+
"transformers_version": "4.50.3",
|
72 |
+
"tie_word_embeddings": false,
|
73 |
+
"vocab_size": 163840,
|
74 |
+
"model_type": "kimi_vl"
|
75 |
+
}
|
configuration_kimi_vl.py
ADDED
@@ -0,0 +1,284 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
from transformers.configuration_utils import PretrainedConfig
|
2 |
+
from transformers.utils import logging
|
3 |
+
from typing import Optional, Union
|
4 |
+
|
5 |
+
logger = logging.get_logger(__name__)
|
6 |
+
|
7 |
+
DEEPSEEK_PRETRAINED_CONFIG_ARCHIVE_MAP = {}
|
8 |
+
|
9 |
+
|
10 |
+
class DeepseekV3Config(PretrainedConfig):
|
11 |
+
r"""
|
12 |
+
This is the configuration class to store the configuration of a [`DeepseekV3Model`]. It is used to instantiate an DeepSeek
|
13 |
+
model according to the specified arguments, defining the model architecture. Instantiating a configuration with the
|
14 |
+
defaults will yield a similar configuration to that of the DeepSeek-V3.
|
15 |
+
|
16 |
+
Configuration objects inherit from [`PretrainedConfig`] and can be used to control the model outputs. Read the
|
17 |
+
documentation from [`PretrainedConfig`] for more information.
|
18 |
+
|
19 |
+
Copy from https://huggingface.co/deepseek-ai/DeepSeek-V3/blob/main/configuration_deepseek.py
|
20 |
+
|
21 |
+
Args:
|
22 |
+
vocab_size (`int`, *optional*, defaults to 129280):
|
23 |
+
Vocabulary size of the Deep model. Defines the number of different tokens that can be represented by the
|
24 |
+
`inputs_ids` passed when calling [`DeepseekV3Model`]
|
25 |
+
hidden_size (`int`, *optional*, defaults to 4096):
|
26 |
+
Dimension of the hidden representations.
|
27 |
+
intermediate_size (`int`, *optional*, defaults to 11008):
|
28 |
+
Dimension of the MLP representations.
|
29 |
+
moe_intermediate_size (`int`, *optional*, defaults to 1407):
|
30 |
+
Dimension of the MoE representations.
|
31 |
+
num_hidden_layers (`int`, *optional*, defaults to 32):
|
32 |
+
Number of hidden layers in the Transformer decoder.
|
33 |
+
num_nextn_predict_layers (`int`, *optional*, defaults to 1):
|
34 |
+
Number of nextn predict layers in the DeepSeekV3 Model.
|
35 |
+
num_attention_heads (`int`, *optional*, defaults to 32):
|
36 |
+
Number of attention heads for each attention layer in the Transformer decoder.
|
37 |
+
n_shared_experts (`int`, *optional*, defaults to None):
|
38 |
+
Number of shared experts, None means dense model.
|
39 |
+
n_routed_experts (`int`, *optional*, defaults to None):
|
40 |
+
Number of routed experts, None means dense model.
|
41 |
+
routed_scaling_factor (`float`, *optional*, defaults to 1.0):
|
42 |
+
Scaling factor or routed experts.
|
43 |
+
topk_method (`str`, *optional*, defaults to `gready`):
|
44 |
+
Topk method used in routed gate.
|
45 |
+
n_group (`int`, *optional*, defaults to None):
|
46 |
+
Number of groups for routed experts.
|
47 |
+
topk_group (`int`, *optional*, defaults to None):
|
48 |
+
Number of selected groups for each token(for each token, ensuring the selected experts is only within `topk_group` groups).
|
49 |
+
num_experts_per_tok (`int`, *optional*, defaults to None):
|
50 |
+
Number of selected experts, None means dense model.
|
51 |
+
moe_layer_freq (`int`, *optional*, defaults to 1):
|
52 |
+
The frequency of the MoE layer: one expert layer for every `moe_layer_freq - 1` dense layers.
|
53 |
+
first_k_dense_replace (`int`, *optional*, defaults to 0):
|
54 |
+
Number of dense layers in shallow layers(embed->dense->dense->...->dense->moe->moe...->lm_head).
|
55 |
+
\--k dense layers--/
|
56 |
+
norm_topk_prob (`bool`, *optional*, defaults to False):
|
57 |
+
Whether to normalize the weights of the routed experts.
|
58 |
+
scoring_func (`str`, *optional*, defaults to 'softmax'):
|
59 |
+
Method of computing expert weights.
|
60 |
+
aux_loss_alpha (`float`, *optional*, defaults to 0.001):
|
61 |
+
Auxiliary loss weight coefficient.
|
62 |
+
seq_aux = (`bool`, *optional*, defaults to True):
|
63 |
+
Whether to compute the auxiliary loss for each individual sample.
|
64 |
+
num_key_value_heads (`int`, *optional*):
|
65 |
+
This is the number of key_value heads that should be used to implement Grouped Query Attention. If
|
66 |
+
`num_key_value_heads=num_attention_heads`, the model will use Multi Head Attention (MHA), if
|
67 |
+
`num_key_value_heads=1 the model will use Multi Query Attention (MQA) otherwise GQA is used. When
|
68 |
+
converting a multi-head checkpoint to a GQA checkpoint, each group key and value head should be constructed
|
69 |
+
by meanpooling all the original heads within that group. For more details checkout [this
|
70 |
+
paper](https://arxiv.org/pdf/2305.13245.pdf). If it is not specified, will default to
|
71 |
+
`num_attention_heads`.
|
72 |
+
hidden_act (`str` or `function`, *optional*, defaults to `"silu"`):
|
73 |
+
The non-linear activation function (function or string) in the decoder.
|
74 |
+
max_position_embeddings (`int`, *optional*, defaults to 2048):
|
75 |
+
The maximum sequence length that this model might ever be used with.
|
76 |
+
initializer_range (`float`, *optional*, defaults to 0.02):
|
77 |
+
The standard deviation of the truncated_normal_initializer for initializing all weight matrices.
|
78 |
+
rms_norm_eps (`float`, *optional*, defaults to 1e-06):
|
79 |
+
The epsilon used by the rms normalization layers.
|
80 |
+
use_cache (`bool`, *optional*, defaults to `True`):
|
81 |
+
Whether or not the model should return the last key/values attentions (not used by all models). Only
|
82 |
+
relevant if `config.is_decoder=True`.
|
83 |
+
pad_token_id (`int`, *optional*):
|
84 |
+
Padding token id.
|
85 |
+
bos_token_id (`int`, *optional*, defaults to 1):
|
86 |
+
Beginning of stream token id.
|
87 |
+
eos_token_id (`int`, *optional*, defaults to 2):
|
88 |
+
End of stream token id.
|
89 |
+
pretraining_tp (`int`, *optional*, defaults to 1):
|
90 |
+
Experimental feature. Tensor parallelism rank used during pretraining. Please refer to [this
|
91 |
+
document](https://huggingface.co/docs/transformers/parallelism) to understand more about it. This value is
|
92 |
+
necessary to ensure exact reproducibility of the pretraining results. Please refer to [this
|
93 |
+
issue](https://github.com/pytorch/pytorch/issues/76232).
|
94 |
+
tie_word_embeddings (`bool`, *optional*, defaults to `False`):
|
95 |
+
Whether to tie weight embeddings
|
96 |
+
rope_theta (`float`, *optional*, defaults to 10000.0):
|
97 |
+
The base period of the RoPE embeddings.
|
98 |
+
rope_scaling (`Dict`, *optional*):
|
99 |
+
Dictionary containing the scaling configuration for the RoPE embeddings. Currently supports two scaling
|
100 |
+
strategies: linear and dynamic. Their scaling factor must be a float greater than 1. The expected format is
|
101 |
+
`{"type": strategy name, "factor": scaling factor}`. When using this flag, don't update
|
102 |
+
`max_position_embeddings` to the expected new maximum.
|
103 |
+
attention_bias (`bool`, defaults to `False`, *optional*, defaults to `False`):
|
104 |
+
Whether to use a bias in the query, key, value and output projection layers during self-attention.
|
105 |
+
attention_dropout (`float`, *optional*, defaults to 0.0):
|
106 |
+
The dropout ratio for the attention probabilities.
|
107 |
+
|
108 |
+
```python
|
109 |
+
>>> from transformers import DeepseekV3Model, DeepseekV3Config
|
110 |
+
|
111 |
+
>>> # Initializing a Deepseek-V3 style configuration
|
112 |
+
>>> configuration = DeepseekV3Config()
|
113 |
+
|
114 |
+
>>> # Accessing the model configuration
|
115 |
+
>>> configuration = model.config
|
116 |
+
```"""
|
117 |
+
|
118 |
+
model_type = "deepseek_v3"
|
119 |
+
keys_to_ignore_at_inference = ["past_key_values"]
|
120 |
+
|
121 |
+
def __init__(
|
122 |
+
self,
|
123 |
+
vocab_size=129280,
|
124 |
+
hidden_size=7168,
|
125 |
+
intermediate_size=18432,
|
126 |
+
moe_intermediate_size=2048,
|
127 |
+
num_hidden_layers=61,
|
128 |
+
num_nextn_predict_layers=1,
|
129 |
+
num_attention_heads=128,
|
130 |
+
num_key_value_heads=128,
|
131 |
+
n_shared_experts=1,
|
132 |
+
n_routed_experts=256,
|
133 |
+
ep_size=1,
|
134 |
+
routed_scaling_factor=2.5,
|
135 |
+
kv_lora_rank=512,
|
136 |
+
q_lora_rank=1536,
|
137 |
+
qk_rope_head_dim=64,
|
138 |
+
v_head_dim=128,
|
139 |
+
qk_nope_head_dim=128,
|
140 |
+
topk_method="noaux_tc",
|
141 |
+
n_group=8,
|
142 |
+
topk_group=4,
|
143 |
+
num_experts_per_tok=8,
|
144 |
+
moe_layer_freq=1,
|
145 |
+
first_k_dense_replace=3,
|
146 |
+
norm_topk_prob=True,
|
147 |
+
scoring_func="sigmoid",
|
148 |
+
aux_loss_alpha=0.001,
|
149 |
+
seq_aux=True,
|
150 |
+
hidden_act="silu",
|
151 |
+
max_position_embeddings=4096,
|
152 |
+
initializer_range=0.02,
|
153 |
+
rms_norm_eps=1e-6,
|
154 |
+
use_cache=True,
|
155 |
+
pad_token_id=None,
|
156 |
+
bos_token_id=0,
|
157 |
+
eos_token_id=1,
|
158 |
+
pretraining_tp=1,
|
159 |
+
tie_word_embeddings=False,
|
160 |
+
rope_theta=10000.0,
|
161 |
+
rope_scaling=None,
|
162 |
+
attention_bias=False,
|
163 |
+
attention_dropout=0.0,
|
164 |
+
**kwargs,
|
165 |
+
):
|
166 |
+
self.vocab_size = vocab_size
|
167 |
+
self.max_position_embeddings = max_position_embeddings
|
168 |
+
self.hidden_size = hidden_size
|
169 |
+
self.intermediate_size = intermediate_size
|
170 |
+
self.moe_intermediate_size = moe_intermediate_size
|
171 |
+
self.num_hidden_layers = num_hidden_layers
|
172 |
+
self.num_nextn_predict_layers = num_nextn_predict_layers
|
173 |
+
self.num_attention_heads = num_attention_heads
|
174 |
+
self.n_shared_experts = n_shared_experts
|
175 |
+
self.n_routed_experts = n_routed_experts
|
176 |
+
self.ep_size = ep_size
|
177 |
+
self.routed_scaling_factor = routed_scaling_factor
|
178 |
+
self.kv_lora_rank = kv_lora_rank
|
179 |
+
self.q_lora_rank = q_lora_rank
|
180 |
+
self.qk_rope_head_dim = qk_rope_head_dim
|
181 |
+
self.v_head_dim = v_head_dim
|
182 |
+
self.qk_nope_head_dim = qk_nope_head_dim
|
183 |
+
self.topk_method = topk_method
|
184 |
+
self.n_group = n_group
|
185 |
+
self.topk_group = topk_group
|
186 |
+
self.num_experts_per_tok = num_experts_per_tok
|
187 |
+
self.moe_layer_freq = moe_layer_freq
|
188 |
+
self.first_k_dense_replace = first_k_dense_replace
|
189 |
+
self.norm_topk_prob = norm_topk_prob
|
190 |
+
self.scoring_func = scoring_func
|
191 |
+
self.aux_loss_alpha = aux_loss_alpha
|
192 |
+
self.seq_aux = seq_aux
|
193 |
+
# for backward compatibility
|
194 |
+
if num_key_value_heads is None:
|
195 |
+
num_key_value_heads = num_attention_heads
|
196 |
+
|
197 |
+
self.num_key_value_heads = num_key_value_heads
|
198 |
+
self.hidden_act = hidden_act
|
199 |
+
self.initializer_range = initializer_range
|
200 |
+
self.rms_norm_eps = rms_norm_eps
|
201 |
+
self.pretraining_tp = pretraining_tp
|
202 |
+
self.use_cache = use_cache
|
203 |
+
self.rope_theta = rope_theta
|
204 |
+
self.rope_scaling = rope_scaling
|
205 |
+
self.attention_bias = attention_bias
|
206 |
+
self.attention_dropout = attention_dropout
|
207 |
+
|
208 |
+
super().__init__(
|
209 |
+
pad_token_id=pad_token_id,
|
210 |
+
bos_token_id=bos_token_id,
|
211 |
+
eos_token_id=eos_token_id,
|
212 |
+
tie_word_embeddings=tie_word_embeddings,
|
213 |
+
**kwargs,
|
214 |
+
)
|
215 |
+
|
216 |
+
|
217 |
+
class MoonViTConfig(PretrainedConfig):
|
218 |
+
model_type = "moonvit"
|
219 |
+
|
220 |
+
def __init__(
|
221 |
+
self,
|
222 |
+
patch_size: int = 14,
|
223 |
+
init_pos_emb_height: int = 64,
|
224 |
+
init_pos_emb_width: int = 64,
|
225 |
+
num_attention_heads: int = 16,
|
226 |
+
num_hidden_layers: int = 27,
|
227 |
+
hidden_size: int = 1152,
|
228 |
+
intermediate_size: int = 4304,
|
229 |
+
merge_kernel_size: tuple[int, int] = (2, 2),
|
230 |
+
**kwargs,
|
231 |
+
):
|
232 |
+
super().__init__(**kwargs)
|
233 |
+
self.patch_size = patch_size
|
234 |
+
# Positional embedding config
|
235 |
+
self.init_pos_emb_height = init_pos_emb_height
|
236 |
+
self.init_pos_emb_width = init_pos_emb_width
|
237 |
+
# Transformer config
|
238 |
+
self.num_hidden_layers = num_hidden_layers
|
239 |
+
self.num_attention_heads = num_attention_heads
|
240 |
+
self.hidden_size = hidden_size
|
241 |
+
self.intermediate_size = intermediate_size
|
242 |
+
# Patch merger config
|
243 |
+
self.merge_kernel_size = merge_kernel_size
|
244 |
+
|
245 |
+
|
246 |
+
class KimiVLConfig(PretrainedConfig):
|
247 |
+
model_type = "kimi_vl"
|
248 |
+
|
249 |
+
def __init__(
|
250 |
+
self,
|
251 |
+
vision_config: Optional[Union[dict, MoonViTConfig]] = None,
|
252 |
+
text_config: Optional[Union[dict, DeepseekV3Config]] = None,
|
253 |
+
ignore_index: int = -100,
|
254 |
+
media_placeholder_token_id: int = 163605,
|
255 |
+
pad_token_id: int = 0,
|
256 |
+
**kwargs,
|
257 |
+
):
|
258 |
+
if vision_config is None:
|
259 |
+
vision_config = MoonViTConfig()
|
260 |
+
elif isinstance(vision_config, dict):
|
261 |
+
vision_config = MoonViTConfig(**vision_config)
|
262 |
+
self.vision_config = vision_config
|
263 |
+
|
264 |
+
if text_config is None:
|
265 |
+
text_config = DeepseekV3Config()
|
266 |
+
elif isinstance(text_config, dict):
|
267 |
+
text_config = DeepseekV3Config(**text_config)
|
268 |
+
self.text_config = text_config
|
269 |
+
|
270 |
+
self.ignore_index = ignore_index
|
271 |
+
self.media_placeholder_token_id = media_placeholder_token_id
|
272 |
+
|
273 |
+
attn_implementation = kwargs.get("attn_implementation")
|
274 |
+
if attn_implementation is not None:
|
275 |
+
if attn_implementation in ["eager", "flash_attention_2"]:
|
276 |
+
self._attn_implementation = attn_implementation
|
277 |
+
self.vision_config._attn_implementation = attn_implementation
|
278 |
+
self.text_config._attn_implementation = attn_implementation
|
279 |
+
else:
|
280 |
+
raise ValueError(
|
281 |
+
f"Invalid attention implementation: {attn_implementation}"
|
282 |
+
)
|
283 |
+
|
284 |
+
super().__init__(pad_token_id=pad_token_id, **kwargs)
|
generation_config.json
ADDED
@@ -0,0 +1,9 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
{
|
2 |
+
"bos_token_id": 163584,
|
3 |
+
"pad_token_id": 163838,
|
4 |
+
"eos_token_id": [
|
5 |
+
163585
|
6 |
+
],
|
7 |
+
"do_sample": true,
|
8 |
+
"temperature": 0.6
|
9 |
+
}
|
image_processing_kimi_vl.py
ADDED
@@ -0,0 +1,126 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
"""Image processor class for KimiVL."""
|
2 |
+
|
3 |
+
import math
|
4 |
+
import numpy as np
|
5 |
+
from PIL import Image
|
6 |
+
from typing import Optional, Union
|
7 |
+
|
8 |
+
import torch
|
9 |
+
from torchvision.transforms import functional as TF
|
10 |
+
from transformers.image_utils import ImageInput, make_list_of_images, valid_images
|
11 |
+
from transformers.image_processing_utils import BaseImageProcessor, BatchFeature
|
12 |
+
from transformers.utils import TensorType
|
13 |
+
|
14 |
+
|
15 |
+
OPENAI_DATASET_MEAN = (0.48145466, 0.4578275, 0.40821073)
|
16 |
+
OPENAI_DATASET_STD = (0.26862954, 0.26130258, 0.27577711)
|
17 |
+
|
18 |
+
|
19 |
+
class KimiVLImageProcessor(BaseImageProcessor):
|
20 |
+
model_type = "kimi_vl"
|
21 |
+
|
22 |
+
def __init__(
|
23 |
+
self,
|
24 |
+
patch_size: int = 14,
|
25 |
+
pad_input: bool = False,
|
26 |
+
image_mean: tuple[float, float, float] = OPENAI_DATASET_MEAN,
|
27 |
+
image_std: tuple[float, float, float] = OPENAI_DATASET_STD,
|
28 |
+
in_token_limit: int = 4096,
|
29 |
+
merge_kernel_size: list[int, int] = [2, 2],
|
30 |
+
**kwargs,
|
31 |
+
):
|
32 |
+
super().__init__(**kwargs)
|
33 |
+
self.in_token_limit = in_token_limit
|
34 |
+
self.patch_size = patch_size
|
35 |
+
self.pad_input = pad_input
|
36 |
+
self.image_mean = image_mean
|
37 |
+
self.image_std = image_std
|
38 |
+
self.merge_kernel_size = merge_kernel_size
|
39 |
+
|
40 |
+
def rescale(
|
41 |
+
self, image: Image.Image, merge_kernel_size: list[int, int] = [2, 2]
|
42 |
+
) -> Image.Image:
|
43 |
+
w, h = image.size
|
44 |
+
patch_size = self.patch_size
|
45 |
+
|
46 |
+
if (w // patch_size) * (h // patch_size) > self.in_token_limit:
|
47 |
+
scale = math.sqrt(self.in_token_limit / ((w // patch_size) * (h // patch_size)))
|
48 |
+
new_w, new_h = int(w * scale), int(h * scale)
|
49 |
+
image = image.resize((new_w, new_h), Image.Resampling.BICUBIC)
|
50 |
+
if self.pad_input:
|
51 |
+
new_w, new_h = image.size
|
52 |
+
pad_size_h = merge_kernel_size[0] * patch_size
|
53 |
+
pad_size_w = merge_kernel_size[1] * patch_size
|
54 |
+
|
55 |
+
pad_h = (pad_size_h - new_h % pad_size_h) % pad_size_h
|
56 |
+
pad_w = (pad_size_w - new_w % pad_size_w) % pad_size_w
|
57 |
+
|
58 |
+
image = TF.pad(image, (0, 0, pad_w, pad_h))
|
59 |
+
else:
|
60 |
+
new_w, new_h = image.size
|
61 |
+
new_w = new_w - new_w % patch_size
|
62 |
+
new_h = new_h - new_h % patch_size
|
63 |
+
image = TF.center_crop(image, (new_h, new_w))
|
64 |
+
|
65 |
+
w, h = image.size
|
66 |
+
if w // patch_size >= 512 or h // patch_size >= 512:
|
67 |
+
raise ValueError("Exceed pos emb")
|
68 |
+
|
69 |
+
return image
|
70 |
+
|
71 |
+
def to_tensor(self, image: Image.Image) -> torch.Tensor:
|
72 |
+
return TF.to_tensor(image.convert("RGB"))
|
73 |
+
|
74 |
+
def normalize(self, image: torch.Tensor) -> torch.Tensor:
|
75 |
+
return TF.normalize(image, self.image_mean, self.image_std)
|
76 |
+
|
77 |
+
def patchify(self, image: torch.Tensor) -> tuple[torch.Tensor, list[int, int]]:
|
78 |
+
patch_size = self.patch_size
|
79 |
+
C, H, W = image.shape
|
80 |
+
patches = image.reshape(C, H // patch_size, patch_size, W // patch_size, patch_size)
|
81 |
+
patches = patches.permute(1, 3, 0, 2, 4)
|
82 |
+
patches = patches.contiguous().view(-1, C, patch_size, patch_size)
|
83 |
+
grid_hw = (H // patch_size, W // patch_size)
|
84 |
+
return patches, grid_hw
|
85 |
+
|
86 |
+
def _preprocess(self, image: ImageInput) -> tuple[torch.Tensor, list[int, int]]:
|
87 |
+
"""
|
88 |
+
Preprocess image and patchify it.
|
89 |
+
|
90 |
+
Args:
|
91 |
+
image (`ImageInput`):
|
92 |
+
Image to preprocess. Expects pixel values ranging from 0 to 255. If pixel values range from 0 to 1, set `do_rescale=False`.
|
93 |
+
|
94 |
+
Returns:
|
95 |
+
patches: torch.Tensor
|
96 |
+
grid_hw: list[int, int]
|
97 |
+
"""
|
98 |
+
image = self.rescale(image, self.merge_kernel_size)
|
99 |
+
image = self.to_tensor(image)
|
100 |
+
image = self.normalize(image)
|
101 |
+
patches, grid_hw = self.patchify(image)
|
102 |
+
return patches, grid_hw
|
103 |
+
|
104 |
+
def preprocess(
|
105 |
+
self,
|
106 |
+
images: ImageInput,
|
107 |
+
return_tensors: Optional[Union[str, TensorType]] = None,
|
108 |
+
) -> BatchFeature:
|
109 |
+
images = make_list_of_images(images)
|
110 |
+
|
111 |
+
if not valid_images(images):
|
112 |
+
raise ValueError(
|
113 |
+
"Invalid image type. Must be of type PIL.Image.Image, numpy.ndarray, "
|
114 |
+
"torch.Tensor, tf.Tensor or jax.ndarray."
|
115 |
+
)
|
116 |
+
|
117 |
+
pixel_values, image_grid_hws = [], []
|
118 |
+
for image in images:
|
119 |
+
patches, image_grid_hw = self._preprocess(image)
|
120 |
+
pixel_values.append(patches)
|
121 |
+
image_grid_hws.append(image_grid_hw)
|
122 |
+
pixel_values = torch.concat(pixel_values, dim=0)
|
123 |
+
image_grid_hws = np.array(image_grid_hws)
|
124 |
+
data = {"pixel_values": pixel_values, "image_grid_hws": image_grid_hws}
|
125 |
+
|
126 |
+
return BatchFeature(data=data, tensor_type=return_tensors)
|
model.safetensors.index.json
ADDED
The diff for this file is too large to render.
See raw diff
|
|
modeling_kimi_vl.py
ADDED
The diff for this file is too large to render.
See raw diff
|
|
preprocessor_config.json
ADDED
@@ -0,0 +1,20 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
{
|
2 |
+
"auto_map": {
|
3 |
+
"AutoImageProcessor": "image_processing_kimi_vl.KimiVLImageProcessor",
|
4 |
+
"AutoProcessor": "processing_kimi_vl.KimiVLProcessor"
|
5 |
+
},
|
6 |
+
"in_token_limit": 4096,
|
7 |
+
"patch_size": 14,
|
8 |
+
"num_pooled_tokens": 1024,
|
9 |
+
"image_mean": [
|
10 |
+
0.5,
|
11 |
+
0.5,
|
12 |
+
0.5
|
13 |
+
],
|
14 |
+
"image_std": [
|
15 |
+
0.5,
|
16 |
+
0.5,
|
17 |
+
0.5
|
18 |
+
],
|
19 |
+
"pad_input": true
|
20 |
+
}
|
processing_kimi_vl.py
ADDED
@@ -0,0 +1,170 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
# coding=utf-8
|
2 |
+
# Copyright 2025 The Moonshot Team and HuggingFace Inc. team. All rights reserved.
|
3 |
+
#
|
4 |
+
# The code is based on the Qwen2VL processor (qwen2_vl/processing_qwen2_vl.py), but modified for KimiVL.
|
5 |
+
#
|
6 |
+
# Licensed under the Apache License, Version 2.0 (the "License");
|
7 |
+
# you may not use this file except in compliance with the License.
|
8 |
+
# You may obtain a copy of the License at
|
9 |
+
#
|
10 |
+
# http://www.apache.org/licenses/LICENSE-2.0
|
11 |
+
#
|
12 |
+
# Unless required by applicable law or agreed to in writing, software
|
13 |
+
# distributed under the License is distributed on an "AS IS" BASIS,
|
14 |
+
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
15 |
+
# See the License for the specific language governing permissions and
|
16 |
+
# limitations under the License.
|
17 |
+
"""
|
18 |
+
Processor class for KimiVL.
|
19 |
+
"""
|
20 |
+
|
21 |
+
from typing import List, Union
|
22 |
+
|
23 |
+
from transformers.feature_extraction_utils import BatchFeature
|
24 |
+
from transformers.image_utils import ImageInput
|
25 |
+
from transformers.processing_utils import ProcessingKwargs, ProcessorMixin, Unpack, _validate_images_text_input_order
|
26 |
+
from transformers.tokenization_utils_base import PreTokenizedInput, TextInput
|
27 |
+
from transformers.utils import logging
|
28 |
+
|
29 |
+
|
30 |
+
logger = logging.get_logger(__name__)
|
31 |
+
|
32 |
+
|
33 |
+
class KimiVLProcessorKwargs(ProcessingKwargs, total=False):
|
34 |
+
_defaults = {
|
35 |
+
"text_kwargs": {
|
36 |
+
"padding": False,
|
37 |
+
},
|
38 |
+
"images_kwargs": {},
|
39 |
+
}
|
40 |
+
|
41 |
+
|
42 |
+
class KimiVLProcessor(ProcessorMixin):
|
43 |
+
r"""
|
44 |
+
Constructs a KimiVL processor which wraps a KimiVL image processor and a tokenizer into a single processor.
|
45 |
+
|
46 |
+
[`KimiVLProcessor`] offers all the functionalities of [`KimiVLImageProcessor`] and [`TikTokenTokenizer`]. See the
|
47 |
+
[`~KimiVLProcessor.__call__`] and [`~KimiVLProcessor.decode`] for more information.
|
48 |
+
|
49 |
+
Args:
|
50 |
+
image_processor ([`KimiVLImageProcessor`], *optional*):
|
51 |
+
The image processor is a required input.
|
52 |
+
tokenizer ([`TikTokenTokenizer`], *optional*):
|
53 |
+
The tokenizer is a required input.
|
54 |
+
chat_template (`str`, *optional*): A Jinja template which will be used to convert lists of messages
|
55 |
+
in a chat into a tokenizable string.
|
56 |
+
"""
|
57 |
+
|
58 |
+
attributes = ["image_processor", "tokenizer"]
|
59 |
+
valid_kwargs = [ "chat_template"]
|
60 |
+
image_processor_class = "AutoImageProcessor"
|
61 |
+
tokenizer_class = "AutoTokenizer"
|
62 |
+
|
63 |
+
def __init__(
|
64 |
+
self,
|
65 |
+
image_processor=None,
|
66 |
+
tokenizer=None,
|
67 |
+
chat_template=None,
|
68 |
+
**kwargs,
|
69 |
+
):
|
70 |
+
self.image_token = "<|media_pad|>"
|
71 |
+
super().__init__(image_processor, tokenizer, chat_template=chat_template)
|
72 |
+
|
73 |
+
def __call__(
|
74 |
+
self,
|
75 |
+
images: ImageInput = None,
|
76 |
+
text: Union[TextInput, PreTokenizedInput, List[TextInput], List[PreTokenizedInput]] = None,
|
77 |
+
**kwargs: Unpack[KimiVLProcessorKwargs],
|
78 |
+
) -> BatchFeature:
|
79 |
+
"""
|
80 |
+
Main method to prepare for the model one or several sequences(s) and image(s). This method forwards the `text`
|
81 |
+
and `kwargs` arguments to TikTokenTokenizer's [`~TikTokenTokenizer.__call__`] if `text` is not `None` to encode
|
82 |
+
the text. To prepare the image(s), this method forwards the `images` and `kwrags` arguments to
|
83 |
+
CLIPImageProcessor's [`~CLIPImageProcessor.__call__`] if `images` is not `None`. Please refer to the docstring
|
84 |
+
of the above two methods for more information.
|
85 |
+
|
86 |
+
Args:
|
87 |
+
images (`PIL.Image.Image`, `np.ndarray`, `torch.Tensor`, `List[PIL.Image.Image]`, `List[np.ndarray]`, `List[torch.Tensor]`):
|
88 |
+
The image or batch of images to be prepared. Each image can be a PIL image, NumPy array or PyTorch
|
89 |
+
tensor. Both channels-first and channels-last formats are supported.
|
90 |
+
text (`str`, `List[str]`, `List[List[str]]`):
|
91 |
+
The sequence or batch of sequences to be encoded. Each sequence can be a string or a list of strings
|
92 |
+
(pretokenized string). If the sequences are provided as list of strings (pretokenized), you must set
|
93 |
+
`is_split_into_words=True` (to lift the ambiguity with a batch of sequences).
|
94 |
+
return_tensors (`str` or [`~utils.TensorType`], *optional*):
|
95 |
+
If set, will return tensors of a particular framework. Acceptable values are:
|
96 |
+
- `'tf'`: Return TensorFlow `tf.constant` objects.
|
97 |
+
- `'pt'`: Return PyTorch `torch.Tensor` objects.
|
98 |
+
- `'np'`: Return NumPy `np.ndarray` objects.
|
99 |
+
- `'jax'`: Return JAX `jnp.ndarray` objects.
|
100 |
+
|
101 |
+
Returns:
|
102 |
+
[`BatchFeature`]: A [`BatchFeature`] with the following fields:
|
103 |
+
|
104 |
+
- **input_ids** -- List of token ids to be fed to a model. Returned when `text` is not `None`.
|
105 |
+
- **attention_mask** -- List of indices specifying which tokens should be attended to by the model (when
|
106 |
+
`return_attention_mask=True` or if *"attention_mask"* is in `self.model_input_names` and if `text` is not
|
107 |
+
`None`).
|
108 |
+
- **pixel_values** -- Pixel values to be fed to a model. Returned when `images` is not `None`.
|
109 |
+
"""
|
110 |
+
if images is None and text is None:
|
111 |
+
raise ValueError("You have to specify at least one of `images` or `text`.")
|
112 |
+
|
113 |
+
# check if images and text inputs are reversed for BC
|
114 |
+
images, text = _validate_images_text_input_order(images, text)
|
115 |
+
|
116 |
+
output_kwargs = self._merge_kwargs(
|
117 |
+
KimiVLProcessorKwargs,
|
118 |
+
tokenizer_init_kwargs=self.tokenizer.init_kwargs,
|
119 |
+
**kwargs,
|
120 |
+
)
|
121 |
+
if images is not None:
|
122 |
+
image_inputs = self.image_processor(images, **output_kwargs["images_kwargs"])
|
123 |
+
image_grid_hws = image_inputs["image_grid_hws"]
|
124 |
+
else:
|
125 |
+
image_inputs = {}
|
126 |
+
image_grid_hws = None
|
127 |
+
|
128 |
+
if isinstance(text, str):
|
129 |
+
text = [text]
|
130 |
+
elif not isinstance(text, list) and not isinstance(text[0], str):
|
131 |
+
raise ValueError("Invalid input text. Please provide a string, or a list of strings")
|
132 |
+
|
133 |
+
if image_grid_hws is not None:
|
134 |
+
merge_length = self.image_processor.merge_kernel_size[0] * self.image_processor.merge_kernel_size[1]
|
135 |
+
index = 0
|
136 |
+
for i in range(len(text)):
|
137 |
+
while self.image_token in text[i]:
|
138 |
+
text[i] = text[i].replace(
|
139 |
+
self.image_token,
|
140 |
+
"<|placeholder|>" * (image_grid_hws[index].prod() // merge_length),
|
141 |
+
1,
|
142 |
+
)
|
143 |
+
index += 1
|
144 |
+
text[i] = text[i].replace("<|placeholder|>", self.image_token)
|
145 |
+
|
146 |
+
text_inputs = self.tokenizer(text, **output_kwargs["text_kwargs"])
|
147 |
+
return BatchFeature(data={**text_inputs, **image_inputs})
|
148 |
+
|
149 |
+
def batch_decode(self, *args, **kwargs):
|
150 |
+
"""
|
151 |
+
This method forwards all its arguments to LlamaTokenizerFast's [`~PreTrainedTokenizer.batch_decode`]. Please
|
152 |
+
refer to the docstring of this method for more information.
|
153 |
+
"""
|
154 |
+
return self.tokenizer.batch_decode(*args, **kwargs)
|
155 |
+
|
156 |
+
def decode(self, *args, **kwargs):
|
157 |
+
"""
|
158 |
+
This method forwards all its arguments to LlamaTokenizerFast's [`~PreTrainedTokenizer.decode`]. Please refer to
|
159 |
+
the docstring of this method for more information.
|
160 |
+
"""
|
161 |
+
return self.tokenizer.decode(*args, **kwargs)
|
162 |
+
|
163 |
+
@property
|
164 |
+
def model_input_names(self):
|
165 |
+
tokenizer_input_names = self.tokenizer.model_input_names
|
166 |
+
image_processor_input_names = self.image_processor.model_input_names
|
167 |
+
return list(dict.fromkeys(tokenizer_input_names + image_processor_input_names))
|
168 |
+
|
169 |
+
|
170 |
+
__all__ = ["KimiVLProcessorKwargs"]
|
tiktoken.model
ADDED
@@ -0,0 +1,3 @@
|
|
|
|
|
|
|
|
|
1 |
+
version https://git-lfs.github.com/spec/v1
|
2 |
+
oid sha256:b6c497a7469b33ced9c38afb1ad6e47f03f5e5dc05f15930799210ec050c5103
|
3 |
+
size 2795286
|
tokenization_moonshot.py
ADDED
@@ -0,0 +1,302 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
import os
|
2 |
+
import tiktoken
|
3 |
+
|
4 |
+
from logging import getLogger
|
5 |
+
from pathlib import Path
|
6 |
+
from typing import (
|
7 |
+
cast,
|
8 |
+
Tuple,
|
9 |
+
Dict,
|
10 |
+
Iterator,
|
11 |
+
List,
|
12 |
+
Union,
|
13 |
+
Optional,
|
14 |
+
)
|
15 |
+
from shutil import copyfile
|
16 |
+
from tiktoken.load import load_tiktoken_bpe
|
17 |
+
from tokenizers import AddedToken
|
18 |
+
from transformers.tokenization_utils import PreTrainedTokenizer
|
19 |
+
from transformers.utils import to_py_obj
|
20 |
+
from transformers.models.gpt2.tokenization_gpt2 import bytes_to_unicode
|
21 |
+
|
22 |
+
|
23 |
+
logger = getLogger(__name__)
|
24 |
+
VOCAB_FILES_NAMES = {"vocab_file": "tiktoken.model"}
|
25 |
+
SPIECE_UNDERLINE = "▁"
|
26 |
+
|
27 |
+
|
28 |
+
class TikTokenTokenizer(PreTrainedTokenizer):
|
29 |
+
"""
|
30 |
+
Tokenizing and encoding/decoding text using the Tiktoken tokenizer. See megatron/tokenizer/tiktoken_tokenizer.py.
|
31 |
+
|
32 |
+
This tokenizer inherits from [`PreTrainedTokenizer`] which contains most of the main methods. Users should refer to
|
33 |
+
this superclass for more information regarding those methods.
|
34 |
+
|
35 |
+
Args:
|
36 |
+
vocab_file (`str`):
|
37 |
+
The path to the Tiktoken model file.
|
38 |
+
bos_token (`str` or `tokenizers.AddedToken`, *optional*, defaults to `"<|begin_of_text|>",`):
|
39 |
+
The beginning of sequence token that was used during pretraining. Can be used a sequence classifier token.
|
40 |
+
eos_token (`str` or `tokenizers.AddedToken`, *optional*, defaults to `"<|end_of_text|>"`):
|
41 |
+
The end of sequence token.
|
42 |
+
unk_token (`str` or `tokenizers.AddedToken`, *optional*, defaults to `"<|reserved_special_token_249|>"`):
|
43 |
+
The unknown token. A token that is not in the vocabulary cannot be converted to an ID and is set to be this
|
44 |
+
token instead. The second to last item in special_tokens.
|
45 |
+
pad_token (`str` or `tokenizers.AddedToken`, *optional*, defaults to `"<|reserved_special_token_250|>"`):
|
46 |
+
The token used for padding, for example when batching sequences of different lengths.
|
47 |
+
additional_special_tokens (list of `str`, *optional*):
|
48 |
+
A tuple or a list of additional tokens, which will be marked as `special`, meaning that they will be
|
49 |
+
skipped when decoding if `skip_special_tokens` is set to `True`.
|
50 |
+
"""
|
51 |
+
|
52 |
+
vocab_files_names = VOCAB_FILES_NAMES
|
53 |
+
|
54 |
+
model_input_names = ["input_ids", "attention_mask"]
|
55 |
+
|
56 |
+
special_tokens: Dict[str, int]
|
57 |
+
|
58 |
+
num_reserved_special_tokens = 256
|
59 |
+
|
60 |
+
pat_str = "|".join(
|
61 |
+
[
|
62 |
+
r"""[\p{Han}]+""",
|
63 |
+
r"""[^\r\n\p{L}\p{N}]?[\p{Lu}\p{Lt}\p{Lm}\p{Lo}\p{M}&&[^\p{Han}]]*[\p{Ll}\p{Lm}\p{Lo}\p{M}&&[^\p{Han}]]+(?i:'s|'t|'re|'ve|'m|'ll|'d)?""",
|
64 |
+
r"""[^\r\n\p{L}\p{N}]?[\p{Lu}\p{Lt}\p{Lm}\p{Lo}\p{M}&&[^\p{Han}]]+[\p{Ll}\p{Lm}\p{Lo}\p{M}&&[^\p{Han}]]*(?i:'s|'t|'re|'ve|'m|'ll|'d)?""",
|
65 |
+
r"""\p{N}{1,3}""",
|
66 |
+
r""" ?[^\s\p{L}\p{N}]+[\r\n]*""",
|
67 |
+
r"""\s*[\r\n]+""",
|
68 |
+
r"""\s+(?!\S)""",
|
69 |
+
r"""\s+""",
|
70 |
+
]
|
71 |
+
)
|
72 |
+
|
73 |
+
def __init__(
|
74 |
+
self,
|
75 |
+
vocab_file,
|
76 |
+
bos_token: Union[str, AddedToken] = "[BOS]",
|
77 |
+
eos_token: Union[str, AddedToken] = "[EOS]",
|
78 |
+
unk_token: Union[str, AddedToken] = "[UNK]",
|
79 |
+
pad_token: Union[str, AddedToken] = "[PAD]",
|
80 |
+
additional_special_tokens: Optional[List[str]] = None,
|
81 |
+
added_tokens_decoder: Optional[dict] = None,
|
82 |
+
**kwargs,
|
83 |
+
):
|
84 |
+
assert os.path.isfile(vocab_file), vocab_file
|
85 |
+
if additional_special_tokens is None:
|
86 |
+
additional_special_tokens = [
|
87 |
+
"<|im_end|>",
|
88 |
+
"<|im_middle|>",
|
89 |
+
"<|im_user|>",
|
90 |
+
"<|im_assistant|>",
|
91 |
+
"<|im_system|>",
|
92 |
+
]
|
93 |
+
special_tokens_mapping = {
|
94 |
+
i: added_tokens_decoder[i].content for i in added_tokens_decoder
|
95 |
+
}
|
96 |
+
|
97 |
+
self.vocab_file = vocab_file
|
98 |
+
mergeable_ranks = load_tiktoken_bpe(vocab_file)
|
99 |
+
num_base_tokens = len(mergeable_ranks)
|
100 |
+
self.special_tokens = {
|
101 |
+
special_tokens_mapping.get(i, f"<|reserved_token_{i}|>"): i
|
102 |
+
for i in range(
|
103 |
+
num_base_tokens, num_base_tokens + self.num_reserved_special_tokens + 2
|
104 |
+
)
|
105 |
+
}
|
106 |
+
|
107 |
+
self.model = tiktoken.Encoding(
|
108 |
+
name=Path(vocab_file).name,
|
109 |
+
pat_str=self.pat_str,
|
110 |
+
mergeable_ranks=mergeable_ranks,
|
111 |
+
special_tokens=self.special_tokens,
|
112 |
+
)
|
113 |
+
|
114 |
+
self.n_words: int = self.model.n_vocab
|
115 |
+
# BOS / EOS token IDs
|
116 |
+
self.bos_id: int = self.special_tokens[str(bos_token)]
|
117 |
+
self.eos_id: int = self.special_tokens[str(eos_token)]
|
118 |
+
|
119 |
+
self.pad_id: int = self.special_tokens[str(pad_token)]
|
120 |
+
self.unk_id: int = self.special_tokens[str(unk_token)]
|
121 |
+
|
122 |
+
self.byte_encoder = bytes_to_unicode()
|
123 |
+
self.byte_decoder = {v: k for k, v in self.byte_encoder.items()}
|
124 |
+
|
125 |
+
self.decoder = {}
|
126 |
+
for i in range(self.n_words):
|
127 |
+
# Taken from https://gist.github.com/xenova/a452a6474428de0182b17605a98631ee
|
128 |
+
decoding = "".join(
|
129 |
+
[
|
130 |
+
self.byte_encoder[ord(char)]
|
131 |
+
for char in self.model.decode_single_token_bytes(i).decode(
|
132 |
+
"latin-1"
|
133 |
+
)
|
134 |
+
]
|
135 |
+
)
|
136 |
+
self.decoder[i] = decoding
|
137 |
+
|
138 |
+
self.encoder = {}
|
139 |
+
for i in range(self.n_words):
|
140 |
+
if i in self.decoder:
|
141 |
+
self.encoder[self.decoder[i]] = i
|
142 |
+
|
143 |
+
super().__init__(
|
144 |
+
bos_token=bos_token,
|
145 |
+
eos_token=eos_token,
|
146 |
+
unk_token=unk_token,
|
147 |
+
pad_token=pad_token,
|
148 |
+
additional_special_tokens=additional_special_tokens,
|
149 |
+
**kwargs,
|
150 |
+
)
|
151 |
+
self.all_special_ids_set = set(self.all_special_ids)
|
152 |
+
|
153 |
+
def encode(
|
154 |
+
self, text: str, allow_special_tokens: bool = True, **kwargs
|
155 |
+
) -> List[int]:
|
156 |
+
"""
|
157 |
+
Encodes a string into a list of token IDs.
|
158 |
+
|
159 |
+
Args:
|
160 |
+
text (str): The input string to be encoded.
|
161 |
+
|
162 |
+
Returns:
|
163 |
+
list[int]: A list of token IDs.
|
164 |
+
"""
|
165 |
+
# If there are other args, we should call super().encode because there are a lot of code
|
166 |
+
# to handle those args. supper().encode finally will call _tokenize and _convert_token_to_id.
|
167 |
+
if len(kwargs) > 0:
|
168 |
+
return super().encode(text, **kwargs)
|
169 |
+
|
170 |
+
assert type(text) is str
|
171 |
+
|
172 |
+
# The tiktoken tokenizer can handle <=400k chars without
|
173 |
+
# pyo3_runtime.PanicException.
|
174 |
+
TIKTOKEN_MAX_ENCODE_CHARS = 400_000
|
175 |
+
|
176 |
+
# https://github.com/openai/tiktoken/issues/195
|
177 |
+
# Here we iterate over subsequences and split if we exceed the limit
|
178 |
+
# of max consecutive non-whitespace or whitespace characters.
|
179 |
+
MAX_NO_WHITESPACES_CHARS = 25_000
|
180 |
+
|
181 |
+
substrs = (
|
182 |
+
substr
|
183 |
+
for i in range(0, len(text), TIKTOKEN_MAX_ENCODE_CHARS)
|
184 |
+
for substr in self._split_whitespaces_or_nonwhitespaces(
|
185 |
+
text[i : i + TIKTOKEN_MAX_ENCODE_CHARS], MAX_NO_WHITESPACES_CHARS
|
186 |
+
)
|
187 |
+
)
|
188 |
+
t: List[int] = []
|
189 |
+
for substr in substrs:
|
190 |
+
if allow_special_tokens:
|
191 |
+
t.extend(
|
192 |
+
# we should consider special token as a common token
|
193 |
+
self.model.encode(
|
194 |
+
substr,
|
195 |
+
allowed_special="all",
|
196 |
+
)
|
197 |
+
)
|
198 |
+
else:
|
199 |
+
t.extend(
|
200 |
+
# we should consider special token as a common token
|
201 |
+
self.model.encode(
|
202 |
+
substr,
|
203 |
+
disallowed_special=(),
|
204 |
+
)
|
205 |
+
)
|
206 |
+
return t
|
207 |
+
|
208 |
+
def decode(self, token_ids: Union[int, List[int]], **kwargs) -> str:
|
209 |
+
"""
|
210 |
+
Decodes a list of token IDs into a string.
|
211 |
+
|
212 |
+
Args:
|
213 |
+
t (List[int]): The list of token IDs to be decoded.
|
214 |
+
|
215 |
+
Returns:
|
216 |
+
str: The decoded string.
|
217 |
+
"""
|
218 |
+
# If there are other args, we should call super().decode because there are a lot of code
|
219 |
+
# to handle those args. supper().encode finally will call convert_tokens_to_string and _convert_id_to_token.
|
220 |
+
if len(kwargs) > 0:
|
221 |
+
return super().decode(token_ids, **kwargs)
|
222 |
+
|
223 |
+
token_ids = to_py_obj(token_ids)
|
224 |
+
|
225 |
+
if type(token_ids) is int:
|
226 |
+
token_ids = [token_ids]
|
227 |
+
|
228 |
+
return self.model.decode(cast(List[int], token_ids))
|
229 |
+
|
230 |
+
@staticmethod
|
231 |
+
def _split_whitespaces_or_nonwhitespaces(
|
232 |
+
s: str, max_consecutive_slice_len: int
|
233 |
+
) -> Iterator[str]:
|
234 |
+
"""
|
235 |
+
Splits the string `s` so that each substring contains no more than `max_consecutive_slice_len`
|
236 |
+
consecutive whitespaces or consecutive non-whitespaces.
|
237 |
+
"""
|
238 |
+
current_slice_len = 0
|
239 |
+
current_slice_is_space = s[0].isspace() if len(s) > 0 else False
|
240 |
+
slice_start = 0
|
241 |
+
|
242 |
+
for i in range(len(s)):
|
243 |
+
is_now_space = s[i].isspace()
|
244 |
+
|
245 |
+
if current_slice_is_space ^ is_now_space:
|
246 |
+
current_slice_len = 1
|
247 |
+
current_slice_is_space = is_now_space
|
248 |
+
else:
|
249 |
+
current_slice_len += 1
|
250 |
+
if current_slice_len > max_consecutive_slice_len:
|
251 |
+
yield s[slice_start:i]
|
252 |
+
slice_start = i
|
253 |
+
current_slice_len = 1
|
254 |
+
yield s[slice_start:]
|
255 |
+
|
256 |
+
""" ----- Below are the abstract methods required by PreTrainedTokenizer ----- """
|
257 |
+
|
258 |
+
@property
|
259 |
+
def vocab_size(self) -> int:
|
260 |
+
return self.n_words
|
261 |
+
|
262 |
+
def get_vocab(self) -> Dict[str, int]:
|
263 |
+
return self.encoder
|
264 |
+
|
265 |
+
def _tokenize(self, text: str, **kwargs) -> List[str]:
|
266 |
+
return [self.decoder[t] for t in self.encode(text)]
|
267 |
+
|
268 |
+
def _convert_token_to_id(self, token: str) -> int:
|
269 |
+
return self.encoder.get(token, self.unk_id)
|
270 |
+
|
271 |
+
def _convert_id_to_token(self, index: int) -> str:
|
272 |
+
return self.decoder.get(index)
|
273 |
+
|
274 |
+
@staticmethod
|
275 |
+
def clean_up_tokenization(out_string: str) -> str:
|
276 |
+
return out_string
|
277 |
+
|
278 |
+
def convert_tokens_to_string(self, tokens: List[str]) -> str:
|
279 |
+
text = "".join(tokens).replace(SPIECE_UNDERLINE, "")
|
280 |
+
text = bytearray([self.byte_decoder[c] for c in text]).decode(
|
281 |
+
"utf-8", "replace"
|
282 |
+
)
|
283 |
+
return text
|
284 |
+
|
285 |
+
def save_vocabulary(
|
286 |
+
self, save_directory: str, filename_prefix: Optional[str] = None
|
287 |
+
) -> Tuple[str]:
|
288 |
+
if not os.path.isdir(save_directory):
|
289 |
+
logger.error(f"Vocabulary path ({save_directory}) should be a directory")
|
290 |
+
return
|
291 |
+
out_vocab_file = os.path.join(
|
292 |
+
save_directory,
|
293 |
+
(filename_prefix + "-" if filename_prefix else "")
|
294 |
+
+ VOCAB_FILES_NAMES["vocab_file"],
|
295 |
+
)
|
296 |
+
|
297 |
+
if os.path.abspath(self.vocab_file) != os.path.abspath(
|
298 |
+
out_vocab_file
|
299 |
+
) and os.path.isfile(self.vocab_file):
|
300 |
+
copyfile(self.vocab_file, out_vocab_file)
|
301 |
+
|
302 |
+
return (out_vocab_file,)
|
tokenizer_config.json
ADDED
@@ -0,0 +1,134 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
{
|
2 |
+
"added_tokens_decoder": {
|
3 |
+
"163584": {
|
4 |
+
"content": "[BOS]",
|
5 |
+
"lstrip": false,
|
6 |
+
"normalized": false,
|
7 |
+
"rstrip": false,
|
8 |
+
"single_word": false,
|
9 |
+
"special": true
|
10 |
+
},
|
11 |
+
"163585": {
|
12 |
+
"content": "[EOS]",
|
13 |
+
"lstrip": false,
|
14 |
+
"normalized": false,
|
15 |
+
"rstrip": false,
|
16 |
+
"single_word": false,
|
17 |
+
"special": true
|
18 |
+
},
|
19 |
+
"163586": {
|
20 |
+
"content": "<|im_end|>",
|
21 |
+
"lstrip": false,
|
22 |
+
"normalized": false,
|
23 |
+
"rstrip": false,
|
24 |
+
"single_word": false,
|
25 |
+
"special": true
|
26 |
+
},
|
27 |
+
"163601": {
|
28 |
+
"content": "<|im_middle|>",
|
29 |
+
"lstrip": false,
|
30 |
+
"normalized": false,
|
31 |
+
"rstrip": false,
|
32 |
+
"single_word": false,
|
33 |
+
"special": true
|
34 |
+
},
|
35 |
+
"163587": {
|
36 |
+
"content": "<|im_user|>",
|
37 |
+
"lstrip": false,
|
38 |
+
"normalized": false,
|
39 |
+
"rstrip": false,
|
40 |
+
"single_word": false,
|
41 |
+
"special": true
|
42 |
+
},
|
43 |
+
"163588": {
|
44 |
+
"content": "<|im_assistant|>",
|
45 |
+
"lstrip": false,
|
46 |
+
"normalized": false,
|
47 |
+
"rstrip": false,
|
48 |
+
"single_word": false,
|
49 |
+
"special": true
|
50 |
+
},
|
51 |
+
"163594": {
|
52 |
+
"content": "<|im_system|>",
|
53 |
+
"lstrip": false,
|
54 |
+
"normalized": false,
|
55 |
+
"rstrip": false,
|
56 |
+
"single_word": false,
|
57 |
+
"special": true
|
58 |
+
},
|
59 |
+
"163602": {
|
60 |
+
"content": "<|media_start|>",
|
61 |
+
"lstrip": false,
|
62 |
+
"normalized": false,
|
63 |
+
"rstrip": false,
|
64 |
+
"single_word": false,
|
65 |
+
"special": true
|
66 |
+
},
|
67 |
+
"163603": {
|
68 |
+
"content": "<|media_content|>",
|
69 |
+
"lstrip": false,
|
70 |
+
"normalized": false,
|
71 |
+
"rstrip": false,
|
72 |
+
"single_word": false,
|
73 |
+
"special": true
|
74 |
+
},
|
75 |
+
"163604": {
|
76 |
+
"content": "<|media_end|>",
|
77 |
+
"lstrip": false,
|
78 |
+
"normalized": false,
|
79 |
+
"rstrip": false,
|
80 |
+
"single_word": false,
|
81 |
+
"special": true
|
82 |
+
},
|
83 |
+
"163605": {
|
84 |
+
"content": "<|media_pad|>",
|
85 |
+
"lstrip": false,
|
86 |
+
"normalized": false,
|
87 |
+
"rstrip": false,
|
88 |
+
"single_word": false,
|
89 |
+
"special": true
|
90 |
+
},
|
91 |
+
"163838": {
|
92 |
+
"content": "[PAD]",
|
93 |
+
"lstrip": false,
|
94 |
+
"normalized": false,
|
95 |
+
"rstrip": false,
|
96 |
+
"single_word": false,
|
97 |
+
"special": true
|
98 |
+
},
|
99 |
+
"163839": {
|
100 |
+
"content": "[UNK]",
|
101 |
+
"lstrip": false,
|
102 |
+
"normalized": false,
|
103 |
+
"rstrip": false,
|
104 |
+
"single_word": false,
|
105 |
+
"special": true
|
106 |
+
}
|
107 |
+
},
|
108 |
+
"additional_special_tokens": [
|
109 |
+
"<|im_end|>",
|
110 |
+
"<|im_user|>",
|
111 |
+
"<|im_assistant|>",
|
112 |
+
"<|im_system|>",
|
113 |
+
"<|im_middle|>",
|
114 |
+
"<|media_start|>",
|
115 |
+
"<|media_content|>",
|
116 |
+
"<|media_end|>",
|
117 |
+
"<|media_pad|>"
|
118 |
+
],
|
119 |
+
"bos_token": "[BOS]",
|
120 |
+
"clean_up_tokenization_spaces": false,
|
121 |
+
"eos_token": "[EOS]",
|
122 |
+
"extra_special_tokens": {},
|
123 |
+
"model_max_length": 1048576,
|
124 |
+
"pad_token": "[PAD]",
|
125 |
+
"unk_token": "[UNK]",
|
126 |
+
"tokenizer_class": "TikTokenTokenizer",
|
127 |
+
"chat_template": "{%- for message in messages -%}{%- if loop.first and messages[0]['role'] != 'system' -%}{{'<|im_system|>system<|im_middle|>You are a helpful assistant<|im_end|>'}}{%- endif -%}{%- if message['role'] == 'system' -%}{{'<|im_system|>'}}{%- endif -%}{%- if message['role'] == 'user' -%}{{'<|im_user|>'}}{%- endif -%}{%- if message['role'] == 'assistant' -%}{{'<|im_assistant|>'}}{%- endif -%}{{- message['role'] -}}{{'<|im_middle|>'}}{%- if message['content'] is string -%}{{- message['content'] + '<|im_end|>' -}}{%- else -%}{%- for content in message['content'] -%}{%- if content['type'] == 'image' or 'image' in content or 'image_url' in content -%}{{'<|media_start|>image<|media_content|><|media_pad|><|media_end|>'}}{%- else -%}{{content['text']}}{%- endif -%}{%- endfor -%}{{'<|im_end|>'}}{%- endif -%}{%- endfor -%}{%- if add_generation_prompt -%}{{'<|im_assistant|>assistant<|im_middle|>'}}{%- endif -%}",
|
128 |
+
"auto_map": {
|
129 |
+
"AutoTokenizer": [
|
130 |
+
"tokenization_moonshot.TikTokenTokenizer",
|
131 |
+
null
|
132 |
+
]
|
133 |
+
}
|
134 |
+
}
|