miaoyibo commited on
Commit
bf41562
·
1 Parent(s): 29c1df8
kimi_vl/__init__.py ADDED
File without changes
kimi_vl/serve/__init__.py ADDED
File without changes
kimi_vl/serve/assets/Kelpy-Codos.js ADDED
@@ -0,0 +1,100 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ /**
2
+ * Copyright (c) 2023-2024 DeepSeek.
3
+ *
4
+ * Permission is hereby granted, free of charge, to any person obtaining a copy of
5
+ * this software and associated documentation files (the "Software"), to deal in
6
+ * the Software without restriction, including without limitation the rights to
7
+ * use, copy, modify, merge, publish, distribute, sublicense, and/or sell copies of
8
+ * the Software, and to permit persons to whom the Software is furnished to do so,
9
+ * subject to the following conditions:
10
+ *
11
+ * The above copyright notice and this permission notice shall be included in all
12
+ * copies or substantial portions of the Software.
13
+ *
14
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
15
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, FITNESS
16
+ * FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR
17
+ * COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER
18
+ * IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN
19
+ * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
20
+ */
21
+
22
+ // ==UserScript==
23
+ // @name Kelpy Codos
24
+ // @namespace https://github.com/Keldos-Li/Kelpy-Codos
25
+ // @version 1.0.5
26
+ // @author Keldos; https://keldos.me/
27
+ // @description Add copy button to PRE tags before CODE tag, for Chuanhu ChatGPT especially.
28
+ // Based on Chuanhu ChatGPT version: ac04408 (2023-3-22)
29
+ // @license GPL-3.0
30
+ // @grant none
31
+ // ==/UserScript==
32
+
33
+ (function () {
34
+ "use strict";
35
+
36
+ function addCopyButton(pre) {
37
+ var code = pre.querySelector("code");
38
+ if (!code) {
39
+ return; // 如果没有找到 <code> 元素,则不添加按钮
40
+ }
41
+ var firstChild = code.firstChild;
42
+ if (!firstChild) {
43
+ return; // 如果 <code> 元素没有子节点,则不添加按钮
44
+ }
45
+ var button = document.createElement("button");
46
+ button.textContent = "\uD83D\uDCCE"; // 使用 📎 符号作为“复制”按钮的文本
47
+ button.style.position = "relative";
48
+ button.style.float = "right";
49
+ button.style.fontSize = "1em"; // 可选:调整按钮大小
50
+ button.style.background = "none"; // 可选:去掉背景颜色
51
+ button.style.border = "none"; // 可选:去掉边框
52
+ button.style.cursor = "pointer"; // 可选:显示指针样式
53
+ button.addEventListener("click", function () {
54
+ var range = document.createRange();
55
+ range.selectNodeContents(code);
56
+ range.setStartBefore(firstChild); // 将范围设置为第一个子节点之前
57
+ var selection = window.getSelection();
58
+ selection.removeAllRanges();
59
+ selection.addRange(range);
60
+
61
+ try {
62
+ var success = document.execCommand("copy");
63
+ if (success) {
64
+ button.textContent = "\u2714";
65
+ setTimeout(function () {
66
+ button.textContent = "\uD83D\uDCCE"; // 恢复按钮为“复制”
67
+ }, 2000);
68
+ } else {
69
+ button.textContent = "\u2716";
70
+ }
71
+ } catch (e) {
72
+ console.error(e);
73
+ button.textContent = "\u2716";
74
+ }
75
+
76
+ selection.removeAllRanges();
77
+ });
78
+ code.insertBefore(button, firstChild); // 将按钮插入到第一个子元素之前
79
+ }
80
+
81
+ function handleNewElements(mutationsList, observer) {
82
+ for (var mutation of mutationsList) {
83
+ if (mutation.type === "childList") {
84
+ for (var node of mutation.addedNodes) {
85
+ if (node.nodeName === "PRE") {
86
+ addCopyButton(node);
87
+ }
88
+ }
89
+ }
90
+ }
91
+ }
92
+
93
+ var observer = new MutationObserver(handleNewElements);
94
+ observer.observe(document.documentElement, {
95
+ childList: true,
96
+ subtree: true,
97
+ });
98
+
99
+ document.querySelectorAll("pre").forEach(addCopyButton);
100
+ })();
kimi_vl/serve/assets/avatar.png ADDED
kimi_vl/serve/assets/custom.css ADDED
@@ -0,0 +1,355 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ /**
2
+ * Copyright (c) 2023-2024 DeepSeek.
3
+ *
4
+ * Permission is hereby granted, free of charge, to any person obtaining a copy of
5
+ * this software and associated documentation files (the "Software"), to deal in
6
+ * the Software without restriction, including without limitation the rights to
7
+ * use, copy, modify, merge, publish, distribute, sublicense, and/or sell copies of
8
+ * the Software, and to permit persons to whom the Software is furnished to do so,
9
+ * subject to the following conditions:
10
+ *
11
+ * The above copyright notice and this permission notice shall be included in all
12
+ * copies or substantial portions of the Software.
13
+ *
14
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
15
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, FITNESS
16
+ * FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR
17
+ * COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER
18
+ * IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN
19
+ * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
20
+ */
21
+
22
+ :root {
23
+ --chatbot-color-light: #f3f3f3;
24
+ --chatbot-color-dark: #121111;
25
+ }
26
+
27
+ /* status_display */
28
+ #status_display {
29
+ display: flex;
30
+ min-height: 2.5em;
31
+ align-items: flex-end;
32
+ justify-content: flex-end;
33
+ }
34
+ #status_display p {
35
+ font-size: 0.85em;
36
+ font-family: monospace;
37
+ color: var(--body-text-color-subdued);
38
+ }
39
+
40
+ /* usage_display */
41
+ #usage_display {
42
+ height: 1em;
43
+ }
44
+ #usage_display p {
45
+ padding: 0 1em;
46
+ font-size: 0.85em;
47
+ font-family: monospace;
48
+ color: var(--body-text-color-subdued);
49
+ }
50
+ /* list */
51
+ ol:not(.options),
52
+ ul:not(.options) {
53
+ padding-inline-start: 2em !important;
54
+ }
55
+
56
+ /* Thank @Keldos-Li for fixing it */
57
+ /* Light mode (default) */
58
+ #deepseek_chatbot {
59
+ background-color: var(--chatbot-color-light) !important;
60
+ color: #000000 !important;
61
+ }
62
+ [data-testid="bot"] {
63
+ background-color: #ffffff !important;
64
+ }
65
+ [data-testid="user"] {
66
+ background-color: #95ec69 !important;
67
+ }
68
+
69
+ /* Dark mode */
70
+ .dark #deepseek_chatbot {
71
+ background-color: var(--chatbot-color-dark) !important;
72
+ color: #ffffff !important;
73
+ }
74
+ .dark [data-testid="bot"] {
75
+ background-color: #2c2c2c !important;
76
+ }
77
+ .dark [data-testid="user"] {
78
+ background-color: #26b561 !important;
79
+ }
80
+
81
+ #deepseek_chatbot {
82
+ height: 100%;
83
+ min-height: 800px;
84
+ flex-grow: 1;
85
+ overflow: auto;
86
+ }
87
+
88
+ [class*="message"] {
89
+ border-radius: var(--radius-xl) !important;
90
+ border: none;
91
+ padding: var(--spacing-xl) !important;
92
+ font-size: var(--text-md) !important;
93
+ line-height: var(--line-md) !important;
94
+ min-height: calc(var(--text-md) * var(--line-md) + 2 * var(--spacing-xl));
95
+ min-width: calc(var(--text-md) * var(--line-md) + 2 * var(--spacing-xl));
96
+ }
97
+ [data-testid="bot"] {
98
+ max-width: 85%;
99
+ border-bottom-left-radius: 0 !important;
100
+ }
101
+ [data-testid="user"] {
102
+ max-width: 85%;
103
+ width: auto !important;
104
+ border-bottom-right-radius: 0 !important;
105
+ }
106
+ /* Table */
107
+ table {
108
+ margin: 1em 0;
109
+ border-collapse: collapse;
110
+ empty-cells: show;
111
+ }
112
+ td,
113
+ th {
114
+ border: 1.2px solid var(--border-color-primary) !important;
115
+ padding: 0.2em;
116
+ }
117
+ thead {
118
+ background-color: rgba(175, 184, 193, 0.2);
119
+ }
120
+ thead th {
121
+ padding: 0.5em 0.2em;
122
+ }
123
+ /* Inline code */
124
+ #deepseek_chatbot code {
125
+ display: inline;
126
+ white-space: break-spaces;
127
+ border-radius: 6px;
128
+ margin: 0 2px 0 2px;
129
+ padding: 0.2em 0.4em 0.1em 0.4em;
130
+ background-color: rgba(175, 184, 193, 0.2);
131
+ }
132
+ /* Code block */
133
+ #deepseek_chatbot pre code {
134
+ display: block;
135
+ overflow: auto;
136
+ white-space: pre;
137
+ background-color: #1c1d1e !important;
138
+ border-radius: 10px;
139
+ padding: 1.4em 1.2em 0em 1.4em;
140
+ margin: 1.2em 2em 1.2em 0.5em;
141
+ color: #fdf8f8;
142
+ box-shadow: 6px 6px 16px hsla(0, 0%, 0%, 0.2);
143
+ }
144
+ /* Hightlight */
145
+ #deepseek_chatbot .highlight {
146
+ background-color: transparent;
147
+ }
148
+ #deepseek_chatbot .highlight .hll {
149
+ background-color: #49483e;
150
+ }
151
+ #deepseek_chatbot .highlight .c {
152
+ color: #75715e;
153
+ } /* Comment */
154
+ #deepseek_chatbot .highlight .err {
155
+ color: #960050;
156
+ background-color: #1e0010;
157
+ } /* Error */
158
+ #deepseek_chatbot .highlight .k {
159
+ color: #66d9ef;
160
+ } /* Keyword */
161
+ #deepseek_chatbot .highlight .l {
162
+ color: #ae81ff;
163
+ } /* Literal */
164
+ #deepseek_chatbot .highlight .n {
165
+ color: #f8f8f2;
166
+ } /* Name */
167
+ #deepseek_chatbot .highlight .o {
168
+ color: #f92672;
169
+ } /* Operator */
170
+ #deepseek_chatbot .highlight .p {
171
+ color: #f8f8f2;
172
+ } /* Punctuation */
173
+ #deepseek_chatbot .highlight .ch {
174
+ color: #75715e;
175
+ } /* Comment.Hashbang */
176
+ #deepseek_chatbot .highlight .cm {
177
+ color: #75715e;
178
+ } /* Comment.Multiline */
179
+ #deepseek_chatbot .highlight .cp {
180
+ color: #75715e;
181
+ } /* Comment.Preproc */
182
+ #deepseek_chatbot .highlight .cpf {
183
+ color: #75715e;
184
+ } /* Comment.PreprocFile */
185
+ #deepseek_chatbot .highlight .c1 {
186
+ color: #75715e;
187
+ } /* Comment.Single */
188
+ #deepseek_chatbot .highlight .cs {
189
+ color: #75715e;
190
+ } /* Comment.Special */
191
+ #deepseek_chatbot .highlight .gd {
192
+ color: #f92672;
193
+ } /* Generic.Deleted */
194
+ #deepseek_chatbot .highlight .ge {
195
+ font-style: italic;
196
+ } /* Generic.Emph */
197
+ #deepseek_chatbot .highlight .gi {
198
+ color: #a6e22e;
199
+ } /* Generic.Inserted */
200
+ #deepseek_chatbot .highlight .gs {
201
+ font-weight: bold;
202
+ } /* Generic.Strong */
203
+ #deepseek_chatbot .highlight .gu {
204
+ color: #75715e;
205
+ } /* Generic.Subheading */
206
+ #deepseek_chatbot .highlight .kc {
207
+ color: #66d9ef;
208
+ } /* Keyword.Constant */
209
+ #deepseek_chatbot .highlight .kd {
210
+ color: #66d9ef;
211
+ } /* Keyword.Declaration */
212
+ #deepseek_chatbot .highlight .kn {
213
+ color: #f92672;
214
+ } /* Keyword.Namespace */
215
+ #deepseek_chatbot .highlight .kp {
216
+ color: #66d9ef;
217
+ } /* Keyword.Pseudo */
218
+ #deepseek_chatbot .highlight .kr {
219
+ color: #66d9ef;
220
+ } /* Keyword.Reserved */
221
+ #deepseek_chatbot .highlight .kt {
222
+ color: #66d9ef;
223
+ } /* Keyword.Type */
224
+ #deepseek_chatbot .highlight .ld {
225
+ color: #e6db74;
226
+ } /* Literal.Date */
227
+ #deepseek_chatbot .highlight .m {
228
+ color: #ae81ff;
229
+ } /* Literal.Number */
230
+ #deepseek_chatbot .highlight .s {
231
+ color: #e6db74;
232
+ } /* Literal.String */
233
+ #deepseek_chatbot .highlight .na {
234
+ color: #a6e22e;
235
+ } /* Name.Attribute */
236
+ #deepseek_chatbot .highlight .nb {
237
+ color: #f8f8f2;
238
+ } /* Name.Builtin */
239
+ #deepseek_chatbot .highlight .nc {
240
+ color: #a6e22e;
241
+ } /* Name.Class */
242
+ #deepseek_chatbot .highlight .no {
243
+ color: #66d9ef;
244
+ } /* Name.Constant */
245
+ #deepseek_chatbot .highlight .nd {
246
+ color: #a6e22e;
247
+ } /* Name.Decorator */
248
+ #deepseek_chatbot .highlight .ni {
249
+ color: #f8f8f2;
250
+ } /* Name.Entity */
251
+ #deepseek_chatbot .highlight .ne {
252
+ color: #a6e22e;
253
+ } /* Name.Exception */
254
+ #deepseek_chatbot .highlight .nf {
255
+ color: #a6e22e;
256
+ } /* Name.Function */
257
+ #deepseek_chatbot .highlight .nl {
258
+ color: #f8f8f2;
259
+ } /* Name.Label */
260
+ #deepseek_chatbot .highlight .nn {
261
+ color: #f8f8f2;
262
+ } /* Name.Namespace */
263
+ #deepseek_chatbot .highlight .nx {
264
+ color: #a6e22e;
265
+ } /* Name.Other */
266
+ #deepseek_chatbot .highlight .py {
267
+ color: #f8f8f2;
268
+ } /* Name.Property */
269
+ #deepseek_chatbot .highlight .nt {
270
+ color: #f92672;
271
+ } /* Name.Tag */
272
+ #deepseek_chatbot .highlight .nv {
273
+ color: #f8f8f2;
274
+ } /* Name.Variable */
275
+ #deepseek_chatbot .highlight .ow {
276
+ color: #f92672;
277
+ } /* Operator.Word */
278
+ #deepseek_chatbot .highlight .w {
279
+ color: #f8f8f2;
280
+ } /* Text.Whitespace */
281
+ #deepseek_chatbot .highlight .mb {
282
+ color: #ae81ff;
283
+ } /* Literal.Number.Bin */
284
+ #deepseek_chatbot .highlight .mf {
285
+ color: #ae81ff;
286
+ } /* Literal.Number.Float */
287
+ #deepseek_chatbot .highlight .mh {
288
+ color: #ae81ff;
289
+ } /* Literal.Number.Hex */
290
+ #deepseek_chatbot .highlight .mi {
291
+ color: #ae81ff;
292
+ } /* Literal.Number.Integer */
293
+ #deepseek_chatbot .highlight .mo {
294
+ color: #ae81ff;
295
+ } /* Literal.Number.Oct */
296
+ #deepseek_chatbot .highlight .sa {
297
+ color: #e6db74;
298
+ } /* Literal.String.Affix */
299
+ #deepseek_chatbot .highlight .sb {
300
+ color: #e6db74;
301
+ } /* Literal.String.Backtick */
302
+ #deepseek_chatbot .highlight .sc {
303
+ color: #e6db74;
304
+ } /* Literal.String.Char */
305
+ #deepseek_chatbot .highlight .dl {
306
+ color: #e6db74;
307
+ } /* Literal.String.Delimiter */
308
+ #deepseek_chatbot .highlight .sd {
309
+ color: #e6db74;
310
+ } /* Literal.String.Doc */
311
+ #deepseek_chatbot .highlight .s2 {
312
+ color: #e6db74;
313
+ } /* Literal.String.Double */
314
+ #deepseek_chatbot .highlight .se {
315
+ color: #ae81ff;
316
+ } /* Literal.String.Escape */
317
+ #deepseek_chatbot .highlight .sh {
318
+ color: #e6db74;
319
+ } /* Literal.String.Heredoc */
320
+ #deepseek_chatbot .highlight .si {
321
+ color: #e6db74;
322
+ } /* Literal.String.Interpol */
323
+ #deepseek_chatbot .highlight .sx {
324
+ color: #e6db74;
325
+ } /* Literal.String.Other */
326
+ #deepseek_chatbot .highlight .sr {
327
+ color: #e6db74;
328
+ } /* Literal.String.Regex */
329
+ #deepseek_chatbot .highlight .s1 {
330
+ color: #e6db74;
331
+ } /* Literal.String.Single */
332
+ #deepseek_chatbot .highlight .ss {
333
+ color: #e6db74;
334
+ } /* Literal.String.Symbol */
335
+ #deepseek_chatbot .highlight .bp {
336
+ color: #f8f8f2;
337
+ } /* Name.Builtin.Pseudo */
338
+ #deepseek_chatbot .highlight .fm {
339
+ color: #a6e22e;
340
+ } /* Name.Function.Magic */
341
+ #deepseek_chatbot .highlight .vc {
342
+ color: #f8f8f2;
343
+ } /* Name.Variable.Class */
344
+ #deepseek_chatbot .highlight .vg {
345
+ color: #f8f8f2;
346
+ } /* Name.Variable.Global */
347
+ #deepseek_chatbot .highlight .vi {
348
+ color: #f8f8f2;
349
+ } /* Name.Variable.Instance */
350
+ #deepseek_chatbot .highlight .vm {
351
+ color: #f8f8f2;
352
+ } /* Name.Variable.Magic */
353
+ #deepseek_chatbot .highlight .il {
354
+ color: #ae81ff;
355
+ } /* Literal.Number.Integer.Long */
kimi_vl/serve/assets/custom.js ADDED
@@ -0,0 +1,22 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ /**
2
+ * Copyright (c) 2023-2024 DeepSeek.
3
+ *
4
+ * Permission is hereby granted, free of charge, to any person obtaining a copy of
5
+ * this software and associated documentation files (the "Software"), to deal in
6
+ * the Software without restriction, including without limitation the rights to
7
+ * use, copy, modify, merge, publish, distribute, sublicense, and/or sell copies of
8
+ * the Software, and to permit persons to whom the Software is furnished to do so,
9
+ * subject to the following conditions:
10
+ *
11
+ * The above copyright notice and this permission notice shall be included in all
12
+ * copies or substantial portions of the Software.
13
+ *
14
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
15
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, FITNESS
16
+ * FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR
17
+ * COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER
18
+ * IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN
19
+ * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
20
+ */
21
+
22
+ // custom javascript here
kimi_vl/serve/assets/favicon.ico ADDED
kimi_vl/serve/chat_utils.py ADDED
@@ -0,0 +1,379 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ """
2
+ From https://github.com/lm-sys/FastChat/blob/main/fastchat/conversation.py
3
+ """
4
+
5
+ import dataclasses
6
+ import logging
7
+ import copy
8
+ from enum import IntEnum, auto
9
+ from typing import Dict, List
10
+ import base64
11
+
12
+ import gradio as gr
13
+ import torch
14
+
15
+ from .utils import pil_to_base64
16
+
17
+ IMAGE_TOKEN = "<image>"
18
+ logger = logging.getLogger("gradio_logger")
19
+
20
+
21
+ class SeparatorStyle(IntEnum):
22
+ """Separator styles."""
23
+
24
+ PLAIN = auto()
25
+ ALIGNMENT = auto()
26
+ KIMI_VL = auto()
27
+
28
+
29
+ @dataclasses.dataclass
30
+ class Conversation:
31
+ """A class that manages prompt templates and keeps all conversation history."""
32
+
33
+ # The name of this template
34
+ name: str
35
+ # The template of the system prompt
36
+ system_template: str = "{system_message}"
37
+ # The system message
38
+ system_message: str = ""
39
+ # The names of two roles
40
+ roles: List[str] = (("USER", "ASSISTANT"),)
41
+ # All messages. Each item is (role, message).
42
+ messages: List[List[str]] = ()
43
+ # The number of few shot examples
44
+ offset: int = 0
45
+ # The separator style and configurations
46
+ sep_style: SeparatorStyle = SeparatorStyle.PLAIN
47
+ sep: str = "\n"
48
+ sep2: str = None
49
+ # Stop criteria (the default one is EOS token)
50
+ stop_str: str = None
51
+ # Stops generation if meeting any token in this list
52
+ stop_token_ids: List[int] = None
53
+
54
+ def get_prompt(self) -> str:
55
+ """Get the prompt for generation."""
56
+ system_prompt = self.system_template.format(system_message=self.system_message)
57
+ if self.sep_style == SeparatorStyle.PLAIN:
58
+ seps = [self.sep, self.sep2]
59
+ ret = ""
60
+ for i, (role, message) in enumerate(self.messages):
61
+ if message:
62
+ if type(message) is tuple:
63
+ message = message[0]
64
+ if i % 2 == 0:
65
+ ret += message + seps[i % 2]
66
+ else:
67
+ ret += message + seps[i % 2]
68
+ else:
69
+ ret += ""
70
+ return ret
71
+ elif self.sep_style == SeparatorStyle.ALIGNMENT:
72
+ seps = [self.sep, self.sep2]
73
+ ret = ""
74
+ for i, (role, message) in enumerate(self.messages):
75
+ if message:
76
+ if type(message) is tuple:
77
+ message, _, _ = message
78
+ if i % 2 == 0:
79
+ ret += '<image>\n' + seps[i % 2]
80
+ else:
81
+ ret += message + seps[i % 2]
82
+ else:
83
+ ret += ""
84
+ return ret
85
+ elif self.sep_style == SeparatorStyle.KIMI_VL:
86
+ seps = [self.sep, self.sep2]
87
+ if system_prompt == "" or system_prompt is None:
88
+ ret = ""
89
+ else:
90
+ ret = system_prompt + seps[0]
91
+ for i, (role, message) in enumerate(self.messages):
92
+ if message:
93
+ if type(message) is tuple:
94
+ message = message[0]
95
+
96
+ if role == "user":
97
+ ret += message + self.sep
98
+ else:
99
+ if self.sep2 is not None:
100
+ ret += message + self.sep2
101
+ else:
102
+ ret += message
103
+ else:
104
+ ret = ret
105
+ return ret
106
+ else:
107
+ raise ValueError(f"Invalid style: {self.sep_style}")
108
+
109
+ def set_system_message(self, system_message: str):
110
+ """Set the system message."""
111
+ self.system_message = system_message
112
+
113
+ def append_message(self, role: str, message: str):
114
+ """Append a new message."""
115
+ self.messages.append([role, message])
116
+
117
+ def update_last_message(self, message: str):
118
+ """Update the last output.
119
+
120
+ The last message is typically set to be None when constructing the prompt,
121
+ so we need to update it in-place after getting the response from a model.
122
+ """
123
+ self.messages[-1][1] = message
124
+
125
+ def reset_message(self):
126
+ """Reset a new message."""
127
+ self.messages = []
128
+
129
+ def to_gradio_chatbot(self):
130
+ """Convert the conversation to gradio chatbot format."""
131
+ ret = []
132
+ for i, (role, msg) in enumerate(self.messages[self.offset :]):
133
+ if i % 2 == 0:
134
+ ret.append([msg, None])
135
+ else:
136
+ ret[-1][-1] = msg
137
+ return ret
138
+
139
+ def to_openai_api_messages(self):
140
+ """Convert the conversation to OpenAI chat completion format."""
141
+ system_prompt = self.system_template.format(system_message=self.system_message)
142
+ ret = [{"role": "system", "content": system_prompt}]
143
+
144
+ for i, (_, msg) in enumerate(self.messages[self.offset :]):
145
+ if i % 2 == 0:
146
+ ret.append({"role": "user", "content": msg})
147
+ else:
148
+ if msg is not None:
149
+ ret.append({"role": "assistant", "content": msg})
150
+ return ret
151
+
152
+ def copy(self):
153
+ return Conversation(
154
+ name=self.name,
155
+ system_template=self.system_template,
156
+ system_message=self.system_message,
157
+ roles=self.roles,
158
+ messages=[[x, y] for x, y in self.messages],
159
+ offset=self.offset,
160
+ sep_style=self.sep_style,
161
+ sep=self.sep,
162
+ sep2=self.sep2,
163
+ stop_str=self.stop_str,
164
+ stop_token_ids=self.stop_token_ids,
165
+ )
166
+
167
+ def dict(self):
168
+ return {
169
+ "template_name": self.name,
170
+ "system_message": self.system_message,
171
+ "roles": self.roles,
172
+ "messages": self.messages,
173
+ "offset": self.offset,
174
+ }
175
+
176
+
177
+ # A global registry for all conversation templates
178
+ conv_templates: Dict[str, Conversation] = {}
179
+
180
+
181
+ def register_conv_template(template: Conversation, override: bool = False):
182
+ """Register a new conversation template."""
183
+ if not override:
184
+ assert template.name not in conv_templates, f"{template.name} has been registered."
185
+
186
+ conv_templates[template.name] = template
187
+
188
+
189
+ def get_conv_template(name: str) -> Conversation:
190
+ """Get a conversation template."""
191
+ return conv_templates[name].copy()
192
+
193
+
194
+ register_conv_template(
195
+ Conversation(
196
+ name="plain",
197
+ system_template="",
198
+ system_message="",
199
+ roles=("", ""),
200
+ messages=(),
201
+ offset=0,
202
+ sep_style=SeparatorStyle.PLAIN,
203
+ sep="",
204
+ sep2="",
205
+ stop_token_ids=[100001],
206
+ stop_str=['</s>'],
207
+ )
208
+ )
209
+
210
+
211
+ register_conv_template(
212
+ Conversation(
213
+ name="alignment",
214
+ system_template="",
215
+ system_message="",
216
+ roles=("", ""),
217
+ messages=(),
218
+ offset=0,
219
+ sep_style=SeparatorStyle.ALIGNMENT,
220
+ sep="",
221
+ sep2="",
222
+ stop_token_ids=[100001],
223
+ stop_str=['</s>'],
224
+ )
225
+ )
226
+
227
+ register_conv_template(
228
+ Conversation(
229
+ name="kimi-vl",
230
+ system_template="{system_message}",
231
+ system_message="You are a helpful assistant",
232
+ roles=("user", "assistant"),
233
+ messages=(),
234
+ offset=0,
235
+ sep_style=SeparatorStyle.KIMI_VL,
236
+ sep="<|im_end|>",
237
+ sep2=None,
238
+ stop_token_ids=None,
239
+ stop_str=["<|im_end|>"],
240
+ )
241
+ )
242
+
243
+
244
+ def new_chat_template(sft_format: str = "kimi-vl"):
245
+ return get_conv_template(sft_format)
246
+
247
+
248
+ def get_prompt(conv: Conversation) -> str:
249
+ """Get the prompt for generation."""
250
+ return conv.get_prompt()
251
+
252
+
253
+ def generate_prompt_with_history(text, images, history, processor, max_length=2048):
254
+ """
255
+ Generate a prompt with the chat history.
256
+
257
+ Args:
258
+ text (str): The text prompt.
259
+ images (list[PIL.Image.Image]): The image prompt.
260
+ history (list): List of previous conversation messages.
261
+ processor (KimiVLProcessor): The chat processor used for encoding the prompt.
262
+ max_length (int): The maximum length of the prompt.
263
+ """
264
+ global IMAGE_TOKEN
265
+
266
+ user_role_ind = 0
267
+ bot_role_ind = 1
268
+
269
+ # Initialize conversation
270
+ conversation = new_chat_template(sft_format="kimi-vl")
271
+
272
+ if history:
273
+ conversation.messages = history
274
+
275
+ if images is not None and len(images) > 0:
276
+ # num_image_tags = text.count(IMAGE_TOKEN)
277
+ # num_images = len(images)
278
+ # if num_images > num_image_tags:
279
+ # pad_image_tags = num_images - num_image_tags
280
+ # image_tokens = "\n".join([IMAGE_TOKEN] * pad_image_tags)
281
+
282
+ # # append the <image> in a new line after the text prompt
283
+ # text = image_tokens + "\n" + text
284
+ # elif num_images < num_image_tags:
285
+ # remove_image_tags = num_image_tags - num_images
286
+ # text = text.replace(IMAGE_TOKEN, "", remove_image_tags)
287
+
288
+ print(f"prompt = {text}, len(images) = {len(images)}")
289
+ text = (text, images)
290
+
291
+ conversation.append_message(conversation.roles[user_role_ind], text)
292
+ conversation.append_message(conversation.roles[bot_role_ind], "")
293
+
294
+ # Create a copy of the conversation to avoid history truncation in the UI
295
+ conversation_copy = conversation.copy()
296
+ logger.info("=" * 80)
297
+ logger.info(get_prompt(conversation))
298
+
299
+ rounds = len(conversation.messages) // 2
300
+
301
+ for _ in range(rounds):
302
+ current_prompt = get_prompt(conversation)
303
+ assert isinstance(current_prompt, str) and len(current_prompt) > 0, f"current_prompt = {current_prompt}"
304
+ if torch.tensor(processor.tokenizer.encode(current_prompt)).size(-1) <= max_length:
305
+ return conversation_copy
306
+
307
+ if len(conversation.messages) % 2 != 0:
308
+ gr.Error("The messages between user and assistant are not paired.")
309
+ return
310
+
311
+ try:
312
+ for _ in range(2): # pop out two messages in a row
313
+ conversation.messages.pop(0)
314
+ except IndexError:
315
+ gr.Error("Input text processing failed, unable to respond in this round.")
316
+ return None
317
+
318
+ gr.Error("Prompt could not be generated within max_length limit.")
319
+ return None
320
+
321
+
322
+ def convert_conversation_to_prompts(conversation: Conversation):
323
+ """
324
+ Convert the conversation to prompts.
325
+ """
326
+ conv_prompts = []
327
+ last_image = None
328
+
329
+ messages = conversation.messages
330
+ for i in range(0, len(messages), 2):
331
+ if isinstance(messages[i][1], tuple):
332
+ text, images = messages[i][1]
333
+ last_image = images[-1]
334
+ else:
335
+ text, images = messages[i][1], []
336
+
337
+ prompt = {"role": messages[i][0], "content": text, "images": images}
338
+ response = {"role": messages[i + 1][0], "content": messages[i + 1][1]}
339
+ conv_prompts.extend([prompt, response])
340
+
341
+ return conv_prompts, last_image
342
+
343
+
344
+ def to_gradio_chatbot(conversation: Conversation) -> list:
345
+ """Convert the conversation to gradio chatbot format."""
346
+ ret = []
347
+ for i, (_, msg) in enumerate(conversation.messages[conversation.offset :]):
348
+ if i % 2 == 0:
349
+ if type(msg) is tuple:
350
+ msg, images = copy.deepcopy(msg)
351
+
352
+ if isinstance(images, list):
353
+ img_str = ""
354
+ for j, image in enumerate(images):
355
+ if isinstance(image, str):
356
+ with open(image, "rb") as f:
357
+ data = f.read()
358
+ img_b64_str = base64.b64encode(data).decode()
359
+ image_str = (
360
+ f'<img src="data:image/png;base64,{img_b64_str}" '
361
+ f'alt="user upload image" style="max-width: 300px; height: auto;" />'
362
+ )
363
+ else:
364
+ image_str = pil_to_base64(image, f"user upload image_{j}", max_size=800, min_size=400)
365
+
366
+ img_str += image_str
367
+ msg = img_str + msg
368
+ else:
369
+ pass
370
+
371
+ ret.append([msg, None])
372
+ else:
373
+ ret[-1][-1] = msg
374
+ return ret
375
+
376
+
377
+ def to_gradio_history(conversation: Conversation):
378
+ """Convert the conversation to gradio history format."""
379
+ return conversation.messages[conversation.offset :]
kimi_vl/serve/examples.py ADDED
@@ -0,0 +1,54 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import os
2
+ import io
3
+ import base64
4
+ from PIL import Image
5
+
6
+ EXAMPLES_LIST = [
7
+ [
8
+ ["images/demo1.jpeg"],
9
+ "Where am I?",
10
+ ],
11
+ [
12
+ ["images/demo2.jpeg", "images/demo3.jpeg"],
13
+ "Based on the abstract and introduction above, write a concise and elegant Twitter post that highlights key points and figures without sounding overly promotional. Use English, include emojis and hashtags.",
14
+ ],
15
+ [
16
+ ["images/demo6.jpeg"],
17
+ "Create a role play modeled after this cat."
18
+ ],
19
+ # mulit-frames example
20
+ [
21
+ ["images/demo4.jpeg", "images/demo5.jpeg"],
22
+ "Please infer step by step who this manuscript belongs to and what it records."
23
+ ]
24
+ ]
25
+
26
+
27
+ def display_example(image_list, root_dir: str = None):
28
+ images_html = ""
29
+ for _, img_path in enumerate(image_list):
30
+ if root_dir is not None:
31
+ img_path = os.path.join(root_dir, img_path)
32
+
33
+ image = Image.open(img_path)
34
+ buffered = io.BytesIO()
35
+ image.save(buffered, format="PNG", quality=100)
36
+ img_b64_str = base64.b64encode(buffered.getvalue()).decode()
37
+ img_str = f'<img src="data:image/png;base64,{img_b64_str}" alt="{img_path}" style="height:80px; margin-right: 10px;" />'
38
+ images_html += img_str
39
+
40
+ result_html = f"""
41
+ <div style="display: flex; align-items: center; margin-bottom: 10px;">
42
+ <div style="flex: 1; margin-right: 10px;">{images_html}</div>
43
+ </div>
44
+ """
45
+
46
+ return result_html
47
+
48
+
49
+ def get_examples(root_dir: str = None):
50
+ examples = []
51
+ for images, texts in EXAMPLES_LIST:
52
+ examples.append([images, display_example(images, root_dir), texts])
53
+
54
+ return examples
kimi_vl/serve/frontend.py ADDED
@@ -0,0 +1,134 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import logging
2
+ import os
3
+ from typing import List, Tuple
4
+
5
+ import gradio as gr
6
+
7
+ from kimi_vl.serve.utils import convert_asis, convert_mdtext, detect_converted_mark
8
+
9
+ ROOT_PATH = os.path.dirname(os.path.abspath(__file__))
10
+
11
+
12
+ small_and_beautiful_theme = gr.themes.Soft(
13
+ primary_hue=gr.themes.Color(
14
+ c50="#EBFAF2",
15
+ c100="#CFF3E1",
16
+ c200="#A8EAC8",
17
+ c300="#77DEA9",
18
+ c400="#3FD086",
19
+ c500="#02C160",
20
+ c600="#06AE56",
21
+ c700="#05974E",
22
+ c800="#057F45",
23
+ c900="#04673D",
24
+ c950="#2E5541",
25
+ name="small_and_beautiful",
26
+ ),
27
+ secondary_hue=gr.themes.Color(
28
+ c50="#576b95",
29
+ c100="#576b95",
30
+ c200="#576b95",
31
+ c300="#576b95",
32
+ c400="#576b95",
33
+ c500="#576b95",
34
+ c600="#576b95",
35
+ c700="#576b95",
36
+ c800="#576b95",
37
+ c900="#576b95",
38
+ c950="#576b95",
39
+ ),
40
+ neutral_hue=gr.themes.Color(
41
+ name="gray",
42
+ c50="#f6f7f8",
43
+ # c100="#f3f4f6",
44
+ c100="#F2F2F2",
45
+ c200="#e5e7eb",
46
+ c300="#d1d5db",
47
+ c400="#B2B2B2",
48
+ c500="#808080",
49
+ c600="#636363",
50
+ c700="#515151",
51
+ c800="#393939",
52
+ # c900="#272727",
53
+ c900="#2B2B2B",
54
+ c950="#171717",
55
+ ),
56
+ radius_size=gr.themes.sizes.radius_sm,
57
+ ).set(
58
+ # button_primary_background_fill="*primary_500",
59
+ button_primary_background_fill_dark="*primary_600",
60
+ # button_primary_background_fill_hover="*primary_400",
61
+ # button_primary_border_color="*primary_500",
62
+ button_primary_border_color_dark="*primary_600",
63
+ button_primary_text_color="white",
64
+ button_primary_text_color_dark="white",
65
+ button_secondary_background_fill="*neutral_100",
66
+ button_secondary_background_fill_hover="*neutral_50",
67
+ button_secondary_background_fill_dark="*neutral_900",
68
+ button_secondary_text_color="*neutral_800",
69
+ button_secondary_text_color_dark="white",
70
+ # background_fill_primary="#F7F7F7",
71
+ # background_fill_primary_dark="#1F1F1F",
72
+ # block_title_text_color="*primary_500",
73
+ block_title_background_fill_dark="*primary_900",
74
+ block_label_background_fill_dark="*primary_900",
75
+ input_background_fill="#F6F6F6",
76
+ # chatbot_code_background_color_dark="*neutral_950",
77
+ )
78
+
79
+
80
+ def compact_text_chunks(self, prompt, text_chunks: List[str]) -> List[str]:
81
+ logging.debug("Compacting text chunks...🚀🚀🚀")
82
+ combined_str = [c.strip() for c in text_chunks if c.strip()]
83
+ combined_str = [f"[{index+1}] {c}" for index, c in enumerate(combined_str)]
84
+ combined_str = "\n\n".join(combined_str)
85
+ # resplit based on self.max_chunk_overlap
86
+ text_splitter = self.get_text_splitter_given_prompt(prompt, 1, padding=1)
87
+ return text_splitter.split_text(combined_str)
88
+
89
+
90
+ def postprocess(y: List[Tuple[str | None, str | None]]) -> List[Tuple[str | None, str | None]]:
91
+ """
92
+ Parameters:
93
+ y: List of tuples representing the message and response pairs. Each message and response should be a string, which may be in Markdown format.
94
+ Returns:
95
+ List of tuples representing the message and response. Each message and response will be a string of HTML.
96
+ """
97
+ if y is None or y == []:
98
+ return []
99
+ temp = []
100
+ for x in y:
101
+ user, bot = x
102
+ if not detect_converted_mark(user):
103
+ user = convert_asis(user)
104
+ if not detect_converted_mark(bot):
105
+ bot = convert_mdtext(bot)
106
+ temp.append((user, bot))
107
+ return temp
108
+
109
+
110
+ custom_js_path = os.path.join(ROOT_PATH, "assets/custom.js")
111
+ kelpy_codos_path = os.path.join(ROOT_PATH, "assets/Kelpy-Codos.js")
112
+
113
+ with (
114
+ open(custom_js_path, "r", encoding="utf-8") as f,
115
+ open(kelpy_codos_path, "r", encoding="utf-8") as f2,
116
+ ):
117
+ customJS = f.read()
118
+ kelpyCodos = f2.read()
119
+
120
+
121
+ def reload_javascript():
122
+ print("Reloading javascript...")
123
+ js = f"<script>{customJS}</script><script>{kelpyCodos}</script>"
124
+
125
+ def template_response(*args, **kwargs):
126
+ res = GradioTemplateResponseOriginal(*args, **kwargs)
127
+ res.body = res.body.replace(b"</html>", f"{js}</html>".encode("utf8"))
128
+ res.init_headers()
129
+ return res
130
+
131
+ gr.routes.templates.TemplateResponse = template_response
132
+
133
+
134
+ GradioTemplateResponseOriginal = gr.routes.templates.TemplateResponse
kimi_vl/serve/gradio_utils.py ADDED
@@ -0,0 +1,93 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ """
2
+ Gradio utils for the Kimi-VL application.
3
+ """
4
+
5
+ import functools
6
+ from typing import Callable
7
+ import traceback
8
+
9
+ import gradio as gr
10
+
11
+
12
+ IMAGE_TOKEN = "<image>"
13
+
14
+
15
+ def transfer_input(input_text, input_images):
16
+ """
17
+ Transfer the input text and images to the input text and images.
18
+ """
19
+ return (input_text, input_images, gr.update(value=""), gr.update(value=None), gr.Button(visible=True))
20
+
21
+
22
+ def delete_last_conversation(chatbot, history):
23
+ """
24
+ Delete the last conversation from the chatbot and history.
25
+
26
+ Args:
27
+ chatbot (list): The chatbot list.
28
+ history (list): The history list.
29
+ """
30
+ if len(history) % 2 != 0:
31
+ gr.Error("history length is not even")
32
+ return (
33
+ chatbot,
34
+ history,
35
+ "Delete Done",
36
+ )
37
+
38
+ if len(chatbot) > 0:
39
+ chatbot.pop()
40
+
41
+ if len(history) > 0 and len(history) % 2 == 0:
42
+ history.pop()
43
+ history.pop()
44
+
45
+ return (
46
+ chatbot,
47
+ history,
48
+ "Delete Done",
49
+ )
50
+
51
+
52
+ def reset_state():
53
+ return [], [], None, "Reset Done"
54
+
55
+
56
+ def reset_textbox():
57
+ return gr.update(value=""), ""
58
+
59
+
60
+ def cancel_outputing():
61
+ return "Stop Done"
62
+
63
+
64
+ class State:
65
+ interrupted = False
66
+
67
+ def interrupt(self):
68
+ self.interrupted = True
69
+
70
+ def recover(self):
71
+ self.interrupted = False
72
+
73
+
74
+ shared_state = State()
75
+
76
+
77
+ def wrap_gen_fn(gen_fn: Callable):
78
+ """
79
+ Wrap the generator function to handle errors.
80
+ """
81
+
82
+ @functools.wraps(gen_fn)
83
+ def wrapped_gen_fn(prompt, *args, **kwargs):
84
+ try:
85
+ yield from gen_fn(prompt, *args, **kwargs)
86
+ except gr.Error as g_err:
87
+ traceback.print_exc()
88
+ raise g_err
89
+ except Exception as e:
90
+ traceback.print_exc()
91
+ raise gr.Error(f"Failed to generate text: {e}") from e
92
+
93
+ return wrapped_gen_fn
kimi_vl/serve/inference.py ADDED
@@ -0,0 +1,224 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import logging
2
+ import re
3
+ from threading import Thread
4
+ from typing import List, Optional
5
+
6
+ import torch
7
+ import spaces
8
+ from transformers import (
9
+ AutoModelForCausalLM,
10
+ AutoProcessor,
11
+ AutoConfig,
12
+ StoppingCriteria,
13
+ StoppingCriteriaList,
14
+ TextIteratorStreamer,
15
+ )
16
+
17
+ from .chat_utils import Conversation, get_conv_template
18
+
19
+ logger = logging.getLogger(__name__)
20
+
21
+
22
+ def load_model(model_path: str = "moonshotai/Kimi-VL-A3B-Thinking"):
23
+ # hotfix the model to use flash attention 2
24
+ config = AutoConfig.from_pretrained(model_path, trust_remote_code=True)
25
+ config._attn_implementation = "flash_attention_2"
26
+ config.vision_config._attn_implementation = "flash_attention_2"
27
+ config.text_config._attn_implementation = "flash_attention_2"
28
+ print("Successfully set the attn_implementation to flash_attention_2")
29
+
30
+ model = AutoModelForCausalLM.from_pretrained(
31
+ model_path,
32
+ config=config,
33
+ torch_dtype="auto",
34
+ device_map="auto",
35
+ trust_remote_code=True,
36
+ )
37
+ processor = AutoProcessor.from_pretrained(model_path, config=config, trust_remote_code=True)
38
+
39
+ return model, processor
40
+
41
+
42
+ class StoppingCriteriaSub(StoppingCriteria):
43
+ def __init__(self, stops=[], encounters=1):
44
+ super().__init__()
45
+ self.stops = [stop.to("cuda") for stop in stops]
46
+
47
+ def __call__(self, input_ids: torch.LongTensor, scores: torch.FloatTensor, **kwargs):
48
+ for stop in self.stops:
49
+ if input_ids.shape[-1] < len(stop):
50
+ continue
51
+ if torch.all((stop == input_ids[0][-len(stop) :])).item():
52
+ return True
53
+
54
+ return False
55
+
56
+
57
+ def format_messages(
58
+ conversations: list[Conversation],
59
+ system_prompt: Optional[str] = "",
60
+ sft_format: Optional[str] = "kimi-vl",
61
+ ):
62
+ """
63
+ Format the conversations to the input format of the model.
64
+ """
65
+ converstion = get_conv_template(sft_format)
66
+ converstion.set_system_message(system_prompt)
67
+ for message in conversations:
68
+ converstion.append_message(message["role"], message["content"])
69
+ return converstion
70
+
71
+
72
+ def preprocess(
73
+ messages: list[dict],
74
+ processor,
75
+ sft_format: Optional[str] = "kimi-vl",
76
+ ):
77
+ """
78
+ Build messages from the conversations and images.
79
+ """
80
+ # get images from conversations
81
+ results = []
82
+ images = []
83
+
84
+ # get texts from conversations
85
+ converstion = get_conv_template(sft_format)
86
+ # only use the last 3 round of messages
87
+ latest_messages = messages[-3:]
88
+ for mid, message in enumerate(latest_messages):
89
+ if message["role"] == converstion.roles[0] or message["role"] == "user":
90
+ record = {
91
+ "role": message["role"],
92
+ "content": [],
93
+ }
94
+ if "images" in message:
95
+ per_round_images = message["images"]
96
+ if len(per_round_images) > 2:
97
+ per_round_images = per_round_images[-2:]
98
+ print(f"Only use the last 2 images in the {mid}-th round")
99
+
100
+ images.extend(per_round_images)
101
+ for image in per_round_images:
102
+ record["content"].append(
103
+ {
104
+ "type": "image",
105
+ "image": image,
106
+ }
107
+ )
108
+ if 'content' in message:
109
+ record["content"].append(
110
+ {
111
+ "type": "text",
112
+ "text": str(message["content"]).strip(),
113
+ }
114
+ )
115
+ results.append(record)
116
+ elif message["role"] == converstion.roles[1] or message["role"] == "assistant":
117
+ formatted_answer = message["content"].strip()
118
+ # ◁think▷用户说了“你好”,这是一个非常简单的问候,通常用于开启对话。我需要判断用户的意图。可能性一:用户只是礼貌性地打招呼,想要开启一段对话;可能性二:用户可能有更具体的需求,比如询问我的功能、功能或者需要帮助。由于用户没有提供更多信息,我需要保持开放,同时引导用户进一步说明他们的需求。
119
+ # 我的回复需要既友好又开放,不能显得过于正式或冷漠。同时,我需要避免假设用户的具体需求,而是提供一个轻松的、鼓励继续对话的回应。◁/think▷你好!很高兴见到你。有什么我可以帮助你的吗
120
+ # delete all the texts between ◁think▷ and ◁/think▷
121
+ # FIXME: this is a hack to remove the thinking texts
122
+ # formatted_answer = re.sub(r"◁think▷.*◁/think▷", "", formatted_answer)
123
+ think_end_token = '◁/think▷'
124
+ formatted_answer = formatted_answer.split(think_end_token)[-1]
125
+ results.append(
126
+ {
127
+ "role": message["role"],
128
+ "content": [
129
+ {
130
+ "type": "text",
131
+ "text": formatted_answer,
132
+ }
133
+ ],
134
+ }
135
+ )
136
+ assert (
137
+ formatted_answer.count(processor.image_token) == 0
138
+ ), f"there should be no {processor.image_token} in the assistant's reply, but got {messages}"
139
+ converstion.append_message(converstion.roles[1], formatted_answer)
140
+
141
+ text = processor.apply_chat_template(results, add_generation_prompt=True)
142
+ print(f"raw text = {text}")
143
+ if len(images) == 0:
144
+ images = None
145
+
146
+ inputs = processor(
147
+ images=images,
148
+ text=[text],
149
+ return_tensors="pt",
150
+ padding=True,
151
+ truncation=True,
152
+ )
153
+ return inputs
154
+
155
+
156
+ @torch.no_grad()
157
+ @torch.inference_mode()
158
+ def kimi_vl_generate(
159
+ model: torch.nn.Module,
160
+ processor: AutoProcessor,
161
+ conversations: list[Conversation],
162
+ stop_words: list,
163
+ max_length: int = 256,
164
+ temperature: float = 1.0,
165
+ top_p: float = 1.0,
166
+ chunk_size: int = -1,
167
+ ):
168
+ # convert conversation to inputs
169
+ print(f"conversations = {conversations}")
170
+ inputs = preprocess(conversations, processor=processor)
171
+ inputs = inputs.to(model.device)
172
+
173
+ return generate(
174
+ model,
175
+ processor,
176
+ inputs,
177
+ max_gen_len=max_length,
178
+ temperature=temperature,
179
+ top_p=top_p,
180
+ stop_words=stop_words,
181
+ chunk_size=chunk_size,
182
+ )
183
+
184
+
185
+ def generate(
186
+ model,
187
+ processor,
188
+ inputs,
189
+ max_gen_len: int = 256,
190
+ temperature: float = 0,
191
+ top_p: float = 0.95,
192
+ stop_words: List[str] = [],
193
+ chunk_size: int = -1,
194
+ ):
195
+ """Stream the text output from the multimodality model with prompt and image inputs."""
196
+ tokenizer = processor.tokenizer
197
+ stop_words_ids = [torch.tensor(tokenizer.encode(stop_word)) for stop_word in stop_words]
198
+ stopping_criteria = StoppingCriteriaList([StoppingCriteriaSub(stops=stop_words_ids)])
199
+ streamer = TextIteratorStreamer(tokenizer, skip_prompt=True)
200
+
201
+ kwargs = dict(
202
+ **inputs,
203
+ max_new_tokens=max_gen_len,
204
+ do_sample=True,
205
+ use_cache=True,
206
+ streamer=streamer,
207
+ stopping_criteria=stopping_criteria,
208
+ )
209
+
210
+ if temperature > 0:
211
+ kwargs.update(
212
+ {
213
+ "do_sample": True,
214
+ "top_p": top_p,
215
+ "temperature": temperature,
216
+ }
217
+ )
218
+ else:
219
+ kwargs["do_sample"] = False
220
+
221
+ thread = Thread(target=model.generate, kwargs=kwargs)
222
+ thread.start()
223
+
224
+ yield from streamer
kimi_vl/serve/utils.py ADDED
@@ -0,0 +1,290 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ from __future__ import annotations
2
+
3
+ import html
4
+ import logging
5
+ import io
6
+ import os
7
+ import re
8
+ import base64
9
+ import time
10
+ from PIL import Image, ImageDraw, ImageFont
11
+
12
+ import mdtex2html
13
+ from markdown import markdown
14
+ from pygments import highlight
15
+ from pygments.formatters import HtmlFormatter
16
+ from pygments.lexers import ClassNotFound, get_lexer_by_name, guess_lexer
17
+
18
+
19
+ ALREADY_CONVERTED_MARK = "<!-- ALREADY CONVERTED BY PARSER. -->"
20
+ BOX2COLOR = {
21
+ 0: (255, 0, 0),
22
+ 1: (0, 255, 0),
23
+ 2: (0, 0, 255),
24
+ }
25
+ MAX_IMAGE_SIZE = 1024
26
+ MIN_IMAGE_SIZE = 1024
27
+ logger = logging.getLogger("gradio_logger")
28
+
29
+
30
+ def configure_logger(log_dir: str = "logs"):
31
+ logger = logging.getLogger("gradio_logger")
32
+ logger.setLevel(logging.DEBUG)
33
+
34
+ timestr = time.strftime("%Y%m%d-%H%M%S")
35
+ os.makedirs(log_dir, exist_ok=True)
36
+ file_handler = logging.FileHandler(f"{log_dir}/{timestr}_gradio_log.log")
37
+ console_handler = logging.StreamHandler()
38
+
39
+ formatter = logging.Formatter("%(asctime)s - %(name)s - %(levelname)s - %(message)s")
40
+ console_handler.setFormatter(formatter)
41
+ file_handler.setFormatter(formatter)
42
+
43
+ console_handler.setLevel(logging.INFO)
44
+ file_handler.setLevel(logging.INFO)
45
+
46
+ logger.addHandler(console_handler)
47
+ logger.addHandler(file_handler)
48
+
49
+ return logger
50
+
51
+
52
+ def strip_stop_words(x, stop_words):
53
+ for w in stop_words:
54
+ if w in x:
55
+ return x[: x.index(w)].strip()
56
+ return x.strip()
57
+
58
+
59
+ def format_output(history, text, x):
60
+ updated_history = history + [[text, x]]
61
+ a = [[y[0], convert_to_markdown(y[1])] for y in updated_history]
62
+ return a, updated_history
63
+
64
+
65
+ def markdown_to_html_with_syntax_highlight(md_str): # deprecated
66
+ def replacer(match):
67
+ lang = match.group(1) or "text"
68
+ code = match.group(2)
69
+
70
+ try:
71
+ lexer = get_lexer_by_name(lang, stripall=True)
72
+ except ValueError:
73
+ lexer = get_lexer_by_name("text", stripall=True)
74
+
75
+ formatter = HtmlFormatter()
76
+ highlighted_code = highlight(code, lexer, formatter)
77
+
78
+ return f'<pre><code class="{lang}">{highlighted_code}</code></pre>'
79
+
80
+ code_block_pattern = r"```(\w+)?\n([\s\S]+?)\n```"
81
+ md_str = re.sub(code_block_pattern, replacer, md_str, flags=re.MULTILINE)
82
+
83
+ html_str = markdown(md_str)
84
+ return html_str
85
+
86
+
87
+ def normalize_markdown(md_text: str) -> str: # deprecated
88
+ lines = md_text.split("\n")
89
+ normalized_lines = []
90
+ inside_list = False
91
+
92
+ for i, line in enumerate(lines):
93
+ if re.match(r"^(\d+\.|-|\*|\+)\s", line.strip()):
94
+ if not inside_list and i > 0 and lines[i - 1].strip() != "":
95
+ normalized_lines.append("")
96
+ inside_list = True
97
+ normalized_lines.append(line)
98
+ elif inside_list and line.strip() == "":
99
+ if i < len(lines) - 1 and not re.match(r"^(\d+\.|-|\*|\+)\s", lines[i + 1].strip()):
100
+ normalized_lines.append(line)
101
+ continue
102
+ else:
103
+ inside_list = False
104
+ normalized_lines.append(line)
105
+
106
+ return "\n".join(normalized_lines)
107
+
108
+
109
+ def convert_mdtext(md_text):
110
+ code_block_pattern = re.compile(r"```(.*?)(?:```|$)", re.DOTALL)
111
+ inline_code_pattern = re.compile(r"`(.*?)`", re.DOTALL)
112
+ code_blocks = code_block_pattern.findall(md_text)
113
+ non_code_parts = code_block_pattern.split(md_text)[::2]
114
+
115
+ result = []
116
+ for non_code, code in zip(non_code_parts, code_blocks + [""]):
117
+ if non_code.strip():
118
+ non_code = normalize_markdown(non_code)
119
+ if inline_code_pattern.search(non_code):
120
+ result.append(markdown(non_code, extensions=["tables"]))
121
+ else:
122
+ result.append(mdtex2html.convert(non_code, extensions=["tables"]))
123
+ if code.strip():
124
+ code = f"\n```{code}\n\n```"
125
+ code = markdown_to_html_with_syntax_highlight(code)
126
+ result.append(code)
127
+ result = "".join(result)
128
+ result += ALREADY_CONVERTED_MARK
129
+ return result
130
+
131
+
132
+ def convert_asis(userinput):
133
+ return f'<p style="white-space:pre-wrap;">{html.escape(userinput)}</p>{ALREADY_CONVERTED_MARK}'
134
+
135
+
136
+ def is_stop_word_or_prefix(s: str, stop_words: list) -> bool:
137
+ return any(s.endswith(stop_word) for stop_word in stop_words)
138
+
139
+
140
+ def detect_converted_mark(userinput):
141
+ return bool(userinput.endswith(ALREADY_CONVERTED_MARK))
142
+
143
+
144
+ def detect_language(code):
145
+ first_line = "" if code.startswith("\n") else code.strip().split("\n", 1)[0]
146
+ language = first_line.lower() if first_line else ""
147
+ code_without_language = code[len(first_line) :].lstrip() if first_line else code
148
+ return language, code_without_language
149
+
150
+
151
+ def convert_to_markdown(text):
152
+ text = text.replace("$", "&#36;")
153
+ text = text.replace("\r\n", "\n")
154
+
155
+ def replace_leading_tabs_and_spaces(line):
156
+ new_line = []
157
+
158
+ for char in line:
159
+ if char == "\t":
160
+ new_line.append("&#9;")
161
+ elif char == " ":
162
+ new_line.append("&nbsp;")
163
+ else:
164
+ break
165
+ return "".join(new_line) + line[len(new_line) :]
166
+
167
+ markdown_text = ""
168
+ lines = text.split("\n")
169
+ in_code_block = False
170
+
171
+ for line in lines:
172
+ if in_code_block is False and line.startswith("```"):
173
+ in_code_block = True
174
+ markdown_text += f"{line}\n"
175
+ elif in_code_block is True and line.startswith("```"):
176
+ in_code_block = False
177
+ markdown_text += f"{line}\n"
178
+ elif in_code_block:
179
+ markdown_text += f"{line}\n"
180
+ else:
181
+ line = replace_leading_tabs_and_spaces(line)
182
+ line = re.sub(r"^(#)", r"\\\1", line)
183
+ markdown_text += f"{line} \n"
184
+
185
+ return markdown_text
186
+
187
+
188
+ def add_language_tag(text):
189
+ def detect_language(code_block):
190
+ try:
191
+ lexer = guess_lexer(code_block)
192
+ return lexer.name.lower()
193
+ except ClassNotFound:
194
+ return ""
195
+
196
+ code_block_pattern = re.compile(r"(```)(\w*\n[^`]+```)", re.MULTILINE)
197
+
198
+ def replacement(match):
199
+ code_block = match.group(2)
200
+ if match.group(2).startswith("\n"):
201
+ language = detect_language(code_block)
202
+ return f"```{language}{code_block}```" if language else f"```\n{code_block}```"
203
+ else:
204
+ return match.group(1) + code_block + "```"
205
+
206
+ text2 = code_block_pattern.sub(replacement, text)
207
+ return text2
208
+
209
+
210
+ def is_variable_assigned(var_name: str) -> bool:
211
+ return var_name in locals()
212
+
213
+
214
+ def pil_to_base64(
215
+ image: Image.Image,
216
+ alt: str = "user upload image",
217
+ resize: bool = True,
218
+ max_size: int = MAX_IMAGE_SIZE,
219
+ min_size: int = MIN_IMAGE_SIZE,
220
+ format: str = "JPEG",
221
+ quality: int = 95,
222
+ ) -> str:
223
+ """
224
+ Convert a PIL image to a base64 string.
225
+ """
226
+
227
+ if resize:
228
+ max_hw, min_hw = max(image.size), min(image.size)
229
+ aspect_ratio = max_hw / min_hw
230
+ shortest_edge = int(min(max_size / aspect_ratio, min_size, min_hw))
231
+ longest_edge = int(shortest_edge * aspect_ratio)
232
+ W, H = image.size
233
+ if H > W:
234
+ H, W = longest_edge, shortest_edge
235
+ else:
236
+ H, W = shortest_edge, longest_edge
237
+ image = image.resize((W, H))
238
+
239
+ buffered = io.BytesIO()
240
+ image.save(buffered, format=format, quality=quality)
241
+ img_b64_str = base64.b64encode(buffered.getvalue()).decode()
242
+ img_str = f'<img src="data:image/png;base64,{img_b64_str}" alt="{alt}" />'
243
+
244
+ return img_str
245
+
246
+
247
+ def parse_ref_bbox(response, image: Image.Image):
248
+ try:
249
+ image = image.copy()
250
+ image_h, image_w = image.size
251
+ draw = ImageDraw.Draw(image)
252
+
253
+ ref = re.findall(r'<\|ref\|>.*?<\|/ref\|>', response)
254
+ bbox = re.findall(r'<\|det\|>.*?<\|/det\|>', response)
255
+ assert len(ref) == len(bbox)
256
+
257
+ if len(ref) == 0:
258
+ return None
259
+
260
+ boxes, labels = [], []
261
+ for box, label in zip(bbox, ref):
262
+ box = box.replace('<|det|>', '').replace('<|/det|>', '')
263
+ label = label.replace('<|ref|>', '').replace('<|/ref|>', '')
264
+ box = box[1:-1]
265
+ for onebox in re.findall(r'\[.*?\]', box):
266
+ boxes.append(eval(onebox))
267
+ labels.append(label)
268
+
269
+ for indice, (box, label) in enumerate(zip(boxes, labels)):
270
+ box = (
271
+ int(box[0] / 999 * image_h),
272
+ int(box[1] / 999 * image_w),
273
+ int(box[2] / 999 * image_h),
274
+ int(box[3] / 999 * image_w),
275
+ )
276
+
277
+ box_color = BOX2COLOR[indice % len(BOX2COLOR.keys())]
278
+ box_width = 3
279
+ draw.rectangle(box, outline=box_color, width=box_width)
280
+
281
+ text_x = box[0]
282
+ text_y = box[1] - 20
283
+ text_color = box_color
284
+ # font = ImageFont.truetype("kimi_vl/serve/assets/simsun.ttc", size=20)
285
+ draw.text((text_x, text_y), label, font=font, fill=text_color)
286
+
287
+ return image
288
+ except Exception as e:
289
+ logger.error(f"Error parsing reference bounding boxes: {e}")
290
+ return None