Spaces:

moonshotai
/

Kimi-Dev-72B

Running on L40S

App Files Files Community

miaoyibo commited on 10 days ago

Commit

bf41562

1 Parent(s): 29c1df8

1

Browse files

Files changed (13) hide show

kimi_vl/__init__.py +0 -0
kimi_vl/serve/__init__.py +0 -0
kimi_vl/serve/assets/Kelpy-Codos.js +100 -0
kimi_vl/serve/assets/avatar.png +0 -0
kimi_vl/serve/assets/custom.css +355 -0
kimi_vl/serve/assets/custom.js +22 -0
kimi_vl/serve/assets/favicon.ico +0 -0
kimi_vl/serve/chat_utils.py +379 -0
kimi_vl/serve/examples.py +54 -0
kimi_vl/serve/frontend.py +134 -0
kimi_vl/serve/gradio_utils.py +93 -0
kimi_vl/serve/inference.py +224 -0
kimi_vl/serve/utils.py +290 -0

kimi_vl/__init__.py ADDED Viewed

File without changes

kimi_vl/serve/__init__.py ADDED Viewed

File without changes

kimi_vl/serve/assets/Kelpy-Codos.js ADDED Viewed

	@@ -0,0 +1,100 @@

+/**
+ * Copyright (c) 2023-2024 DeepSeek.
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy of
+ * this software and associated documentation files (the "Software"), to deal in
+ * the Software without restriction, including without limitation the rights to
+ * use, copy, modify, merge, publish, distribute, sublicense, and/or sell copies of
+ * the Software, and to permit persons to whom the Software is furnished to do so,
+ * subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, FITNESS
+ * FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR
+ * COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER
+ * IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN
+ * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
+ */
+// ==UserScript==
+// @name         Kelpy Codos
+// @namespace    https://github.com/Keldos-Li/Kelpy-Codos
+// @version      1.0.5
+// @author       Keldos; https://keldos.me/
+// @description  Add copy button to PRE tags before CODE tag, for Chuanhu ChatGPT especially.
+//               Based on Chuanhu ChatGPT version: ac04408 (2023-3-22)
+// @license      GPL-3.0
+// @grant        none
+// ==/UserScript==
+(function () {
+  "use strict";
+  function addCopyButton(pre) {
+    var code = pre.querySelector("code");
+    if (!code) {
+      return; // 如果没有找到 <code> 元素，则不添加按钮
+    }
+    var firstChild = code.firstChild;
+    if (!firstChild) {
+      return; // 如果 <code> 元素没有子节点，则不添加按钮
+    }
+    var button = document.createElement("button");
+    button.textContent = "\uD83D\uDCCE"; // 使用 📎 符号作为“复制”按钮的文本
+    button.style.position = "relative";
+    button.style.float = "right";
+    button.style.fontSize = "1em"; // 可选：调整按钮大小
+    button.style.background = "none"; // 可选：去掉背景颜色
+    button.style.border = "none"; // 可选：去掉边框
+    button.style.cursor = "pointer"; // 可选：显示指针样式
+    button.addEventListener("click", function () {
+      var range = document.createRange();
+      range.selectNodeContents(code);
+      range.setStartBefore(firstChild); // 将范围设置为第一个子节点之前
+      var selection = window.getSelection();
+      selection.removeAllRanges();
+      selection.addRange(range);
+      try {
+        var success = document.execCommand("copy");
+        if (success) {
+          button.textContent = "\u2714";
+          setTimeout(function () {
+            button.textContent = "\uD83D\uDCCE"; // 恢复按钮为“复制”
+          }, 2000);
+        } else {
+          button.textContent = "\u2716";
+        }
+      } catch (e) {
+        console.error(e);
+        button.textContent = "\u2716";
+      }
+      selection.removeAllRanges();
+    });
+    code.insertBefore(button, firstChild); // 将按钮插入到第一个子元素之前
+  }
+  function handleNewElements(mutationsList, observer) {
+    for (var mutation of mutationsList) {
+      if (mutation.type === "childList") {
+        for (var node of mutation.addedNodes) {
+          if (node.nodeName === "PRE") {
+            addCopyButton(node);
+          }
+        }
+      }
+    }
+  }
+  var observer = new MutationObserver(handleNewElements);
+  observer.observe(document.documentElement, {
+    childList: true,
+    subtree: true,
+  });
+  document.querySelectorAll("pre").forEach(addCopyButton);
+})();

kimi_vl/serve/assets/avatar.png ADDED Viewed

kimi_vl/serve/assets/custom.css ADDED Viewed

	@@ -0,0 +1,355 @@

+/**
+ * Copyright (c) 2023-2024 DeepSeek.
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy of
+ * this software and associated documentation files (the "Software"), to deal in
+ * the Software without restriction, including without limitation the rights to
+ * use, copy, modify, merge, publish, distribute, sublicense, and/or sell copies of
+ * the Software, and to permit persons to whom the Software is furnished to do so,
+ * subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, FITNESS
+ * FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR
+ * COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER
+ * IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN
+ * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
+ */
+:root {
+  --chatbot-color-light: #f3f3f3;
+  --chatbot-color-dark: #121111;
+}
+/* status_display */
+#status_display {
+  display: flex;
+  min-height: 2.5em;
+  align-items: flex-end;
+  justify-content: flex-end;
+}
+#status_display p {
+  font-size: 0.85em;
+  font-family: monospace;
+  color: var(--body-text-color-subdued);
+}
+/* usage_display */
+#usage_display {
+  height: 1em;
+}
+#usage_display p {
+  padding: 0 1em;
+  font-size: 0.85em;
+  font-family: monospace;
+  color: var(--body-text-color-subdued);
+}
+/* list */
+ol:not(.options),
+ul:not(.options) {
+  padding-inline-start: 2em !important;
+}
+/* Thank @Keldos-Li for fixing it */
+/* Light mode (default) */
+#deepseek_chatbot {
+  background-color: var(--chatbot-color-light) !important;
+  color: #000000 !important;
+}
+[data-testid="bot"] {
+  background-color: #ffffff !important;
+}
+[data-testid="user"] {
+  background-color: #95ec69 !important;
+}
+/* Dark mode */
+.dark #deepseek_chatbot {
+  background-color: var(--chatbot-color-dark) !important;
+  color: #ffffff !important;
+}
+.dark [data-testid="bot"] {
+  background-color: #2c2c2c !important;
+}
+.dark [data-testid="user"] {
+  background-color: #26b561 !important;
+}
+#deepseek_chatbot {
+  height: 100%;
+  min-height: 800px;
+  flex-grow: 1;
+  overflow: auto;
+}
+[class*="message"] {
+  border-radius: var(--radius-xl) !important;
+  border: none;
+  padding: var(--spacing-xl) !important;
+  font-size: var(--text-md) !important;
+  line-height: var(--line-md) !important;
+  min-height: calc(var(--text-md) * var(--line-md) + 2 * var(--spacing-xl));
+  min-width: calc(var(--text-md) * var(--line-md) + 2 * var(--spacing-xl));
+}
+[data-testid="bot"] {
+  max-width: 85%;
+  border-bottom-left-radius: 0 !important;
+}
+[data-testid="user"] {
+  max-width: 85%;
+  width: auto !important;
+  border-bottom-right-radius: 0 !important;
+}
+/* Table */
+table {
+  margin: 1em 0;
+  border-collapse: collapse;
+  empty-cells: show;
+}
+td,
+th {
+  border: 1.2px solid var(--border-color-primary) !important;
+  padding: 0.2em;
+}
+thead {
+  background-color: rgba(175, 184, 193, 0.2);
+}
+thead th {
+  padding: 0.5em 0.2em;
+}
+/* Inline code */
+#deepseek_chatbot code {
+  display: inline;
+  white-space: break-spaces;
+  border-radius: 6px;
+  margin: 0 2px 0 2px;
+  padding: 0.2em 0.4em 0.1em 0.4em;
+  background-color: rgba(175, 184, 193, 0.2);
+}
+/* Code block */
+#deepseek_chatbot pre code {
+  display: block;
+  overflow: auto;
+  white-space: pre;
+  background-color: #1c1d1e !important;
+  border-radius: 10px;
+  padding: 1.4em 1.2em 0em 1.4em;
+  margin: 1.2em 2em 1.2em 0.5em;
+  color: #fdf8f8;
+  box-shadow: 6px 6px 16px hsla(0, 0%, 0%, 0.2);
+}
+/* Hightlight */
+#deepseek_chatbot .highlight {
+  background-color: transparent;
+}
+#deepseek_chatbot .highlight .hll {
+  background-color: #49483e;
+}
+#deepseek_chatbot .highlight .c {
+  color: #75715e;
+} /* Comment */
+#deepseek_chatbot .highlight .err {
+  color: #960050;
+  background-color: #1e0010;
+} /* Error */
+#deepseek_chatbot .highlight .k {
+  color: #66d9ef;
+} /* Keyword */
+#deepseek_chatbot .highlight .l {
+  color: #ae81ff;
+} /* Literal */
+#deepseek_chatbot .highlight .n {
+  color: #f8f8f2;
+} /* Name */
+#deepseek_chatbot .highlight .o {
+  color: #f92672;
+} /* Operator */
+#deepseek_chatbot .highlight .p {
+  color: #f8f8f2;
+} /* Punctuation */
+#deepseek_chatbot .highlight .ch {
+  color: #75715e;
+} /* Comment.Hashbang */
+#deepseek_chatbot .highlight .cm {
+  color: #75715e;
+} /* Comment.Multiline */
+#deepseek_chatbot .highlight .cp {
+  color: #75715e;
+} /* Comment.Preproc */
+#deepseek_chatbot .highlight .cpf {
+  color: #75715e;
+} /* Comment.PreprocFile */
+#deepseek_chatbot .highlight .c1 {
+  color: #75715e;
+} /* Comment.Single */
+#deepseek_chatbot .highlight .cs {
+  color: #75715e;
+} /* Comment.Special */
+#deepseek_chatbot .highlight .gd {
+  color: #f92672;
+} /* Generic.Deleted */
+#deepseek_chatbot .highlight .ge {
+  font-style: italic;
+} /* Generic.Emph */
+#deepseek_chatbot .highlight .gi {
+  color: #a6e22e;
+} /* Generic.Inserted */
+#deepseek_chatbot .highlight .gs {
+  font-weight: bold;
+} /* Generic.Strong */
+#deepseek_chatbot .highlight .gu {
+  color: #75715e;
+} /* Generic.Subheading */
+#deepseek_chatbot .highlight .kc {
+  color: #66d9ef;
+} /* Keyword.Constant */
+#deepseek_chatbot .highlight .kd {
+  color: #66d9ef;
+} /* Keyword.Declaration */
+#deepseek_chatbot .highlight .kn {
+  color: #f92672;
+} /* Keyword.Namespace */
+#deepseek_chatbot .highlight .kp {
+  color: #66d9ef;
+} /* Keyword.Pseudo */
+#deepseek_chatbot .highlight .kr {
+  color: #66d9ef;
+} /* Keyword.Reserved */
+#deepseek_chatbot .highlight .kt {
+  color: #66d9ef;
+} /* Keyword.Type */
+#deepseek_chatbot .highlight .ld {
+  color: #e6db74;
+} /* Literal.Date */
+#deepseek_chatbot .highlight .m {
+  color: #ae81ff;
+} /* Literal.Number */
+#deepseek_chatbot .highlight .s {
+  color: #e6db74;
+} /* Literal.String */
+#deepseek_chatbot .highlight .na {
+  color: #a6e22e;
+} /* Name.Attribute */
+#deepseek_chatbot .highlight .nb {
+  color: #f8f8f2;
+} /* Name.Builtin */
+#deepseek_chatbot .highlight .nc {
+  color: #a6e22e;
+} /* Name.Class */
+#deepseek_chatbot .highlight .no {
+  color: #66d9ef;
+} /* Name.Constant */
+#deepseek_chatbot .highlight .nd {
+  color: #a6e22e;
+} /* Name.Decorator */
+#deepseek_chatbot .highlight .ni {
+  color: #f8f8f2;
+} /* Name.Entity */
+#deepseek_chatbot .highlight .ne {
+  color: #a6e22e;
+} /* Name.Exception */
+#deepseek_chatbot .highlight .nf {
+  color: #a6e22e;
+} /* Name.Function */
+#deepseek_chatbot .highlight .nl {
+  color: #f8f8f2;
+} /* Name.Label */
+#deepseek_chatbot .highlight .nn {
+  color: #f8f8f2;
+} /* Name.Namespace */
+#deepseek_chatbot .highlight .nx {
+  color: #a6e22e;
+} /* Name.Other */
+#deepseek_chatbot .highlight .py {
+  color: #f8f8f2;
+} /* Name.Property */
+#deepseek_chatbot .highlight .nt {
+  color: #f92672;
+} /* Name.Tag */
+#deepseek_chatbot .highlight .nv {
+  color: #f8f8f2;
+} /* Name.Variable */
+#deepseek_chatbot .highlight .ow {
+  color: #f92672;
+} /* Operator.Word */
+#deepseek_chatbot .highlight .w {
+  color: #f8f8f2;
+} /* Text.Whitespace */
+#deepseek_chatbot .highlight .mb {
+  color: #ae81ff;
+} /* Literal.Number.Bin */
+#deepseek_chatbot .highlight .mf {
+  color: #ae81ff;
+} /* Literal.Number.Float */
+#deepseek_chatbot .highlight .mh {
+  color: #ae81ff;
+} /* Literal.Number.Hex */
+#deepseek_chatbot .highlight .mi {
+  color: #ae81ff;
+} /* Literal.Number.Integer */
+#deepseek_chatbot .highlight .mo {
+  color: #ae81ff;
+} /* Literal.Number.Oct */
+#deepseek_chatbot .highlight .sa {
+  color: #e6db74;
+} /* Literal.String.Affix */
+#deepseek_chatbot .highlight .sb {
+  color: #e6db74;
+} /* Literal.String.Backtick */
+#deepseek_chatbot .highlight .sc {
+  color: #e6db74;
+} /* Literal.String.Char */
+#deepseek_chatbot .highlight .dl {
+  color: #e6db74;
+} /* Literal.String.Delimiter */
+#deepseek_chatbot .highlight .sd {
+  color: #e6db74;
+} /* Literal.String.Doc */
+#deepseek_chatbot .highlight .s2 {
+  color: #e6db74;
+} /* Literal.String.Double */
+#deepseek_chatbot .highlight .se {
+  color: #ae81ff;
+} /* Literal.String.Escape */
+#deepseek_chatbot .highlight .sh {
+  color: #e6db74;
+} /* Literal.String.Heredoc */
+#deepseek_chatbot .highlight .si {
+  color: #e6db74;
+} /* Literal.String.Interpol */
+#deepseek_chatbot .highlight .sx {
+  color: #e6db74;
+} /* Literal.String.Other */
+#deepseek_chatbot .highlight .sr {
+  color: #e6db74;
+} /* Literal.String.Regex */
+#deepseek_chatbot .highlight .s1 {
+  color: #e6db74;
+} /* Literal.String.Single */
+#deepseek_chatbot .highlight .ss {
+  color: #e6db74;
+} /* Literal.String.Symbol */
+#deepseek_chatbot .highlight .bp {
+  color: #f8f8f2;
+} /* Name.Builtin.Pseudo */
+#deepseek_chatbot .highlight .fm {
+  color: #a6e22e;
+} /* Name.Function.Magic */
+#deepseek_chatbot .highlight .vc {
+  color: #f8f8f2;
+} /* Name.Variable.Class */
+#deepseek_chatbot .highlight .vg {
+  color: #f8f8f2;
+} /* Name.Variable.Global */
+#deepseek_chatbot .highlight .vi {
+  color: #f8f8f2;
+} /* Name.Variable.Instance */
+#deepseek_chatbot .highlight .vm {
+  color: #f8f8f2;
+} /* Name.Variable.Magic */
+#deepseek_chatbot .highlight .il {
+  color: #ae81ff;
+} /* Literal.Number.Integer.Long */

kimi_vl/serve/assets/custom.js ADDED Viewed

	@@ -0,0 +1,22 @@

+/**
+ * Copyright (c) 2023-2024 DeepSeek.
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy of
+ * this software and associated documentation files (the "Software"), to deal in
+ * the Software without restriction, including without limitation the rights to
+ * use, copy, modify, merge, publish, distribute, sublicense, and/or sell copies of
+ * the Software, and to permit persons to whom the Software is furnished to do so,
+ * subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, FITNESS
+ * FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR
+ * COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER
+ * IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN
+ * CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
+ */
+// custom javascript here

kimi_vl/serve/assets/favicon.ico ADDED Viewed

kimi_vl/serve/chat_utils.py ADDED Viewed

	@@ -0,0 +1,379 @@

+"""
+From https://github.com/lm-sys/FastChat/blob/main/fastchat/conversation.py
+"""
+import dataclasses
+import logging
+import copy
+from enum import IntEnum, auto
+from typing import Dict, List
+import base64
+import gradio as gr
+import torch
+from .utils import pil_to_base64
+IMAGE_TOKEN = "<image>"
+logger = logging.getLogger("gradio_logger")
+class SeparatorStyle(IntEnum):
+    """Separator styles."""
+    PLAIN = auto()
+    ALIGNMENT = auto()
+    KIMI_VL = auto()
+@dataclasses.dataclass
+class Conversation:
+    """A class that manages prompt templates and keeps all conversation history."""
+    # The name of this template
+    name: str
+    # The template of the system prompt
+    system_template: str = "{system_message}"
+    # The system message
+    system_message: str = ""
+    # The names of two roles
+    roles: List[str] = (("USER", "ASSISTANT"),)
+    # All messages. Each item is (role, message).
+    messages: List[List[str]] = ()
+    # The number of few shot examples
+    offset: int = 0
+    # The separator style and configurations
+    sep_style: SeparatorStyle = SeparatorStyle.PLAIN
+    sep: str = "\n"
+    sep2: str = None
+    # Stop criteria (the default one is EOS token)
+    stop_str: str = None
+    # Stops generation if meeting any token in this list
+    stop_token_ids: List[int] = None
+    def get_prompt(self) -> str:
+        """Get the prompt for generation."""
+        system_prompt = self.system_template.format(system_message=self.system_message)
+        if self.sep_style == SeparatorStyle.PLAIN:
+            seps = [self.sep, self.sep2]
+            ret = ""
+            for i, (role, message) in enumerate(self.messages):
+                if message:
+                    if type(message) is tuple:
+                        message = message[0]
+                    if i % 2 == 0:
+                        ret += message + seps[i % 2]
+                    else:
+                        ret += message + seps[i % 2]
+                else:
+                    ret += ""
+            return ret
+        elif self.sep_style == SeparatorStyle.ALIGNMENT:
+            seps = [self.sep, self.sep2]
+            ret = ""
+            for i, (role, message) in enumerate(self.messages):
+                if message:
+                    if type(message) is tuple:
+                        message, _, _ = message
+                    if i % 2 == 0:
+                        ret += '<image>\n' + seps[i % 2]
+                    else:
+                        ret += message + seps[i % 2]
+                else:
+                    ret += ""
+            return ret
+        elif self.sep_style == SeparatorStyle.KIMI_VL:
+            seps = [self.sep, self.sep2]
+            if system_prompt == "" or system_prompt is None:
+                ret = ""
+            else:
+                ret = system_prompt + seps[0]
+            for i, (role, message) in enumerate(self.messages):
+                if message:
+                    if type(message) is tuple:
+                        message = message[0]
+                    if role == "user":
+                        ret += message + self.sep
+                    else:
+                        if self.sep2 is not None:
+                            ret += message + self.sep2
+                        else:
+                            ret += message
+                else:
+                    ret = ret
+            return ret
+        else:
+            raise ValueError(f"Invalid style: {self.sep_style}")
+    def set_system_message(self, system_message: str):
+        """Set the system message."""
+        self.system_message = system_message
+    def append_message(self, role: str, message: str):
+        """Append a new message."""
+        self.messages.append([role, message])
+    def update_last_message(self, message: str):
+        """Update the last output.
+        The last message is typically set to be None when constructing the prompt,
+        so we need to update it in-place after getting the response from a model.
+        """
+        self.messages[-1][1] = message
+    def reset_message(self):
+        """Reset a new message."""
+        self.messages = []
+    def to_gradio_chatbot(self):
+        """Convert the conversation to gradio chatbot format."""
+        ret = []
+        for i, (role, msg) in enumerate(self.messages[self.offset :]):
+            if i % 2 == 0:
+                ret.append([msg, None])
+            else:
+                ret[-1][-1] = msg
+        return ret
+    def to_openai_api_messages(self):
+        """Convert the conversation to OpenAI chat completion format."""
+        system_prompt = self.system_template.format(system_message=self.system_message)
+        ret = [{"role": "system", "content": system_prompt}]
+        for i, (_, msg) in enumerate(self.messages[self.offset :]):
+            if i % 2 == 0:
+                ret.append({"role": "user", "content": msg})
+            else:
+                if msg is not None:
+                    ret.append({"role": "assistant", "content": msg})
+        return ret
+    def copy(self):
+        return Conversation(
+            name=self.name,
+            system_template=self.system_template,
+            system_message=self.system_message,
+            roles=self.roles,
+            messages=[[x, y] for x, y in self.messages],
+            offset=self.offset,
+            sep_style=self.sep_style,
+            sep=self.sep,
+            sep2=self.sep2,
+            stop_str=self.stop_str,
+            stop_token_ids=self.stop_token_ids,
+        )
+    def dict(self):
+        return {
+            "template_name": self.name,
+            "system_message": self.system_message,
+            "roles": self.roles,
+            "messages": self.messages,
+            "offset": self.offset,
+        }
+# A global registry for all conversation templates
+conv_templates: Dict[str, Conversation] = {}
+def register_conv_template(template: Conversation, override: bool = False):
+    """Register a new conversation template."""
+    if not override:
+        assert template.name not in conv_templates, f"{template.name} has been registered."
+    conv_templates[template.name] = template
+def get_conv_template(name: str) -> Conversation:
+    """Get a conversation template."""
+    return conv_templates[name].copy()
+register_conv_template(
+    Conversation(
+        name="plain",
+        system_template="",
+        system_message="",
+        roles=("", ""),
+        messages=(),
+        offset=0,
+        sep_style=SeparatorStyle.PLAIN,
+        sep="",
+        sep2="",
+        stop_token_ids=[100001],
+        stop_str=['</s>'],
+    )
+)
+register_conv_template(
+    Conversation(
+        name="alignment",
+        system_template="",
+        system_message="",
+        roles=("", ""),
+        messages=(),
+        offset=0,
+        sep_style=SeparatorStyle.ALIGNMENT,
+        sep="",
+        sep2="",
+        stop_token_ids=[100001],
+        stop_str=['</s>'],
+    )
+)
+register_conv_template(
+    Conversation(
+        name="kimi-vl",
+        system_template="{system_message}",
+        system_message="You are a helpful assistant",
+        roles=("user", "assistant"),
+        messages=(),
+        offset=0,
+        sep_style=SeparatorStyle.KIMI_VL,
+        sep="<|im_end|>",
+        sep2=None,
+        stop_token_ids=None,
+        stop_str=["<|im_end|>"],
+    )
+)
+def new_chat_template(sft_format: str = "kimi-vl"):
+    return get_conv_template(sft_format)
+def get_prompt(conv: Conversation) -> str:
+    """Get the prompt for generation."""
+    return conv.get_prompt()
+def generate_prompt_with_history(text, images, history, processor, max_length=2048):
+    """
+    Generate a prompt with the chat history.
+    Args:
+        text (str): The text prompt.
+        images (list[PIL.Image.Image]): The image prompt.
+        history (list): List of previous conversation messages.
+        processor (KimiVLProcessor): The chat processor used for encoding the prompt.
+        max_length (int): The maximum length of the prompt.
+    """
+    global IMAGE_TOKEN
+    user_role_ind = 0
+    bot_role_ind = 1
+    # Initialize conversation
+    conversation = new_chat_template(sft_format="kimi-vl")
+    if history:
+        conversation.messages = history
+    if images is not None and len(images) > 0:
+        # num_image_tags = text.count(IMAGE_TOKEN)
+        # num_images = len(images)
+        # if num_images > num_image_tags:
+        #     pad_image_tags = num_images - num_image_tags
+        #     image_tokens = "\n".join([IMAGE_TOKEN] * pad_image_tags)
+        #     # append the <image> in a new line after the text prompt
+        #     text = image_tokens + "\n" + text
+        # elif num_images < num_image_tags:
+        #     remove_image_tags = num_image_tags - num_images
+        #     text = text.replace(IMAGE_TOKEN, "", remove_image_tags)
+        print(f"prompt = {text}, len(images) = {len(images)}")
+        text = (text, images)
+    conversation.append_message(conversation.roles[user_role_ind], text)
+    conversation.append_message(conversation.roles[bot_role_ind], "")
+    # Create a copy of the conversation to avoid history truncation in the UI
+    conversation_copy = conversation.copy()
+    logger.info("=" * 80)
+    logger.info(get_prompt(conversation))
+    rounds = len(conversation.messages) // 2
+    for _ in range(rounds):
+        current_prompt = get_prompt(conversation)
+        assert isinstance(current_prompt, str) and len(current_prompt) > 0, f"current_prompt = {current_prompt}"
+        if torch.tensor(processor.tokenizer.encode(current_prompt)).size(-1) <= max_length:
+            return conversation_copy
+        if len(conversation.messages) % 2 != 0:
+            gr.Error("The messages between user and assistant are not paired.")
+            return
+        try:
+            for _ in range(2):  # pop out two messages in a row
+                conversation.messages.pop(0)
+        except IndexError:
+            gr.Error("Input text processing failed, unable to respond in this round.")
+            return None
+    gr.Error("Prompt could not be generated within max_length limit.")
+    return None
+def convert_conversation_to_prompts(conversation: Conversation):
+    """
+    Convert the conversation to prompts.
+    """
+    conv_prompts = []
+    last_image = None
+    messages = conversation.messages
+    for i in range(0, len(messages), 2):
+        if isinstance(messages[i][1], tuple):
+            text, images = messages[i][1]
+            last_image = images[-1]
+        else:
+            text, images = messages[i][1], []
+        prompt = {"role": messages[i][0], "content": text, "images": images}
+        response = {"role": messages[i + 1][0], "content": messages[i + 1][1]}
+        conv_prompts.extend([prompt, response])
+    return conv_prompts, last_image
+def to_gradio_chatbot(conversation: Conversation) -> list:
+    """Convert the conversation to gradio chatbot format."""
+    ret = []
+    for i, (_, msg) in enumerate(conversation.messages[conversation.offset :]):
+        if i % 2 == 0:
+            if type(msg) is tuple:
+                msg, images = copy.deepcopy(msg)
+                if isinstance(images, list):
+                    img_str = ""
+                    for j, image in enumerate(images):
+                        if isinstance(image, str):
+                            with open(image, "rb") as f:
+                                data = f.read()
+                            img_b64_str = base64.b64encode(data).decode()
+                            image_str = (
+                                f'<img src="data:image/png;base64,{img_b64_str}" '
+                                f'alt="user upload image" style="max-width: 300px; height: auto;" />'
+                            )
+                        else:
+                            image_str = pil_to_base64(image, f"user upload image_{j}", max_size=800, min_size=400)
+                        img_str += image_str
+                    msg = img_str + msg
+                else:
+                    pass
+            ret.append([msg, None])
+        else:
+            ret[-1][-1] = msg
+    return ret
+def to_gradio_history(conversation: Conversation):
+    """Convert the conversation to gradio history format."""
+    return conversation.messages[conversation.offset :]

kimi_vl/serve/examples.py ADDED Viewed

	@@ -0,0 +1,54 @@

+import os
+import io
+import base64
+from PIL import Image
+EXAMPLES_LIST = [
+    [
+        ["images/demo1.jpeg"],
+        "Where am I?",
+    ],
+    [
+        ["images/demo2.jpeg", "images/demo3.jpeg"],
+        "Based on the abstract and introduction above, write a concise and elegant Twitter post that highlights key points and figures without sounding overly promotional. Use English, include emojis and hashtags.",
+    ],
+    [
+        ["images/demo6.jpeg"],
+        "Create a role play modeled after this cat."
+    ],
+    # mulit-frames example
+    [
+        ["images/demo4.jpeg", "images/demo5.jpeg"],
+        "Please infer step by step who this manuscript belongs to and what it records."
+    ]
+]
+def display_example(image_list, root_dir: str = None):
+    images_html = ""
+    for _, img_path in enumerate(image_list):
+        if root_dir is not None:
+            img_path = os.path.join(root_dir, img_path)
+        image = Image.open(img_path)
+        buffered = io.BytesIO()
+        image.save(buffered, format="PNG", quality=100)
+        img_b64_str = base64.b64encode(buffered.getvalue()).decode()
+        img_str = f'<img src="data:image/png;base64,{img_b64_str}" alt="{img_path}" style="height:80px; margin-right: 10px;" />'
+        images_html += img_str
+    result_html = f"""
+    <div style="display: flex; align-items: center; margin-bottom: 10px;">
+        <div style="flex: 1; margin-right: 10px;">{images_html}</div>
+    </div>
+    """
+    return result_html
+def get_examples(root_dir: str = None):
+    examples = []
+    for images, texts in EXAMPLES_LIST:
+        examples.append([images, display_example(images, root_dir), texts])
+    return examples

kimi_vl/serve/frontend.py ADDED Viewed

	@@ -0,0 +1,134 @@

+import logging
+import os
+from typing import List, Tuple
+import gradio as gr
+from kimi_vl.serve.utils import convert_asis, convert_mdtext, detect_converted_mark
+ROOT_PATH = os.path.dirname(os.path.abspath(__file__))
+small_and_beautiful_theme = gr.themes.Soft(
+    primary_hue=gr.themes.Color(
+        c50="#EBFAF2",
+        c100="#CFF3E1",
+        c200="#A8EAC8",
+        c300="#77DEA9",
+        c400="#3FD086",
+        c500="#02C160",
+        c600="#06AE56",
+        c700="#05974E",
+        c800="#057F45",
+        c900="#04673D",
+        c950="#2E5541",
+        name="small_and_beautiful",
+    ),
+    secondary_hue=gr.themes.Color(
+        c50="#576b95",
+        c100="#576b95",
+        c200="#576b95",
+        c300="#576b95",
+        c400="#576b95",
+        c500="#576b95",
+        c600="#576b95",
+        c700="#576b95",
+        c800="#576b95",
+        c900="#576b95",
+        c950="#576b95",
+    ),
+    neutral_hue=gr.themes.Color(
+        name="gray",
+        c50="#f6f7f8",
+        # c100="#f3f4f6",
+        c100="#F2F2F2",
+        c200="#e5e7eb",
+        c300="#d1d5db",
+        c400="#B2B2B2",
+        c500="#808080",
+        c600="#636363",
+        c700="#515151",
+        c800="#393939",
+        # c900="#272727",
+        c900="#2B2B2B",
+        c950="#171717",
+    ),
+    radius_size=gr.themes.sizes.radius_sm,
+).set(
+    # button_primary_background_fill="*primary_500",
+    button_primary_background_fill_dark="*primary_600",
+    # button_primary_background_fill_hover="*primary_400",
+    # button_primary_border_color="*primary_500",
+    button_primary_border_color_dark="*primary_600",
+    button_primary_text_color="white",
+    button_primary_text_color_dark="white",
+    button_secondary_background_fill="*neutral_100",
+    button_secondary_background_fill_hover="*neutral_50",
+    button_secondary_background_fill_dark="*neutral_900",
+    button_secondary_text_color="*neutral_800",
+    button_secondary_text_color_dark="white",
+    # background_fill_primary="#F7F7F7",
+    # background_fill_primary_dark="#1F1F1F",
+    # block_title_text_color="*primary_500",
+    block_title_background_fill_dark="*primary_900",
+    block_label_background_fill_dark="*primary_900",
+    input_background_fill="#F6F6F6",
+    # chatbot_code_background_color_dark="*neutral_950",
+)
+def compact_text_chunks(self, prompt, text_chunks: List[str]) -> List[str]:
+    logging.debug("Compacting text chunks...🚀🚀🚀")
+    combined_str = [c.strip() for c in text_chunks if c.strip()]
+    combined_str = [f"[{index+1}] {c}" for index, c in enumerate(combined_str)]
+    combined_str = "\n\n".join(combined_str)
+    # resplit based on self.max_chunk_overlap
+    text_splitter = self.get_text_splitter_given_prompt(prompt, 1, padding=1)
+    return text_splitter.split_text(combined_str)
+def postprocess(y: List[Tuple[str | None, str | None]]) -> List[Tuple[str | None, str | None]]:
+    """
+    Parameters:
+        y: List of tuples representing the message and response pairs. Each message and response should be a string, which may be in Markdown format.
+    Returns:
+        List of tuples representing the message and response. Each message and response will be a string of HTML.
+    """
+    if y is None or y == []:
+        return []
+    temp = []
+    for x in y:
+        user, bot = x
+        if not detect_converted_mark(user):
+            user = convert_asis(user)
+        if not detect_converted_mark(bot):
+            bot = convert_mdtext(bot)
+        temp.append((user, bot))
+    return temp
+custom_js_path = os.path.join(ROOT_PATH, "assets/custom.js")
+kelpy_codos_path = os.path.join(ROOT_PATH, "assets/Kelpy-Codos.js")
+with (
+    open(custom_js_path, "r", encoding="utf-8") as f,
+    open(kelpy_codos_path, "r", encoding="utf-8") as f2,
+):
+    customJS = f.read()
+    kelpyCodos = f2.read()
+def reload_javascript():
+    print("Reloading javascript...")
+    js = f"<script>{customJS}</script><script>{kelpyCodos}</script>"
+    def template_response(*args, **kwargs):
+        res = GradioTemplateResponseOriginal(*args, **kwargs)
+        res.body = res.body.replace(b"</html>", f"{js}</html>".encode("utf8"))
+        res.init_headers()
+        return res
+    gr.routes.templates.TemplateResponse = template_response
+GradioTemplateResponseOriginal = gr.routes.templates.TemplateResponse

kimi_vl/serve/gradio_utils.py ADDED Viewed

	@@ -0,0 +1,93 @@

+"""
+Gradio utils for the Kimi-VL application.
+"""
+import functools
+from typing import Callable
+import traceback
+import gradio as gr
+IMAGE_TOKEN = "<image>"
+def transfer_input(input_text, input_images):
+    """
+    Transfer the input text and images to the input text and images.
+    """
+    return (input_text, input_images, gr.update(value=""), gr.update(value=None), gr.Button(visible=True))
+def delete_last_conversation(chatbot, history):
+    """
+    Delete the last conversation from the chatbot and history.
+    Args:
+        chatbot (list): The chatbot list.
+        history (list): The history list.
+    """
+    if len(history) % 2 != 0:
+        gr.Error("history length is not even")
+        return (
+            chatbot,
+            history,
+            "Delete Done",
+        )
+    if len(chatbot) > 0:
+        chatbot.pop()
+    if len(history) > 0 and len(history) % 2 == 0:
+        history.pop()
+        history.pop()
+    return (
+        chatbot,
+        history,
+        "Delete Done",
+    )
+def reset_state():
+    return [], [], None, "Reset Done"
+def reset_textbox():
+    return gr.update(value=""), ""
+def cancel_outputing():
+    return "Stop Done"
+class State:
+    interrupted = False
+    def interrupt(self):
+        self.interrupted = True
+    def recover(self):
+        self.interrupted = False
+shared_state = State()
+def wrap_gen_fn(gen_fn: Callable):
+    """
+    Wrap the generator function to handle errors.
+    """
+    @functools.wraps(gen_fn)
+    def wrapped_gen_fn(prompt, *args, **kwargs):
+        try:
+            yield from gen_fn(prompt, *args, **kwargs)
+        except gr.Error as g_err:
+            traceback.print_exc()
+            raise g_err
+        except Exception as e:
+            traceback.print_exc()
+            raise gr.Error(f"Failed to generate text: {e}") from e
+    return wrapped_gen_fn

kimi_vl/serve/inference.py ADDED Viewed

	@@ -0,0 +1,224 @@

+import logging
+import re
+from threading import Thread
+from typing import List, Optional
+import torch
+import spaces
+from transformers import (
+    AutoModelForCausalLM,
+    AutoProcessor,
+    AutoConfig,
+    StoppingCriteria,
+    StoppingCriteriaList,
+    TextIteratorStreamer,
+)
+from .chat_utils import Conversation, get_conv_template
+logger = logging.getLogger(__name__)
+def load_model(model_path: str = "moonshotai/Kimi-VL-A3B-Thinking"):
+    # hotfix the model to use flash attention 2
+    config = AutoConfig.from_pretrained(model_path, trust_remote_code=True)
+    config._attn_implementation = "flash_attention_2"
+    config.vision_config._attn_implementation = "flash_attention_2"
+    config.text_config._attn_implementation = "flash_attention_2"
+    print("Successfully set the attn_implementation to flash_attention_2")
+    model = AutoModelForCausalLM.from_pretrained(
+        model_path,
+        config=config,
+        torch_dtype="auto",
+        device_map="auto",
+        trust_remote_code=True,
+    )
+    processor = AutoProcessor.from_pretrained(model_path, config=config, trust_remote_code=True)
+    return model, processor
+class StoppingCriteriaSub(StoppingCriteria):
+    def __init__(self, stops=[], encounters=1):
+        super().__init__()
+        self.stops = [stop.to("cuda") for stop in stops]
+    def __call__(self, input_ids: torch.LongTensor, scores: torch.FloatTensor, **kwargs):
+        for stop in self.stops:
+            if input_ids.shape[-1] < len(stop):
+                continue
+            if torch.all((stop == input_ids[0][-len(stop) :])).item():
+                return True
+        return False
+def format_messages(
+    conversations: list[Conversation],
+    system_prompt: Optional[str] = "",
+    sft_format: Optional[str] = "kimi-vl",
+):
+    """
+    Format the conversations to the input format of the model.
+    """
+    converstion = get_conv_template(sft_format)
+    converstion.set_system_message(system_prompt)
+    for message in conversations:
+        converstion.append_message(message["role"], message["content"])
+    return converstion
+def preprocess(
+    messages: list[dict],
+    processor,
+    sft_format: Optional[str] = "kimi-vl",
+):
+    """
+    Build messages from the conversations and images.
+    """
+    # get images from conversations
+    results = []
+    images = []
+    # get texts from conversations
+    converstion = get_conv_template(sft_format)
+    # only use the last 3 round of messages
+    latest_messages = messages[-3:]
+    for mid, message in enumerate(latest_messages):
+        if message["role"] == converstion.roles[0] or message["role"] == "user":
+            record = {
+                "role": message["role"],
+                "content": [],
+            }
+            if "images" in message:
+                per_round_images = message["images"]
+                if len(per_round_images) > 2:
+                    per_round_images = per_round_images[-2:]
+                    print(f"Only use the last 2 images in the {mid}-th round")
+                images.extend(per_round_images)
+                for image in per_round_images:
+                    record["content"].append(
+                        {
+                            "type": "image",
+                            "image": image,
+                        }
+                    )
+            if 'content' in message:
+                record["content"].append(
+                    {
+                        "type": "text",
+                        "text": str(message["content"]).strip(),
+                    }
+                )
+            results.append(record)
+        elif message["role"] == converstion.roles[1] or message["role"] == "assistant":
+            formatted_answer = message["content"].strip()
+            # ◁think▷用户说了“你好”，这是一个非常简单的问候，通常用于开启对话。我需要判断用户的意图。可能性一：用户只是礼貌性地打招呼，想要开启一段对话；可能性二：用户可能有更具体的需求，比如询问我的功能、功能或者需要帮助。由于用户没有提供更多信息，我需要保持开放，同时引导用户进一步说明他们的需求。
+            # 我的回复需要既友好又开放，不能显得过于正式或冷漠。同时，我需要避免假设用户的具体需求，而是提供一个轻松的、鼓励继续对话的回应。◁/think▷你好！很高兴见到你。有什么我可以帮助你的吗
+            # delete all the texts between ◁think▷ and ◁/think▷
+            # FIXME: this is a hack to remove the thinking texts
+            # formatted_answer = re.sub(r"◁think▷.*◁/think▷", "", formatted_answer)
+            think_end_token = '◁/think▷'
+            formatted_answer = formatted_answer.split(think_end_token)[-1]
+            results.append(
+                {
+                    "role": message["role"],
+                    "content": [
+                        {
+                            "type": "text",
+                            "text": formatted_answer,
+                        }
+                    ],
+                }
+            )
+            assert (
+                formatted_answer.count(processor.image_token) == 0
+            ), f"there should be no {processor.image_token} in the assistant's reply, but got {messages}"
+            converstion.append_message(converstion.roles[1], formatted_answer)
+    text = processor.apply_chat_template(results, add_generation_prompt=True)
+    print(f"raw text = {text}")
+    if len(images) == 0:
+        images = None
+    inputs = processor(
+        images=images,
+        text=[text],
+        return_tensors="pt",
+        padding=True,
+        truncation=True,
+    )
+    return inputs
+@torch.no_grad()
+@torch.inference_mode()
+def kimi_vl_generate(
+    model: torch.nn.Module,
+    processor: AutoProcessor,
+    conversations: list[Conversation],
+    stop_words: list,
+    max_length: int = 256,
+    temperature: float = 1.0,
+    top_p: float = 1.0,
+    chunk_size: int = -1,
+):
+    # convert conversation to inputs
+    print(f"conversations = {conversations}")
+    inputs = preprocess(conversations, processor=processor)
+    inputs = inputs.to(model.device)
+    return generate(
+        model,
+        processor,
+        inputs,
+        max_gen_len=max_length,
+        temperature=temperature,
+        top_p=top_p,
+        stop_words=stop_words,
+        chunk_size=chunk_size,
+    )
+def generate(
+    model,
+    processor,
+    inputs,
+    max_gen_len: int = 256,
+    temperature: float = 0,
+    top_p: float = 0.95,
+    stop_words: List[str] = [],
+    chunk_size: int = -1,
+):
+    """Stream the text output from the multimodality model with prompt and image inputs."""
+    tokenizer = processor.tokenizer
+    stop_words_ids = [torch.tensor(tokenizer.encode(stop_word)) for stop_word in stop_words]
+    stopping_criteria = StoppingCriteriaList([StoppingCriteriaSub(stops=stop_words_ids)])
+    streamer = TextIteratorStreamer(tokenizer, skip_prompt=True)
+    kwargs = dict(
+        **inputs,
+        max_new_tokens=max_gen_len,
+        do_sample=True,
+        use_cache=True,
+        streamer=streamer,
+        stopping_criteria=stopping_criteria,
+    )
+    if temperature > 0:
+        kwargs.update(
+            {
+                "do_sample": True,
+                "top_p": top_p,
+                "temperature": temperature,
+            }
+        )
+    else:
+        kwargs["do_sample"] = False
+    thread = Thread(target=model.generate, kwargs=kwargs)
+    thread.start()
+    yield from streamer

kimi_vl/serve/utils.py ADDED Viewed

	@@ -0,0 +1,290 @@

+from __future__ import annotations
+import html
+import logging
+import io
+import os
+import re
+import base64
+import time
+from PIL import Image, ImageDraw, ImageFont
+import mdtex2html
+from markdown import markdown
+from pygments import highlight
+from pygments.formatters import HtmlFormatter
+from pygments.lexers import ClassNotFound, get_lexer_by_name, guess_lexer
+ALREADY_CONVERTED_MARK = "<!-- ALREADY CONVERTED BY PARSER. -->"
+BOX2COLOR = {
+    0: (255, 0, 0),
+    1: (0, 255, 0),
+    2: (0, 0, 255),
+}
+MAX_IMAGE_SIZE = 1024
+MIN_IMAGE_SIZE = 1024
+logger = logging.getLogger("gradio_logger")
+def configure_logger(log_dir: str = "logs"):
+    logger = logging.getLogger("gradio_logger")
+    logger.setLevel(logging.DEBUG)
+    timestr = time.strftime("%Y%m%d-%H%M%S")
+    os.makedirs(log_dir, exist_ok=True)
+    file_handler = logging.FileHandler(f"{log_dir}/{timestr}_gradio_log.log")
+    console_handler = logging.StreamHandler()
+    formatter = logging.Formatter("%(asctime)s - %(name)s - %(levelname)s - %(message)s")
+    console_handler.setFormatter(formatter)
+    file_handler.setFormatter(formatter)
+    console_handler.setLevel(logging.INFO)
+    file_handler.setLevel(logging.INFO)
+    logger.addHandler(console_handler)
+    logger.addHandler(file_handler)
+    return logger
+def strip_stop_words(x, stop_words):
+    for w in stop_words:
+        if w in x:
+            return x[: x.index(w)].strip()
+    return x.strip()
+def format_output(history, text, x):
+    updated_history = history + [[text, x]]
+    a = [[y[0], convert_to_markdown(y[1])] for y in updated_history]
+    return a, updated_history
+def markdown_to_html_with_syntax_highlight(md_str):  # deprecated
+    def replacer(match):
+        lang = match.group(1) or "text"
+        code = match.group(2)
+        try:
+            lexer = get_lexer_by_name(lang, stripall=True)
+        except ValueError:
+            lexer = get_lexer_by_name("text", stripall=True)
+        formatter = HtmlFormatter()
+        highlighted_code = highlight(code, lexer, formatter)
+        return f'<pre><code class="{lang}">{highlighted_code}</code></pre>'
+    code_block_pattern = r"```(\w+)?\n([\s\S]+?)\n```"
+    md_str = re.sub(code_block_pattern, replacer, md_str, flags=re.MULTILINE)
+    html_str = markdown(md_str)
+    return html_str
+def normalize_markdown(md_text: str) -> str:  # deprecated
+    lines = md_text.split("\n")
+    normalized_lines = []
+    inside_list = False
+    for i, line in enumerate(lines):
+        if re.match(r"^(\d+\.|-|\*|\+)\s", line.strip()):
+            if not inside_list and i > 0 and lines[i - 1].strip() != "":
+                normalized_lines.append("")
+            inside_list = True
+            normalized_lines.append(line)
+        elif inside_list and line.strip() == "":
+            if i < len(lines) - 1 and not re.match(r"^(\d+\.|-|\*|\+)\s", lines[i + 1].strip()):
+                normalized_lines.append(line)
+            continue
+        else:
+            inside_list = False
+            normalized_lines.append(line)
+    return "\n".join(normalized_lines)
+def convert_mdtext(md_text):
+    code_block_pattern = re.compile(r"```(.*?)(?:```|$)", re.DOTALL)
+    inline_code_pattern = re.compile(r"`(.*?)`", re.DOTALL)
+    code_blocks = code_block_pattern.findall(md_text)
+    non_code_parts = code_block_pattern.split(md_text)[::2]
+    result = []
+    for non_code, code in zip(non_code_parts, code_blocks + [""]):
+        if non_code.strip():
+            non_code = normalize_markdown(non_code)
+            if inline_code_pattern.search(non_code):
+                result.append(markdown(non_code, extensions=["tables"]))
+            else:
+                result.append(mdtex2html.convert(non_code, extensions=["tables"]))
+        if code.strip():
+            code = f"\n```{code}\n\n```"
+            code = markdown_to_html_with_syntax_highlight(code)
+            result.append(code)
+    result = "".join(result)
+    result += ALREADY_CONVERTED_MARK
+    return result
+def convert_asis(userinput):
+    return f'<p style="white-space:pre-wrap;">{html.escape(userinput)}</p>{ALREADY_CONVERTED_MARK}'
+def is_stop_word_or_prefix(s: str, stop_words: list) -> bool:
+    return any(s.endswith(stop_word) for stop_word in stop_words)
+def detect_converted_mark(userinput):
+    return bool(userinput.endswith(ALREADY_CONVERTED_MARK))
+def detect_language(code):
+    first_line = "" if code.startswith("\n") else code.strip().split("\n", 1)[0]
+    language = first_line.lower() if first_line else ""
+    code_without_language = code[len(first_line) :].lstrip() if first_line else code
+    return language, code_without_language
+def convert_to_markdown(text):
+    text = text.replace("$", "&#36;")
+    text = text.replace("\r\n", "\n")
+    def replace_leading_tabs_and_spaces(line):
+        new_line = []
+        for char in line:
+            if char == "\t":
+                new_line.append("&#9;")
+            elif char == " ":
+                new_line.append("&nbsp;")
+            else:
+                break
+        return "".join(new_line) + line[len(new_line) :]
+    markdown_text = ""
+    lines = text.split("\n")
+    in_code_block = False
+    for line in lines:
+        if in_code_block is False and line.startswith("```"):
+            in_code_block = True
+            markdown_text += f"{line}\n"
+        elif in_code_block is True and line.startswith("```"):
+            in_code_block = False
+            markdown_text += f"{line}\n"
+        elif in_code_block:
+            markdown_text += f"{line}\n"
+        else:
+            line = replace_leading_tabs_and_spaces(line)
+            line = re.sub(r"^(#)", r"\\\1", line)
+            markdown_text += f"{line}  \n"
+    return markdown_text
+def add_language_tag(text):
+    def detect_language(code_block):
+        try:
+            lexer = guess_lexer(code_block)
+            return lexer.name.lower()
+        except ClassNotFound:
+            return ""
+    code_block_pattern = re.compile(r"(```)(\w*\n[^`]+```)", re.MULTILINE)
+    def replacement(match):
+        code_block = match.group(2)
+        if match.group(2).startswith("\n"):
+            language = detect_language(code_block)
+            return f"```{language}{code_block}```" if language else f"```\n{code_block}```"
+        else:
+            return match.group(1) + code_block + "```"
+    text2 = code_block_pattern.sub(replacement, text)
+    return text2
+def is_variable_assigned(var_name: str) -> bool:
+    return var_name in locals()
+def pil_to_base64(
+    image: Image.Image,
+    alt: str = "user upload image",
+    resize: bool = True,
+    max_size: int = MAX_IMAGE_SIZE,
+    min_size: int = MIN_IMAGE_SIZE,
+    format: str = "JPEG",
+    quality: int = 95,
+) -> str:
+    """
+    Convert a PIL image to a base64 string.
+    """
+    if resize:
+        max_hw, min_hw = max(image.size), min(image.size)
+        aspect_ratio = max_hw / min_hw
+        shortest_edge = int(min(max_size / aspect_ratio, min_size, min_hw))
+        longest_edge = int(shortest_edge * aspect_ratio)
+        W, H = image.size
+        if H > W:
+            H, W = longest_edge, shortest_edge
+        else:
+            H, W = shortest_edge, longest_edge
+        image = image.resize((W, H))
+    buffered = io.BytesIO()
+    image.save(buffered, format=format, quality=quality)
+    img_b64_str = base64.b64encode(buffered.getvalue()).decode()
+    img_str = f'<img src="data:image/png;base64,{img_b64_str}" alt="{alt}" />'
+    return img_str
+def parse_ref_bbox(response, image: Image.Image):
+    try:
+        image = image.copy()
+        image_h, image_w = image.size
+        draw = ImageDraw.Draw(image)
+        ref = re.findall(r'<\|ref\|>.*?<\|/ref\|>', response)
+        bbox = re.findall(r'<\|det\|>.*?<\|/det\|>', response)
+        assert len(ref) == len(bbox)
+        if len(ref) == 0:
+            return None
+        boxes, labels = [], []
+        for box, label in zip(bbox, ref):
+            box = box.replace('<|det|>', '').replace('<|/det|>', '')
+            label = label.replace('<|ref|>', '').replace('<|/ref|>', '')
+            box = box[1:-1]
+            for onebox in re.findall(r'\[.*?\]', box):
+                boxes.append(eval(onebox))
+                labels.append(label)
+        for indice, (box, label) in enumerate(zip(boxes, labels)):
+            box = (
+                int(box[0] / 999 * image_h),
+                int(box[1] / 999 * image_w),
+                int(box[2] / 999 * image_h),
+                int(box[3] / 999 * image_w),
+            )
+            box_color = BOX2COLOR[indice % len(BOX2COLOR.keys())]
+            box_width = 3
+            draw.rectangle(box, outline=box_color, width=box_width)
+            text_x = box[0]
+            text_y = box[1] - 20
+            text_color = box_color
+            # font = ImageFont.truetype("kimi_vl/serve/assets/simsun.ttc", size=20)
+            draw.text((text_x, text_y), label, font=font, fill=text_color)
+        return image
+    except Exception as e:
+        logger.error(f"Error parsing reference bounding boxes: {e}")
+        return None