Spaces:
Running
on
L40S
Running
on
L40S
miaoyibo
commited on
Commit
·
8cf3ee6
1
Parent(s):
4079598
- app.py +27 -2
- kimi_vl/serve/chat_utils.py +1 -1
- kimi_vl/serve/inference.py +2 -84
app.py
CHANGED
@@ -1,10 +1,10 @@
|
|
1 |
import argparse
|
2 |
import gradio as gr
|
3 |
import os
|
4 |
-
os.environ["HF_HOME"] = "/mnt/moonfs/miaoyibo-ksyun/hf_home"
|
5 |
from PIL import Image
|
6 |
import spaces
|
7 |
import copy
|
|
|
8 |
|
9 |
from kimi_vl.serve.frontend import reload_javascript
|
10 |
from kimi_vl.serve.utils import (
|
@@ -137,7 +137,32 @@ def predict(
|
|
137 |
yield [[text, "No Model Found"]], [], "No Model Found"
|
138 |
return
|
139 |
|
140 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
141 |
|
142 |
if images is None:
|
143 |
images = []
|
|
|
1 |
import argparse
|
2 |
import gradio as gr
|
3 |
import os
|
|
|
4 |
from PIL import Image
|
5 |
import spaces
|
6 |
import copy
|
7 |
+
import time
|
8 |
|
9 |
from kimi_vl.serve.frontend import reload_javascript
|
10 |
from kimi_vl.serve.utils import (
|
|
|
137 |
yield [[text, "No Model Found"]], [], "No Model Found"
|
138 |
return
|
139 |
|
140 |
+
|
141 |
+
prompt = "Give me a short introduction to large language model."
|
142 |
+
messages = [
|
143 |
+
{"role": "system", "content": "You are a helpful assistant."},
|
144 |
+
{"role": "user", "content": prompt}
|
145 |
+
]
|
146 |
+
text = tokenizer.apply_chat_template(
|
147 |
+
messages,
|
148 |
+
tokenize=False,
|
149 |
+
add_generation_prompt=True
|
150 |
+
)
|
151 |
+
model_inputs = tokenizer([text], return_tensors="pt").to(model.device)
|
152 |
+
|
153 |
+
generated_ids = model.generate(
|
154 |
+
**model_inputs,
|
155 |
+
max_new_tokens=512
|
156 |
+
)
|
157 |
+
generated_ids = [
|
158 |
+
output_ids[len(input_ids):] for input_ids, output_ids in zip(model_inputs.input_ids, generated_ids)
|
159 |
+
]
|
160 |
+
|
161 |
+
response = tokenizer.batch_decode(generated_ids, skip_special_tokens=True)[0]
|
162 |
+
|
163 |
+
print(response)
|
164 |
+
time.sleep(2600)
|
165 |
+
|
166 |
|
167 |
if images is None:
|
168 |
images = []
|
kimi_vl/serve/chat_utils.py
CHANGED
@@ -267,7 +267,7 @@ def generate_prompt_with_history(text, images, history, processor, max_length=20
|
|
267 |
bot_role_ind = 1
|
268 |
|
269 |
# Initialize conversation
|
270 |
-
conversation = new_chat_template(sft_format="
|
271 |
|
272 |
if history:
|
273 |
conversation.messages = history
|
|
|
267 |
bot_role_ind = 1
|
268 |
|
269 |
# Initialize conversation
|
270 |
+
conversation = new_chat_template(sft_format="plain")
|
271 |
|
272 |
if history:
|
273 |
conversation.messages = history
|
kimi_vl/serve/inference.py
CHANGED
@@ -71,88 +71,6 @@ def format_messages(
|
|
71 |
return converstion
|
72 |
|
73 |
|
74 |
-
def preprocess(
|
75 |
-
messages: list[dict],
|
76 |
-
processor,
|
77 |
-
sft_format: Optional[str] = "kimi-vl",
|
78 |
-
):
|
79 |
-
"""
|
80 |
-
Build messages from the conversations and images.
|
81 |
-
"""
|
82 |
-
# get images from conversations
|
83 |
-
results = []
|
84 |
-
images = []
|
85 |
-
|
86 |
-
# get texts from conversations
|
87 |
-
converstion = get_conv_template(sft_format)
|
88 |
-
# only use the last 3 round of messages
|
89 |
-
latest_messages = messages[-3:]
|
90 |
-
for mid, message in enumerate(latest_messages):
|
91 |
-
if message["role"] == converstion.roles[0] or message["role"] == "user":
|
92 |
-
record = {
|
93 |
-
"role": message["role"],
|
94 |
-
"content": [],
|
95 |
-
}
|
96 |
-
if "images" in message:
|
97 |
-
per_round_images = message["images"]
|
98 |
-
if len(per_round_images) > 2:
|
99 |
-
per_round_images = per_round_images[-2:]
|
100 |
-
print(f"Only use the last 2 images in the {mid}-th round")
|
101 |
-
|
102 |
-
images.extend(per_round_images)
|
103 |
-
for image in per_round_images:
|
104 |
-
record["content"].append(
|
105 |
-
{
|
106 |
-
"type": "image",
|
107 |
-
"image": image,
|
108 |
-
}
|
109 |
-
)
|
110 |
-
if 'content' in message:
|
111 |
-
record["content"].append(
|
112 |
-
{
|
113 |
-
"type": "text",
|
114 |
-
"text": str(message["content"]).strip(),
|
115 |
-
}
|
116 |
-
)
|
117 |
-
results.append(record)
|
118 |
-
elif message["role"] == converstion.roles[1] or message["role"] == "assistant":
|
119 |
-
formatted_answer = message["content"].strip()
|
120 |
-
# ◁think▷用户说了“你好”,这是一个非常简单的问候,通常用于开启对话。我需要判断用户的意图。可能性一:用户只是礼貌性地打招呼,想要开启一段对话;可能性二:用户可能有更具体的需求,比如询问我的功能、功能或者需要帮助。由于用户没有提供更多信息,我需要保持开放,同时引导用户进一步说明他们的需求。
|
121 |
-
# 我的回复需要既友好又开放,不能显得过于正式或冷漠。同时,我需要避免假设用户的具体需求,而是提供一个轻松的、鼓励继续对话的回应。◁/think▷你好!很高兴见到你。有什么我可以帮助你的吗
|
122 |
-
# delete all the texts between ◁think▷ and ◁/think▷
|
123 |
-
# FIXME: this is a hack to remove the thinking texts
|
124 |
-
# formatted_answer = re.sub(r"◁think▷.*◁/think▷", "", formatted_answer)
|
125 |
-
think_end_token = '◁/think▷'
|
126 |
-
formatted_answer = formatted_answer.split(think_end_token)[-1]
|
127 |
-
results.append(
|
128 |
-
{
|
129 |
-
"role": message["role"],
|
130 |
-
"content": [
|
131 |
-
{
|
132 |
-
"type": "text",
|
133 |
-
"text": formatted_answer,
|
134 |
-
}
|
135 |
-
],
|
136 |
-
}
|
137 |
-
)
|
138 |
-
assert (
|
139 |
-
formatted_answer.count(processor.image_token) == 0
|
140 |
-
), f"there should be no {processor.image_token} in the assistant's reply, but got {messages}"
|
141 |
-
converstion.append_message(converstion.roles[1], formatted_answer)
|
142 |
-
|
143 |
-
text = processor.apply_chat_template(results, add_generation_prompt=True)
|
144 |
-
print(f"raw text = {text}")
|
145 |
-
if len(images) == 0:
|
146 |
-
images = None
|
147 |
-
|
148 |
-
inputs = processor(
|
149 |
-
images=images,
|
150 |
-
text=[text],
|
151 |
-
return_tensors="pt",
|
152 |
-
padding=True,
|
153 |
-
truncation=True,
|
154 |
-
)
|
155 |
-
return inputs
|
156 |
|
157 |
|
158 |
@torch.no_grad()
|
@@ -176,6 +94,7 @@ def kimi_dev_generate(
|
|
176 |
|
177 |
return generate(
|
178 |
model,
|
|
|
179 |
inputs,
|
180 |
max_gen_len=max_length,
|
181 |
temperature=temperature,
|
@@ -187,7 +106,7 @@ def kimi_dev_generate(
|
|
187 |
|
188 |
def generate(
|
189 |
model,
|
190 |
-
|
191 |
inputs,
|
192 |
max_gen_len: int = 256,
|
193 |
temperature: float = 0,
|
@@ -196,7 +115,6 @@ def generate(
|
|
196 |
chunk_size: int = -1,
|
197 |
):
|
198 |
"""Stream the text output from the multimodality model with prompt and image inputs."""
|
199 |
-
tokenizer = processor.tokenizer
|
200 |
stop_words_ids = [torch.tensor(tokenizer.encode(stop_word)) for stop_word in stop_words]
|
201 |
stopping_criteria = StoppingCriteriaList([StoppingCriteriaSub(stops=stop_words_ids)])
|
202 |
streamer = TextIteratorStreamer(tokenizer, skip_prompt=True)
|
|
|
71 |
return converstion
|
72 |
|
73 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
74 |
|
75 |
|
76 |
@torch.no_grad()
|
|
|
94 |
|
95 |
return generate(
|
96 |
model,
|
97 |
+
tokenizer,
|
98 |
inputs,
|
99 |
max_gen_len=max_length,
|
100 |
temperature=temperature,
|
|
|
106 |
|
107 |
def generate(
|
108 |
model,
|
109 |
+
tokenizer,
|
110 |
inputs,
|
111 |
max_gen_len: int = 256,
|
112 |
temperature: float = 0,
|
|
|
115 |
chunk_size: int = -1,
|
116 |
):
|
117 |
"""Stream the text output from the multimodality model with prompt and image inputs."""
|
|
|
118 |
stop_words_ids = [torch.tensor(tokenizer.encode(stop_word)) for stop_word in stop_words]
|
119 |
stopping_criteria = StoppingCriteriaList([StoppingCriteriaSub(stops=stop_words_ids)])
|
120 |
streamer = TextIteratorStreamer(tokenizer, skip_prompt=True)
|