duzx16
commited on
Commit
·
d493e51
1
Parent(s):
dba7772
Update apply_chat_template
Browse files- tokenization_chatglm.py +6 -6
tokenization_chatglm.py
CHANGED
|
@@ -63,22 +63,22 @@ class ChatGLM4Tokenizer(PreTrainedTokenizer):
|
|
| 63 |
vocab.update(self.added_tokens_encoder)
|
| 64 |
return vocab
|
| 65 |
|
| 66 |
-
def convert_tokens_to_string(self, tokens: List[Union[bytes, str
|
| 67 |
"""
|
| 68 |
Converts a sequence of tokens in a single string.
|
| 69 |
"""
|
| 70 |
text = ""
|
| 71 |
temp = b""
|
| 72 |
for t in tokens:
|
| 73 |
-
if isinstance(t, int):
|
| 74 |
-
t = chr(t)
|
| 75 |
if isinstance(t, str):
|
| 76 |
if temp:
|
| 77 |
text += temp.decode("utf-8", errors="replace")
|
|
|
|
|
|
|
| 78 |
elif isinstance(t, bytes):
|
| 79 |
temp += t
|
| 80 |
else:
|
| 81 |
-
raise TypeError("token should only be of type
|
| 82 |
if temp:
|
| 83 |
text += temp.decode("utf-8", errors="replace")
|
| 84 |
return text
|
|
@@ -168,7 +168,8 @@ class ChatGLM4Tokenizer(PreTrainedTokenizer):
|
|
| 168 |
for item in conversation:
|
| 169 |
if item.get("tools"):
|
| 170 |
tools = item["tools"]
|
| 171 |
-
content = "你是一个名为
|
|
|
|
| 172 |
for tool in tools:
|
| 173 |
if tool["type"] == "function":
|
| 174 |
function = tool["function"]
|
|
@@ -203,7 +204,6 @@ class ChatGLM4Tokenizer(PreTrainedTokenizer):
|
|
| 203 |
input_ids.extend([self.convert_tokens_to_ids("<|assistant|>")])
|
| 204 |
else:
|
| 205 |
input_message += "<|assistant|>"
|
| 206 |
-
|
| 207 |
return input_ids if tokenize else input_message
|
| 208 |
|
| 209 |
# Main logic to handle different conversation formats
|
|
|
|
| 63 |
vocab.update(self.added_tokens_encoder)
|
| 64 |
return vocab
|
| 65 |
|
| 66 |
+
def convert_tokens_to_string(self, tokens: List[Union[bytes, str]]) -> str:
|
| 67 |
"""
|
| 68 |
Converts a sequence of tokens in a single string.
|
| 69 |
"""
|
| 70 |
text = ""
|
| 71 |
temp = b""
|
| 72 |
for t in tokens:
|
|
|
|
|
|
|
| 73 |
if isinstance(t, str):
|
| 74 |
if temp:
|
| 75 |
text += temp.decode("utf-8", errors="replace")
|
| 76 |
+
temp = b""
|
| 77 |
+
text += t
|
| 78 |
elif isinstance(t, bytes):
|
| 79 |
temp += t
|
| 80 |
else:
|
| 81 |
+
raise TypeError("token should only be of type types or str")
|
| 82 |
if temp:
|
| 83 |
text += temp.decode("utf-8", errors="replace")
|
| 84 |
return text
|
|
|
|
| 168 |
for item in conversation:
|
| 169 |
if item.get("tools"):
|
| 170 |
tools = item["tools"]
|
| 171 |
+
content = "你是一个名为 GhatGLM 的人工智能助手。你是基于智谱AI训练的语言模型 GLM-4 模型开发的,你的任务是针对用户的问题和要求提供适当的答复和支持。"
|
| 172 |
+
content += "\n\n# 可用工具"
|
| 173 |
for tool in tools:
|
| 174 |
if tool["type"] == "function":
|
| 175 |
function = tool["function"]
|
|
|
|
| 204 |
input_ids.extend([self.convert_tokens_to_ids("<|assistant|>")])
|
| 205 |
else:
|
| 206 |
input_message += "<|assistant|>"
|
|
|
|
| 207 |
return input_ids if tokenize else input_message
|
| 208 |
|
| 209 |
# Main logic to handle different conversation formats
|