zR commited on
Commit
4f82091
1 Parent(s): aae8bd7

fix padding

Browse files
Files changed (1) hide show
  1. tokenization_chatglm.py +4 -103
tokenization_chatglm.py CHANGED
@@ -1,12 +1,10 @@
1
  import regex as re
2
  import base64
3
  import os
4
- import json
5
  import tiktoken
6
- from torch import TensorType
7
- from typing import List, Optional, Union, Dict, Any
8
  from transformers import PreTrainedTokenizer
9
- from transformers.utils import logging, PaddingStrategy
10
  from transformers.tokenization_utils_base import EncodedInput, BatchEncoding
11
 
12
 
@@ -17,16 +15,13 @@ class ChatGLM4Tokenizer(PreTrainedTokenizer):
17
  def __init__(
18
  self,
19
  vocab_file,
20
- padding_side="left",
21
  clean_up_tokenization_spaces=False,
22
- encode_special_tokens=False,
23
  **kwargs
24
  ):
25
  self.name = "GLM4Tokenizer"
26
  self.vocab_file = vocab_file
27
  pat_str = "(?i:'s|'t|'re|'ve|'m|'ll|'d)|[^\\r\\n\\p{L}\\p{N}]?\\p{L}+|\\p{N}{1,3}| ?[^\\s\\p{L}\\p{N}]+[\\r\\n]*|\\s*[\\r\\n]+|\\s+(?!\\S)|\\s+"
28
  self.pat_str = re.compile(pat_str)
29
- self.encode_special_tokens = encode_special_tokens
30
 
31
  mergeable_ranks = {}
32
  with open(vocab_file) as f:
@@ -48,7 +43,6 @@ class ChatGLM4Tokenizer(PreTrainedTokenizer):
48
  self.n_words = len(self.decoder)
49
 
50
  super().__init__(
51
- padding_side=padding_side,
52
  clean_up_tokenization_spaces=clean_up_tokenization_spaces,
53
  **kwargs
54
  )
@@ -141,99 +135,6 @@ class ChatGLM4Tokenizer(PreTrainedTokenizer):
141
  else:
142
  return str(f"<|{role}|>{metadata}\n{message}")
143
 
144
- # Use Jinja Template in tokenizer_config.json
145
- # def apply_chat_template(
146
- # self,
147
- # conversation: Union[List[Dict[str, str]], List[List[Dict[str, str]]], "Conversation"],
148
- # add_generation_prompt: bool = False,
149
- # tokenize: bool = True,
150
- # padding: bool = False,
151
- # truncation: bool = False,
152
- # max_length: Optional[int] = None,
153
- # return_tensors: Optional[Union[str, TensorType]] = None,
154
- # return_dict: bool = False,
155
- # tokenizer_kwargs: Optional[Dict[str, Any]] = None,
156
- # add_special_tokens: bool = True,
157
- # **kwargs,
158
- # ) -> Union[str, List[int], List[str], List[List[int]], BatchEncoding]:
159
- #
160
- # if return_dict and not tokenize:
161
- # raise ValueError(
162
- # "`return_dict=True` is incompatible with `tokenize=False`, because there is no dict "
163
- # "of tokenizer outputs to return."
164
- # )
165
- #
166
- # def handle_single_conversation(conversation):
167
- # input_ids = self.get_prefix_tokens() if add_special_tokens else []
168
- # input_message = "[gMASK]<sop>" if add_special_tokens else ""
169
- # for item in conversation:
170
- # if item.get("tools"):
171
- # tools = item["tools"]
172
- # content = "你是一个名为 GhatGLM 的人工智能助手。你是基于智谱AI训练的语言模型 GLM-4 模型开发的,你的任务是针对用户的问题和要求提供适当的答复和支持。"
173
- # content += "\n\n# 可用工具"
174
- # for tool in tools:
175
- # if tool["type"] == "function":
176
- # function = tool["function"]
177
- # content += f"\n\n## {function['name']}\n\n{json.dumps(function, ensure_ascii=False, indent=4)}"
178
- # content += "\n在调用上述函数时,请使用 Json 格式表示调用的参数。"
179
- # elif tool["type"] == "python":
180
- # content += "\n\n## python\n\n当你向 `python` 发送包含 Python 代码的消息时,该代码将会在一个有状态的 Jupyter notebook 环境中执行。\n`python` 返回代码执行的输出,或在执行 60 秒后返回超时。\n`/mnt/data` 将会持久化存储你的文件。在此会话中,`python` 无法访问互联网。不要使用 `python` 进行任何网络请求或者在线 API 调用,这些在线内容的访问将不会成功。"
181
- # elif tool["type"] == "simple_browser":
182
- # content += "\n\n## simple_browser\n\n你可以使用 `simple_browser` 工具。该工具支持以下函数:\n`search(query: str, recency_days: int)`:使用搜索引擎进行查询并显示结果,可以使用 `recency_days` 参数控制搜索内容的时效性。\n`mclick(ids: list[int])`:获取一系列指定 id 的页面内容。每次调用时,须选择3-10个页面。选择多个角度的页面,同时尽可能选择可信任的信息来源。考虑到部分页面是无法加载的,你也可以多打开一些可能有用的页面而不用担心内容过多。\n`open_url(url: str)`:打开指定的 URL。\n\n使用 `【{引用 id}†{引用文本}】` 来引用内容。\n\n操作步骤:1. 使用 `search` 来获得信息列表; 2. 使用 `mclick` 来获取指定 ID 页面的内容; 3. 根据获得的内容进行回复。在���复中应当引用信息来源。\n 如果用户提供了 URL,也可以用 `open_url` 直接打开页面。\n如果初次搜索结果没有找到合适的信息,也可以再次使用 `search` 进行搜索。"
183
- # elif tool["type"] == "cogview":
184
- # content += "\n\n## cogview\n\n如果用户的请求中包含了对图像的描述,你可以使用 `cogview` 来生成图像并展示给用户。你需要向 `cogview` 发送图像描述,规则:\n- 发送给 `cogview` 的消息必须使用英语。用户的中文描述必须完全翻译为英语。\n- 应当尽可能详细地描述图像生成的需求,需求描述约 100 英文单词。\n- 保持用户原始描述的意图。不要虚构内容或者没见过的人物。\n- 如无特殊说明,所在地为中国,持有中国立场并遵循中国社会主义价值观。"
185
- # else:
186
- # raise NotImplementedError(f"Unknown tool type {tool['type']}")
187
- # input = self.build_single_message("system", "", content, tokenize=tokenize)
188
- # if tokenize:
189
- # input_ids.extend(input)
190
- # else:
191
- # input_message += input
192
- # if item["content"]:
193
- # input = self.build_single_message(
194
- # item["role"],
195
- # item.get("metadata", ""),
196
- # item["content"],
197
- # tokenize=tokenize
198
- # )
199
- # if tokenize:
200
- # input_ids.extend(input)
201
- # else:
202
- # input_message += input
203
- # if add_generation_prompt:
204
- # if tokenize:
205
- # input_ids.extend([self.convert_tokens_to_ids("<|assistant|>")])
206
- # else:
207
- # input_message += "<|assistant|>"
208
- # return input_ids if tokenize else input_message
209
- #
210
- # # Main logic to handle different conversation formats
211
- # if isinstance(conversation, list) and all(isinstance(i, dict) for i in conversation):
212
- # result = handle_single_conversation(conversation)
213
- # elif isinstance(conversation, list) and all(isinstance(i, list) for i in conversation):
214
- # result = [handle_single_conversation(c) for c in conversation]
215
- # elif hasattr(conversation, "messages"):
216
- # result = handle_single_conversation(conversation.messages)
217
- # else:
218
- # raise ValueError("Invalid conversation format")
219
- #
220
- # if tokenize:
221
- # output = self.batch_encode_plus(
222
- # [result] if isinstance(result[0], int) else result,
223
- # padding=padding,
224
- # truncation=truncation,
225
- # max_length=max_length,
226
- # return_tensors=return_tensors,
227
- # is_split_into_words=True,
228
- # add_special_tokens=False
229
- # )
230
- # if return_dict:
231
- # return output
232
- # else:
233
- # return output["input_ids"]
234
- # else:
235
- # return result
236
-
237
  def build_inputs_with_special_tokens(
238
  self, token_ids_0: List[int], token_ids_1: Optional[List[int]] = None
239
  ) -> List[int]:
@@ -263,6 +164,7 @@ class ChatGLM4Tokenizer(PreTrainedTokenizer):
263
  self,
264
  encoded_inputs: Union[Dict[str, EncodedInput], BatchEncoding],
265
  max_length: Optional[int] = None,
 
266
  padding_strategy: PaddingStrategy = PaddingStrategy.DO_NOT_PAD,
267
  pad_to_multiple_of: Optional[int] = None,
268
  return_attention_mask: Optional[bool] = None,
@@ -291,7 +193,6 @@ class ChatGLM4Tokenizer(PreTrainedTokenizer):
291
  (optional) Set to False to avoid returning attention mask (default: set to model specifics)
292
  """
293
  # Load from model defaults
294
- assert self.padding_side == "left"
295
 
296
  required_input = encoded_inputs[self.model_input_names[0]]
297
  seq_length = len(required_input)
@@ -320,4 +221,4 @@ class ChatGLM4Tokenizer(PreTrainedTokenizer):
320
  encoded_inputs["position_ids"] = [0] * difference + encoded_inputs["position_ids"]
321
  encoded_inputs[self.model_input_names[0]] = [self.pad_token_id] * difference + required_input
322
 
323
- return encoded_inputs
 
1
  import regex as re
2
  import base64
3
  import os
 
4
  import tiktoken
5
+ from typing import List, Optional, Union, Dict
 
6
  from transformers import PreTrainedTokenizer
7
+ from transformers.utils import PaddingStrategy
8
  from transformers.tokenization_utils_base import EncodedInput, BatchEncoding
9
 
10
 
 
15
  def __init__(
16
  self,
17
  vocab_file,
 
18
  clean_up_tokenization_spaces=False,
 
19
  **kwargs
20
  ):
21
  self.name = "GLM4Tokenizer"
22
  self.vocab_file = vocab_file
23
  pat_str = "(?i:'s|'t|'re|'ve|'m|'ll|'d)|[^\\r\\n\\p{L}\\p{N}]?\\p{L}+|\\p{N}{1,3}| ?[^\\s\\p{L}\\p{N}]+[\\r\\n]*|\\s*[\\r\\n]+|\\s+(?!\\S)|\\s+"
24
  self.pat_str = re.compile(pat_str)
 
25
 
26
  mergeable_ranks = {}
27
  with open(vocab_file) as f:
 
43
  self.n_words = len(self.decoder)
44
 
45
  super().__init__(
 
46
  clean_up_tokenization_spaces=clean_up_tokenization_spaces,
47
  **kwargs
48
  )
 
135
  else:
136
  return str(f"<|{role}|>{metadata}\n{message}")
137
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
138
  def build_inputs_with_special_tokens(
139
  self, token_ids_0: List[int], token_ids_1: Optional[List[int]] = None
140
  ) -> List[int]:
 
164
  self,
165
  encoded_inputs: Union[Dict[str, EncodedInput], BatchEncoding],
166
  max_length: Optional[int] = None,
167
+ padding_side: str = "left",
168
  padding_strategy: PaddingStrategy = PaddingStrategy.DO_NOT_PAD,
169
  pad_to_multiple_of: Optional[int] = None,
170
  return_attention_mask: Optional[bool] = None,
 
193
  (optional) Set to False to avoid returning attention mask (default: set to model specifics)
194
  """
195
  # Load from model defaults
 
196
 
197
  required_input = encoded_inputs[self.model_input_names[0]]
198
  seq_length = len(required_input)
 
221
  encoded_inputs["position_ids"] = [0] * difference + encoded_inputs["position_ids"]
222
  encoded_inputs[self.model_input_names[0]] = [self.pad_token_id] * difference + required_input
223
 
224
+ return encoded_inputs