bygpt-jp-multi-lm-head-6.5B-alpha / tokenization_utf8_like_byte_v3.py

Upload folder using huggingface_hub

5b88d12 verified about 2 months ago

52.2 kB

	# Copyright 2021 T5 Authors and HuggingFace Inc. team.
	#
	# Licensed under the Apache License, Version 2.0 (the "License");
	# you may not use this file except in compliance with the License.
	# You may obtain a copy of the License at
	#
	# http://www.apache.org/licenses/LICENSE-2.0
	#
	# Unless required by applicable law or agreed to in writing, software
	# distributed under the License is distributed on an "AS IS" BASIS,
	# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
	# See the License for the specific language governing permissions and
	# limitations under the License.
	"""Tokenization class for model ByT5."""

	import warnings
	from typing import (
	Dict,
	List,
	Optional,
	Union,
	Tuple
	)
	import json
	import os
	import copy
	import ast

	import torch
	import numpy as np
	from transformers.tokenization_utils import AddedToken, PreTrainedTokenizer
	from transformers.tokenization_utils_base import (
	BatchEncoding,
	EncodedInput,
	PaddingStrategy,
	TruncationStrategy
	)
	from transformers.utils import logging

	logger = logging.get_logger(__name__)

	SPECIAL_TOKENS_MAP_FILE = "special_tokens_map.json"
	ADDED_TOKENS_FILE = "added_tokens.json"
	TOKENIZER_CONFIG_FILE = "tokenizer_config.json"

	LARGE_INTEGER = int(1e20)

	def make_serializeable(obj):
	if isinstance(obj, dict):
	return {str(k): make_serializeable(v) for k, v in obj.items()}
	if isinstance(obj, list):
	return [make_serializeable(v) for v in obj]
	if isinstance(obj, tuple):
	return make_serializeable(list(obj))
	return obj


	class ByteLMTokenizerV3(PreTrainedTokenizer):
	"""Byte tokenizer with completely seperate space for special tokens.

	tok.pad Parameters
	----------
	PreTrainedTokenizer : _type_
	_description_

	Returns
	-------
	_type_
	_description_

	Raises
	------
	ValueError
	_description_
	ValueError
	_description_
	"""

	model_input_names: list[str] = ["input_ids", "attention_mask"]
	reserve_sizes: list[int] = [59, 0, 0, 0]
	byte_head_ints: list[int] = [
	int("11000000", base=2),
	int("10000000", base=2),
	int("01000000", base=2),
	int("00000000", base=2),
	]
	byte_n_free_bits: list[int] = [6, 6, 6, 6]
	patch_padding: bool
	reserve_token_list: list[tuple[int]]

	def __init__(
	self,
	patch_padding=True,
	pad_token="<\|pad\|>",
	eos_token="<\|end_of_text\|>",
	bos_token="<\|begin_of_text\|>",
	cls_token="<\|cls\|>",
	sep_token="<\|sep\|>",
	mask_token="<\|mask\|>",
	vision_start_token="<\|vision_start\|>", # for vlm
	vision_br_token="<\|vision_br\|>", # for vlm
	vision_end_token="<\|vision_end\|>", # for vlm
	start_header_id_token="<\|start_header_id\|>", # for it
	end_header_id_token="<\|end_header_id\|>", # for it
	eor_id="<\|end_of_role\|>", # for it
	extra_ids=47,
	**kwargs,
	) -> None:
	assert np.prod(
	[
	2**n_free_bits - reserve_size
	for reserve_size, n_free_bits in zip(
	self.reserve_sizes, self.byte_n_free_bits
	)
	]
	) >= int(
	"110000", base=16
	), "Not enough positions for all unicode. Too many reserve size."

	self.patch_padding = patch_padding

	# list up all reserve tokens
	self._list_up_reserve_tokens()

	_bos_token = (
	AddedToken(bos_token, lstrip=False, rstrip=False)
	if isinstance(bos_token, str)
	else bos_token
	)
	_eos_token = (
	AddedToken(eos_token, lstrip=False, rstrip=False)
	if isinstance(eos_token, str)
	else eos_token
	)
	_pad_token = (
	AddedToken(pad_token, lstrip=False, rstrip=False)
	if isinstance(pad_token, str)
	else pad_token
	)
	_cls_token = (
	AddedToken(cls_token, lstrip=False, rstrip=False)
	if isinstance(cls_token, str)
	else cls_token
	)
	_sep_token = (
	AddedToken(sep_token, lstrip=False, rstrip=False)
	if isinstance(sep_token, str)
	else sep_token
	)
	_mask_token = (
	AddedToken(mask_token, lstrip=False, rstrip=False)
	if isinstance(mask_token, str)
	else mask_token
	)
	_vision_start_token = (
	AddedToken(vision_start_token, lstrip=False, rstrip=False)
	if isinstance(vision_start_token, str)
	else vision_start_token
	)
	_vision_br_token = (
	AddedToken(vision_br_token, lstrip=False, rstrip=False)
	if isinstance(vision_br_token, str)
	else vision_br_token
	)
	_vision_end_token = (
	AddedToken(vision_end_token, lstrip=False, rstrip=False)
	if isinstance(vision_end_token, str)
	else vision_end_token
	)
	_start_header_id_token = (
	AddedToken(start_header_id_token, lstrip=False, rstrip=False)
	if isinstance(start_header_id_token, str)
	else start_header_id_token
	)
	_end_header_id_token = (
	AddedToken(end_header_id_token, lstrip=False, rstrip=False)
	if isinstance(end_header_id_token, str)
	else end_header_id_token
	)
	_eor_id = (
	AddedToken(eor_id, lstrip=False, rstrip=False)
	if isinstance(eor_id, str)
	else eor_id
	)

	self.offset = 0
	self._added_tokens_decoder = {
	self.reserve_token_list[i]: special_token
	for i, special_token in enumerate(
	[
	_pad_token,
	_eos_token,
	_bos_token,
	_cls_token,
	_sep_token,
	_mask_token,
	_vision_start_token,
	_vision_br_token,
	_vision_end_token,
	_start_header_id_token,
	_end_header_id_token,
	_eor_id,
	]
	)
	}

	offset = len(self._added_tokens_decoder)
	extra_tokens = {
	self.reserve_token_list[j + offset]: AddedToken(
	f"<\|extra_id_{i}\|>", lstrip=False, rstrip=False
	)
	for j, i in enumerate(range(extra_ids))
	}
	self._added_tokens_decoder.update(extra_tokens)

	super().__init__(
	bos_token=_bos_token,
	eos_token=_eos_token,
	pad_token=_pad_token,
	cls_token=_cls_token,
	sep_token=_sep_token,
	mask_token=_mask_token,
	vision_start_token=_vision_start_token,
	vision_br_token=_vision_br_token,
	vision_end_token=_vision_end_token,
	start_header_id_token=_start_header_id_token,
	end_header_id_token=_end_header_id_token,
	eor_id=_eor_id,
	**kwargs,
	)

	self._vocab_size = len(self.get_vocab())

	def _list_up_reserve_tokens(self):
	self.reserve_token_list = [
	(
	i + self.byte_head_ints[0],
	self.byte_head_ints[1],
	self.byte_head_ints[2],
	self.byte_head_ints[3],
	)
	for i in range(self.reserve_sizes[0])
	]

	@property
	def vocab_size(self):
	return self._vocab_size

	def create_tree(
	self, byte_options: list[list[int]], byte_index: int, max_byte_index: int
	) -> list[list[int]]:
	if byte_index == max_byte_index:
	return [[reserve_option] for reserve_option in byte_options[byte_index]]

	concat_list = []
	for byte_reserve_option in byte_options[byte_index]:
	if byte_reserve_option is not None:
	concat_list += [
	[byte_reserve_option] + following_bytes
	if following_bytes != [None]
	else [byte_reserve_option]
	for following_bytes in self.create_tree(
	byte_options=byte_options,
	byte_index=byte_index + 1,
	max_byte_index=max_byte_index,
	)
	]
	else:
	concat_list.append([None])
	return concat_list

	def get_vocab(self):
	byte_options = [
	list(range(reserve_size, 2**n_free_bits))
	for reserve_size, n_free_bits in zip(
	self.reserve_sizes, self.byte_n_free_bits
	)
	]

	if not self.patch_padding:
	for i in range(len(byte_options) - 1):
	byte_options[i] += [None]

	byte_options.reverse()
	byte_tokens = self.create_tree(
	byte_options=byte_options, byte_index=0, max_byte_index=3
	)

	byte_tokens = sorted(
	byte_tokens,
	key=lambda lst: sum([e * (256**i) for i, e in enumerate(lst)])
	+ 256 ** len(lst),
	)

	for byte_token_index in range(len(byte_tokens)):
	byte_tokens[byte_token_index].reverse()
	for position in range(len(byte_tokens[byte_token_index])):
	byte_tokens[byte_token_index][position] += self.byte_head_ints[position]
	byte_tokens[byte_token_index] = tuple(byte_tokens[byte_token_index])

	vocab = {self.convert_ids_to_tokens(tokens): tokens for tokens in byte_tokens}
	vocab.pop("")
	vocab.update(self.added_tokens_encoder)
	return vocab


	def _get_padding_truncation_strategies(
	self, padding=False, truncation=None, max_length=None, pad_to_multiple_of=None, verbose=True, **kwargs
	):
	"""
	Find the correct padding/truncation strategy
	"""

	# Backward compatibility for previous behavior, maybe we should deprecate it:
	# If you only set max_length, it activates truncation for max_length
	if max_length is not None and padding is False and truncation is None:
	if verbose:
	if not self.deprecation_warnings.get("Truncation-not-explicitly-activated", False):
	logger.warning(
	"Truncation was not explicitly activated but `max_length` is provided a specific value, please"
	" use `truncation=True` to explicitly truncate examples to max length. Defaulting to"
	" 'longest_first' truncation strategy. If you encode pairs of sequences (GLUE-style) with the"
	" tokenizer you can select this strategy more precisely by providing a specific strategy to"
	" `truncation`."
	)
	self.deprecation_warnings["Truncation-not-explicitly-activated"] = True
	truncation = "longest_first"

	# Get padding strategy
	if padding is not False:
	if padding is True:
	if verbose:
	if max_length is not None and (
	truncation is None or truncation is False or truncation == "do_not_truncate"
	):
	warnings.warn(
	"`max_length` is ignored when `padding`=`True` and there is no truncation strategy. "
	"To pad to max length, use `padding='max_length'`."
	)
	padding_strategy = PaddingStrategy.LONGEST # Default to pad to the longest sequence in the batch
	elif not isinstance(padding, PaddingStrategy):
	padding_strategy = PaddingStrategy(padding)
	elif isinstance(padding, PaddingStrategy):
	padding_strategy = padding
	else:
	padding_strategy = PaddingStrategy.DO_NOT_PAD

	# Get truncation strategy
	if truncation is not False and truncation is not None:
	if truncation is True:
	truncation_strategy = (
	TruncationStrategy.LONGEST_FIRST
	) # Default to truncate the longest sequences in pairs of inputs
	elif not isinstance(truncation, TruncationStrategy):
	truncation_strategy = TruncationStrategy(truncation)
	elif isinstance(truncation, TruncationStrategy):
	truncation_strategy = truncation
	else:
	truncation_strategy = TruncationStrategy.DO_NOT_TRUNCATE

	# Set max length if needed
	if max_length is None:
	if padding_strategy == PaddingStrategy.MAX_LENGTH:
	if self.model_max_length > LARGE_INTEGER:
	if verbose:
	if not self.deprecation_warnings.get("Asking-to-pad-to-max_length", False):
	logger.warning(
	"Asking to pad to max_length but no maximum length is provided and the model has no"
	" predefined maximum length. Default to no padding."
	)
	self.deprecation_warnings["Asking-to-pad-to-max_length"] = True
	padding_strategy = PaddingStrategy.DO_NOT_PAD
	else:
	max_length = self.model_max_length

	if truncation_strategy != TruncationStrategy.DO_NOT_TRUNCATE:
	if self.model_max_length > LARGE_INTEGER:
	if verbose:
	if not self.deprecation_warnings.get("Asking-to-truncate-to-max_length", False):
	logger.warning(
	"Asking to truncate to max_length but no maximum length is provided and the model has"
	" no predefined maximum length. Default to no truncation."
	)
	self.deprecation_warnings["Asking-to-truncate-to-max_length"] = True
	truncation_strategy = TruncationStrategy.DO_NOT_TRUNCATE
	else:
	max_length = self.model_max_length

	# Test if we have a padding token
	if padding_strategy != PaddingStrategy.DO_NOT_PAD and self.pad_token is None:
	raise ValueError(
	"Asking to pad but the tokenizer does not have a padding token. "
	"Please select a token to use as `pad_token` `(tokenizer.pad_token = tokenizer.eos_token e.g.)` "
	"or add a new pad token via `tokenizer.add_special_tokens({'pad_token': '[PAD]'})`."
	)

	# Check that we will truncate to a multiple of pad_to_multiple_of if both are provided
	if (
	truncation_strategy != TruncationStrategy.DO_NOT_TRUNCATE
	and padding_strategy != PaddingStrategy.DO_NOT_PAD
	and pad_to_multiple_of is not None
	and max_length is not None
	and (max_length % pad_to_multiple_of != 0)
	):
	raise ValueError(
	"Truncation and padding are both activated but "
	f"truncation length ({max_length}) is not a multiple of pad_to_multiple_of ({pad_to_multiple_of})."
	)

	return padding_strategy, truncation_strategy, max_length, kwargs



	def _add_bos_if_not_present(self, token_ids: list[int]) -> list[int]:
	"""Do not add bos again if user already added it."""
	if len(token_ids) > 0 and token_ids[0] == self.bos_token_id:
	warnings.warn(
	f"This sequence already has {self.bos_token}. In future versions this behavior may lead to duplicated"
	" bos tokens being added."
	)
	return token_ids
	else:
	return list(self.bos_token_id) + token_ids


	def _add_eos_if_not_present(self, token_ids: list[int]) -> list[int]:
	"""Do not add eos again if user already added it."""
	if len(token_ids) > 0 and token_ids[-1] == self.eos_token_id:
	warnings.warn(
	f"This sequence already has {self.eos_token}. In future versions this behavior may lead to duplicated"
	" eos tokens being added."
	)
	return token_ids
	else:
	return token_ids + list(self.eos_token_id)

	def _pad(
	self,
	encoded_inputs: Union[Dict[str, EncodedInput], BatchEncoding],
	max_length: Optional[int] = None,
	padding_strategy: PaddingStrategy = PaddingStrategy.DO_NOT_PAD,
	pad_to_multiple_of: Optional[int] = None,
	padding_side: Optional[bool] = None,
	return_attention_mask: Optional[bool] = None,
	) -> dict:
	"""
	Pad encoded inputs (on left/right and up to predefined length or max length in the batch)

	Args:
	encoded_inputs:
	Dictionary of tokenized inputs (`List[int]`) or batch of tokenized inputs (`List[List[int]]`).
	max_length: maximum length of the returned list and optionally padding length (see below).
	Will truncate by taking into account the special tokens.
	padding_strategy: PaddingStrategy to use for padding.

	- PaddingStrategy.LONGEST Pad to the longest sequence in the batch
	- PaddingStrategy.MAX_LENGTH: Pad to the max length (default)
	- PaddingStrategy.DO_NOT_PAD: Do not pad
	The tokenizer padding sides are defined in `padding_side` argument:

	- 'left': pads on the left of the sequences
	- 'right': pads on the right of the sequences
	pad_to_multiple_of: (optional) Integer if set will pad the sequence to a multiple of the provided value.
	This is especially useful to enable the use of Tensor Core on NVIDIA hardware with compute capability
	`>= 7.5` (Volta).
	padding_side:
	The side on which the model should have padding applied. Should be selected between ['right', 'left'].
	Default value is picked from the class attribute of the same name.
	return_attention_mask:
	(optional) Set to False to avoid returning attention mask (default: set to model specifics)
	"""
	# Load from model defaults
	if return_attention_mask is None:
	return_attention_mask = "attention_mask" in self.model_input_names

	required_input = encoded_inputs[self.model_input_names[0]]

	if padding_strategy == PaddingStrategy.LONGEST:
	max_length = len(required_input)

	if (
	max_length is not None
	and pad_to_multiple_of is not None
	and (max_length % pad_to_multiple_of != 0)
	):
	max_length = ((max_length // pad_to_multiple_of) + 1) * pad_to_multiple_of

	needs_to_be_padded = (
	padding_strategy != PaddingStrategy.DO_NOT_PAD
	and len(required_input) != max_length
	)

	# Initialize attention mask if not present.
	if return_attention_mask and "attention_mask" not in encoded_inputs:
	encoded_inputs["attention_mask"] = [1] * len(required_input)

	if needs_to_be_padded:
	if self.patch_padding:
	difference = (max_length - len(required_input)) // len(
	self.byte_head_ints
	)
	mask_patch_size = 4
	else:
	difference = max_length - len(required_input)
	mask_patch_size = 1

	padding_side = (
	padding_side if padding_side is not None else self.padding_side
	)

	if padding_side == "right":
	if return_attention_mask:
	encoded_inputs["attention_mask"] = (
	encoded_inputs["attention_mask"]
	+ [0] * difference * mask_patch_size
	)
	if "token_type_ids" in encoded_inputs:
	encoded_inputs["token_type_ids"] = (
	encoded_inputs["token_type_ids"]
	+ list(self.pad_token_type_id) * difference
	)
	if "special_tokens_mask" in encoded_inputs:
	encoded_inputs["special_tokens_mask"] = (
	encoded_inputs["special_tokens_mask"]
	+ [1] * difference * mask_patch_size
	)
	encoded_inputs[self.model_input_names[0]] = (
	required_input + list(self.pad_token_id) * difference
	)
	elif padding_side == "left":
	if return_attention_mask:
	encoded_inputs["attention_mask"] = [
	0
	] * difference * mask_patch_size + encoded_inputs["attention_mask"]
	if "token_type_ids" in encoded_inputs:
	encoded_inputs["token_type_ids"] = (
	list(self.pad_token_type_id) * difference
	+ encoded_inputs["token_type_ids"]
	)
	if "special_tokens_mask" in encoded_inputs:
	encoded_inputs["special_tokens_mask"] = [
	1
	] * difference * mask_patch_size + encoded_inputs[
	"special_tokens_mask"
	]
	encoded_inputs[self.model_input_names[0]] = (
	list(self.pad_token_id) * difference + required_input
	)
	else:
	raise ValueError(f"Invalid padding strategy:{padding_side}")

	return encoded_inputs


	def build_inputs_with_special_tokens(
	self, token_ids_0: list[int], token_ids_1: list[int] \| None = None
	) -> list[int]:
	"""
	Build model inputs from a sequence or a pair of sequence for sequence classification tasks by concatenating and
	adding special tokens. A sequence has the following format:
	- single sequence: `X </s>`
	- pair of sequences: `A </s> B </s>`
	Args:
	token_ids_0 (`List[int]`):
	List of IDs to which the special tokens will be added.
	token_ids_1 (`List[int]`, optional):
	Optional second list of IDs for sequence pairs.
	Returns:
	`List[int]`: List of [input IDs](../glossary#input-ids) with the appropriate special tokens.
	"""
	token_ids_0 = self._add_bos_if_not_present(token_ids_0)
	token_ids_0 = self._add_eos_if_not_present(token_ids_0)
	if token_ids_1 is None:
	return token_ids_0
	else:
	token_ids_1 = self._add_bos_if_not_present(token_ids_1)
	token_ids_1 = self._add_eos_if_not_present(token_ids_1)
	return token_ids_0 + token_ids_1

	def _tokenize(self, text: str) -> list[str]:
	"""Take as input a string and return a list of strings (tokens) for words/sub-words"""
	token_ids = []
	for c in text:
	token_ids.extend(self.unicode_to_bytes(ord(c)))

	# Convert to string
	token_ids = [str(i) for i in token_ids]
	return token_ids

	def _convert_token_to_id(self, token):
	"""Converts a token (str) in an id using the vocab."""
	token_id = int(token) + self.offset
	return token_id

	def _convert_id_to_token(self, index):
	"""Converts an index (integer) in a token (str) using the vocab."""
	return str(index - self.offset)

	def _convert_token_to_id_with_added_voc(self, token):
	if token is None:
	return None

	if token in self._added_tokens_encoder:
	return list(self._added_tokens_encoder[token])
	return [self._convert_token_to_id(token)]

	def convert_tokens_to_ids(
	self, tokens: Union[str, List[str]]
	) -> Union[int, List[int]]:
	"""
	Converts a token string (or a sequence of tokens) in a single integer id (or a sequence of ids), using the
	vocabulary.

	Args:
	tokens (`str` or `List[str]`): One or several token(s) to convert to token id(s).

	Returns:
	`int` or `List[int]`: The token id or list of token ids.
	"""
	if tokens is None:
	return None

	if isinstance(tokens, str):
	return self._convert_token_to_id_with_added_voc(tokens)

	ids = []
	for token in tokens:
	ids.extend(self._convert_token_to_id_with_added_voc(token))
	return ids

	def convert_bytes_for_single_char_to_char(self, ids: list[int]) -> str:
	byte_ints = []
	byte_offset = 1

	if self.is_special_token(ids): # special token
	return self.added_tokens_decoder[tuple(ids)].__str__()

	for byte_position in range(1, len(ids) + 1):
	byte_int = (
	ids[-byte_position]
	- self.byte_head_ints[-byte_position]
	- self.reserve_sizes[-byte_position]
	)
	if byte_int != -self.reserve_sizes[-byte_position]: # not padding
	byte_ints.append(byte_int * byte_offset)

	byte_offset *= (
	2 ** self.byte_n_free_bits[-byte_position]
	- self.reserve_sizes[-byte_position]
	)

	codepoint = sum(byte_ints)
	if codepoint >= int("110000", base=16):
	return None
	else:
	try:
	return chr(codepoint)
	except ValueError:
	return None

	# def is_special_token(self, ids: list[int]):
	# return ids[0] < self.byte_head_ints[0] + (self.reserve_sizes[0] - 1)

	def is_special_token(self, ids: list[int]):
	return tuple(ids) in self._added_tokens_decoder

	def convert_ids_to_tokens(
	self, ids: list[int] \| tuple[int], skip_special_tokens: bool = False
	) -> str \| None:
	"""convert ids for single/multiple unicode character(s) to unicode character(s)"""

	decoded_chars = ""

	if isinstance(ids, tuple):
	ids = list(ids)

	if self.patch_padding:
	for byte_position in range(0, len(ids), len(self.byte_head_ints)):
	char_bytes = ids[
	byte_position : byte_position + len(self.byte_head_ints)
	]
	if (
	skip_special_tokens and not self.is_special_token(char_bytes)
	) or not skip_special_tokens:
	char = self.convert_bytes_for_single_char_to_char(char_bytes)
	if char:
	decoded_chars += char
	return decoded_chars

	if not self.is_special_token(ids): # not special token
	byte_ints = []
	byte_offset = 1
	for byte_position in range(1, len(ids) + 1):
	if ids[-byte_position] == 0:
	break
	byte_int = (
	ids[-byte_position]
	- self.byte_head_ints[-byte_position]
	- self.reserve_sizes[-byte_position]
	)
	assert byte_int >= 0
	byte_ints.append(byte_int * byte_offset)
	byte_offset *= (
	2 ** self.byte_n_free_bits[-byte_position]
	- self.reserve_sizes[-byte_position]
	)

	codepoint = sum(byte_ints)
	if codepoint >= int("110000", base=16):
	return None
	else:
	return chr(codepoint)
	else: # special token
	return self._added_tokens_decoder[tuple(ids)]

	def unicode_to_bytes(self, codepoint: int) -> list[int]:
	byte_list_reversed = []
	for byte_position_from_right in range(len(self.byte_n_free_bits)):
	byte_n_free_ids = (
	2 ** self.byte_n_free_bits[-1 - byte_position_from_right]
	- self.reserve_sizes[-1 - byte_position_from_right]
	)
	byte_id = (
	codepoint % byte_n_free_ids
	+ self.reserve_sizes[-1 - byte_position_from_right]
	+ self.byte_head_ints[-1 - byte_position_from_right]
	)
	codepoint //= byte_n_free_ids
	byte_list_reversed.append(byte_id)

	if codepoint == 0:
	if self.patch_padding:
	for pad_byte_position_from_right in range(
	len(byte_list_reversed), len(self.byte_n_free_bits)
	):
	byte_list_reversed.append(
	self.byte_head_ints[-1 - pad_byte_position_from_right] + self.reserve_sizes[-1 - pad_byte_position_from_right]
	)
	byte_list_reversed.reverse()
	return byte_list_reversed
	raise ValueError("codepoint is too large")

	# ByteTokenizer has no vocab file
	def save_vocabulary(
	self, save_directory: str, filename_prefix: str \| None = None
	) -> tuple[str]:
	return ()


	def image_to_ids(self, image_data: list[list[list[int]]]) -> list[int]:
	image_data = torch.tensor(image_data)
	x, y, rgb = image_data.size()
	assert rgb == 3
	image_br_token = list(self.added_tokens_encoder["<\|vision_br\|>"])
	image_special_byte_index = self.added_tokens_encoder["<\|vision_start\|>"][0]

	# add img byte by padding to the beginning
	image_data = torch.nn.functional.pad(
	image_data, (1, 0), "constant", value=image_special_byte_index
	).view(x, y * 4)

	image_data = torch.concat(
	[image_data, torch.tensor(image_br_token * x).view(x, 4)], dim=1
	).view(-1)
	return image_data.tolist()

	def save_pretrained(
	self,
	save_directory: Union[str, os.PathLike],
	legacy_format: Optional[bool] = None,
	filename_prefix: Optional[str] = None,
	push_to_hub: bool = False,
	**kwargs,
	) -> Tuple[str]:
	"""
	Save the full tokenizer state.


	This method make sure the full tokenizer can then be re-loaded using the
	[`~tokenization_utils_base.PreTrainedTokenizer.from_pretrained`] class method..

	Warning,None This won't save modifications you may have applied to the tokenizer after the instantiation (for
	instance, modifying `tokenizer.do_lower_case` after creation).

	Args:
	save_directory (`str` or `os.PathLike`): The path to a directory where the tokenizer will be saved.
	legacy_format (`bool`, optional):
	Only applicable for a fast tokenizer. If unset (default), will save the tokenizer in the unified JSON
	format as well as in legacy format if it exists, i.e. with tokenizer specific vocabulary and a separate
	added_tokens files.

	If `False`, will only save the tokenizer in the unified JSON format. This format is incompatible with
	"slow" tokenizers (not powered by the tokenizers library), so the tokenizer will not be able to be
	loaded in the corresponding "slow" tokenizer.

	If `True`, will save the tokenizer in legacy format. If the "slow" tokenizer doesn't exits, a value
	error is raised.
	filename_prefix (`str`, optional):
	A prefix to add to the names of the files saved by the tokenizer.
	push_to_hub (`bool`, optional, defaults to `False`):
	Whether or not to push your model to the Hugging Face model hub after saving it. You can specify the
	repository you want to push to with `repo_id` (will default to the name of `save_directory` in your
	namespace).
	kwargs (`Dict[str, Any]`, optional):
	Additional key word arguments passed along to the [`~utils.PushToHubMixin.push_to_hub`] method.

	Returns:
	A tuple of `str`: The files saved.
	"""
	use_auth_token = kwargs.pop("use_auth_token", None)

	if use_auth_token is not None:
	warnings.warn(
	"The `use_auth_token` argument is deprecated and will be removed in v5 of Transformers. Please use `token` instead.",
	FutureWarning,
	)
	if kwargs.get("token", None) is not None:
	raise ValueError(
	"`token` and `use_auth_token` are both specified. Please set only the argument `token`."
	)
	kwargs["token"] = use_auth_token

	if os.path.isfile(save_directory):
	logger.error(f"Provided path ({save_directory}) should be a directory, not a file")
	return

	os.makedirs(save_directory, exist_ok=True)

	if push_to_hub:
	commit_message = kwargs.pop("commit_message", None)
	repo_id = kwargs.pop("repo_id", save_directory.split(os.path.sep)[-1])
	repo_id = self._create_repo(repo_id, **kwargs)
	files_timestamps = self._get_files_timestamps(save_directory)

	special_tokens_map_file = os.path.join(
	save_directory, (filename_prefix + "-" if filename_prefix else "") + SPECIAL_TOKENS_MAP_FILE
	)
	tokenizer_config_file = os.path.join(
	save_directory, (filename_prefix + "-" if filename_prefix else "") + TOKENIZER_CONFIG_FILE
	)

	tokenizer_config = copy.deepcopy(self.init_kwargs)

	# Let's save the init kwargs
	target_keys = set(self.init_kwargs.keys())
	# Let's save the special tokens map (only the strings)
	target_keys.update(["model_max_length", "clean_up_tokenization_spaces"])

	for k in target_keys:
	if hasattr(self, k):
	tokenizer_config[k] = getattr(self, k)

	# Let's make sure we properly save the special tokens.
	tokenizer_config.update(self.special_tokens_map)

	if self.chat_template is not None:
	if isinstance(self.chat_template, dict):
	# Chat template dicts are saved to the config as lists of dicts with fixed key names.
	# They will be reconstructed as a single dict during loading.
	tokenizer_config["chat_template"] = [{"name": k, "template": v} for k, v in self.chat_template.items()]
	else:
	tokenizer_config["chat_template"] = self.chat_template

	if len(self.init_inputs) > 0:
	tokenizer_config["init_inputs"] = copy.deepcopy(self.init_inputs)
	for file_id in self.vocab_files_names.keys():
	tokenizer_config.pop(file_id, None)

	# no typefields, this way old fast and slow can load it
	tokenizer_config = self.convert_added_tokens(tokenizer_config, add_type_field=True, save=True)

	# Process added tokens seperatly: allows previous versions to ignore it!
	added_tokens = {}
	for key, value in self.added_tokens_decoder.items():
	added_tokens[key] = value.__getstate__()
	tokenizer_config["added_tokens_decoder"] = added_tokens

	# Add tokenizer class to the tokenizer config to be able to reload it with from_pretrained
	tokenizer_class = self.__class__.__name__
	# Remove the Fast at the end unless we have a special `PreTrainedTokenizerFast`
	if tokenizer_class.endswith("Fast") and tokenizer_class != "PreTrainedTokenizerFast":
	tokenizer_class = tokenizer_class[:-4]
	tokenizer_config["tokenizer_class"] = tokenizer_class
	if getattr(self, "_auto_map", None) is not None:
	tokenizer_config["auto_map"] = self._auto_map
	if getattr(self, "_processor_class", None) is not None:
	tokenizer_config["processor_class"] = self._processor_class

	# If we have a custom model, we copy the file defining it in the folder and set the attributes so it can be
	# loaded from the Hub.
	if self._auto_class is not None:
	custom_object_save(self, save_directory, config=tokenizer_config)

	# remove private information
	if "name_or_path" in tokenizer_config:
	tokenizer_config.pop("name_or_path")
	tokenizer_config.pop("special_tokens_map_file", None)
	tokenizer_config.pop("tokenizer_file", None)

	with open(tokenizer_config_file, "w", encoding="utf-8") as f:
	out_str = json.dumps(
	make_serializeable(tokenizer_config),
	indent=2,
	sort_keys=True,
	ensure_ascii=False
	) + "\n"
	f.write(out_str)
	logger.info(f"tokenizer config file saved in {tokenizer_config_file}")

	# Sanitize AddedTokens in special_tokens_map

	# kept for forward compatibility, will be removed in transoformers 5. Typefields are not saved for FC, special should not be save either
	write_dict = self.convert_added_tokens(self.special_tokens_map_extended, save=True, add_type_field=False)
	with open(special_tokens_map_file, "w", encoding="utf-8") as f:
	out_str = json.dumps(write_dict, indent=2, sort_keys=True, ensure_ascii=False) + "\n"
	f.write(out_str)
	logger.info(f"Special tokens file saved in {special_tokens_map_file}")

	file_names = (tokenizer_config_file, special_tokens_map_file)

	save_files = self._save_pretrained(
	save_directory=save_directory,
	file_names=file_names,
	legacy_format=legacy_format,
	filename_prefix=filename_prefix,
	)

	if push_to_hub:
	self._upload_modified_files(
	save_directory,
	repo_id,
	files_timestamps,
	commit_message=commit_message,
	token=kwargs.get("token"),
	)

	return save_files


	def _save_pretrained(
	self,
	save_directory: Union[str, os.PathLike],
	file_names: Tuple[str],
	legacy_format: Optional[bool] = None,
	filename_prefix: Optional[str] = None,
	) -> Tuple[str]:
	"""
	Save a tokenizer using the slow-tokenizer/legacy format: vocabulary + added tokens.

	Fast tokenizers can also be saved in a unique JSON file containing {config + vocab + added-tokens} using the
	specific [`~tokenization_utils_fast.PreTrainedTokenizerFast._save_pretrained`]
	"""
	if legacy_format is False:
	raise ValueError(
	"Only fast tokenizers (instances of PreTrainedTokenizerFast) can be saved in non legacy format."
	)

	save_directory = str(save_directory)

	added_tokens_file = os.path.join(
	save_directory, (filename_prefix + "-" if filename_prefix else "") + ADDED_TOKENS_FILE
	)
	# the new get_added_vocab() also returns special tokens and tokens that have an index < vocab_size
	# added_vocab = {tok: index for tok, index in self.added_tokens_encoder.items() if index >= self.vocab_size}
	added_vocab = {tok: list(index) for tok, index in self.added_tokens_encoder.items()}
	if added_vocab:
	with open(added_tokens_file, "w", encoding="utf-8") as f:
	out_str = json.dumps(added_vocab, indent=2, sort_keys=True, ensure_ascii=False) + "\n"
	f.write(out_str)
	logger.info(f"added tokens file saved in {added_tokens_file}")

	vocab_files = self.save_vocabulary(save_directory, filename_prefix=filename_prefix)

	return file_names + vocab_files + (added_tokens_file,)



	@classmethod
	def _from_pretrained(
	cls,
	resolved_vocab_files,
	pretrained_model_name_or_path,
	init_configuration,
	*init_inputs,
	token=None,
	cache_dir=None,
	local_files_only=False,
	_commit_hash=None,
	_is_local=False,
	trust_remote_code=False,
	**kwargs,
	):
	# We instantiate fast tokenizers based on a slow tokenizer if we don't have access to the tokenizer.json
	# file or if `from_slow` is set to True.
	from_slow = kwargs.get("from_slow", False)
	gguf_file = kwargs.get("gguf_file", None)
	has_tokenizer_file = resolved_vocab_files.get("tokenizer_file", None) is not None

	# If one passes a GGUF file path to `gguf_file` there is no need for this check as the tokenizer will be
	# loaded directly from the GGUF file.
	if (from_slow or not has_tokenizer_file) and cls.slow_tokenizer_class is not None and not gguf_file:
	slow_tokenizer = (cls.slow_tokenizer_class)._from_pretrained(
	copy.deepcopy(resolved_vocab_files),
	pretrained_model_name_or_path,
	copy.deepcopy(init_configuration),
	*init_inputs,
	token=token,
	cache_dir=cache_dir,
	local_files_only=local_files_only,
	_commit_hash=_commit_hash,
	**(copy.deepcopy(kwargs)),
	)
	else:
	slow_tokenizer = None

	# Prepare tokenizer initialization kwargs
	# Did we saved some inputs and kwargs to reload ?
	tokenizer_config_file = resolved_vocab_files.pop("tokenizer_config_file", None)
	if tokenizer_config_file is not None:
	with open(tokenizer_config_file, encoding="utf-8") as tokenizer_config_handle:
	init_kwargs = json.load(tokenizer_config_handle)
	# First attempt. We get tokenizer_class from tokenizer_config to check mismatch between tokenizers.

	config_tokenizer_class = init_kwargs.get("tokenizer_class")
	init_kwargs.pop("tokenizer_class", None)
	if not has_tokenizer_file:
	init_kwargs.pop("tokenizer_file", None)
	saved_init_inputs = init_kwargs.pop("init_inputs", ())
	if not init_inputs:
	init_inputs = saved_init_inputs
	else:
	config_tokenizer_class = None
	init_kwargs = init_configuration

	if not _is_local:
	if "auto_map" in init_kwargs:
	# For backward compatibility with odl format.
	if isinstance(init_kwargs["auto_map"], (tuple, list)):
	init_kwargs["auto_map"] = {"AutoTokenizer": init_kwargs["auto_map"]}


	if config_tokenizer_class is None:
	# Matt: This entire block is only used to decide if the tokenizer class matches the class in the repo.
	# If not, it raises a warning, but otherwise continues. Since we mostly load tokenizers with
	# AutoTokenizer these days, it seems like a lot of work (and a source of bugs) for little gain.
	# Maybe we can just remove this entirely?
	from transformers.models.auto.configuration_auto import AutoConfig # tests_ignore

	# Second attempt. If we have not yet found tokenizer_class, let's try to use the config.
	try:
	config = AutoConfig.from_pretrained(
	pretrained_model_name_or_path,
	token=token,
	cache_dir=cache_dir,
	local_files_only=local_files_only,
	trust_remote_code=trust_remote_code,
	_commit_hash=_commit_hash,
	)
	config_tokenizer_class = config.tokenizer_class
	except (OSError, ValueError, KeyError):
	# skip if an error occurred.
	config = None
	if config_tokenizer_class is None:
	# Third attempt. If we have not yet found the original type of the tokenizer,
	# we are loading we see if we can infer it from the type of the configuration file
	from transformers.models.auto.tokenization_auto import TOKENIZER_MAPPING_NAMES # tests_ignore

	if hasattr(config, "model_type"):
	model_type = config.model_type
	else:
	# Fallback: use pattern matching on the string.
	model_type = None
	for pattern in TOKENIZER_MAPPING_NAMES.keys():
	if pattern in str(pretrained_model_name_or_path):
	model_type = pattern
	break

	if model_type is not None:
	config_tokenizer_class, config_tokenizer_class_fast = TOKENIZER_MAPPING_NAMES.get(
	model_type, (None, None)
	)
	if config_tokenizer_class is None:
	config_tokenizer_class = config_tokenizer_class_fast

	if config_tokenizer_class is not None:
	if cls.__name__.replace("Fast", "") != config_tokenizer_class.replace("Fast", ""):
	logger.warning(
	"The tokenizer class you load from this checkpoint is not the same type as the class this"
	" function is called from. It may result in unexpected tokenization. \nThe tokenizer class you"
	f" load from this checkpoint is '{config_tokenizer_class}'. \nThe class this function is called"
	f" from is '{cls.__name__}'."
	)

	# Update with newly provided kwargs
	init_kwargs.update(kwargs)

	# Merge resolved_vocab_files arguments in init_kwargs.
	added_tokens_file = resolved_vocab_files.pop("added_tokens_file", None)
	special_tokens_map_file = resolved_vocab_files.pop("special_tokens_map_file", None)
	for args_name, file_path in resolved_vocab_files.items():
	if args_name not in init_kwargs:
	init_kwargs[args_name] = file_path
	tokenizer_file = resolved_vocab_files.pop("tokenizer_file", None)

	if slow_tokenizer is not None:
	init_kwargs["__slow_tokenizer"] = slow_tokenizer
	init_kwargs["name_or_path"] = pretrained_model_name_or_path

	#### Handle tokenizer serialization of added and special tokens
	added_tokens_decoder: Dict[int, AddedToken] = {}
	added_tokens_map: Dict[str, AddedToken] = {}
	# if we have info on the slow added tokens
	if "added_tokens_decoder" in init_kwargs:
	for idx, token in init_kwargs["added_tokens_decoder"].items():
	if isinstance(token, dict):
	token = AddedToken(**token)
	if isinstance(token, AddedToken):
	added_tokens_decoder[ast.literal_eval(idx)] = token
	added_tokens_map[str(token)] = token
	else:
	raise ValueError(
	f"Found a {token.__class__} in the saved `added_tokens_decoder`, should be a dictionary or an AddedToken instance"
	)
	else:
	# begin legacy: read the added_tokens_file and update kwargs with special_tokens_map if modified
	if special_tokens_map_file is not None:
	with open(special_tokens_map_file, encoding="utf-8") as special_tokens_map_handle:
	special_tokens_map = json.load(special_tokens_map_handle)
	for key, value in special_tokens_map.items():
	if key in kwargs and kwargs[key]:
	# This value has already been redefined by the kwargs
	# We keep this new value and ignore the one stored in the special_tokens_map_file
	continue
	if isinstance(value, dict):
	value = AddedToken(**value, special=True)
	elif key == "additional_special_tokens" and isinstance(value, list):
	additional_special_tokens = init_kwargs.pop("additional_special_tokens", []) or []
	for token in value:
	token = AddedToken(**token, special=True) if isinstance(token, dict) else token
	if token not in additional_special_tokens:
	additional_special_tokens.append(token)
	value = additional_special_tokens
	init_kwargs[key] = value

	# slow -> slow\|fast, legacy: convert the `"added_tokens.json"` file to `added_tokens_decoder`.
	# this is for legacy purpose. We don't add the tokens after init for efficiency.
	if added_tokens_file is not None:
	special_tokens = []
	for key in cls.SPECIAL_TOKENS_ATTRIBUTES & init_kwargs.keys():
	if init_kwargs[key] is not None:
	if key == "additional_special_tokens":
	special_tokens += [str(token) for token in init_kwargs[key]]
	else:
	special_tokens.append(str(init_kwargs[key]))

	with open(added_tokens_file, encoding="utf-8") as added_tokens_handle:
	added_tok_encoder = json.load(added_tokens_handle)
	for str_token, index in added_tok_encoder.items():
	# if index not in added_tokens_decoder and str_token not in added_tokens_map:
	special = str_token in special_tokens
	added_tokens_decoder[index] = AddedToken(
	str_token, rstrip=False, lstrip=False, normalized=not special, special=special
	)
	added_tokens_map[str(token)] = added_tokens_decoder[index]

	# allows converting a fast -> slow: add the `tokenizer.json`'s `"added_tokens"` to the slow tokenizer
	# if `tokenizer_config.json` is `None`
	if tokenizer_file is not None:
	# This is for slow so can be done before
	with open(tokenizer_file, encoding="utf-8") as tokenizer_file_handle:
	tokenizer_file_handle = json.load(tokenizer_file_handle)
	added_tokens = tokenizer_file_handle.pop("added_tokens")
	for serialized_tokens in added_tokens:
	idx = serialized_tokens.pop("id")
	added_tokens_decoder[idx] = AddedToken(**serialized_tokens)
	added_tokens_map[str(added_tokens_decoder[idx])] = added_tokens_decoder[idx]
	# end legacy

	# Passing AddedTokens and not strings to the class to prevent it from casting the string to a different AddedToken
	# convert {'__type': 'AddedToken', 'content': '<ent>', 'lstrip': False, 'normalized': True, ...} to AddedTokens
	init_kwargs["added_tokens_decoder"] = added_tokens_decoder
	init_kwargs = cls.convert_added_tokens(init_kwargs, save=False)
	for key in cls.SPECIAL_TOKENS_ATTRIBUTES & init_kwargs.keys():
	if added_tokens_map != {} and init_kwargs[key] is not None:
	if key != "additional_special_tokens":
	init_kwargs[key] = added_tokens_map.get(str(init_kwargs[key]), init_kwargs[key])

	# Instantiate the tokenizer.
	try:
	tokenizer = cls(init_inputs, *init_kwargs)
	except OSError:
	raise OSError(
	"Unable to load vocabulary from file. "
	"Please check that the provided vocabulary is accessible and not corrupted."
	)

	# if added_tokens_decoder != {} and max(list(added_tokens_decoder.keys())[-1], 0) > tokenizer.vocab_size:
	# logger.warning_advice(
	# "Special tokens have been added in the vocabulary, make sure the associated word embeddings are"
	# " fine-tuned or trained."
	# )
	return tokenizer