bygpt-jp-multi-lm-head-6.5B-alpha / tokenization_utf8_like_byte_v3.py
shirayukikun's picture
Upload folder using huggingface_hub
5b88d12 verified
raw
history blame
52.2 kB
# Copyright 2021 T5 Authors and HuggingFace Inc. team.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
"""Tokenization class for model ByT5."""
import warnings
from typing import (
Dict,
List,
Optional,
Union,
Tuple
)
import json
import os
import copy
import ast
import torch
import numpy as np
from transformers.tokenization_utils import AddedToken, PreTrainedTokenizer
from transformers.tokenization_utils_base import (
BatchEncoding,
EncodedInput,
PaddingStrategy,
TruncationStrategy
)
from transformers.utils import logging
logger = logging.get_logger(__name__)
SPECIAL_TOKENS_MAP_FILE = "special_tokens_map.json"
ADDED_TOKENS_FILE = "added_tokens.json"
TOKENIZER_CONFIG_FILE = "tokenizer_config.json"
LARGE_INTEGER = int(1e20)
def make_serializeable(obj):
if isinstance(obj, dict):
return {str(k): make_serializeable(v) for k, v in obj.items()}
if isinstance(obj, list):
return [make_serializeable(v) for v in obj]
if isinstance(obj, tuple):
return make_serializeable(list(obj))
return obj
class ByteLMTokenizerV3(PreTrainedTokenizer):
"""Byte tokenizer with completely seperate space for special tokens.
tok.pad Parameters
----------
PreTrainedTokenizer : _type_
_description_
Returns
-------
_type_
_description_
Raises
------
ValueError
_description_
ValueError
_description_
"""
model_input_names: list[str] = ["input_ids", "attention_mask"]
reserve_sizes: list[int] = [59, 0, 0, 0]
byte_head_ints: list[int] = [
int("11000000", base=2),
int("10000000", base=2),
int("01000000", base=2),
int("00000000", base=2),
]
byte_n_free_bits: list[int] = [6, 6, 6, 6]
patch_padding: bool
reserve_token_list: list[tuple[int]]
def __init__(
self,
patch_padding=True,
pad_token="<|pad|>",
eos_token="<|end_of_text|>",
bos_token="<|begin_of_text|>",
cls_token="<|cls|>",
sep_token="<|sep|>",
mask_token="<|mask|>",
vision_start_token="<|vision_start|>", # for vlm
vision_br_token="<|vision_br|>", # for vlm
vision_end_token="<|vision_end|>", # for vlm
start_header_id_token="<|start_header_id|>", # for it
end_header_id_token="<|end_header_id|>", # for it
eor_id="<|end_of_role|>", # for it
extra_ids=47,
**kwargs,
) -> None:
assert np.prod(
[
2**n_free_bits - reserve_size
for reserve_size, n_free_bits in zip(
self.reserve_sizes, self.byte_n_free_bits
)
]
) >= int(
"110000", base=16
), "Not enough positions for all unicode. Too many reserve size."
self.patch_padding = patch_padding
# list up all reserve tokens
self._list_up_reserve_tokens()
_bos_token = (
AddedToken(bos_token, lstrip=False, rstrip=False)
if isinstance(bos_token, str)
else bos_token
)
_eos_token = (
AddedToken(eos_token, lstrip=False, rstrip=False)
if isinstance(eos_token, str)
else eos_token
)
_pad_token = (
AddedToken(pad_token, lstrip=False, rstrip=False)
if isinstance(pad_token, str)
else pad_token
)
_cls_token = (
AddedToken(cls_token, lstrip=False, rstrip=False)
if isinstance(cls_token, str)
else cls_token
)
_sep_token = (
AddedToken(sep_token, lstrip=False, rstrip=False)
if isinstance(sep_token, str)
else sep_token
)
_mask_token = (
AddedToken(mask_token, lstrip=False, rstrip=False)
if isinstance(mask_token, str)
else mask_token
)
_vision_start_token = (
AddedToken(vision_start_token, lstrip=False, rstrip=False)
if isinstance(vision_start_token, str)
else vision_start_token
)
_vision_br_token = (
AddedToken(vision_br_token, lstrip=False, rstrip=False)
if isinstance(vision_br_token, str)
else vision_br_token
)
_vision_end_token = (
AddedToken(vision_end_token, lstrip=False, rstrip=False)
if isinstance(vision_end_token, str)
else vision_end_token
)
_start_header_id_token = (
AddedToken(start_header_id_token, lstrip=False, rstrip=False)
if isinstance(start_header_id_token, str)
else start_header_id_token
)
_end_header_id_token = (
AddedToken(end_header_id_token, lstrip=False, rstrip=False)
if isinstance(end_header_id_token, str)
else end_header_id_token
)
_eor_id = (
AddedToken(eor_id, lstrip=False, rstrip=False)
if isinstance(eor_id, str)
else eor_id
)
self.offset = 0
self._added_tokens_decoder = {
self.reserve_token_list[i]: special_token
for i, special_token in enumerate(
[
_pad_token,
_eos_token,
_bos_token,
_cls_token,
_sep_token,
_mask_token,
_vision_start_token,
_vision_br_token,
_vision_end_token,
_start_header_id_token,
_end_header_id_token,
_eor_id,
]
)
}
offset = len(self._added_tokens_decoder)
extra_tokens = {
self.reserve_token_list[j + offset]: AddedToken(
f"<|extra_id_{i}|>", lstrip=False, rstrip=False
)
for j, i in enumerate(range(extra_ids))
}
self._added_tokens_decoder.update(extra_tokens)
super().__init__(
bos_token=_bos_token,
eos_token=_eos_token,
pad_token=_pad_token,
cls_token=_cls_token,
sep_token=_sep_token,
mask_token=_mask_token,
vision_start_token=_vision_start_token,
vision_br_token=_vision_br_token,
vision_end_token=_vision_end_token,
start_header_id_token=_start_header_id_token,
end_header_id_token=_end_header_id_token,
eor_id=_eor_id,
**kwargs,
)
self._vocab_size = len(self.get_vocab())
def _list_up_reserve_tokens(self):
self.reserve_token_list = [
(
i + self.byte_head_ints[0],
self.byte_head_ints[1],
self.byte_head_ints[2],
self.byte_head_ints[3],
)
for i in range(self.reserve_sizes[0])
]
@property
def vocab_size(self):
return self._vocab_size
def create_tree(
self, byte_options: list[list[int]], byte_index: int, max_byte_index: int
) -> list[list[int]]:
if byte_index == max_byte_index:
return [[reserve_option] for reserve_option in byte_options[byte_index]]
concat_list = []
for byte_reserve_option in byte_options[byte_index]:
if byte_reserve_option is not None:
concat_list += [
[byte_reserve_option] + following_bytes
if following_bytes != [None]
else [byte_reserve_option]
for following_bytes in self.create_tree(
byte_options=byte_options,
byte_index=byte_index + 1,
max_byte_index=max_byte_index,
)
]
else:
concat_list.append([None])
return concat_list
def get_vocab(self):
byte_options = [
list(range(reserve_size, 2**n_free_bits))
for reserve_size, n_free_bits in zip(
self.reserve_sizes, self.byte_n_free_bits
)
]
if not self.patch_padding:
for i in range(len(byte_options) - 1):
byte_options[i] += [None]
byte_options.reverse()
byte_tokens = self.create_tree(
byte_options=byte_options, byte_index=0, max_byte_index=3
)
byte_tokens = sorted(
byte_tokens,
key=lambda lst: sum([e * (256**i) for i, e in enumerate(lst)])
+ 256 ** len(lst),
)
for byte_token_index in range(len(byte_tokens)):
byte_tokens[byte_token_index].reverse()
for position in range(len(byte_tokens[byte_token_index])):
byte_tokens[byte_token_index][position] += self.byte_head_ints[position]
byte_tokens[byte_token_index] = tuple(byte_tokens[byte_token_index])
vocab = {self.convert_ids_to_tokens(tokens): tokens for tokens in byte_tokens}
vocab.pop("")
vocab.update(self.added_tokens_encoder)
return vocab
def _get_padding_truncation_strategies(
self, padding=False, truncation=None, max_length=None, pad_to_multiple_of=None, verbose=True, **kwargs
):
"""
Find the correct padding/truncation strategy
"""
# Backward compatibility for previous behavior, maybe we should deprecate it:
# If you only set max_length, it activates truncation for max_length
if max_length is not None and padding is False and truncation is None:
if verbose:
if not self.deprecation_warnings.get("Truncation-not-explicitly-activated", False):
logger.warning(
"Truncation was not explicitly activated but `max_length` is provided a specific value, please"
" use `truncation=True` to explicitly truncate examples to max length. Defaulting to"
" 'longest_first' truncation strategy. If you encode pairs of sequences (GLUE-style) with the"
" tokenizer you can select this strategy more precisely by providing a specific strategy to"
" `truncation`."
)
self.deprecation_warnings["Truncation-not-explicitly-activated"] = True
truncation = "longest_first"
# Get padding strategy
if padding is not False:
if padding is True:
if verbose:
if max_length is not None and (
truncation is None or truncation is False or truncation == "do_not_truncate"
):
warnings.warn(
"`max_length` is ignored when `padding`=`True` and there is no truncation strategy. "
"To pad to max length, use `padding='max_length'`."
)
padding_strategy = PaddingStrategy.LONGEST # Default to pad to the longest sequence in the batch
elif not isinstance(padding, PaddingStrategy):
padding_strategy = PaddingStrategy(padding)
elif isinstance(padding, PaddingStrategy):
padding_strategy = padding
else:
padding_strategy = PaddingStrategy.DO_NOT_PAD
# Get truncation strategy
if truncation is not False and truncation is not None:
if truncation is True:
truncation_strategy = (
TruncationStrategy.LONGEST_FIRST
) # Default to truncate the longest sequences in pairs of inputs
elif not isinstance(truncation, TruncationStrategy):
truncation_strategy = TruncationStrategy(truncation)
elif isinstance(truncation, TruncationStrategy):
truncation_strategy = truncation
else:
truncation_strategy = TruncationStrategy.DO_NOT_TRUNCATE
# Set max length if needed
if max_length is None:
if padding_strategy == PaddingStrategy.MAX_LENGTH:
if self.model_max_length > LARGE_INTEGER:
if verbose:
if not self.deprecation_warnings.get("Asking-to-pad-to-max_length", False):
logger.warning(
"Asking to pad to max_length but no maximum length is provided and the model has no"
" predefined maximum length. Default to no padding."
)
self.deprecation_warnings["Asking-to-pad-to-max_length"] = True
padding_strategy = PaddingStrategy.DO_NOT_PAD
else:
max_length = self.model_max_length
if truncation_strategy != TruncationStrategy.DO_NOT_TRUNCATE:
if self.model_max_length > LARGE_INTEGER:
if verbose:
if not self.deprecation_warnings.get("Asking-to-truncate-to-max_length", False):
logger.warning(
"Asking to truncate to max_length but no maximum length is provided and the model has"
" no predefined maximum length. Default to no truncation."
)
self.deprecation_warnings["Asking-to-truncate-to-max_length"] = True
truncation_strategy = TruncationStrategy.DO_NOT_TRUNCATE
else:
max_length = self.model_max_length
# Test if we have a padding token
if padding_strategy != PaddingStrategy.DO_NOT_PAD and self.pad_token is None:
raise ValueError(
"Asking to pad but the tokenizer does not have a padding token. "
"Please select a token to use as `pad_token` `(tokenizer.pad_token = tokenizer.eos_token e.g.)` "
"or add a new pad token via `tokenizer.add_special_tokens({'pad_token': '[PAD]'})`."
)
# Check that we will truncate to a multiple of pad_to_multiple_of if both are provided
if (
truncation_strategy != TruncationStrategy.DO_NOT_TRUNCATE
and padding_strategy != PaddingStrategy.DO_NOT_PAD
and pad_to_multiple_of is not None
and max_length is not None
and (max_length % pad_to_multiple_of != 0)
):
raise ValueError(
"Truncation and padding are both activated but "
f"truncation length ({max_length}) is not a multiple of pad_to_multiple_of ({pad_to_multiple_of})."
)
return padding_strategy, truncation_strategy, max_length, kwargs
def _add_bos_if_not_present(self, token_ids: list[int]) -> list[int]:
"""Do not add bos again if user already added it."""
if len(token_ids) > 0 and token_ids[0] == self.bos_token_id:
warnings.warn(
f"This sequence already has {self.bos_token}. In future versions this behavior may lead to duplicated"
" bos tokens being added."
)
return token_ids
else:
return list(self.bos_token_id) + token_ids
def _add_eos_if_not_present(self, token_ids: list[int]) -> list[int]:
"""Do not add eos again if user already added it."""
if len(token_ids) > 0 and token_ids[-1] == self.eos_token_id:
warnings.warn(
f"This sequence already has {self.eos_token}. In future versions this behavior may lead to duplicated"
" eos tokens being added."
)
return token_ids
else:
return token_ids + list(self.eos_token_id)
def _pad(
self,
encoded_inputs: Union[Dict[str, EncodedInput], BatchEncoding],
max_length: Optional[int] = None,
padding_strategy: PaddingStrategy = PaddingStrategy.DO_NOT_PAD,
pad_to_multiple_of: Optional[int] = None,
padding_side: Optional[bool] = None,
return_attention_mask: Optional[bool] = None,
) -> dict:
"""
Pad encoded inputs (on left/right and up to predefined length or max length in the batch)
Args:
encoded_inputs:
Dictionary of tokenized inputs (`List[int]`) or batch of tokenized inputs (`List[List[int]]`).
max_length: maximum length of the returned list and optionally padding length (see below).
Will truncate by taking into account the special tokens.
padding_strategy: PaddingStrategy to use for padding.
- PaddingStrategy.LONGEST Pad to the longest sequence in the batch
- PaddingStrategy.MAX_LENGTH: Pad to the max length (default)
- PaddingStrategy.DO_NOT_PAD: Do not pad
The tokenizer padding sides are defined in `padding_side` argument:
- 'left': pads on the left of the sequences
- 'right': pads on the right of the sequences
pad_to_multiple_of: (optional) Integer if set will pad the sequence to a multiple of the provided value.
This is especially useful to enable the use of Tensor Core on NVIDIA hardware with compute capability
`>= 7.5` (Volta).
padding_side:
The side on which the model should have padding applied. Should be selected between ['right', 'left'].
Default value is picked from the class attribute of the same name.
return_attention_mask:
(optional) Set to False to avoid returning attention mask (default: set to model specifics)
"""
# Load from model defaults
if return_attention_mask is None:
return_attention_mask = "attention_mask" in self.model_input_names
required_input = encoded_inputs[self.model_input_names[0]]
if padding_strategy == PaddingStrategy.LONGEST:
max_length = len(required_input)
if (
max_length is not None
and pad_to_multiple_of is not None
and (max_length % pad_to_multiple_of != 0)
):
max_length = ((max_length // pad_to_multiple_of) + 1) * pad_to_multiple_of
needs_to_be_padded = (
padding_strategy != PaddingStrategy.DO_NOT_PAD
and len(required_input) != max_length
)
# Initialize attention mask if not present.
if return_attention_mask and "attention_mask" not in encoded_inputs:
encoded_inputs["attention_mask"] = [1] * len(required_input)
if needs_to_be_padded:
if self.patch_padding:
difference = (max_length - len(required_input)) // len(
self.byte_head_ints
)
mask_patch_size = 4
else:
difference = max_length - len(required_input)
mask_patch_size = 1
padding_side = (
padding_side if padding_side is not None else self.padding_side
)
if padding_side == "right":
if return_attention_mask:
encoded_inputs["attention_mask"] = (
encoded_inputs["attention_mask"]
+ [0] * difference * mask_patch_size
)
if "token_type_ids" in encoded_inputs:
encoded_inputs["token_type_ids"] = (
encoded_inputs["token_type_ids"]
+ list(self.pad_token_type_id) * difference
)
if "special_tokens_mask" in encoded_inputs:
encoded_inputs["special_tokens_mask"] = (
encoded_inputs["special_tokens_mask"]
+ [1] * difference * mask_patch_size
)
encoded_inputs[self.model_input_names[0]] = (
required_input + list(self.pad_token_id) * difference
)
elif padding_side == "left":
if return_attention_mask:
encoded_inputs["attention_mask"] = [
0
] * difference * mask_patch_size + encoded_inputs["attention_mask"]
if "token_type_ids" in encoded_inputs:
encoded_inputs["token_type_ids"] = (
list(self.pad_token_type_id) * difference
+ encoded_inputs["token_type_ids"]
)
if "special_tokens_mask" in encoded_inputs:
encoded_inputs["special_tokens_mask"] = [
1
] * difference * mask_patch_size + encoded_inputs[
"special_tokens_mask"
]
encoded_inputs[self.model_input_names[0]] = (
list(self.pad_token_id) * difference + required_input
)
else:
raise ValueError(f"Invalid padding strategy:{padding_side}")
return encoded_inputs
def build_inputs_with_special_tokens(
self, token_ids_0: list[int], token_ids_1: list[int] | None = None
) -> list[int]:
"""
Build model inputs from a sequence or a pair of sequence for sequence classification tasks by concatenating and
adding special tokens. A sequence has the following format:
- single sequence: `X </s>`
- pair of sequences: `A </s> B </s>`
Args:
token_ids_0 (`List[int]`):
List of IDs to which the special tokens will be added.
token_ids_1 (`List[int]`, *optional*):
Optional second list of IDs for sequence pairs.
Returns:
`List[int]`: List of [input IDs](../glossary#input-ids) with the appropriate special tokens.
"""
token_ids_0 = self._add_bos_if_not_present(token_ids_0)
token_ids_0 = self._add_eos_if_not_present(token_ids_0)
if token_ids_1 is None:
return token_ids_0
else:
token_ids_1 = self._add_bos_if_not_present(token_ids_1)
token_ids_1 = self._add_eos_if_not_present(token_ids_1)
return token_ids_0 + token_ids_1
def _tokenize(self, text: str) -> list[str]:
"""Take as input a string and return a list of strings (tokens) for words/sub-words"""
token_ids = []
for c in text:
token_ids.extend(self.unicode_to_bytes(ord(c)))
# Convert to string
token_ids = [str(i) for i in token_ids]
return token_ids
def _convert_token_to_id(self, token):
"""Converts a token (str) in an id using the vocab."""
token_id = int(token) + self.offset
return token_id
def _convert_id_to_token(self, index):
"""Converts an index (integer) in a token (str) using the vocab."""
return str(index - self.offset)
def _convert_token_to_id_with_added_voc(self, token):
if token is None:
return None
if token in self._added_tokens_encoder:
return list(self._added_tokens_encoder[token])
return [self._convert_token_to_id(token)]
def convert_tokens_to_ids(
self, tokens: Union[str, List[str]]
) -> Union[int, List[int]]:
"""
Converts a token string (or a sequence of tokens) in a single integer id (or a sequence of ids), using the
vocabulary.
Args:
tokens (`str` or `List[str]`): One or several token(s) to convert to token id(s).
Returns:
`int` or `List[int]`: The token id or list of token ids.
"""
if tokens is None:
return None
if isinstance(tokens, str):
return self._convert_token_to_id_with_added_voc(tokens)
ids = []
for token in tokens:
ids.extend(self._convert_token_to_id_with_added_voc(token))
return ids
def convert_bytes_for_single_char_to_char(self, ids: list[int]) -> str:
byte_ints = []
byte_offset = 1
if self.is_special_token(ids): # special token
return self.added_tokens_decoder[tuple(ids)].__str__()
for byte_position in range(1, len(ids) + 1):
byte_int = (
ids[-byte_position]
- self.byte_head_ints[-byte_position]
- self.reserve_sizes[-byte_position]
)
if byte_int != -self.reserve_sizes[-byte_position]: # not padding
byte_ints.append(byte_int * byte_offset)
byte_offset *= (
2 ** self.byte_n_free_bits[-byte_position]
- self.reserve_sizes[-byte_position]
)
codepoint = sum(byte_ints)
if codepoint >= int("110000", base=16):
return None
else:
try:
return chr(codepoint)
except ValueError:
return None
# def is_special_token(self, ids: list[int]):
# return ids[0] < self.byte_head_ints[0] + (self.reserve_sizes[0] - 1)
def is_special_token(self, ids: list[int]):
return tuple(ids) in self._added_tokens_decoder
def convert_ids_to_tokens(
self, ids: list[int] | tuple[int], skip_special_tokens: bool = False
) -> str | None:
"""convert ids for single/multiple unicode character(s) to unicode character(s)"""
decoded_chars = ""
if isinstance(ids, tuple):
ids = list(ids)
if self.patch_padding:
for byte_position in range(0, len(ids), len(self.byte_head_ints)):
char_bytes = ids[
byte_position : byte_position + len(self.byte_head_ints)
]
if (
skip_special_tokens and not self.is_special_token(char_bytes)
) or not skip_special_tokens:
char = self.convert_bytes_for_single_char_to_char(char_bytes)
if char:
decoded_chars += char
return decoded_chars
if not self.is_special_token(ids): # not special token
byte_ints = []
byte_offset = 1
for byte_position in range(1, len(ids) + 1):
if ids[-byte_position] == 0:
break
byte_int = (
ids[-byte_position]
- self.byte_head_ints[-byte_position]
- self.reserve_sizes[-byte_position]
)
assert byte_int >= 0
byte_ints.append(byte_int * byte_offset)
byte_offset *= (
2 ** self.byte_n_free_bits[-byte_position]
- self.reserve_sizes[-byte_position]
)
codepoint = sum(byte_ints)
if codepoint >= int("110000", base=16):
return None
else:
return chr(codepoint)
else: # special token
return self._added_tokens_decoder[tuple(ids)]
def unicode_to_bytes(self, codepoint: int) -> list[int]:
byte_list_reversed = []
for byte_position_from_right in range(len(self.byte_n_free_bits)):
byte_n_free_ids = (
2 ** self.byte_n_free_bits[-1 - byte_position_from_right]
- self.reserve_sizes[-1 - byte_position_from_right]
)
byte_id = (
codepoint % byte_n_free_ids
+ self.reserve_sizes[-1 - byte_position_from_right]
+ self.byte_head_ints[-1 - byte_position_from_right]
)
codepoint //= byte_n_free_ids
byte_list_reversed.append(byte_id)
if codepoint == 0:
if self.patch_padding:
for pad_byte_position_from_right in range(
len(byte_list_reversed), len(self.byte_n_free_bits)
):
byte_list_reversed.append(
self.byte_head_ints[-1 - pad_byte_position_from_right] + self.reserve_sizes[-1 - pad_byte_position_from_right]
)
byte_list_reversed.reverse()
return byte_list_reversed
raise ValueError("codepoint is too large")
# ByteTokenizer has no vocab file
def save_vocabulary(
self, save_directory: str, filename_prefix: str | None = None
) -> tuple[str]:
return ()
def image_to_ids(self, image_data: list[list[list[int]]]) -> list[int]:
image_data = torch.tensor(image_data)
x, y, rgb = image_data.size()
assert rgb == 3
image_br_token = list(self.added_tokens_encoder["<|vision_br|>"])
image_special_byte_index = self.added_tokens_encoder["<|vision_start|>"][0]
# add img byte by padding to the beginning
image_data = torch.nn.functional.pad(
image_data, (1, 0), "constant", value=image_special_byte_index
).view(x, y * 4)
image_data = torch.concat(
[image_data, torch.tensor(image_br_token * x).view(x, 4)], dim=1
).view(-1)
return image_data.tolist()
def save_pretrained(
self,
save_directory: Union[str, os.PathLike],
legacy_format: Optional[bool] = None,
filename_prefix: Optional[str] = None,
push_to_hub: bool = False,
**kwargs,
) -> Tuple[str]:
"""
Save the full tokenizer state.
This method make sure the full tokenizer can then be re-loaded using the
[`~tokenization_utils_base.PreTrainedTokenizer.from_pretrained`] class method..
Warning,None This won't save modifications you may have applied to the tokenizer after the instantiation (for
instance, modifying `tokenizer.do_lower_case` after creation).
Args:
save_directory (`str` or `os.PathLike`): The path to a directory where the tokenizer will be saved.
legacy_format (`bool`, *optional*):
Only applicable for a fast tokenizer. If unset (default), will save the tokenizer in the unified JSON
format as well as in legacy format if it exists, i.e. with tokenizer specific vocabulary and a separate
added_tokens files.
If `False`, will only save the tokenizer in the unified JSON format. This format is incompatible with
"slow" tokenizers (not powered by the *tokenizers* library), so the tokenizer will not be able to be
loaded in the corresponding "slow" tokenizer.
If `True`, will save the tokenizer in legacy format. If the "slow" tokenizer doesn't exits, a value
error is raised.
filename_prefix (`str`, *optional*):
A prefix to add to the names of the files saved by the tokenizer.
push_to_hub (`bool`, *optional*, defaults to `False`):
Whether or not to push your model to the Hugging Face model hub after saving it. You can specify the
repository you want to push to with `repo_id` (will default to the name of `save_directory` in your
namespace).
kwargs (`Dict[str, Any]`, *optional*):
Additional key word arguments passed along to the [`~utils.PushToHubMixin.push_to_hub`] method.
Returns:
A tuple of `str`: The files saved.
"""
use_auth_token = kwargs.pop("use_auth_token", None)
if use_auth_token is not None:
warnings.warn(
"The `use_auth_token` argument is deprecated and will be removed in v5 of Transformers. Please use `token` instead.",
FutureWarning,
)
if kwargs.get("token", None) is not None:
raise ValueError(
"`token` and `use_auth_token` are both specified. Please set only the argument `token`."
)
kwargs["token"] = use_auth_token
if os.path.isfile(save_directory):
logger.error(f"Provided path ({save_directory}) should be a directory, not a file")
return
os.makedirs(save_directory, exist_ok=True)
if push_to_hub:
commit_message = kwargs.pop("commit_message", None)
repo_id = kwargs.pop("repo_id", save_directory.split(os.path.sep)[-1])
repo_id = self._create_repo(repo_id, **kwargs)
files_timestamps = self._get_files_timestamps(save_directory)
special_tokens_map_file = os.path.join(
save_directory, (filename_prefix + "-" if filename_prefix else "") + SPECIAL_TOKENS_MAP_FILE
)
tokenizer_config_file = os.path.join(
save_directory, (filename_prefix + "-" if filename_prefix else "") + TOKENIZER_CONFIG_FILE
)
tokenizer_config = copy.deepcopy(self.init_kwargs)
# Let's save the init kwargs
target_keys = set(self.init_kwargs.keys())
# Let's save the special tokens map (only the strings)
target_keys.update(["model_max_length", "clean_up_tokenization_spaces"])
for k in target_keys:
if hasattr(self, k):
tokenizer_config[k] = getattr(self, k)
# Let's make sure we properly save the special tokens.
tokenizer_config.update(self.special_tokens_map)
if self.chat_template is not None:
if isinstance(self.chat_template, dict):
# Chat template dicts are saved to the config as lists of dicts with fixed key names.
# They will be reconstructed as a single dict during loading.
tokenizer_config["chat_template"] = [{"name": k, "template": v} for k, v in self.chat_template.items()]
else:
tokenizer_config["chat_template"] = self.chat_template
if len(self.init_inputs) > 0:
tokenizer_config["init_inputs"] = copy.deepcopy(self.init_inputs)
for file_id in self.vocab_files_names.keys():
tokenizer_config.pop(file_id, None)
# no typefields, this way old fast and slow can load it
tokenizer_config = self.convert_added_tokens(tokenizer_config, add_type_field=True, save=True)
# Process added tokens seperatly: allows previous versions to ignore it!
added_tokens = {}
for key, value in self.added_tokens_decoder.items():
added_tokens[key] = value.__getstate__()
tokenizer_config["added_tokens_decoder"] = added_tokens
# Add tokenizer class to the tokenizer config to be able to reload it with from_pretrained
tokenizer_class = self.__class__.__name__
# Remove the Fast at the end unless we have a special `PreTrainedTokenizerFast`
if tokenizer_class.endswith("Fast") and tokenizer_class != "PreTrainedTokenizerFast":
tokenizer_class = tokenizer_class[:-4]
tokenizer_config["tokenizer_class"] = tokenizer_class
if getattr(self, "_auto_map", None) is not None:
tokenizer_config["auto_map"] = self._auto_map
if getattr(self, "_processor_class", None) is not None:
tokenizer_config["processor_class"] = self._processor_class
# If we have a custom model, we copy the file defining it in the folder and set the attributes so it can be
# loaded from the Hub.
if self._auto_class is not None:
custom_object_save(self, save_directory, config=tokenizer_config)
# remove private information
if "name_or_path" in tokenizer_config:
tokenizer_config.pop("name_or_path")
tokenizer_config.pop("special_tokens_map_file", None)
tokenizer_config.pop("tokenizer_file", None)
with open(tokenizer_config_file, "w", encoding="utf-8") as f:
out_str = json.dumps(
make_serializeable(tokenizer_config),
indent=2,
sort_keys=True,
ensure_ascii=False
) + "\n"
f.write(out_str)
logger.info(f"tokenizer config file saved in {tokenizer_config_file}")
# Sanitize AddedTokens in special_tokens_map
# kept for forward compatibility, will be removed in transoformers 5. Typefields are not saved for FC, special should not be save either
write_dict = self.convert_added_tokens(self.special_tokens_map_extended, save=True, add_type_field=False)
with open(special_tokens_map_file, "w", encoding="utf-8") as f:
out_str = json.dumps(write_dict, indent=2, sort_keys=True, ensure_ascii=False) + "\n"
f.write(out_str)
logger.info(f"Special tokens file saved in {special_tokens_map_file}")
file_names = (tokenizer_config_file, special_tokens_map_file)
save_files = self._save_pretrained(
save_directory=save_directory,
file_names=file_names,
legacy_format=legacy_format,
filename_prefix=filename_prefix,
)
if push_to_hub:
self._upload_modified_files(
save_directory,
repo_id,
files_timestamps,
commit_message=commit_message,
token=kwargs.get("token"),
)
return save_files
def _save_pretrained(
self,
save_directory: Union[str, os.PathLike],
file_names: Tuple[str],
legacy_format: Optional[bool] = None,
filename_prefix: Optional[str] = None,
) -> Tuple[str]:
"""
Save a tokenizer using the slow-tokenizer/legacy format: vocabulary + added tokens.
Fast tokenizers can also be saved in a unique JSON file containing {config + vocab + added-tokens} using the
specific [`~tokenization_utils_fast.PreTrainedTokenizerFast._save_pretrained`]
"""
if legacy_format is False:
raise ValueError(
"Only fast tokenizers (instances of PreTrainedTokenizerFast) can be saved in non legacy format."
)
save_directory = str(save_directory)
added_tokens_file = os.path.join(
save_directory, (filename_prefix + "-" if filename_prefix else "") + ADDED_TOKENS_FILE
)
# the new get_added_vocab() also returns special tokens and tokens that have an index < vocab_size
# added_vocab = {tok: index for tok, index in self.added_tokens_encoder.items() if index >= self.vocab_size}
added_vocab = {tok: list(index) for tok, index in self.added_tokens_encoder.items()}
if added_vocab:
with open(added_tokens_file, "w", encoding="utf-8") as f:
out_str = json.dumps(added_vocab, indent=2, sort_keys=True, ensure_ascii=False) + "\n"
f.write(out_str)
logger.info(f"added tokens file saved in {added_tokens_file}")
vocab_files = self.save_vocabulary(save_directory, filename_prefix=filename_prefix)
return file_names + vocab_files + (added_tokens_file,)
@classmethod
def _from_pretrained(
cls,
resolved_vocab_files,
pretrained_model_name_or_path,
init_configuration,
*init_inputs,
token=None,
cache_dir=None,
local_files_only=False,
_commit_hash=None,
_is_local=False,
trust_remote_code=False,
**kwargs,
):
# We instantiate fast tokenizers based on a slow tokenizer if we don't have access to the tokenizer.json
# file or if `from_slow` is set to True.
from_slow = kwargs.get("from_slow", False)
gguf_file = kwargs.get("gguf_file", None)
has_tokenizer_file = resolved_vocab_files.get("tokenizer_file", None) is not None
# If one passes a GGUF file path to `gguf_file` there is no need for this check as the tokenizer will be
# loaded directly from the GGUF file.
if (from_slow or not has_tokenizer_file) and cls.slow_tokenizer_class is not None and not gguf_file:
slow_tokenizer = (cls.slow_tokenizer_class)._from_pretrained(
copy.deepcopy(resolved_vocab_files),
pretrained_model_name_or_path,
copy.deepcopy(init_configuration),
*init_inputs,
token=token,
cache_dir=cache_dir,
local_files_only=local_files_only,
_commit_hash=_commit_hash,
**(copy.deepcopy(kwargs)),
)
else:
slow_tokenizer = None
# Prepare tokenizer initialization kwargs
# Did we saved some inputs and kwargs to reload ?
tokenizer_config_file = resolved_vocab_files.pop("tokenizer_config_file", None)
if tokenizer_config_file is not None:
with open(tokenizer_config_file, encoding="utf-8") as tokenizer_config_handle:
init_kwargs = json.load(tokenizer_config_handle)
# First attempt. We get tokenizer_class from tokenizer_config to check mismatch between tokenizers.
config_tokenizer_class = init_kwargs.get("tokenizer_class")
init_kwargs.pop("tokenizer_class", None)
if not has_tokenizer_file:
init_kwargs.pop("tokenizer_file", None)
saved_init_inputs = init_kwargs.pop("init_inputs", ())
if not init_inputs:
init_inputs = saved_init_inputs
else:
config_tokenizer_class = None
init_kwargs = init_configuration
if not _is_local:
if "auto_map" in init_kwargs:
# For backward compatibility with odl format.
if isinstance(init_kwargs["auto_map"], (tuple, list)):
init_kwargs["auto_map"] = {"AutoTokenizer": init_kwargs["auto_map"]}
if config_tokenizer_class is None:
# Matt: This entire block is only used to decide if the tokenizer class matches the class in the repo.
# If not, it raises a warning, but otherwise continues. Since we mostly load tokenizers with
# AutoTokenizer these days, it seems like a lot of work (and a source of bugs) for little gain.
# Maybe we can just remove this entirely?
from transformers.models.auto.configuration_auto import AutoConfig # tests_ignore
# Second attempt. If we have not yet found tokenizer_class, let's try to use the config.
try:
config = AutoConfig.from_pretrained(
pretrained_model_name_or_path,
token=token,
cache_dir=cache_dir,
local_files_only=local_files_only,
trust_remote_code=trust_remote_code,
_commit_hash=_commit_hash,
)
config_tokenizer_class = config.tokenizer_class
except (OSError, ValueError, KeyError):
# skip if an error occurred.
config = None
if config_tokenizer_class is None:
# Third attempt. If we have not yet found the original type of the tokenizer,
# we are loading we see if we can infer it from the type of the configuration file
from transformers.models.auto.tokenization_auto import TOKENIZER_MAPPING_NAMES # tests_ignore
if hasattr(config, "model_type"):
model_type = config.model_type
else:
# Fallback: use pattern matching on the string.
model_type = None
for pattern in TOKENIZER_MAPPING_NAMES.keys():
if pattern in str(pretrained_model_name_or_path):
model_type = pattern
break
if model_type is not None:
config_tokenizer_class, config_tokenizer_class_fast = TOKENIZER_MAPPING_NAMES.get(
model_type, (None, None)
)
if config_tokenizer_class is None:
config_tokenizer_class = config_tokenizer_class_fast
if config_tokenizer_class is not None:
if cls.__name__.replace("Fast", "") != config_tokenizer_class.replace("Fast", ""):
logger.warning(
"The tokenizer class you load from this checkpoint is not the same type as the class this"
" function is called from. It may result in unexpected tokenization. \nThe tokenizer class you"
f" load from this checkpoint is '{config_tokenizer_class}'. \nThe class this function is called"
f" from is '{cls.__name__}'."
)
# Update with newly provided kwargs
init_kwargs.update(kwargs)
# Merge resolved_vocab_files arguments in init_kwargs.
added_tokens_file = resolved_vocab_files.pop("added_tokens_file", None)
special_tokens_map_file = resolved_vocab_files.pop("special_tokens_map_file", None)
for args_name, file_path in resolved_vocab_files.items():
if args_name not in init_kwargs:
init_kwargs[args_name] = file_path
tokenizer_file = resolved_vocab_files.pop("tokenizer_file", None)
if slow_tokenizer is not None:
init_kwargs["__slow_tokenizer"] = slow_tokenizer
init_kwargs["name_or_path"] = pretrained_model_name_or_path
#### Handle tokenizer serialization of added and special tokens
added_tokens_decoder: Dict[int, AddedToken] = {}
added_tokens_map: Dict[str, AddedToken] = {}
# if we have info on the slow added tokens
if "added_tokens_decoder" in init_kwargs:
for idx, token in init_kwargs["added_tokens_decoder"].items():
if isinstance(token, dict):
token = AddedToken(**token)
if isinstance(token, AddedToken):
added_tokens_decoder[ast.literal_eval(idx)] = token
added_tokens_map[str(token)] = token
else:
raise ValueError(
f"Found a {token.__class__} in the saved `added_tokens_decoder`, should be a dictionary or an AddedToken instance"
)
else:
# begin legacy: read the added_tokens_file and update kwargs with special_tokens_map if modified
if special_tokens_map_file is not None:
with open(special_tokens_map_file, encoding="utf-8") as special_tokens_map_handle:
special_tokens_map = json.load(special_tokens_map_handle)
for key, value in special_tokens_map.items():
if key in kwargs and kwargs[key]:
# This value has already been redefined by the kwargs
# We keep this new value and ignore the one stored in the special_tokens_map_file
continue
if isinstance(value, dict):
value = AddedToken(**value, special=True)
elif key == "additional_special_tokens" and isinstance(value, list):
additional_special_tokens = init_kwargs.pop("additional_special_tokens", []) or []
for token in value:
token = AddedToken(**token, special=True) if isinstance(token, dict) else token
if token not in additional_special_tokens:
additional_special_tokens.append(token)
value = additional_special_tokens
init_kwargs[key] = value
# slow -> slow|fast, legacy: convert the `"added_tokens.json"` file to `added_tokens_decoder`.
# this is for legacy purpose. We don't add the tokens after init for efficiency.
if added_tokens_file is not None:
special_tokens = []
for key in cls.SPECIAL_TOKENS_ATTRIBUTES & init_kwargs.keys():
if init_kwargs[key] is not None:
if key == "additional_special_tokens":
special_tokens += [str(token) for token in init_kwargs[key]]
else:
special_tokens.append(str(init_kwargs[key]))
with open(added_tokens_file, encoding="utf-8") as added_tokens_handle:
added_tok_encoder = json.load(added_tokens_handle)
for str_token, index in added_tok_encoder.items():
# if index not in added_tokens_decoder and str_token not in added_tokens_map:
special = str_token in special_tokens
added_tokens_decoder[index] = AddedToken(
str_token, rstrip=False, lstrip=False, normalized=not special, special=special
)
added_tokens_map[str(token)] = added_tokens_decoder[index]
# allows converting a fast -> slow: add the `tokenizer.json`'s `"added_tokens"` to the slow tokenizer
# if `tokenizer_config.json` is `None`
if tokenizer_file is not None:
# This is for slow so can be done before
with open(tokenizer_file, encoding="utf-8") as tokenizer_file_handle:
tokenizer_file_handle = json.load(tokenizer_file_handle)
added_tokens = tokenizer_file_handle.pop("added_tokens")
for serialized_tokens in added_tokens:
idx = serialized_tokens.pop("id")
added_tokens_decoder[idx] = AddedToken(**serialized_tokens)
added_tokens_map[str(added_tokens_decoder[idx])] = added_tokens_decoder[idx]
# end legacy
# Passing AddedTokens and not strings to the class to prevent it from casting the string to a different AddedToken
# convert {'__type': 'AddedToken', 'content': '<ent>', 'lstrip': False, 'normalized': True, ...} to AddedTokens
init_kwargs["added_tokens_decoder"] = added_tokens_decoder
init_kwargs = cls.convert_added_tokens(init_kwargs, save=False)
for key in cls.SPECIAL_TOKENS_ATTRIBUTES & init_kwargs.keys():
if added_tokens_map != {} and init_kwargs[key] is not None:
if key != "additional_special_tokens":
init_kwargs[key] = added_tokens_map.get(str(init_kwargs[key]), init_kwargs[key])
# Instantiate the tokenizer.
try:
tokenizer = cls(*init_inputs, **init_kwargs)
except OSError:
raise OSError(
"Unable to load vocabulary from file. "
"Please check that the provided vocabulary is accessible and not corrupted."
)
# if added_tokens_decoder != {} and max(list(added_tokens_decoder.keys())[-1], 0) > tokenizer.vocab_size:
# logger.warning_advice(
# "Special tokens have been added in the vocabulary, make sure the associated word embeddings are"
# " fine-tuned or trained."
# )
return tokenizer