|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
"""Tokenization class for model ByT5.""" |
|
|
|
|
|
import warnings |
|
|
from typing import ( |
|
|
Dict, |
|
|
List, |
|
|
Optional, |
|
|
Union, |
|
|
Tuple |
|
|
) |
|
|
import json |
|
|
import os |
|
|
import copy |
|
|
import ast |
|
|
|
|
|
import torch |
|
|
import numpy as np |
|
|
from transformers.tokenization_utils import AddedToken, PreTrainedTokenizer |
|
|
from transformers.tokenization_utils_base import ( |
|
|
BatchEncoding, |
|
|
EncodedInput, |
|
|
PaddingStrategy, |
|
|
TruncationStrategy |
|
|
) |
|
|
from transformers.utils import logging |
|
|
|
|
|
logger = logging.get_logger(__name__) |
|
|
|
|
|
SPECIAL_TOKENS_MAP_FILE = "special_tokens_map.json" |
|
|
ADDED_TOKENS_FILE = "added_tokens.json" |
|
|
TOKENIZER_CONFIG_FILE = "tokenizer_config.json" |
|
|
|
|
|
LARGE_INTEGER = int(1e20) |
|
|
|
|
|
def make_serializeable(obj): |
|
|
if isinstance(obj, dict): |
|
|
return {str(k): make_serializeable(v) for k, v in obj.items()} |
|
|
if isinstance(obj, list): |
|
|
return [make_serializeable(v) for v in obj] |
|
|
if isinstance(obj, tuple): |
|
|
return make_serializeable(list(obj)) |
|
|
return obj |
|
|
|
|
|
|
|
|
class ByteLMTokenizerV3(PreTrainedTokenizer): |
|
|
"""Byte tokenizer with completely seperate space for special tokens. |
|
|
|
|
|
tok.pad Parameters |
|
|
---------- |
|
|
PreTrainedTokenizer : _type_ |
|
|
_description_ |
|
|
|
|
|
Returns |
|
|
------- |
|
|
_type_ |
|
|
_description_ |
|
|
|
|
|
Raises |
|
|
------ |
|
|
ValueError |
|
|
_description_ |
|
|
ValueError |
|
|
_description_ |
|
|
""" |
|
|
|
|
|
model_input_names: list[str] = ["input_ids", "attention_mask"] |
|
|
reserve_sizes: list[int] = [59, 0, 0, 0] |
|
|
byte_head_ints: list[int] = [ |
|
|
int("11000000", base=2), |
|
|
int("10000000", base=2), |
|
|
int("01000000", base=2), |
|
|
int("00000000", base=2), |
|
|
] |
|
|
byte_n_free_bits: list[int] = [6, 6, 6, 6] |
|
|
patch_padding: bool |
|
|
reserve_token_list: list[tuple[int]] |
|
|
|
|
|
def __init__( |
|
|
self, |
|
|
patch_padding=True, |
|
|
pad_token="<|pad|>", |
|
|
eos_token="<|end_of_text|>", |
|
|
bos_token="<|begin_of_text|>", |
|
|
cls_token="<|cls|>", |
|
|
sep_token="<|sep|>", |
|
|
mask_token="<|mask|>", |
|
|
vision_start_token="<|vision_start|>", |
|
|
vision_br_token="<|vision_br|>", |
|
|
vision_end_token="<|vision_end|>", |
|
|
start_header_id_token="<|start_header_id|>", |
|
|
end_header_id_token="<|end_header_id|>", |
|
|
eor_id="<|end_of_role|>", |
|
|
extra_ids=47, |
|
|
**kwargs, |
|
|
) -> None: |
|
|
assert np.prod( |
|
|
[ |
|
|
2**n_free_bits - reserve_size |
|
|
for reserve_size, n_free_bits in zip( |
|
|
self.reserve_sizes, self.byte_n_free_bits |
|
|
) |
|
|
] |
|
|
) >= int( |
|
|
"110000", base=16 |
|
|
), "Not enough positions for all unicode. Too many reserve size." |
|
|
|
|
|
self.patch_padding = patch_padding |
|
|
|
|
|
|
|
|
self._list_up_reserve_tokens() |
|
|
|
|
|
_bos_token = ( |
|
|
AddedToken(bos_token, lstrip=False, rstrip=False) |
|
|
if isinstance(bos_token, str) |
|
|
else bos_token |
|
|
) |
|
|
_eos_token = ( |
|
|
AddedToken(eos_token, lstrip=False, rstrip=False) |
|
|
if isinstance(eos_token, str) |
|
|
else eos_token |
|
|
) |
|
|
_pad_token = ( |
|
|
AddedToken(pad_token, lstrip=False, rstrip=False) |
|
|
if isinstance(pad_token, str) |
|
|
else pad_token |
|
|
) |
|
|
_cls_token = ( |
|
|
AddedToken(cls_token, lstrip=False, rstrip=False) |
|
|
if isinstance(cls_token, str) |
|
|
else cls_token |
|
|
) |
|
|
_sep_token = ( |
|
|
AddedToken(sep_token, lstrip=False, rstrip=False) |
|
|
if isinstance(sep_token, str) |
|
|
else sep_token |
|
|
) |
|
|
_mask_token = ( |
|
|
AddedToken(mask_token, lstrip=False, rstrip=False) |
|
|
if isinstance(mask_token, str) |
|
|
else mask_token |
|
|
) |
|
|
_vision_start_token = ( |
|
|
AddedToken(vision_start_token, lstrip=False, rstrip=False) |
|
|
if isinstance(vision_start_token, str) |
|
|
else vision_start_token |
|
|
) |
|
|
_vision_br_token = ( |
|
|
AddedToken(vision_br_token, lstrip=False, rstrip=False) |
|
|
if isinstance(vision_br_token, str) |
|
|
else vision_br_token |
|
|
) |
|
|
_vision_end_token = ( |
|
|
AddedToken(vision_end_token, lstrip=False, rstrip=False) |
|
|
if isinstance(vision_end_token, str) |
|
|
else vision_end_token |
|
|
) |
|
|
_start_header_id_token = ( |
|
|
AddedToken(start_header_id_token, lstrip=False, rstrip=False) |
|
|
if isinstance(start_header_id_token, str) |
|
|
else start_header_id_token |
|
|
) |
|
|
_end_header_id_token = ( |
|
|
AddedToken(end_header_id_token, lstrip=False, rstrip=False) |
|
|
if isinstance(end_header_id_token, str) |
|
|
else end_header_id_token |
|
|
) |
|
|
_eor_id = ( |
|
|
AddedToken(eor_id, lstrip=False, rstrip=False) |
|
|
if isinstance(eor_id, str) |
|
|
else eor_id |
|
|
) |
|
|
|
|
|
self.offset = 0 |
|
|
self._added_tokens_decoder = { |
|
|
self.reserve_token_list[i]: special_token |
|
|
for i, special_token in enumerate( |
|
|
[ |
|
|
_pad_token, |
|
|
_eos_token, |
|
|
_bos_token, |
|
|
_cls_token, |
|
|
_sep_token, |
|
|
_mask_token, |
|
|
_vision_start_token, |
|
|
_vision_br_token, |
|
|
_vision_end_token, |
|
|
_start_header_id_token, |
|
|
_end_header_id_token, |
|
|
_eor_id, |
|
|
] |
|
|
) |
|
|
} |
|
|
|
|
|
offset = len(self._added_tokens_decoder) |
|
|
extra_tokens = { |
|
|
self.reserve_token_list[j + offset]: AddedToken( |
|
|
f"<|extra_id_{i}|>", lstrip=False, rstrip=False |
|
|
) |
|
|
for j, i in enumerate(range(extra_ids)) |
|
|
} |
|
|
self._added_tokens_decoder.update(extra_tokens) |
|
|
|
|
|
super().__init__( |
|
|
bos_token=_bos_token, |
|
|
eos_token=_eos_token, |
|
|
pad_token=_pad_token, |
|
|
cls_token=_cls_token, |
|
|
sep_token=_sep_token, |
|
|
mask_token=_mask_token, |
|
|
vision_start_token=_vision_start_token, |
|
|
vision_br_token=_vision_br_token, |
|
|
vision_end_token=_vision_end_token, |
|
|
start_header_id_token=_start_header_id_token, |
|
|
end_header_id_token=_end_header_id_token, |
|
|
eor_id=_eor_id, |
|
|
**kwargs, |
|
|
) |
|
|
|
|
|
self._vocab_size = len(self.get_vocab()) |
|
|
|
|
|
def _list_up_reserve_tokens(self): |
|
|
self.reserve_token_list = [ |
|
|
( |
|
|
i + self.byte_head_ints[0], |
|
|
self.byte_head_ints[1], |
|
|
self.byte_head_ints[2], |
|
|
self.byte_head_ints[3], |
|
|
) |
|
|
for i in range(self.reserve_sizes[0]) |
|
|
] |
|
|
|
|
|
@property |
|
|
def vocab_size(self): |
|
|
return self._vocab_size |
|
|
|
|
|
def create_tree( |
|
|
self, byte_options: list[list[int]], byte_index: int, max_byte_index: int |
|
|
) -> list[list[int]]: |
|
|
if byte_index == max_byte_index: |
|
|
return [[reserve_option] for reserve_option in byte_options[byte_index]] |
|
|
|
|
|
concat_list = [] |
|
|
for byte_reserve_option in byte_options[byte_index]: |
|
|
if byte_reserve_option is not None: |
|
|
concat_list += [ |
|
|
[byte_reserve_option] + following_bytes |
|
|
if following_bytes != [None] |
|
|
else [byte_reserve_option] |
|
|
for following_bytes in self.create_tree( |
|
|
byte_options=byte_options, |
|
|
byte_index=byte_index + 1, |
|
|
max_byte_index=max_byte_index, |
|
|
) |
|
|
] |
|
|
else: |
|
|
concat_list.append([None]) |
|
|
return concat_list |
|
|
|
|
|
def get_vocab(self): |
|
|
byte_options = [ |
|
|
list(range(reserve_size, 2**n_free_bits)) |
|
|
for reserve_size, n_free_bits in zip( |
|
|
self.reserve_sizes, self.byte_n_free_bits |
|
|
) |
|
|
] |
|
|
|
|
|
if not self.patch_padding: |
|
|
for i in range(len(byte_options) - 1): |
|
|
byte_options[i] += [None] |
|
|
|
|
|
byte_options.reverse() |
|
|
byte_tokens = self.create_tree( |
|
|
byte_options=byte_options, byte_index=0, max_byte_index=3 |
|
|
) |
|
|
|
|
|
byte_tokens = sorted( |
|
|
byte_tokens, |
|
|
key=lambda lst: sum([e * (256**i) for i, e in enumerate(lst)]) |
|
|
+ 256 ** len(lst), |
|
|
) |
|
|
|
|
|
for byte_token_index in range(len(byte_tokens)): |
|
|
byte_tokens[byte_token_index].reverse() |
|
|
for position in range(len(byte_tokens[byte_token_index])): |
|
|
byte_tokens[byte_token_index][position] += self.byte_head_ints[position] |
|
|
byte_tokens[byte_token_index] = tuple(byte_tokens[byte_token_index]) |
|
|
|
|
|
vocab = {self.convert_ids_to_tokens(tokens): tokens for tokens in byte_tokens} |
|
|
vocab.pop("") |
|
|
vocab.update(self.added_tokens_encoder) |
|
|
return vocab |
|
|
|
|
|
|
|
|
def _get_padding_truncation_strategies( |
|
|
self, padding=False, truncation=None, max_length=None, pad_to_multiple_of=None, verbose=True, **kwargs |
|
|
): |
|
|
""" |
|
|
Find the correct padding/truncation strategy |
|
|
""" |
|
|
|
|
|
|
|
|
|
|
|
if max_length is not None and padding is False and truncation is None: |
|
|
if verbose: |
|
|
if not self.deprecation_warnings.get("Truncation-not-explicitly-activated", False): |
|
|
logger.warning( |
|
|
"Truncation was not explicitly activated but `max_length` is provided a specific value, please" |
|
|
" use `truncation=True` to explicitly truncate examples to max length. Defaulting to" |
|
|
" 'longest_first' truncation strategy. If you encode pairs of sequences (GLUE-style) with the" |
|
|
" tokenizer you can select this strategy more precisely by providing a specific strategy to" |
|
|
" `truncation`." |
|
|
) |
|
|
self.deprecation_warnings["Truncation-not-explicitly-activated"] = True |
|
|
truncation = "longest_first" |
|
|
|
|
|
|
|
|
if padding is not False: |
|
|
if padding is True: |
|
|
if verbose: |
|
|
if max_length is not None and ( |
|
|
truncation is None or truncation is False or truncation == "do_not_truncate" |
|
|
): |
|
|
warnings.warn( |
|
|
"`max_length` is ignored when `padding`=`True` and there is no truncation strategy. " |
|
|
"To pad to max length, use `padding='max_length'`." |
|
|
) |
|
|
padding_strategy = PaddingStrategy.LONGEST |
|
|
elif not isinstance(padding, PaddingStrategy): |
|
|
padding_strategy = PaddingStrategy(padding) |
|
|
elif isinstance(padding, PaddingStrategy): |
|
|
padding_strategy = padding |
|
|
else: |
|
|
padding_strategy = PaddingStrategy.DO_NOT_PAD |
|
|
|
|
|
|
|
|
if truncation is not False and truncation is not None: |
|
|
if truncation is True: |
|
|
truncation_strategy = ( |
|
|
TruncationStrategy.LONGEST_FIRST |
|
|
) |
|
|
elif not isinstance(truncation, TruncationStrategy): |
|
|
truncation_strategy = TruncationStrategy(truncation) |
|
|
elif isinstance(truncation, TruncationStrategy): |
|
|
truncation_strategy = truncation |
|
|
else: |
|
|
truncation_strategy = TruncationStrategy.DO_NOT_TRUNCATE |
|
|
|
|
|
|
|
|
if max_length is None: |
|
|
if padding_strategy == PaddingStrategy.MAX_LENGTH: |
|
|
if self.model_max_length > LARGE_INTEGER: |
|
|
if verbose: |
|
|
if not self.deprecation_warnings.get("Asking-to-pad-to-max_length", False): |
|
|
logger.warning( |
|
|
"Asking to pad to max_length but no maximum length is provided and the model has no" |
|
|
" predefined maximum length. Default to no padding." |
|
|
) |
|
|
self.deprecation_warnings["Asking-to-pad-to-max_length"] = True |
|
|
padding_strategy = PaddingStrategy.DO_NOT_PAD |
|
|
else: |
|
|
max_length = self.model_max_length |
|
|
|
|
|
if truncation_strategy != TruncationStrategy.DO_NOT_TRUNCATE: |
|
|
if self.model_max_length > LARGE_INTEGER: |
|
|
if verbose: |
|
|
if not self.deprecation_warnings.get("Asking-to-truncate-to-max_length", False): |
|
|
logger.warning( |
|
|
"Asking to truncate to max_length but no maximum length is provided and the model has" |
|
|
" no predefined maximum length. Default to no truncation." |
|
|
) |
|
|
self.deprecation_warnings["Asking-to-truncate-to-max_length"] = True |
|
|
truncation_strategy = TruncationStrategy.DO_NOT_TRUNCATE |
|
|
else: |
|
|
max_length = self.model_max_length |
|
|
|
|
|
|
|
|
if padding_strategy != PaddingStrategy.DO_NOT_PAD and self.pad_token is None: |
|
|
raise ValueError( |
|
|
"Asking to pad but the tokenizer does not have a padding token. " |
|
|
"Please select a token to use as `pad_token` `(tokenizer.pad_token = tokenizer.eos_token e.g.)` " |
|
|
"or add a new pad token via `tokenizer.add_special_tokens({'pad_token': '[PAD]'})`." |
|
|
) |
|
|
|
|
|
|
|
|
if ( |
|
|
truncation_strategy != TruncationStrategy.DO_NOT_TRUNCATE |
|
|
and padding_strategy != PaddingStrategy.DO_NOT_PAD |
|
|
and pad_to_multiple_of is not None |
|
|
and max_length is not None |
|
|
and (max_length % pad_to_multiple_of != 0) |
|
|
): |
|
|
raise ValueError( |
|
|
"Truncation and padding are both activated but " |
|
|
f"truncation length ({max_length}) is not a multiple of pad_to_multiple_of ({pad_to_multiple_of})." |
|
|
) |
|
|
|
|
|
return padding_strategy, truncation_strategy, max_length, kwargs |
|
|
|
|
|
|
|
|
|
|
|
def _add_bos_if_not_present(self, token_ids: list[int]) -> list[int]: |
|
|
"""Do not add bos again if user already added it.""" |
|
|
if len(token_ids) > 0 and token_ids[0] == self.bos_token_id: |
|
|
warnings.warn( |
|
|
f"This sequence already has {self.bos_token}. In future versions this behavior may lead to duplicated" |
|
|
" bos tokens being added." |
|
|
) |
|
|
return token_ids |
|
|
else: |
|
|
return list(self.bos_token_id) + token_ids |
|
|
|
|
|
|
|
|
def _add_eos_if_not_present(self, token_ids: list[int]) -> list[int]: |
|
|
"""Do not add eos again if user already added it.""" |
|
|
if len(token_ids) > 0 and token_ids[-1] == self.eos_token_id: |
|
|
warnings.warn( |
|
|
f"This sequence already has {self.eos_token}. In future versions this behavior may lead to duplicated" |
|
|
" eos tokens being added." |
|
|
) |
|
|
return token_ids |
|
|
else: |
|
|
return token_ids + list(self.eos_token_id) |
|
|
|
|
|
def _pad( |
|
|
self, |
|
|
encoded_inputs: Union[Dict[str, EncodedInput], BatchEncoding], |
|
|
max_length: Optional[int] = None, |
|
|
padding_strategy: PaddingStrategy = PaddingStrategy.DO_NOT_PAD, |
|
|
pad_to_multiple_of: Optional[int] = None, |
|
|
padding_side: Optional[bool] = None, |
|
|
return_attention_mask: Optional[bool] = None, |
|
|
) -> dict: |
|
|
""" |
|
|
Pad encoded inputs (on left/right and up to predefined length or max length in the batch) |
|
|
|
|
|
Args: |
|
|
encoded_inputs: |
|
|
Dictionary of tokenized inputs (`List[int]`) or batch of tokenized inputs (`List[List[int]]`). |
|
|
max_length: maximum length of the returned list and optionally padding length (see below). |
|
|
Will truncate by taking into account the special tokens. |
|
|
padding_strategy: PaddingStrategy to use for padding. |
|
|
|
|
|
- PaddingStrategy.LONGEST Pad to the longest sequence in the batch |
|
|
- PaddingStrategy.MAX_LENGTH: Pad to the max length (default) |
|
|
- PaddingStrategy.DO_NOT_PAD: Do not pad |
|
|
The tokenizer padding sides are defined in `padding_side` argument: |
|
|
|
|
|
- 'left': pads on the left of the sequences |
|
|
- 'right': pads on the right of the sequences |
|
|
pad_to_multiple_of: (optional) Integer if set will pad the sequence to a multiple of the provided value. |
|
|
This is especially useful to enable the use of Tensor Core on NVIDIA hardware with compute capability |
|
|
`>= 7.5` (Volta). |
|
|
padding_side: |
|
|
The side on which the model should have padding applied. Should be selected between ['right', 'left']. |
|
|
Default value is picked from the class attribute of the same name. |
|
|
return_attention_mask: |
|
|
(optional) Set to False to avoid returning attention mask (default: set to model specifics) |
|
|
""" |
|
|
|
|
|
if return_attention_mask is None: |
|
|
return_attention_mask = "attention_mask" in self.model_input_names |
|
|
|
|
|
required_input = encoded_inputs[self.model_input_names[0]] |
|
|
|
|
|
if padding_strategy == PaddingStrategy.LONGEST: |
|
|
max_length = len(required_input) |
|
|
|
|
|
if ( |
|
|
max_length is not None |
|
|
and pad_to_multiple_of is not None |
|
|
and (max_length % pad_to_multiple_of != 0) |
|
|
): |
|
|
max_length = ((max_length // pad_to_multiple_of) + 1) * pad_to_multiple_of |
|
|
|
|
|
needs_to_be_padded = ( |
|
|
padding_strategy != PaddingStrategy.DO_NOT_PAD |
|
|
and len(required_input) != max_length |
|
|
) |
|
|
|
|
|
|
|
|
if return_attention_mask and "attention_mask" not in encoded_inputs: |
|
|
encoded_inputs["attention_mask"] = [1] * len(required_input) |
|
|
|
|
|
if needs_to_be_padded: |
|
|
if self.patch_padding: |
|
|
difference = (max_length - len(required_input)) // len( |
|
|
self.byte_head_ints |
|
|
) |
|
|
mask_patch_size = 4 |
|
|
else: |
|
|
difference = max_length - len(required_input) |
|
|
mask_patch_size = 1 |
|
|
|
|
|
padding_side = ( |
|
|
padding_side if padding_side is not None else self.padding_side |
|
|
) |
|
|
|
|
|
if padding_side == "right": |
|
|
if return_attention_mask: |
|
|
encoded_inputs["attention_mask"] = ( |
|
|
encoded_inputs["attention_mask"] |
|
|
+ [0] * difference * mask_patch_size |
|
|
) |
|
|
if "token_type_ids" in encoded_inputs: |
|
|
encoded_inputs["token_type_ids"] = ( |
|
|
encoded_inputs["token_type_ids"] |
|
|
+ list(self.pad_token_type_id) * difference |
|
|
) |
|
|
if "special_tokens_mask" in encoded_inputs: |
|
|
encoded_inputs["special_tokens_mask"] = ( |
|
|
encoded_inputs["special_tokens_mask"] |
|
|
+ [1] * difference * mask_patch_size |
|
|
) |
|
|
encoded_inputs[self.model_input_names[0]] = ( |
|
|
required_input + list(self.pad_token_id) * difference |
|
|
) |
|
|
elif padding_side == "left": |
|
|
if return_attention_mask: |
|
|
encoded_inputs["attention_mask"] = [ |
|
|
0 |
|
|
] * difference * mask_patch_size + encoded_inputs["attention_mask"] |
|
|
if "token_type_ids" in encoded_inputs: |
|
|
encoded_inputs["token_type_ids"] = ( |
|
|
list(self.pad_token_type_id) * difference |
|
|
+ encoded_inputs["token_type_ids"] |
|
|
) |
|
|
if "special_tokens_mask" in encoded_inputs: |
|
|
encoded_inputs["special_tokens_mask"] = [ |
|
|
1 |
|
|
] * difference * mask_patch_size + encoded_inputs[ |
|
|
"special_tokens_mask" |
|
|
] |
|
|
encoded_inputs[self.model_input_names[0]] = ( |
|
|
list(self.pad_token_id) * difference + required_input |
|
|
) |
|
|
else: |
|
|
raise ValueError(f"Invalid padding strategy:{padding_side}") |
|
|
|
|
|
return encoded_inputs |
|
|
|
|
|
|
|
|
def build_inputs_with_special_tokens( |
|
|
self, token_ids_0: list[int], token_ids_1: list[int] | None = None |
|
|
) -> list[int]: |
|
|
""" |
|
|
Build model inputs from a sequence or a pair of sequence for sequence classification tasks by concatenating and |
|
|
adding special tokens. A sequence has the following format: |
|
|
- single sequence: `X </s>` |
|
|
- pair of sequences: `A </s> B </s>` |
|
|
Args: |
|
|
token_ids_0 (`List[int]`): |
|
|
List of IDs to which the special tokens will be added. |
|
|
token_ids_1 (`List[int]`, *optional*): |
|
|
Optional second list of IDs for sequence pairs. |
|
|
Returns: |
|
|
`List[int]`: List of [input IDs](../glossary#input-ids) with the appropriate special tokens. |
|
|
""" |
|
|
token_ids_0 = self._add_bos_if_not_present(token_ids_0) |
|
|
token_ids_0 = self._add_eos_if_not_present(token_ids_0) |
|
|
if token_ids_1 is None: |
|
|
return token_ids_0 |
|
|
else: |
|
|
token_ids_1 = self._add_bos_if_not_present(token_ids_1) |
|
|
token_ids_1 = self._add_eos_if_not_present(token_ids_1) |
|
|
return token_ids_0 + token_ids_1 |
|
|
|
|
|
def _tokenize(self, text: str) -> list[str]: |
|
|
"""Take as input a string and return a list of strings (tokens) for words/sub-words""" |
|
|
token_ids = [] |
|
|
for c in text: |
|
|
token_ids.extend(self.unicode_to_bytes(ord(c))) |
|
|
|
|
|
|
|
|
token_ids = [str(i) for i in token_ids] |
|
|
return token_ids |
|
|
|
|
|
def _convert_token_to_id(self, token): |
|
|
"""Converts a token (str) in an id using the vocab.""" |
|
|
token_id = int(token) + self.offset |
|
|
return token_id |
|
|
|
|
|
def _convert_id_to_token(self, index): |
|
|
"""Converts an index (integer) in a token (str) using the vocab.""" |
|
|
return str(index - self.offset) |
|
|
|
|
|
def _convert_token_to_id_with_added_voc(self, token): |
|
|
if token is None: |
|
|
return None |
|
|
|
|
|
if token in self._added_tokens_encoder: |
|
|
return list(self._added_tokens_encoder[token]) |
|
|
return [self._convert_token_to_id(token)] |
|
|
|
|
|
def convert_tokens_to_ids( |
|
|
self, tokens: Union[str, List[str]] |
|
|
) -> Union[int, List[int]]: |
|
|
""" |
|
|
Converts a token string (or a sequence of tokens) in a single integer id (or a sequence of ids), using the |
|
|
vocabulary. |
|
|
|
|
|
Args: |
|
|
tokens (`str` or `List[str]`): One or several token(s) to convert to token id(s). |
|
|
|
|
|
Returns: |
|
|
`int` or `List[int]`: The token id or list of token ids. |
|
|
""" |
|
|
if tokens is None: |
|
|
return None |
|
|
|
|
|
if isinstance(tokens, str): |
|
|
return self._convert_token_to_id_with_added_voc(tokens) |
|
|
|
|
|
ids = [] |
|
|
for token in tokens: |
|
|
ids.extend(self._convert_token_to_id_with_added_voc(token)) |
|
|
return ids |
|
|
|
|
|
def convert_bytes_for_single_char_to_char(self, ids: list[int]) -> str: |
|
|
byte_ints = [] |
|
|
byte_offset = 1 |
|
|
|
|
|
if self.is_special_token(ids): |
|
|
return self.added_tokens_decoder[tuple(ids)].__str__() |
|
|
|
|
|
for byte_position in range(1, len(ids) + 1): |
|
|
byte_int = ( |
|
|
ids[-byte_position] |
|
|
- self.byte_head_ints[-byte_position] |
|
|
- self.reserve_sizes[-byte_position] |
|
|
) |
|
|
if byte_int != -self.reserve_sizes[-byte_position]: |
|
|
byte_ints.append(byte_int * byte_offset) |
|
|
|
|
|
byte_offset *= ( |
|
|
2 ** self.byte_n_free_bits[-byte_position] |
|
|
- self.reserve_sizes[-byte_position] |
|
|
) |
|
|
|
|
|
codepoint = sum(byte_ints) |
|
|
if codepoint >= int("110000", base=16): |
|
|
return None |
|
|
else: |
|
|
try: |
|
|
return chr(codepoint) |
|
|
except ValueError: |
|
|
return None |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
def is_special_token(self, ids: list[int]): |
|
|
return tuple(ids) in self._added_tokens_decoder |
|
|
|
|
|
def convert_ids_to_tokens( |
|
|
self, ids: list[int] | tuple[int], skip_special_tokens: bool = False |
|
|
) -> str | None: |
|
|
"""convert ids for single/multiple unicode character(s) to unicode character(s)""" |
|
|
|
|
|
decoded_chars = "" |
|
|
|
|
|
if isinstance(ids, tuple): |
|
|
ids = list(ids) |
|
|
|
|
|
if self.patch_padding: |
|
|
for byte_position in range(0, len(ids), len(self.byte_head_ints)): |
|
|
char_bytes = ids[ |
|
|
byte_position : byte_position + len(self.byte_head_ints) |
|
|
] |
|
|
if ( |
|
|
skip_special_tokens and not self.is_special_token(char_bytes) |
|
|
) or not skip_special_tokens: |
|
|
char = self.convert_bytes_for_single_char_to_char(char_bytes) |
|
|
if char: |
|
|
decoded_chars += char |
|
|
return decoded_chars |
|
|
|
|
|
if not self.is_special_token(ids): |
|
|
byte_ints = [] |
|
|
byte_offset = 1 |
|
|
for byte_position in range(1, len(ids) + 1): |
|
|
if ids[-byte_position] == 0: |
|
|
break |
|
|
byte_int = ( |
|
|
ids[-byte_position] |
|
|
- self.byte_head_ints[-byte_position] |
|
|
- self.reserve_sizes[-byte_position] |
|
|
) |
|
|
assert byte_int >= 0 |
|
|
byte_ints.append(byte_int * byte_offset) |
|
|
byte_offset *= ( |
|
|
2 ** self.byte_n_free_bits[-byte_position] |
|
|
- self.reserve_sizes[-byte_position] |
|
|
) |
|
|
|
|
|
codepoint = sum(byte_ints) |
|
|
if codepoint >= int("110000", base=16): |
|
|
return None |
|
|
else: |
|
|
return chr(codepoint) |
|
|
else: |
|
|
return self._added_tokens_decoder[tuple(ids)] |
|
|
|
|
|
def unicode_to_bytes(self, codepoint: int) -> list[int]: |
|
|
byte_list_reversed = [] |
|
|
for byte_position_from_right in range(len(self.byte_n_free_bits)): |
|
|
byte_n_free_ids = ( |
|
|
2 ** self.byte_n_free_bits[-1 - byte_position_from_right] |
|
|
- self.reserve_sizes[-1 - byte_position_from_right] |
|
|
) |
|
|
byte_id = ( |
|
|
codepoint % byte_n_free_ids |
|
|
+ self.reserve_sizes[-1 - byte_position_from_right] |
|
|
+ self.byte_head_ints[-1 - byte_position_from_right] |
|
|
) |
|
|
codepoint //= byte_n_free_ids |
|
|
byte_list_reversed.append(byte_id) |
|
|
|
|
|
if codepoint == 0: |
|
|
if self.patch_padding: |
|
|
for pad_byte_position_from_right in range( |
|
|
len(byte_list_reversed), len(self.byte_n_free_bits) |
|
|
): |
|
|
byte_list_reversed.append( |
|
|
self.byte_head_ints[-1 - pad_byte_position_from_right] + self.reserve_sizes[-1 - pad_byte_position_from_right] |
|
|
) |
|
|
byte_list_reversed.reverse() |
|
|
return byte_list_reversed |
|
|
raise ValueError("codepoint is too large") |
|
|
|
|
|
|
|
|
def save_vocabulary( |
|
|
self, save_directory: str, filename_prefix: str | None = None |
|
|
) -> tuple[str]: |
|
|
return () |
|
|
|
|
|
|
|
|
def image_to_ids(self, image_data: list[list[list[int]]]) -> list[int]: |
|
|
image_data = torch.tensor(image_data) |
|
|
x, y, rgb = image_data.size() |
|
|
assert rgb == 3 |
|
|
image_br_token = list(self.added_tokens_encoder["<|vision_br|>"]) |
|
|
image_special_byte_index = self.added_tokens_encoder["<|vision_start|>"][0] |
|
|
|
|
|
|
|
|
image_data = torch.nn.functional.pad( |
|
|
image_data, (1, 0), "constant", value=image_special_byte_index |
|
|
).view(x, y * 4) |
|
|
|
|
|
image_data = torch.concat( |
|
|
[image_data, torch.tensor(image_br_token * x).view(x, 4)], dim=1 |
|
|
).view(-1) |
|
|
return image_data.tolist() |
|
|
|
|
|
def save_pretrained( |
|
|
self, |
|
|
save_directory: Union[str, os.PathLike], |
|
|
legacy_format: Optional[bool] = None, |
|
|
filename_prefix: Optional[str] = None, |
|
|
push_to_hub: bool = False, |
|
|
**kwargs, |
|
|
) -> Tuple[str]: |
|
|
""" |
|
|
Save the full tokenizer state. |
|
|
|
|
|
|
|
|
This method make sure the full tokenizer can then be re-loaded using the |
|
|
[`~tokenization_utils_base.PreTrainedTokenizer.from_pretrained`] class method.. |
|
|
|
|
|
Warning,None This won't save modifications you may have applied to the tokenizer after the instantiation (for |
|
|
instance, modifying `tokenizer.do_lower_case` after creation). |
|
|
|
|
|
Args: |
|
|
save_directory (`str` or `os.PathLike`): The path to a directory where the tokenizer will be saved. |
|
|
legacy_format (`bool`, *optional*): |
|
|
Only applicable for a fast tokenizer. If unset (default), will save the tokenizer in the unified JSON |
|
|
format as well as in legacy format if it exists, i.e. with tokenizer specific vocabulary and a separate |
|
|
added_tokens files. |
|
|
|
|
|
If `False`, will only save the tokenizer in the unified JSON format. This format is incompatible with |
|
|
"slow" tokenizers (not powered by the *tokenizers* library), so the tokenizer will not be able to be |
|
|
loaded in the corresponding "slow" tokenizer. |
|
|
|
|
|
If `True`, will save the tokenizer in legacy format. If the "slow" tokenizer doesn't exits, a value |
|
|
error is raised. |
|
|
filename_prefix (`str`, *optional*): |
|
|
A prefix to add to the names of the files saved by the tokenizer. |
|
|
push_to_hub (`bool`, *optional*, defaults to `False`): |
|
|
Whether or not to push your model to the Hugging Face model hub after saving it. You can specify the |
|
|
repository you want to push to with `repo_id` (will default to the name of `save_directory` in your |
|
|
namespace). |
|
|
kwargs (`Dict[str, Any]`, *optional*): |
|
|
Additional key word arguments passed along to the [`~utils.PushToHubMixin.push_to_hub`] method. |
|
|
|
|
|
Returns: |
|
|
A tuple of `str`: The files saved. |
|
|
""" |
|
|
use_auth_token = kwargs.pop("use_auth_token", None) |
|
|
|
|
|
if use_auth_token is not None: |
|
|
warnings.warn( |
|
|
"The `use_auth_token` argument is deprecated and will be removed in v5 of Transformers. Please use `token` instead.", |
|
|
FutureWarning, |
|
|
) |
|
|
if kwargs.get("token", None) is not None: |
|
|
raise ValueError( |
|
|
"`token` and `use_auth_token` are both specified. Please set only the argument `token`." |
|
|
) |
|
|
kwargs["token"] = use_auth_token |
|
|
|
|
|
if os.path.isfile(save_directory): |
|
|
logger.error(f"Provided path ({save_directory}) should be a directory, not a file") |
|
|
return |
|
|
|
|
|
os.makedirs(save_directory, exist_ok=True) |
|
|
|
|
|
if push_to_hub: |
|
|
commit_message = kwargs.pop("commit_message", None) |
|
|
repo_id = kwargs.pop("repo_id", save_directory.split(os.path.sep)[-1]) |
|
|
repo_id = self._create_repo(repo_id, **kwargs) |
|
|
files_timestamps = self._get_files_timestamps(save_directory) |
|
|
|
|
|
special_tokens_map_file = os.path.join( |
|
|
save_directory, (filename_prefix + "-" if filename_prefix else "") + SPECIAL_TOKENS_MAP_FILE |
|
|
) |
|
|
tokenizer_config_file = os.path.join( |
|
|
save_directory, (filename_prefix + "-" if filename_prefix else "") + TOKENIZER_CONFIG_FILE |
|
|
) |
|
|
|
|
|
tokenizer_config = copy.deepcopy(self.init_kwargs) |
|
|
|
|
|
|
|
|
target_keys = set(self.init_kwargs.keys()) |
|
|
|
|
|
target_keys.update(["model_max_length", "clean_up_tokenization_spaces"]) |
|
|
|
|
|
for k in target_keys: |
|
|
if hasattr(self, k): |
|
|
tokenizer_config[k] = getattr(self, k) |
|
|
|
|
|
|
|
|
tokenizer_config.update(self.special_tokens_map) |
|
|
|
|
|
if self.chat_template is not None: |
|
|
if isinstance(self.chat_template, dict): |
|
|
|
|
|
|
|
|
tokenizer_config["chat_template"] = [{"name": k, "template": v} for k, v in self.chat_template.items()] |
|
|
else: |
|
|
tokenizer_config["chat_template"] = self.chat_template |
|
|
|
|
|
if len(self.init_inputs) > 0: |
|
|
tokenizer_config["init_inputs"] = copy.deepcopy(self.init_inputs) |
|
|
for file_id in self.vocab_files_names.keys(): |
|
|
tokenizer_config.pop(file_id, None) |
|
|
|
|
|
|
|
|
tokenizer_config = self.convert_added_tokens(tokenizer_config, add_type_field=True, save=True) |
|
|
|
|
|
|
|
|
added_tokens = {} |
|
|
for key, value in self.added_tokens_decoder.items(): |
|
|
added_tokens[key] = value.__getstate__() |
|
|
tokenizer_config["added_tokens_decoder"] = added_tokens |
|
|
|
|
|
|
|
|
tokenizer_class = self.__class__.__name__ |
|
|
|
|
|
if tokenizer_class.endswith("Fast") and tokenizer_class != "PreTrainedTokenizerFast": |
|
|
tokenizer_class = tokenizer_class[:-4] |
|
|
tokenizer_config["tokenizer_class"] = tokenizer_class |
|
|
if getattr(self, "_auto_map", None) is not None: |
|
|
tokenizer_config["auto_map"] = self._auto_map |
|
|
if getattr(self, "_processor_class", None) is not None: |
|
|
tokenizer_config["processor_class"] = self._processor_class |
|
|
|
|
|
|
|
|
|
|
|
if self._auto_class is not None: |
|
|
custom_object_save(self, save_directory, config=tokenizer_config) |
|
|
|
|
|
|
|
|
if "name_or_path" in tokenizer_config: |
|
|
tokenizer_config.pop("name_or_path") |
|
|
tokenizer_config.pop("special_tokens_map_file", None) |
|
|
tokenizer_config.pop("tokenizer_file", None) |
|
|
|
|
|
with open(tokenizer_config_file, "w", encoding="utf-8") as f: |
|
|
out_str = json.dumps( |
|
|
make_serializeable(tokenizer_config), |
|
|
indent=2, |
|
|
sort_keys=True, |
|
|
ensure_ascii=False |
|
|
) + "\n" |
|
|
f.write(out_str) |
|
|
logger.info(f"tokenizer config file saved in {tokenizer_config_file}") |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
write_dict = self.convert_added_tokens(self.special_tokens_map_extended, save=True, add_type_field=False) |
|
|
with open(special_tokens_map_file, "w", encoding="utf-8") as f: |
|
|
out_str = json.dumps(write_dict, indent=2, sort_keys=True, ensure_ascii=False) + "\n" |
|
|
f.write(out_str) |
|
|
logger.info(f"Special tokens file saved in {special_tokens_map_file}") |
|
|
|
|
|
file_names = (tokenizer_config_file, special_tokens_map_file) |
|
|
|
|
|
save_files = self._save_pretrained( |
|
|
save_directory=save_directory, |
|
|
file_names=file_names, |
|
|
legacy_format=legacy_format, |
|
|
filename_prefix=filename_prefix, |
|
|
) |
|
|
|
|
|
if push_to_hub: |
|
|
self._upload_modified_files( |
|
|
save_directory, |
|
|
repo_id, |
|
|
files_timestamps, |
|
|
commit_message=commit_message, |
|
|
token=kwargs.get("token"), |
|
|
) |
|
|
|
|
|
return save_files |
|
|
|
|
|
|
|
|
def _save_pretrained( |
|
|
self, |
|
|
save_directory: Union[str, os.PathLike], |
|
|
file_names: Tuple[str], |
|
|
legacy_format: Optional[bool] = None, |
|
|
filename_prefix: Optional[str] = None, |
|
|
) -> Tuple[str]: |
|
|
""" |
|
|
Save a tokenizer using the slow-tokenizer/legacy format: vocabulary + added tokens. |
|
|
|
|
|
Fast tokenizers can also be saved in a unique JSON file containing {config + vocab + added-tokens} using the |
|
|
specific [`~tokenization_utils_fast.PreTrainedTokenizerFast._save_pretrained`] |
|
|
""" |
|
|
if legacy_format is False: |
|
|
raise ValueError( |
|
|
"Only fast tokenizers (instances of PreTrainedTokenizerFast) can be saved in non legacy format." |
|
|
) |
|
|
|
|
|
save_directory = str(save_directory) |
|
|
|
|
|
added_tokens_file = os.path.join( |
|
|
save_directory, (filename_prefix + "-" if filename_prefix else "") + ADDED_TOKENS_FILE |
|
|
) |
|
|
|
|
|
|
|
|
added_vocab = {tok: list(index) for tok, index in self.added_tokens_encoder.items()} |
|
|
if added_vocab: |
|
|
with open(added_tokens_file, "w", encoding="utf-8") as f: |
|
|
out_str = json.dumps(added_vocab, indent=2, sort_keys=True, ensure_ascii=False) + "\n" |
|
|
f.write(out_str) |
|
|
logger.info(f"added tokens file saved in {added_tokens_file}") |
|
|
|
|
|
vocab_files = self.save_vocabulary(save_directory, filename_prefix=filename_prefix) |
|
|
|
|
|
return file_names + vocab_files + (added_tokens_file,) |
|
|
|
|
|
|
|
|
|
|
|
@classmethod |
|
|
def _from_pretrained( |
|
|
cls, |
|
|
resolved_vocab_files, |
|
|
pretrained_model_name_or_path, |
|
|
init_configuration, |
|
|
*init_inputs, |
|
|
token=None, |
|
|
cache_dir=None, |
|
|
local_files_only=False, |
|
|
_commit_hash=None, |
|
|
_is_local=False, |
|
|
trust_remote_code=False, |
|
|
**kwargs, |
|
|
): |
|
|
|
|
|
|
|
|
from_slow = kwargs.get("from_slow", False) |
|
|
gguf_file = kwargs.get("gguf_file", None) |
|
|
has_tokenizer_file = resolved_vocab_files.get("tokenizer_file", None) is not None |
|
|
|
|
|
|
|
|
|
|
|
if (from_slow or not has_tokenizer_file) and cls.slow_tokenizer_class is not None and not gguf_file: |
|
|
slow_tokenizer = (cls.slow_tokenizer_class)._from_pretrained( |
|
|
copy.deepcopy(resolved_vocab_files), |
|
|
pretrained_model_name_or_path, |
|
|
copy.deepcopy(init_configuration), |
|
|
*init_inputs, |
|
|
token=token, |
|
|
cache_dir=cache_dir, |
|
|
local_files_only=local_files_only, |
|
|
_commit_hash=_commit_hash, |
|
|
**(copy.deepcopy(kwargs)), |
|
|
) |
|
|
else: |
|
|
slow_tokenizer = None |
|
|
|
|
|
|
|
|
|
|
|
tokenizer_config_file = resolved_vocab_files.pop("tokenizer_config_file", None) |
|
|
if tokenizer_config_file is not None: |
|
|
with open(tokenizer_config_file, encoding="utf-8") as tokenizer_config_handle: |
|
|
init_kwargs = json.load(tokenizer_config_handle) |
|
|
|
|
|
|
|
|
config_tokenizer_class = init_kwargs.get("tokenizer_class") |
|
|
init_kwargs.pop("tokenizer_class", None) |
|
|
if not has_tokenizer_file: |
|
|
init_kwargs.pop("tokenizer_file", None) |
|
|
saved_init_inputs = init_kwargs.pop("init_inputs", ()) |
|
|
if not init_inputs: |
|
|
init_inputs = saved_init_inputs |
|
|
else: |
|
|
config_tokenizer_class = None |
|
|
init_kwargs = init_configuration |
|
|
|
|
|
if not _is_local: |
|
|
if "auto_map" in init_kwargs: |
|
|
|
|
|
if isinstance(init_kwargs["auto_map"], (tuple, list)): |
|
|
init_kwargs["auto_map"] = {"AutoTokenizer": init_kwargs["auto_map"]} |
|
|
|
|
|
|
|
|
if config_tokenizer_class is None: |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
from transformers.models.auto.configuration_auto import AutoConfig |
|
|
|
|
|
|
|
|
try: |
|
|
config = AutoConfig.from_pretrained( |
|
|
pretrained_model_name_or_path, |
|
|
token=token, |
|
|
cache_dir=cache_dir, |
|
|
local_files_only=local_files_only, |
|
|
trust_remote_code=trust_remote_code, |
|
|
_commit_hash=_commit_hash, |
|
|
) |
|
|
config_tokenizer_class = config.tokenizer_class |
|
|
except (OSError, ValueError, KeyError): |
|
|
|
|
|
config = None |
|
|
if config_tokenizer_class is None: |
|
|
|
|
|
|
|
|
from transformers.models.auto.tokenization_auto import TOKENIZER_MAPPING_NAMES |
|
|
|
|
|
if hasattr(config, "model_type"): |
|
|
model_type = config.model_type |
|
|
else: |
|
|
|
|
|
model_type = None |
|
|
for pattern in TOKENIZER_MAPPING_NAMES.keys(): |
|
|
if pattern in str(pretrained_model_name_or_path): |
|
|
model_type = pattern |
|
|
break |
|
|
|
|
|
if model_type is not None: |
|
|
config_tokenizer_class, config_tokenizer_class_fast = TOKENIZER_MAPPING_NAMES.get( |
|
|
model_type, (None, None) |
|
|
) |
|
|
if config_tokenizer_class is None: |
|
|
config_tokenizer_class = config_tokenizer_class_fast |
|
|
|
|
|
if config_tokenizer_class is not None: |
|
|
if cls.__name__.replace("Fast", "") != config_tokenizer_class.replace("Fast", ""): |
|
|
logger.warning( |
|
|
"The tokenizer class you load from this checkpoint is not the same type as the class this" |
|
|
" function is called from. It may result in unexpected tokenization. \nThe tokenizer class you" |
|
|
f" load from this checkpoint is '{config_tokenizer_class}'. \nThe class this function is called" |
|
|
f" from is '{cls.__name__}'." |
|
|
) |
|
|
|
|
|
|
|
|
init_kwargs.update(kwargs) |
|
|
|
|
|
|
|
|
added_tokens_file = resolved_vocab_files.pop("added_tokens_file", None) |
|
|
special_tokens_map_file = resolved_vocab_files.pop("special_tokens_map_file", None) |
|
|
for args_name, file_path in resolved_vocab_files.items(): |
|
|
if args_name not in init_kwargs: |
|
|
init_kwargs[args_name] = file_path |
|
|
tokenizer_file = resolved_vocab_files.pop("tokenizer_file", None) |
|
|
|
|
|
if slow_tokenizer is not None: |
|
|
init_kwargs["__slow_tokenizer"] = slow_tokenizer |
|
|
init_kwargs["name_or_path"] = pretrained_model_name_or_path |
|
|
|
|
|
|
|
|
added_tokens_decoder: Dict[int, AddedToken] = {} |
|
|
added_tokens_map: Dict[str, AddedToken] = {} |
|
|
|
|
|
if "added_tokens_decoder" in init_kwargs: |
|
|
for idx, token in init_kwargs["added_tokens_decoder"].items(): |
|
|
if isinstance(token, dict): |
|
|
token = AddedToken(**token) |
|
|
if isinstance(token, AddedToken): |
|
|
added_tokens_decoder[ast.literal_eval(idx)] = token |
|
|
added_tokens_map[str(token)] = token |
|
|
else: |
|
|
raise ValueError( |
|
|
f"Found a {token.__class__} in the saved `added_tokens_decoder`, should be a dictionary or an AddedToken instance" |
|
|
) |
|
|
else: |
|
|
|
|
|
if special_tokens_map_file is not None: |
|
|
with open(special_tokens_map_file, encoding="utf-8") as special_tokens_map_handle: |
|
|
special_tokens_map = json.load(special_tokens_map_handle) |
|
|
for key, value in special_tokens_map.items(): |
|
|
if key in kwargs and kwargs[key]: |
|
|
|
|
|
|
|
|
continue |
|
|
if isinstance(value, dict): |
|
|
value = AddedToken(**value, special=True) |
|
|
elif key == "additional_special_tokens" and isinstance(value, list): |
|
|
additional_special_tokens = init_kwargs.pop("additional_special_tokens", []) or [] |
|
|
for token in value: |
|
|
token = AddedToken(**token, special=True) if isinstance(token, dict) else token |
|
|
if token not in additional_special_tokens: |
|
|
additional_special_tokens.append(token) |
|
|
value = additional_special_tokens |
|
|
init_kwargs[key] = value |
|
|
|
|
|
|
|
|
|
|
|
if added_tokens_file is not None: |
|
|
special_tokens = [] |
|
|
for key in cls.SPECIAL_TOKENS_ATTRIBUTES & init_kwargs.keys(): |
|
|
if init_kwargs[key] is not None: |
|
|
if key == "additional_special_tokens": |
|
|
special_tokens += [str(token) for token in init_kwargs[key]] |
|
|
else: |
|
|
special_tokens.append(str(init_kwargs[key])) |
|
|
|
|
|
with open(added_tokens_file, encoding="utf-8") as added_tokens_handle: |
|
|
added_tok_encoder = json.load(added_tokens_handle) |
|
|
for str_token, index in added_tok_encoder.items(): |
|
|
|
|
|
special = str_token in special_tokens |
|
|
added_tokens_decoder[index] = AddedToken( |
|
|
str_token, rstrip=False, lstrip=False, normalized=not special, special=special |
|
|
) |
|
|
added_tokens_map[str(token)] = added_tokens_decoder[index] |
|
|
|
|
|
|
|
|
|
|
|
if tokenizer_file is not None: |
|
|
|
|
|
with open(tokenizer_file, encoding="utf-8") as tokenizer_file_handle: |
|
|
tokenizer_file_handle = json.load(tokenizer_file_handle) |
|
|
added_tokens = tokenizer_file_handle.pop("added_tokens") |
|
|
for serialized_tokens in added_tokens: |
|
|
idx = serialized_tokens.pop("id") |
|
|
added_tokens_decoder[idx] = AddedToken(**serialized_tokens) |
|
|
added_tokens_map[str(added_tokens_decoder[idx])] = added_tokens_decoder[idx] |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
init_kwargs["added_tokens_decoder"] = added_tokens_decoder |
|
|
init_kwargs = cls.convert_added_tokens(init_kwargs, save=False) |
|
|
for key in cls.SPECIAL_TOKENS_ATTRIBUTES & init_kwargs.keys(): |
|
|
if added_tokens_map != {} and init_kwargs[key] is not None: |
|
|
if key != "additional_special_tokens": |
|
|
init_kwargs[key] = added_tokens_map.get(str(init_kwargs[key]), init_kwargs[key]) |
|
|
|
|
|
|
|
|
try: |
|
|
tokenizer = cls(*init_inputs, **init_kwargs) |
|
|
except OSError: |
|
|
raise OSError( |
|
|
"Unable to load vocabulary from file. " |
|
|
"Please check that the provided vocabulary is accessible and not corrupted." |
|
|
) |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
return tokenizer |
|
|
|
|
|
|
|
|
|
|
|
|