Upload 3 files
Browse files- __init__.py +0 -0
- tokenization_bartpho.py +329 -0
- tokenization_bartpho_fast.py +334 -0
    	
        __init__.py
    ADDED
    
    | 
            File without changes
         | 
    	
        tokenization_bartpho.py
    ADDED
    
    | @@ -0,0 +1,329 @@ | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | 
|  | |
| 1 | 
            +
            # coding=utf-8
         | 
| 2 | 
            +
            # Copyright 2021 VinAI Research and the HuggingFace Inc. team.
         | 
| 3 | 
            +
            #
         | 
| 4 | 
            +
            # Licensed under the Apache License, Version 2.0 (the "License");
         | 
| 5 | 
            +
            # you may not use this file except in compliance with the License.
         | 
| 6 | 
            +
            # You may obtain a copy of the License at
         | 
| 7 | 
            +
            #
         | 
| 8 | 
            +
            #     http://www.apache.org/licenses/LICENSE-2.0
         | 
| 9 | 
            +
            #
         | 
| 10 | 
            +
            # Unless required by applicable law or agreed to in writing, software
         | 
| 11 | 
            +
            # distributed under the License is distributed on an "AS IS" BASIS,
         | 
| 12 | 
            +
            # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
         | 
| 13 | 
            +
            # See the License for the specific language governing permissions and
         | 
| 14 | 
            +
            # limitations under the License
         | 
| 15 | 
            +
            """ Tokenization classes for BARTpho-syllable model."""
         | 
| 16 | 
            +
             | 
| 17 | 
            +
             | 
| 18 | 
            +
            import os
         | 
| 19 | 
            +
            from shutil import copyfile
         | 
| 20 | 
            +
            from typing import Any, Dict, List, Optional, Tuple
         | 
| 21 | 
            +
             | 
| 22 | 
            +
            import sentencepiece as spm
         | 
| 23 | 
            +
             | 
| 24 | 
            +
            from transformers.tokenization_utils import AddedToken, PreTrainedTokenizer
         | 
| 25 | 
            +
            from transformers.utils import logging
         | 
| 26 | 
            +
             | 
| 27 | 
            +
             | 
| 28 | 
            +
            logger = logging.get_logger(__name__)
         | 
| 29 | 
            +
             | 
| 30 | 
            +
            SPIECE_UNDERLINE = "▁"
         | 
| 31 | 
            +
             | 
| 32 | 
            +
            VOCAB_FILES_NAMES = {"vocab_file": "sentencepiece.bpe.model", "monolingual_vocab_file": "dict.txt"}
         | 
| 33 | 
            +
             | 
| 34 | 
            +
            PRETRAINED_VOCAB_FILES_MAP = {
         | 
| 35 | 
            +
                "vocab_file": {
         | 
| 36 | 
            +
                    "vinai/bartpho-syllable": "https://huggingface.co/vinai/bartpho-syllable/resolve/main/sentencepiece.bpe.model",
         | 
| 37 | 
            +
                },
         | 
| 38 | 
            +
                "monolingual_vocab_file": {
         | 
| 39 | 
            +
                    "vinai/bartpho-syllable": "https://huggingface.co/vinai/bartpho-syllable/resolve/main/dict.txt",
         | 
| 40 | 
            +
                },
         | 
| 41 | 
            +
            }
         | 
| 42 | 
            +
             | 
| 43 | 
            +
            PRETRAINED_POSITIONAL_EMBEDDINGS_SIZES = {"vinai/bartpho-syllable": 1024}
         | 
| 44 | 
            +
             | 
| 45 | 
            +
             | 
| 46 | 
            +
            class BartphoTokenizer(PreTrainedTokenizer):
         | 
| 47 | 
            +
                """
         | 
| 48 | 
            +
                Adapted from [`XLMRobertaTokenizer`]. Based on [SentencePiece](https://github.com/google/sentencepiece).
         | 
| 49 | 
            +
             | 
| 50 | 
            +
                This tokenizer inherits from [`PreTrainedTokenizer`] which contains most of the main methods. Users should refer to
         | 
| 51 | 
            +
                this superclass for more information regarding those methods.
         | 
| 52 | 
            +
             | 
| 53 | 
            +
                Args:
         | 
| 54 | 
            +
                    vocab_file (`str`):
         | 
| 55 | 
            +
                        Path to the vocabulary file. This vocabulary is the pre-trained SentencePiece model available from the
         | 
| 56 | 
            +
                        multilingual XLM-RoBERTa, also used in mBART, consisting of 250K types.
         | 
| 57 | 
            +
                    monolingual_vocab_file (`str`):
         | 
| 58 | 
            +
                        Path to the monolingual vocabulary file. This monolingual vocabulary consists of Vietnamese-specialized
         | 
| 59 | 
            +
                        types extracted from the multilingual vocabulary vocab_file of 250K types.
         | 
| 60 | 
            +
                    bos_token (`str`, *optional*, defaults to `"<s>"`):
         | 
| 61 | 
            +
                        The beginning of sequence token that was used during pretraining. Can be used a sequence classifier token.
         | 
| 62 | 
            +
             | 
| 63 | 
            +
                        <Tip>
         | 
| 64 | 
            +
             | 
| 65 | 
            +
                        When building a sequence using special tokens, this is not the token that is used for the beginning of
         | 
| 66 | 
            +
                        sequence. The token used is the `cls_token`.
         | 
| 67 | 
            +
             | 
| 68 | 
            +
                        </Tip>
         | 
| 69 | 
            +
             | 
| 70 | 
            +
                    eos_token (`str`, *optional*, defaults to `"</s>"`):
         | 
| 71 | 
            +
                        The end of sequence token.
         | 
| 72 | 
            +
             | 
| 73 | 
            +
                        <Tip>
         | 
| 74 | 
            +
             | 
| 75 | 
            +
                        When building a sequence using special tokens, this is not the token that is used for the end of sequence.
         | 
| 76 | 
            +
                        The token used is the `sep_token`.
         | 
| 77 | 
            +
             | 
| 78 | 
            +
                        </Tip>
         | 
| 79 | 
            +
             | 
| 80 | 
            +
                    sep_token (`str`, *optional*, defaults to `"</s>"`):
         | 
| 81 | 
            +
                        The separator token, which is used when building a sequence from multiple sequences, e.g. two sequences for
         | 
| 82 | 
            +
                        sequence classification or for a text and a question for question answering. It is also used as the last
         | 
| 83 | 
            +
                        token of a sequence built with special tokens.
         | 
| 84 | 
            +
                    cls_token (`str`, *optional*, defaults to `"<s>"`):
         | 
| 85 | 
            +
                        The classifier token which is used when doing sequence classification (classification of the whole sequence
         | 
| 86 | 
            +
                        instead of per-token classification). It is the first token of the sequence when built with special tokens.
         | 
| 87 | 
            +
                    unk_token (`str`, *optional*, defaults to `"<unk>"`):
         | 
| 88 | 
            +
                        The unknown token. A token that is not in the vocabulary cannot be converted to an ID and is set to be this
         | 
| 89 | 
            +
                        token instead.
         | 
| 90 | 
            +
                    pad_token (`str`, *optional*, defaults to `"<pad>"`):
         | 
| 91 | 
            +
                        The token used for padding, for example when batching sequences of different lengths.
         | 
| 92 | 
            +
                    mask_token (`str`, *optional*, defaults to `"<mask>"`):
         | 
| 93 | 
            +
                        The token used for masking values. This is the token used when training this model with masked language
         | 
| 94 | 
            +
                        modeling. This is the token which the model will try to predict.
         | 
| 95 | 
            +
                    additional_special_tokens (`List[str]`, *optional*, defaults to `["<s>NOTUSED", "</s>NOTUSED"]`):
         | 
| 96 | 
            +
                        Additional special tokens used by the tokenizer.
         | 
| 97 | 
            +
                    sp_model_kwargs (`dict`, *optional*):
         | 
| 98 | 
            +
                        Will be passed to the `SentencePieceProcessor.__init__()` method. The [Python wrapper for
         | 
| 99 | 
            +
                        SentencePiece](https://github.com/google/sentencepiece/tree/master/python) can be used, among other things,
         | 
| 100 | 
            +
                        to set:
         | 
| 101 | 
            +
             | 
| 102 | 
            +
                        - `enable_sampling`: Enable subword regularization.
         | 
| 103 | 
            +
                        - `nbest_size`: Sampling parameters for unigram. Invalid for BPE-Dropout.
         | 
| 104 | 
            +
             | 
| 105 | 
            +
                          - `nbest_size = {0,1}`: No sampling is performed.
         | 
| 106 | 
            +
                          - `nbest_size > 1`: samples from the nbest_size results.
         | 
| 107 | 
            +
                          - `nbest_size < 0`: assuming that nbest_size is infinite and samples from the all hypothesis (lattice)
         | 
| 108 | 
            +
                            using forward-filtering-and-backward-sampling algorithm.
         | 
| 109 | 
            +
             | 
| 110 | 
            +
                        - `alpha`: Smoothing parameter for unigram sampling, and dropout probability of merge operations for
         | 
| 111 | 
            +
                          BPE-dropout.
         | 
| 112 | 
            +
             | 
| 113 | 
            +
                Attributes:
         | 
| 114 | 
            +
                    sp_model (`SentencePieceProcessor`):
         | 
| 115 | 
            +
                        The *SentencePiece* processor that is used for every conversion (string, tokens and IDs).
         | 
| 116 | 
            +
                """
         | 
| 117 | 
            +
             | 
| 118 | 
            +
                vocab_files_names = VOCAB_FILES_NAMES
         | 
| 119 | 
            +
                pretrained_vocab_files_map = PRETRAINED_VOCAB_FILES_MAP
         | 
| 120 | 
            +
                max_model_input_sizes = PRETRAINED_POSITIONAL_EMBEDDINGS_SIZES
         | 
| 121 | 
            +
                model_input_names = ["input_ids", "attention_mask"]
         | 
| 122 | 
            +
             | 
| 123 | 
            +
                def __init__(
         | 
| 124 | 
            +
                    self,
         | 
| 125 | 
            +
                    vocab_file,
         | 
| 126 | 
            +
                    monolingual_vocab_file,
         | 
| 127 | 
            +
                    bos_token="<s>",
         | 
| 128 | 
            +
                    eos_token="</s>",
         | 
| 129 | 
            +
                    sep_token="</s>",
         | 
| 130 | 
            +
                    cls_token="<s>",
         | 
| 131 | 
            +
                    unk_token="<unk>",
         | 
| 132 | 
            +
                    pad_token="<pad>",
         | 
| 133 | 
            +
                    mask_token="<mask>",
         | 
| 134 | 
            +
                    sp_model_kwargs: Optional[Dict[str, Any]] = None,
         | 
| 135 | 
            +
                    **kwargs
         | 
| 136 | 
            +
                ) -> None:
         | 
| 137 | 
            +
                    # Mask token behave like a normal word, i.e. include the space before it
         | 
| 138 | 
            +
                    mask_token = AddedToken(mask_token, lstrip=True, rstrip=False) if isinstance(mask_token, str) else mask_token
         | 
| 139 | 
            +
             | 
| 140 | 
            +
                    self.sp_model_kwargs = {} if sp_model_kwargs is None else sp_model_kwargs
         | 
| 141 | 
            +
             | 
| 142 | 
            +
                    super().__init__(
         | 
| 143 | 
            +
                        bos_token=bos_token,
         | 
| 144 | 
            +
                        eos_token=eos_token,
         | 
| 145 | 
            +
                        unk_token=unk_token,
         | 
| 146 | 
            +
                        sep_token=sep_token,
         | 
| 147 | 
            +
                        cls_token=cls_token,
         | 
| 148 | 
            +
                        pad_token=pad_token,
         | 
| 149 | 
            +
                        mask_token=mask_token,
         | 
| 150 | 
            +
                        sp_model_kwargs=self.sp_model_kwargs,
         | 
| 151 | 
            +
                        **kwargs,
         | 
| 152 | 
            +
                    )
         | 
| 153 | 
            +
             | 
| 154 | 
            +
                    self.vocab_file = vocab_file
         | 
| 155 | 
            +
                    self.monolingual_vocab_file = monolingual_vocab_file
         | 
| 156 | 
            +
                    self.sp_model = spm.SentencePieceProcessor(**self.sp_model_kwargs)
         | 
| 157 | 
            +
                    self.sp_model.Load(str(vocab_file))
         | 
| 158 | 
            +
             | 
| 159 | 
            +
                    # Load the reduced vocab
         | 
| 160 | 
            +
             | 
| 161 | 
            +
                    # Keep order of special tokens for backward compatibility
         | 
| 162 | 
            +
                    self.fairseq_tokens_to_ids = {}
         | 
| 163 | 
            +
                    cnt = 0
         | 
| 164 | 
            +
                    for token in [bos_token, pad_token, eos_token, unk_token, sep_token, cls_token]:
         | 
| 165 | 
            +
                        if str(token) not in self.fairseq_tokens_to_ids:
         | 
| 166 | 
            +
                            self.fairseq_tokens_to_ids[str(token)] = cnt
         | 
| 167 | 
            +
                            cnt += 1
         | 
| 168 | 
            +
                    with open(monolingual_vocab_file, "r", encoding="utf-8") as f:
         | 
| 169 | 
            +
                        for line in f.readlines():
         | 
| 170 | 
            +
                            token = line.strip().split()[0]
         | 
| 171 | 
            +
                            self.fairseq_tokens_to_ids[token] = len(self.fairseq_tokens_to_ids)
         | 
| 172 | 
            +
                    if str(mask_token) not in self.fairseq_tokens_to_ids:
         | 
| 173 | 
            +
                        self.fairseq_tokens_to_ids[str(mask_token)] = len(self.fairseq_tokens_to_ids)
         | 
| 174 | 
            +
             | 
| 175 | 
            +
                    self.fairseq_ids_to_tokens = {v: k for k, v in self.fairseq_tokens_to_ids.items()}
         | 
| 176 | 
            +
             | 
| 177 | 
            +
                def __getstate__(self):
         | 
| 178 | 
            +
                    state = self.__dict__.copy()
         | 
| 179 | 
            +
                    state["sp_model"] = None
         | 
| 180 | 
            +
                    state["sp_model_proto"] = self.sp_model.serialized_model_proto()
         | 
| 181 | 
            +
                    return state
         | 
| 182 | 
            +
             | 
| 183 | 
            +
                def __setstate__(self, d):
         | 
| 184 | 
            +
                    self.__dict__ = d
         | 
| 185 | 
            +
             | 
| 186 | 
            +
                    # for backward compatibility
         | 
| 187 | 
            +
                    if not hasattr(self, "sp_model_kwargs"):
         | 
| 188 | 
            +
                        self.sp_model_kwargs = {}
         | 
| 189 | 
            +
             | 
| 190 | 
            +
                    self.sp_model = spm.SentencePieceProcessor(**self.sp_model_kwargs)
         | 
| 191 | 
            +
                    self.sp_model.LoadFromSerializedProto(self.sp_model_proto)
         | 
| 192 | 
            +
             | 
| 193 | 
            +
                def build_inputs_with_special_tokens(
         | 
| 194 | 
            +
                    self, token_ids_0: List[int], token_ids_1: Optional[List[int]] = None
         | 
| 195 | 
            +
                ) -> List[int]:
         | 
| 196 | 
            +
                    """
         | 
| 197 | 
            +
                    Build model inputs from a sequence or a pair of sequence for sequence classification tasks by concatenating and
         | 
| 198 | 
            +
                    adding special tokens. An BARTPho sequence has the following format:
         | 
| 199 | 
            +
             | 
| 200 | 
            +
                    - single sequence: `<s> X </s>`
         | 
| 201 | 
            +
                    - pair of sequences: `<s> A </s></s> B </s>`
         | 
| 202 | 
            +
             | 
| 203 | 
            +
                    Args:
         | 
| 204 | 
            +
                        token_ids_0 (`List[int]`):
         | 
| 205 | 
            +
                            List of IDs to which the special tokens will be added.
         | 
| 206 | 
            +
                        token_ids_1 (`List[int]`, *optional*):
         | 
| 207 | 
            +
                            Optional second list of IDs for sequence pairs.
         | 
| 208 | 
            +
             | 
| 209 | 
            +
                    Returns:
         | 
| 210 | 
            +
                        `List[int]`: List of [input IDs](../glossary#input-ids) with the appropriate special tokens.
         | 
| 211 | 
            +
                    """
         | 
| 212 | 
            +
             | 
| 213 | 
            +
                    if token_ids_1 is None:
         | 
| 214 | 
            +
                        return [self.cls_token_id] + token_ids_0 + [self.sep_token_id]
         | 
| 215 | 
            +
                    cls = [self.cls_token_id]
         | 
| 216 | 
            +
                    sep = [self.sep_token_id]
         | 
| 217 | 
            +
                    return cls + token_ids_0 + sep + sep + token_ids_1 + sep
         | 
| 218 | 
            +
             | 
| 219 | 
            +
                def get_special_tokens_mask(
         | 
| 220 | 
            +
                    self, token_ids_0: List[int], token_ids_1: Optional[List[int]] = None, already_has_special_tokens: bool = False
         | 
| 221 | 
            +
                ) -> List[int]:
         | 
| 222 | 
            +
                    """
         | 
| 223 | 
            +
                    Retrieve sequence ids from a token list that has no special tokens added. This method is called when adding
         | 
| 224 | 
            +
                    special tokens using the tokenizer `prepare_for_model` method.
         | 
| 225 | 
            +
             | 
| 226 | 
            +
                    Args:
         | 
| 227 | 
            +
                        token_ids_0 (`List[int]`):
         | 
| 228 | 
            +
                            List of IDs.
         | 
| 229 | 
            +
                        token_ids_1 (`List[int]`, *optional*):
         | 
| 230 | 
            +
                            Optional second list of IDs for sequence pairs.
         | 
| 231 | 
            +
                        already_has_special_tokens (`bool`, *optional*, defaults to `False`):
         | 
| 232 | 
            +
                            Whether or not the token list is already formatted with special tokens for the model.
         | 
| 233 | 
            +
             | 
| 234 | 
            +
                    Returns:
         | 
| 235 | 
            +
                        `List[int]`: A list of integers in the range [0, 1]: 1 for a special token, 0 for a sequence token.
         | 
| 236 | 
            +
                    """
         | 
| 237 | 
            +
             | 
| 238 | 
            +
                    if already_has_special_tokens:
         | 
| 239 | 
            +
                        return super().get_special_tokens_mask(
         | 
| 240 | 
            +
                            token_ids_0=token_ids_0, token_ids_1=token_ids_1, already_has_special_tokens=True
         | 
| 241 | 
            +
                        )
         | 
| 242 | 
            +
             | 
| 243 | 
            +
                    if token_ids_1 is None:
         | 
| 244 | 
            +
                        return [1] + ([0] * len(token_ids_0)) + [1]
         | 
| 245 | 
            +
                    return [1] + ([0] * len(token_ids_0)) + [1, 1] + ([0] * len(token_ids_1)) + [1]
         | 
| 246 | 
            +
             | 
| 247 | 
            +
                def create_token_type_ids_from_sequences(
         | 
| 248 | 
            +
                    self, token_ids_0: List[int], token_ids_1: Optional[List[int]] = None
         | 
| 249 | 
            +
                ) -> List[int]:
         | 
| 250 | 
            +
                    """
         | 
| 251 | 
            +
                    Create a mask from the two sequences passed to be used in a sequence-pair classification task. BARTPho does not
         | 
| 252 | 
            +
                    make use of token type ids, therefore a list of zeros is returned.
         | 
| 253 | 
            +
             | 
| 254 | 
            +
                    Args:
         | 
| 255 | 
            +
                        token_ids_0 (`List[int]`):
         | 
| 256 | 
            +
                            List of IDs.
         | 
| 257 | 
            +
                        token_ids_1 (`List[int]`, *optional*):
         | 
| 258 | 
            +
                            Optional second list of IDs for sequence pairs.
         | 
| 259 | 
            +
             | 
| 260 | 
            +
                    Returns:
         | 
| 261 | 
            +
                        `List[int]`: List of zeros.
         | 
| 262 | 
            +
             | 
| 263 | 
            +
                    """
         | 
| 264 | 
            +
             | 
| 265 | 
            +
                    sep = [self.sep_token_id]
         | 
| 266 | 
            +
                    cls = [self.cls_token_id]
         | 
| 267 | 
            +
             | 
| 268 | 
            +
                    if token_ids_1 is None:
         | 
| 269 | 
            +
                        return len(cls + token_ids_0 + sep) * [0]
         | 
| 270 | 
            +
                    return len(cls + token_ids_0 + sep + sep + token_ids_1 + sep) * [0]
         | 
| 271 | 
            +
             | 
| 272 | 
            +
                @property
         | 
| 273 | 
            +
                def vocab_size(self):
         | 
| 274 | 
            +
                    return len(self.fairseq_ids_to_tokens)
         | 
| 275 | 
            +
             | 
| 276 | 
            +
                def get_vocab(self):
         | 
| 277 | 
            +
                    vocab = {self.convert_ids_to_tokens(i): i for i in range(self.vocab_size)}
         | 
| 278 | 
            +
                    vocab.update(self.added_tokens_encoder)
         | 
| 279 | 
            +
                    return vocab
         | 
| 280 | 
            +
             | 
| 281 | 
            +
                def _tokenize(self, text: str) -> List[str]:
         | 
| 282 | 
            +
                    return self.sp_model.encode(text, out_type=str)
         | 
| 283 | 
            +
             | 
| 284 | 
            +
                def _convert_token_to_id(self, token):
         | 
| 285 | 
            +
                    """Converts a token (str) in an id using the vocab."""
         | 
| 286 | 
            +
                    if token in self.fairseq_tokens_to_ids:
         | 
| 287 | 
            +
                        return self.fairseq_tokens_to_ids[token]
         | 
| 288 | 
            +
                    else:
         | 
| 289 | 
            +
                        return self.unk_token_id
         | 
| 290 | 
            +
             | 
| 291 | 
            +
                def _convert_id_to_token(self, index):
         | 
| 292 | 
            +
                    """Converts an index (integer) in a token (str) using the vocab."""
         | 
| 293 | 
            +
                    return self.fairseq_ids_to_tokens[index]
         | 
| 294 | 
            +
             | 
| 295 | 
            +
                def convert_tokens_to_string(self, tokens):
         | 
| 296 | 
            +
                    """Converts a sequence of tokens (strings for sub-words) in a single string."""
         | 
| 297 | 
            +
                    out_string = "".join(tokens).replace(SPIECE_UNDERLINE, " ").strip()
         | 
| 298 | 
            +
                    return out_string
         | 
| 299 | 
            +
             | 
| 300 | 
            +
                def save_vocabulary(self, save_directory: str, filename_prefix: Optional[str] = None) -> Tuple[str]:
         | 
| 301 | 
            +
                    if not os.path.isdir(save_directory):
         | 
| 302 | 
            +
                        logger.error(f"Vocabulary path ({save_directory}) should be a directory")
         | 
| 303 | 
            +
                        return
         | 
| 304 | 
            +
                    out_vocab_file = os.path.join(
         | 
| 305 | 
            +
                        save_directory, (filename_prefix + "-" if filename_prefix else "") + VOCAB_FILES_NAMES["vocab_file"]
         | 
| 306 | 
            +
                    )
         | 
| 307 | 
            +
                    out_monolingual_vocab_file = os.path.join(
         | 
| 308 | 
            +
                        save_directory,
         | 
| 309 | 
            +
                        (filename_prefix + "-" if filename_prefix else "") + VOCAB_FILES_NAMES["monolingual_vocab_file"],
         | 
| 310 | 
            +
                    )
         | 
| 311 | 
            +
             | 
| 312 | 
            +
                    if os.path.abspath(self.vocab_file) != os.path.abspath(out_vocab_file) and os.path.isfile(self.vocab_file):
         | 
| 313 | 
            +
                        copyfile(self.vocab_file, out_vocab_file)
         | 
| 314 | 
            +
                    elif not os.path.isfile(self.vocab_file):
         | 
| 315 | 
            +
                        with open(out_vocab_file, "wb") as fi:
         | 
| 316 | 
            +
                            content_spiece_model = self.sp_model.serialized_model_proto()
         | 
| 317 | 
            +
                            fi.write(content_spiece_model)
         | 
| 318 | 
            +
             | 
| 319 | 
            +
                    if os.path.abspath(self.monolingual_vocab_file) != os.path.abspath(
         | 
| 320 | 
            +
                        out_monolingual_vocab_file
         | 
| 321 | 
            +
                    ) and os.path.isfile(self.monolingual_vocab_file):
         | 
| 322 | 
            +
                        copyfile(self.monolingual_vocab_file, out_monolingual_vocab_file)
         | 
| 323 | 
            +
                    elif not os.path.isfile(self.monolingual_vocab_file):
         | 
| 324 | 
            +
                        with open(out_monolingual_vocab_file, "w", encoding="utf-8") as fp:
         | 
| 325 | 
            +
                            for token in self.fairseq_tokens_to_ids:
         | 
| 326 | 
            +
                                if token not in self.all_special_tokens:
         | 
| 327 | 
            +
                                    fp.write(f"{str(token)} \n")
         | 
| 328 | 
            +
             | 
| 329 | 
            +
                    return out_vocab_file, out_monolingual_vocab_file
         | 
    	
        tokenization_bartpho_fast.py
    ADDED
    
    | @@ -0,0 +1,334 @@ | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | |
|  | 
|  | |
| 1 | 
            +
            # coding=utf-8
         | 
| 2 | 
            +
            # Copyright 2021 VinAI Research and the HuggingFace Inc. team.
         | 
| 3 | 
            +
            #
         | 
| 4 | 
            +
            # Licensed under the Apache License, Version 2.0 (the "License");
         | 
| 5 | 
            +
            # you may not use this file except in compliance with the License.
         | 
| 6 | 
            +
            # You may obtain a copy of the License at
         | 
| 7 | 
            +
            #
         | 
| 8 | 
            +
            #     http://www.apache.org/licenses/LICENSE-2.0
         | 
| 9 | 
            +
            #
         | 
| 10 | 
            +
            # Unless required by applicable law or agreed to in writing, software
         | 
| 11 | 
            +
            # distributed under the License is distributed on an "AS IS" BASIS,
         | 
| 12 | 
            +
            # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
         | 
| 13 | 
            +
            # See the License for the specific language governing permissions and
         | 
| 14 | 
            +
            # limitations under the License
         | 
| 15 | 
            +
            """ Tokenization classes for BARTpho-syllable model."""
         | 
| 16 | 
            +
             | 
| 17 | 
            +
            import os
         | 
| 18 | 
            +
            from collections import defaultdict
         | 
| 19 | 
            +
            from shutil import copyfile
         | 
| 20 | 
            +
            from typing import Any, Dict, List, Optional, Tuple, Union
         | 
| 21 | 
            +
             | 
| 22 | 
            +
            from transformers.tokenization_utils import AddedToken
         | 
| 23 | 
            +
            from transformers.tokenization_utils_base import EncodingFast
         | 
| 24 | 
            +
            from transformers.tokenization_utils_fast import PreTrainedTokenizerFast
         | 
| 25 | 
            +
            from transformers.utils import is_sentencepiece_available, logging
         | 
| 26 | 
            +
             | 
| 27 | 
            +
             | 
| 28 | 
            +
            if is_sentencepiece_available():
         | 
| 29 | 
            +
                from .tokenization_bartpho import BartphoTokenizer
         | 
| 30 | 
            +
            else:
         | 
| 31 | 
            +
                BartphoTokenizer = None
         | 
| 32 | 
            +
             | 
| 33 | 
            +
             | 
| 34 | 
            +
            logger = logging.get_logger(__name__)
         | 
| 35 | 
            +
             | 
| 36 | 
            +
            VOCAB_FILES_NAMES = {
         | 
| 37 | 
            +
                "vocab_file": "sentencepiece.bpe.model",
         | 
| 38 | 
            +
                "monolingual_vocab_file": "dict.txt",
         | 
| 39 | 
            +
                "tokenizer_file": "tokenizer.json",
         | 
| 40 | 
            +
            }
         | 
| 41 | 
            +
             | 
| 42 | 
            +
            PRETRAINED_VOCAB_FILES_MAP = {
         | 
| 43 | 
            +
                "vocab_file": {
         | 
| 44 | 
            +
                    "vinai/bartpho-syllable": "https://huggingface.co/vinai/bartpho-syllable/resolve/main/sentencepiece.bpe.model",
         | 
| 45 | 
            +
                },
         | 
| 46 | 
            +
                "monolingual_vocab_file": {
         | 
| 47 | 
            +
                    "vinai/bartpho-syllable": "https://huggingface.co/vinai/bartpho-syllable/resolve/main/dict.txt",
         | 
| 48 | 
            +
                },
         | 
| 49 | 
            +
                "tokenizer_file": {
         | 
| 50 | 
            +
                    "vinai/bartpho-syllable": "https://huggingface.co/vinai/bartpho-syllable/resolve/main/tokenizer.json",
         | 
| 51 | 
            +
                },
         | 
| 52 | 
            +
            }
         | 
| 53 | 
            +
             | 
| 54 | 
            +
            PRETRAINED_POSITIONAL_EMBEDDINGS_SIZES = {"vinai/bartpho-syllable": 1024}
         | 
| 55 | 
            +
             | 
| 56 | 
            +
             | 
| 57 | 
            +
            class BartphoTokenizerFast(PreTrainedTokenizerFast):
         | 
| 58 | 
            +
                """
         | 
| 59 | 
            +
                Construct a "fast" BARTpho tokenizer (backed by HuggingFace's *tokenizers* library). Adapted from
         | 
| 60 | 
            +
                [`XLMRobertaTokenizerFast`]. Based on [SentencePiece](https://github.com/google/sentencepiece).
         | 
| 61 | 
            +
             | 
| 62 | 
            +
                This tokenizer inherits from [`PreTrainedTokenizerFast`] which contains most of the main methods. Users should
         | 
| 63 | 
            +
                refer to this superclass for more information regarding those methods.
         | 
| 64 | 
            +
             | 
| 65 | 
            +
                Args:
         | 
| 66 | 
            +
                    vocab_file (`str`):
         | 
| 67 | 
            +
                        Path to the vocabulary file.
         | 
| 68 | 
            +
                    bos_token (`str`, *optional*, defaults to `"<s>"`):
         | 
| 69 | 
            +
                        The beginning of sequence token that was used during pretraining. Can be used a sequence classifier token.
         | 
| 70 | 
            +
             | 
| 71 | 
            +
                        <Tip>
         | 
| 72 | 
            +
             | 
| 73 | 
            +
                        When building a sequence using special tokens, this is not the token that is used for the beginning of
         | 
| 74 | 
            +
                        sequence. The token used is the `cls_token`.
         | 
| 75 | 
            +
             | 
| 76 | 
            +
                        </Tip>
         | 
| 77 | 
            +
             | 
| 78 | 
            +
                    eos_token (`str`, *optional*, defaults to `"</s>"`):
         | 
| 79 | 
            +
                        The end of sequence token.
         | 
| 80 | 
            +
             | 
| 81 | 
            +
                        <Tip>
         | 
| 82 | 
            +
             | 
| 83 | 
            +
                        When building a sequence using special tokens, this is not the token that is used for the end of sequence.
         | 
| 84 | 
            +
                        The token used is the `sep_token`.
         | 
| 85 | 
            +
             | 
| 86 | 
            +
                        </Tip>
         | 
| 87 | 
            +
             | 
| 88 | 
            +
                    sep_token (`str`, *optional*, defaults to `"</s>"`):
         | 
| 89 | 
            +
                        The separator token, which is used when building a sequence from multiple sequences, e.g. two sequences for
         | 
| 90 | 
            +
                        sequence classification or for a text and a question for question answering. It is also used as the last
         | 
| 91 | 
            +
                        token of a sequence built with special tokens.
         | 
| 92 | 
            +
                    cls_token (`str`, *optional*, defaults to `"<s>"`):
         | 
| 93 | 
            +
                        The classifier token which is used when doing sequence classification (classification of the whole sequence
         | 
| 94 | 
            +
                        instead of per-token classification). It is the first token of the sequence when built with special tokens.
         | 
| 95 | 
            +
                    unk_token (`str`, *optional*, defaults to `"<unk>"`):
         | 
| 96 | 
            +
                        The unknown token. A token that is not in the vocabulary cannot be converted to an ID and is set to be this
         | 
| 97 | 
            +
                        token instead.
         | 
| 98 | 
            +
                    pad_token (`str`, *optional*, defaults to `"<pad>"`):
         | 
| 99 | 
            +
                        The token used for padding, for example when batching sequences of different lengths.
         | 
| 100 | 
            +
                    mask_token (`str`, *optional*, defaults to `"<mask>"`):
         | 
| 101 | 
            +
                        The token used for masking values. This is the token used when training this model with masked language
         | 
| 102 | 
            +
                        modeling. This is the token which the model will try to predict.
         | 
| 103 | 
            +
                    additional_special_tokens (`List[str]`, *optional*, defaults to `["<s>NOTUSED", "</s>NOTUSED"]`):
         | 
| 104 | 
            +
                        Additional special tokens used by the tokenizer.
         | 
| 105 | 
            +
                """
         | 
| 106 | 
            +
             | 
| 107 | 
            +
                vocab_files_names = VOCAB_FILES_NAMES
         | 
| 108 | 
            +
                pretrained_vocab_files_map = PRETRAINED_VOCAB_FILES_MAP
         | 
| 109 | 
            +
                max_model_input_sizes = PRETRAINED_POSITIONAL_EMBEDDINGS_SIZES
         | 
| 110 | 
            +
                model_input_names = ["input_ids", "attention_mask"]
         | 
| 111 | 
            +
                slow_tokenizer_class = BartphoTokenizer
         | 
| 112 | 
            +
             | 
| 113 | 
            +
                def __init__(
         | 
| 114 | 
            +
                    self,
         | 
| 115 | 
            +
                    vocab_file=None,
         | 
| 116 | 
            +
                    monolingual_vocab_file=None,
         | 
| 117 | 
            +
                    tokenizer_file=None,
         | 
| 118 | 
            +
                    bos_token="<s>",
         | 
| 119 | 
            +
                    eos_token="</s>",
         | 
| 120 | 
            +
                    sep_token="</s>",
         | 
| 121 | 
            +
                    cls_token="<s>",
         | 
| 122 | 
            +
                    unk_token="<unk>",
         | 
| 123 | 
            +
                    pad_token="<pad>",
         | 
| 124 | 
            +
                    mask_token="<mask>",
         | 
| 125 | 
            +
                    **kwargs
         | 
| 126 | 
            +
                ):
         | 
| 127 | 
            +
                    # Mask token behave like a normal word, i.e. include the space before it
         | 
| 128 | 
            +
                    mask_token = AddedToken(mask_token, lstrip=True, rstrip=False) if isinstance(mask_token, str) else mask_token
         | 
| 129 | 
            +
             | 
| 130 | 
            +
                    super().__init__(
         | 
| 131 | 
            +
                        vocab_file,
         | 
| 132 | 
            +
                        monolingual_vocab_file,
         | 
| 133 | 
            +
                        tokenizer_file=tokenizer_file,
         | 
| 134 | 
            +
                        bos_token=bos_token,
         | 
| 135 | 
            +
                        eos_token=eos_token,
         | 
| 136 | 
            +
                        sep_token=sep_token,
         | 
| 137 | 
            +
                        cls_token=cls_token,
         | 
| 138 | 
            +
                        unk_token=unk_token,
         | 
| 139 | 
            +
                        pad_token=pad_token,
         | 
| 140 | 
            +
                        mask_token=mask_token,
         | 
| 141 | 
            +
                        **kwargs,
         | 
| 142 | 
            +
                    )
         | 
| 143 | 
            +
             | 
| 144 | 
            +
                    self.vocab_file = vocab_file
         | 
| 145 | 
            +
                    self.monolingual_vocab_file = monolingual_vocab_file
         | 
| 146 | 
            +
                    self.can_save_slow_tokenizer = False if not self.vocab_file else True
         | 
| 147 | 
            +
             | 
| 148 | 
            +
                def get_added_vocab_hacking(self):
         | 
| 149 | 
            +
                    """
         | 
| 150 | 
            +
                    Returns the added tokens in the vocabulary as a dictionary of token to index.
         | 
| 151 | 
            +
             | 
| 152 | 
            +
                    Returns:
         | 
| 153 | 
            +
                        `Dict[str, int], Dict[int, int]`: The added tokens, and their original and new ids
         | 
| 154 | 
            +
                    """
         | 
| 155 | 
            +
                    base_vocab_size = self._tokenizer.get_vocab_size(with_added_tokens=False)
         | 
| 156 | 
            +
                    full_vocab_size = self._tokenizer.get_vocab_size(with_added_tokens=True)
         | 
| 157 | 
            +
                    if full_vocab_size == base_vocab_size:
         | 
| 158 | 
            +
                        return {}, {}
         | 
| 159 | 
            +
             | 
| 160 | 
            +
                    # Tokens in added_vocab should have ids that are equal to or larger than the size of base_vocab
         | 
| 161 | 
            +
                    added_vocab = dict(
         | 
| 162 | 
            +
                        (self._tokenizer.id_to_token(index), index + 1 - base_vocab_size + self.mask_token_id)
         | 
| 163 | 
            +
                        for index in range(base_vocab_size, full_vocab_size)
         | 
| 164 | 
            +
                    )
         | 
| 165 | 
            +
             | 
| 166 | 
            +
                    id_mapping = dict((index, self._tokenizer.token_to_id(tok)) for tok, index in added_vocab.items())
         | 
| 167 | 
            +
             | 
| 168 | 
            +
                    return added_vocab, id_mapping
         | 
| 169 | 
            +
             | 
| 170 | 
            +
                def _decode(
         | 
| 171 | 
            +
                    self,
         | 
| 172 | 
            +
                    token_ids: Union[int, List[int]],
         | 
| 173 | 
            +
                    skip_special_tokens: bool = False,
         | 
| 174 | 
            +
                    clean_up_tokenization_spaces: bool = True,
         | 
| 175 | 
            +
                    **kwargs
         | 
| 176 | 
            +
                ) -> str:
         | 
| 177 | 
            +
                    self._decode_use_source_tokenizer = kwargs.pop("use_source_tokenizer", False)
         | 
| 178 | 
            +
             | 
| 179 | 
            +
                    if isinstance(token_ids, int):
         | 
| 180 | 
            +
                        token_ids = [token_ids]
         | 
| 181 | 
            +
             | 
| 182 | 
            +
                    # Mapping ids into their original values
         | 
| 183 | 
            +
                    _, id_mapping = self.get_added_vocab_hacking()
         | 
| 184 | 
            +
                    if len(id_mapping) > 0:
         | 
| 185 | 
            +
                        token_ids = [id_mapping[id] if id in id_mapping else id for id in token_ids]
         | 
| 186 | 
            +
             | 
| 187 | 
            +
                    text = self._tokenizer.decode(token_ids, skip_special_tokens=skip_special_tokens)
         | 
| 188 | 
            +
             | 
| 189 | 
            +
                    if clean_up_tokenization_spaces:
         | 
| 190 | 
            +
                        clean_text = self.clean_up_tokenization(text)
         | 
| 191 | 
            +
                        return clean_text
         | 
| 192 | 
            +
                    else:
         | 
| 193 | 
            +
                        return text
         | 
| 194 | 
            +
             | 
| 195 | 
            +
                def _convert_encoding(
         | 
| 196 | 
            +
                    self,
         | 
| 197 | 
            +
                    encoding: EncodingFast,
         | 
| 198 | 
            +
                    return_token_type_ids: Optional[bool] = None,
         | 
| 199 | 
            +
                    return_attention_mask: Optional[bool] = None,
         | 
| 200 | 
            +
                    return_overflowing_tokens: bool = False,
         | 
| 201 | 
            +
                    return_special_tokens_mask: bool = False,
         | 
| 202 | 
            +
                    return_offsets_mapping: bool = False,
         | 
| 203 | 
            +
                    return_length: bool = False,
         | 
| 204 | 
            +
                    verbose: bool = True,
         | 
| 205 | 
            +
                ) -> Tuple[Dict[str, Any], List[EncodingFast]]:
         | 
| 206 | 
            +
                    """
         | 
| 207 | 
            +
                    Convert the encoding representation (from low-level HuggingFace tokenizer output) to a python Dict and a list
         | 
| 208 | 
            +
                    of encodings, take care of building a batch from overflowing tokens.
         | 
| 209 | 
            +
             | 
| 210 | 
            +
                    Overflowing tokens are converted to additional examples (like batches) so the output values of the dict are
         | 
| 211 | 
            +
                    lists (overflows) of lists (tokens).
         | 
| 212 | 
            +
             | 
| 213 | 
            +
                    Output shape: (overflows, sequence length)
         | 
| 214 | 
            +
                    """
         | 
| 215 | 
            +
                    if return_token_type_ids is None:
         | 
| 216 | 
            +
                        return_token_type_ids = "token_type_ids" in self.model_input_names
         | 
| 217 | 
            +
                    if return_attention_mask is None:
         | 
| 218 | 
            +
                        return_attention_mask = "attention_mask" in self.model_input_names
         | 
| 219 | 
            +
             | 
| 220 | 
            +
                    if return_overflowing_tokens and encoding.overflowing is not None:
         | 
| 221 | 
            +
                        encodings = [encoding] + encoding.overflowing
         | 
| 222 | 
            +
                    else:
         | 
| 223 | 
            +
                        encodings = [encoding]
         | 
| 224 | 
            +
             | 
| 225 | 
            +
                    encoding_dict = defaultdict(list)
         | 
| 226 | 
            +
                    added_vocab, _ = self.get_added_vocab_hacking()
         | 
| 227 | 
            +
                    for e in encodings:
         | 
| 228 | 
            +
                        # encoding_dict["input_ids"].append(e.ids)
         | 
| 229 | 
            +
                        # Reassign ids of tokens due to the hacking strategy
         | 
| 230 | 
            +
                        ids = []
         | 
| 231 | 
            +
                        for id, token in zip(e.ids, e.tokens):
         | 
| 232 | 
            +
                            if id <= self.mask_token_id:
         | 
| 233 | 
            +
                                ids.append(id)
         | 
| 234 | 
            +
                            else:
         | 
| 235 | 
            +
                                if token.strip() in added_vocab:
         | 
| 236 | 
            +
                                    ids.append(added_vocab[token.strip()])
         | 
| 237 | 
            +
                                else:
         | 
| 238 | 
            +
                                    ids.append(self.unk_token_id)
         | 
| 239 | 
            +
             | 
| 240 | 
            +
                        encoding_dict["input_ids"].append(ids)
         | 
| 241 | 
            +
             | 
| 242 | 
            +
                        if return_token_type_ids:
         | 
| 243 | 
            +
                            encoding_dict["token_type_ids"].append(e.type_ids)
         | 
| 244 | 
            +
                        if return_attention_mask:
         | 
| 245 | 
            +
                            encoding_dict["attention_mask"].append(e.attention_mask)
         | 
| 246 | 
            +
                        if return_special_tokens_mask:
         | 
| 247 | 
            +
                            encoding_dict["special_tokens_mask"].append(e.special_tokens_mask)
         | 
| 248 | 
            +
                        if return_offsets_mapping:
         | 
| 249 | 
            +
                            encoding_dict["offset_mapping"].append(e.offsets)
         | 
| 250 | 
            +
                        if return_length:
         | 
| 251 | 
            +
                            # encoding_dict["length"].append(len(e.ids))
         | 
| 252 | 
            +
                            encoding_dict["length"].append(len(ids))
         | 
| 253 | 
            +
             | 
| 254 | 
            +
                    return encoding_dict, encodings
         | 
| 255 | 
            +
             | 
| 256 | 
            +
                def build_inputs_with_special_tokens(
         | 
| 257 | 
            +
                    self, token_ids_0: List[int], token_ids_1: Optional[List[int]] = None
         | 
| 258 | 
            +
                ) -> List[int]:
         | 
| 259 | 
            +
                    """
         | 
| 260 | 
            +
                    Build model inputs from a sequence or a pair of sequence for sequence classification tasks by concatenating and
         | 
| 261 | 
            +
                    adding special tokens. A BARTpho sequence has the following format:
         | 
| 262 | 
            +
             | 
| 263 | 
            +
                    - single sequence: `<s> X </s>`
         | 
| 264 | 
            +
                    - pair of sequences: `<s> A </s></s> B </s>`
         | 
| 265 | 
            +
             | 
| 266 | 
            +
                    Args:
         | 
| 267 | 
            +
                        token_ids_0 (`List[int]`):
         | 
| 268 | 
            +
                            List of IDs to which the special tokens will be added.
         | 
| 269 | 
            +
                        token_ids_1 (`List[int]`, *optional*):
         | 
| 270 | 
            +
                            Optional second list of IDs for sequence pairs.
         | 
| 271 | 
            +
             | 
| 272 | 
            +
                    Returns:
         | 
| 273 | 
            +
                        `List[int]`: List of [input IDs](../glossary#input-ids) with the appropriate special tokens.
         | 
| 274 | 
            +
                    """
         | 
| 275 | 
            +
             | 
| 276 | 
            +
                    if token_ids_1 is None:
         | 
| 277 | 
            +
                        return [self.cls_token_id] + token_ids_0 + [self.sep_token_id]
         | 
| 278 | 
            +
                    cls = [self.cls_token_id]
         | 
| 279 | 
            +
                    sep = [self.sep_token_id]
         | 
| 280 | 
            +
                    return cls + token_ids_0 + sep + sep + token_ids_1 + sep
         | 
| 281 | 
            +
             | 
| 282 | 
            +
                def create_token_type_ids_from_sequences(
         | 
| 283 | 
            +
                    self, token_ids_0: List[int], token_ids_1: Optional[List[int]] = None
         | 
| 284 | 
            +
                ) -> List[int]:
         | 
| 285 | 
            +
                    """
         | 
| 286 | 
            +
                    Create a mask from the two sequences passed to be used in a sequence-pair classification task. BARTpho does not
         | 
| 287 | 
            +
                    make use of token type ids, therefore a list of zeros is returned.
         | 
| 288 | 
            +
             | 
| 289 | 
            +
                    Args:
         | 
| 290 | 
            +
                        token_ids_0 (`List[int]`):
         | 
| 291 | 
            +
                            List of IDs.
         | 
| 292 | 
            +
                        token_ids_1 (`List[int]`, *optional*):
         | 
| 293 | 
            +
                            Optional second list of IDs for sequence pairs.
         | 
| 294 | 
            +
             | 
| 295 | 
            +
                    Returns:
         | 
| 296 | 
            +
                        `List[int]`: List of zeros.
         | 
| 297 | 
            +
             | 
| 298 | 
            +
                    """
         | 
| 299 | 
            +
             | 
| 300 | 
            +
                    sep = [self.sep_token_id]
         | 
| 301 | 
            +
                    cls = [self.cls_token_id]
         | 
| 302 | 
            +
             | 
| 303 | 
            +
                    if token_ids_1 is None:
         | 
| 304 | 
            +
                        return len(cls + token_ids_0 + sep) * [0]
         | 
| 305 | 
            +
                    return len(cls + token_ids_0 + sep + sep + token_ids_1 + sep) * [0]
         | 
| 306 | 
            +
             | 
| 307 | 
            +
                def save_vocabulary(self, save_directory: str, filename_prefix: Optional[str] = None) -> Tuple[str]:
         | 
| 308 | 
            +
                    if not self.can_save_slow_tokenizer:
         | 
| 309 | 
            +
                        raise ValueError(
         | 
| 310 | 
            +
                            "Your fast tokenizer does not have the necessary information to save the vocabulary for a "
         | 
| 311 | 
            +
                            "slow tokenizer."
         | 
| 312 | 
            +
                        )
         | 
| 313 | 
            +
             | 
| 314 | 
            +
                    if not os.path.isdir(save_directory):
         | 
| 315 | 
            +
                        logger.error(f"Vocabulary path ({save_directory}) should be a directory.")
         | 
| 316 | 
            +
                        return
         | 
| 317 | 
            +
             | 
| 318 | 
            +
                    out_vocab_file = os.path.join(
         | 
| 319 | 
            +
                        save_directory,
         | 
| 320 | 
            +
                        (filename_prefix + "-" if filename_prefix else "") + VOCAB_FILES_NAMES["vocab_file"],
         | 
| 321 | 
            +
                    )
         | 
| 322 | 
            +
             | 
| 323 | 
            +
                    out_monolingual_vocab_file = os.path.join(
         | 
| 324 | 
            +
                        save_directory,
         | 
| 325 | 
            +
                        (filename_prefix + "-" if filename_prefix else "") + VOCAB_FILES_NAMES["monolingual_vocab_file"],
         | 
| 326 | 
            +
                    )
         | 
| 327 | 
            +
             | 
| 328 | 
            +
                    if os.path.abspath(self.vocab_file) != os.path.abspath(out_vocab_file):
         | 
| 329 | 
            +
                        copyfile(self.vocab_file, out_vocab_file)
         | 
| 330 | 
            +
             | 
| 331 | 
            +
                    if os.path.abspath(self.monolingual_vocab_file) != os.path.abspath(out_monolingual_vocab_file):
         | 
| 332 | 
            +
                        copyfile(self.monolingual_vocab_file, out_monolingual_vocab_file)
         | 
| 333 | 
            +
             | 
| 334 | 
            +
                    return (out_vocab_file, out_monolingual_vocab_file)
         | 

