Source code for transformers.models.retribert.tokenization_retribert_fast
# coding=utf-8# Copyright 2018 The HuggingFace Inc. team.## Licensed under the Apache License, Version 2.0 (the "License");# you may not use this file except in compliance with the License.# You may obtain a copy of the License at## http://www.apache.org/licenses/LICENSE-2.0## Unless required by applicable law or agreed to in writing, software# distributed under the License is distributed on an "AS IS" BASIS,# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.# See the License for the specific language governing permissions and# limitations under the License."""Tokenization classes for RetriBERT."""from...utilsimportloggingfrom..bert.tokenization_bert_fastimportBertTokenizerFastfrom.tokenization_retribertimportRetriBertTokenizerlogger=logging.get_logger(__name__)VOCAB_FILES_NAMES={"vocab_file":"vocab.txt","tokenizer_file":"tokenizer.json"}PRETRAINED_VOCAB_FILES_MAP={"vocab_file":{"yjernite/retribert-base-uncased":"https://huggingface.co/yjernite/retribert-base-uncased/resolve/main/vocab.txt",},"tokenizer_file":{"yjernite/retribert-base-uncased":"https://huggingface.co/yjernite/retribert-base-uncased/resolve/main/tokenizer.json",},}PRETRAINED_POSITIONAL_EMBEDDINGS_SIZES={"yjernite/retribert-base-uncased":512,}PRETRAINED_INIT_CONFIGURATION={"yjernite/retribert-base-uncased":{"do_lower_case":True},}
[docs]classRetriBertTokenizerFast(BertTokenizerFast):r""" Construct a "fast" RetriBERT tokenizer (backed by HuggingFace's `tokenizers` library). :class:`~transformers.RetriBertTokenizerFast` is identical to :class:`~transformers.BertTokenizerFast` and runs end-to-end tokenization: punctuation splitting and wordpiece. Refer to superclass :class:`~transformers.BertTokenizerFast` for usage examples and documentation concerning parameters. """vocab_files_names=VOCAB_FILES_NAMESpretrained_vocab_files_map=PRETRAINED_VOCAB_FILES_MAPmax_model_input_sizes=PRETRAINED_POSITIONAL_EMBEDDINGS_SIZESpretrained_init_configuration=PRETRAINED_INIT_CONFIGURATIONslow_tokenizer_class=RetriBertTokenizermodel_input_names=["input_ids","attention_mask"]