Add HAT implementation files

Browse files

Files changed (3) hide show

configuration_hat.py +150 -0
modelling_hat.py +0 -0
tokenization_hat.py +249 -0

configuration_hat.py ADDED Viewed

	@@ -0,0 +1,150 @@

+# coding=utf-8
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+""" HAT configuration"""
+from collections import OrderedDict
+from typing import Mapping
+from transformers.onnx import OnnxConfig
+from transformers.utils import logging
+from transformers import PretrainedConfig
+logger = logging.get_logger(__name__)
+HAT_PRETRAINED_CONFIG_ARCHIVE_MAP = {
+    "kiddothe2b/hierarchical-transformer-base-4096": "https://huggingface.co/kiddothe2b/hierarchical-transformer-base-4096/resolve/main/config.json",
+    "kiddothe2b/adhoc-hierarchical-transformer-base-4096": "https://huggingface.co/kiddothe2b/adhoc-hierarchical-transformer-base-4096/resolve/main/config.json",
+}
+class HATConfig(PretrainedConfig):
+    r"""
+    This is the configuration class to store the configuration of a :class:`~transformers.HAT`.
+    It is used to instantiate a HAT model according to the specified arguments,
+    defining the model architecture. Instantiating a configuration with the defaults will yield a similar configuration
+    to that of the HAT `kiddothe2b/hierarchical-transformer-base-4096
+    <https://huggingface.co/kiddothe2b/hierarchical-transformer-base-4096>`__ architecture.
+    Configuration objects inherit from :class:`~transformers.PretrainedConfig` and can be used to control the model
+    outputs. Read the documentation from :class:`~transformers.PretrainedConfig` for more information.
+    Args:
+        vocab_size (:obj:`int`, `optional`, defaults to 30522):
+            Vocabulary size of the BERT model. Defines the number of different tokens that can be represented by the
+            :obj:`inputs_ids` passed when calling :class:`~transformers.BertModel` or
+            :class:`~transformers.TFBertModel`.
+        max_sentences (:obj:`int`, `optional`, defaults to 64):
+            The maximum number of sentences that this model might ever be used with.
+        max_sentence_size (:obj:`int`, `optional`, defaults to 128):
+            The maximum sentence length that this model might ever be used with.
+        model_max_length (:obj:`int`, `optional`, defaults to 8192):
+            The maximum  sequence length (max_sentences * max_sentence_size) that this model might ever be used with
+        encoder_layout (:obj:`Dict`):
+            The sentence/document encoder layout.
+        hidden_size (:obj:`int`, `optional`, defaults to 768):
+            Dimensionality of the encoder layers and the pooler layer.
+        num_hidden_layers (:obj:`int`, `optional`, defaults to 12):
+            Number of hidden layers in the Transformer encoder.
+        num_attention_heads (:obj:`int`, `optional`, defaults to 12):
+            Number of attention heads for each attention layer in the Transformer encoder.
+        intermediate_size (:obj:`int`, `optional`, defaults to 3072):
+            Dimensionality of the "intermediate" (often named feed-forward) layer in the Transformer encoder.
+        hidden_act (:obj:`str` or :obj:`Callable`, `optional`, defaults to :obj:`"gelu"`):
+            The non-linear activation function (function or string) in the encoder and pooler. If string,
+            :obj:`"gelu"`, :obj:`"relu"`, :obj:`"silu"` and :obj:`"gelu_new"` are supported.
+        hidden_dropout_prob (:obj:`float`, `optional`, defaults to 0.1):
+            The dropout probability for all fully connected layers in the embeddings, encoder, and pooler.
+        attention_probs_dropout_prob (:obj:`float`, `optional`, defaults to 0.1):
+            The dropout ratio for the attention probabilities.
+        max_position_embeddings (:obj:`int`, `optional`, defaults to 512):
+            The maximum sequence length that this model might ever be used with. Typically set this to something large
+            just in case (e.g., 512 or 1024 or 2048).
+        type_vocab_size (:obj:`int`, `optional`, defaults to 2):
+            The vocabulary size of the :obj:`token_type_ids` passed when calling :class:`~transformers.BertModel` or
+            :class:`~transformers.TFBertModel`.
+        initializer_range (:obj:`float`, `optional`, defaults to 0.02):
+            The standard deviation of the truncated_normal_initializer for initializing all weight matrices.
+        layer_norm_eps (:obj:`float`, `optional`, defaults to 1e-12):
+            The epsilon used by the layer normalization layers.
+        position_embedding_type (:obj:`str`, `optional`, defaults to :obj:`"absolute"`):
+            Type of position embedding. Choose one of :obj:`"absolute"`, :obj:`"relative_key"`,
+            :obj:`"relative_key_query"`. For positional embeddings use :obj:`"absolute"`. For more information on
+            :obj:`"relative_key"`, please refer to `Self-Attention with Relative Position Representations (Shaw et al.)
+            <https://arxiv.org/abs/1803.02155>`__. For more information on :obj:`"relative_key_query"`, please refer to
+            `Method 4` in `Improve Transformer Models with Better Relative Position Embeddings (Huang et al.)
+            <https://arxiv.org/abs/2009.13658>`__.
+        use_cache (:obj:`bool`, `optional`, defaults to :obj:`True`):
+            Whether or not the model should return the last key/values attentions (not used by all models). Only
+            relevant if ``config.is_decoder=True``.
+        classifier_dropout (:obj:`float`, `optional`):
+            The dropout ratio for the classification head.
+    """
+    model_type = "hierarchical-transformer"
+    def __init__(
+        self,
+        vocab_size=30522,
+        hidden_size=768,
+        max_sentences=64,
+        max_sentence_size=128,
+        model_max_length=8192,
+        num_hidden_layers=12,
+        num_attention_heads=12,
+        intermediate_size=3072,
+        hidden_act="gelu",
+        hidden_dropout_prob=0.1,
+        attention_probs_dropout_prob=0.1,
+        max_position_embeddings=512,
+        type_vocab_size=2,
+        initializer_range=0.02,
+        layer_norm_eps=1e-12,
+        pad_token_id=0,
+        position_embedding_type="absolute",
+        encoder_layout=None,
+        use_cache=True,
+        classifier_dropout=None,
+        **kwargs
+    ):
+        super().__init__(pad_token_id=pad_token_id, **kwargs)
+        self.vocab_size = vocab_size
+        self.hidden_size = hidden_size
+        self.max_sentences = max_sentences
+        self.max_sentence_size = max_sentence_size
+        self.model_max_length = model_max_length
+        self.encoder_layout = encoder_layout
+        self.num_hidden_layers = num_hidden_layers
+        self.num_attention_heads = num_attention_heads
+        self.hidden_act = hidden_act
+        self.intermediate_size = intermediate_size
+        self.hidden_dropout_prob = hidden_dropout_prob
+        self.attention_probs_dropout_prob = attention_probs_dropout_prob
+        self.max_position_embeddings = max_position_embeddings
+        self.type_vocab_size = type_vocab_size
+        self.initializer_range = initializer_range
+        self.layer_norm_eps = layer_norm_eps
+        self.position_embedding_type = position_embedding_type
+        self.use_cache = use_cache
+        self.classifier_dropout = classifier_dropout
+class HATOnnxConfig(OnnxConfig):
+    @property
+    def inputs(self) -> Mapping[str, Mapping[int, str]]:
+        return OrderedDict(
+            [
+                ("input_ids", {0: "batch", 1: "sequence"}),
+                ("attention_mask", {0: "batch", 1: "sequence"}),
+            ]
+        )

modelling_hat.py ADDED Viewed

The diff for this file is too large to render. See raw diff

tokenization_hat.py ADDED Viewed

	@@ -0,0 +1,249 @@

+# coding=utf-8
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+"""Tokenization classes for HAT."""
+import torch
+from transformers import RobertaTokenizer, BertTokenizer
+from .configuration_hat import HATConfig
+from transformers.utils import logging
+try:
+    from nltk import sent_tokenize
+except:
+    raise Exception('NLTK is not installed! Install it with `pip install nltk`...')
+logger = logging.get_logger(__name__)
+class HATTokenizer:
+    def __init__(self, tokenizer=None):
+        self._tokenizer = tokenizer
+        self.config = HATConfig.from_pretrained(self._tokenizer.name_or_path)
+        self._tokenizer.model_max_length = self.model_max_length
+        self.type2id = {'input_ids': (self._tokenizer.cls_token_id, self._tokenizer.pad_token_id),
+                        'token_type_ids': (0, 0),
+                        'attention_mask': (1, 0),
+                        'special_tokens_mask': (1, -100)}
+    @property
+    def model_max_length(self):
+        return self.config.model_max_length
+    @property
+    def mask_token(self):
+        return self._tokenizer.mask_token
+    @property
+    def mask_token_id(self):
+        return self._tokenizer.mask_token_id
+    @property
+    def pad_token_id(self):
+        return self._tokenizer.pad_token_id
+    @property
+    def cls_token_id(self):
+        return self._tokenizer.cls_token_id
+    @property
+    def sep_token_id(self):
+        return self._tokenizer.sep_token_id
+    @property
+    def vocab(self):
+        return self._tokenizer.vocab
+    def __len__(self):
+        """
+        Size of the full vocabulary with the added tokens.
+        """
+        return len(self._tokenizer)
+    def pad(self, *args, **kwargs):
+        return self._tokenizer.pad(*args, **kwargs)
+    def convert_tokens_to_ids(self, *args, **kwargs):
+        return self._tokenizer.convert_tokens_to_ids(*args, **kwargs)
+    def batch_decode(self, *args, **kwargs):
+        return self._tokenizer.batch_decode(*args, **kwargs)
+    def decode(self, *args, **kwargs):
+        return self._tokenizer.decode(*args, **kwargs)
+    def tokenize(self, text, **kwargs):
+        return self._tokenizer.tokenize(text, **kwargs)
+    def encode(self, text, **kwargs):
+        input_ids = self._tokenizer.encode_plus(text, add_special_tokens=False, **kwargs)
+        input_ids = self.chunks(input_ids[: self.model_max_length - self.config.max_sentences],
+                                chunk_size=self.config.max_sentence_length, special_id=self.type2id['input_ids'])
+        return input_ids
+    def get_special_tokens_mask(self, *args, **kwargs):
+        return self._tokenizer.get_special_tokens_mask(*args, **kwargs)
+    @classmethod
+    def from_pretrained(cls, pretrained_model_name_or_path, **kwargs):
+        try:
+            tokenizer = RobertaTokenizer.from_pretrained(pretrained_model_name_or_path, **kwargs)
+        except:
+            tokenizer = BertTokenizer.from_pretrained(pretrained_model_name_or_path, **kwargs)
+        return cls(tokenizer=tokenizer)
+    def save_pretrained(self, *args, **kwargs):
+        return self._tokenizer.save_pretrained( *args, **kwargs)
+    def __call__(self, text, **kwargs):
+        greedy_chunking = kwargs.pop('greedy_chunking', None)
+        text_pair = kwargs.pop('text_pair', None)
+        if isinstance(text[0], list):
+            batch = self.auto_chunking(text, **kwargs)
+        elif greedy_chunking:
+            # fixed uniform chunking
+            batch = self.uniform_chunking(text, **kwargs)
+        else:
+            # dynamic sentence splitting and grouping
+            batch = self.sentence_splitting(text, **kwargs)
+        if text_pair:
+            batch_b = self._tokenizer(text_pair, add_special_tokens=False,
+                                      padding=False, truncation=False)
+            for idx, sample in enumerate(batch['input_ids']):
+                n_sentences = sum(sample[::self.config.max_sentence_size])
+                for input_key in batch:
+                    batch[input_key][idx][self.config.max_sentence_size * n_sentences:
+                                          self.config.max_sentence_size * (n_sentences + 1)] = \
+                        self.pad_sentence(batch_b[input_key][idx],
+                                          special_id=(self.sep_token_id, self.pad_token_id)
+                                          if input_key == 'input_ids' else self.type2id[input_key])
+        return batch
+    def uniform_chunking(self, texts, **kwargs):
+        original_batch = self._tokenizer(texts, add_special_tokens=False, **kwargs)
+        batch = {input_type: [] for input_type in original_batch}
+        for input_type in original_batch:
+            fixed_batch = []
+            for example in original_batch[input_type]:
+                fixed_batch.append(self.chunks(example[: self.model_max_length - self.config.max_sentences],
+                                               chunk_size=self.config.max_sentence_length,
+                                               special_id=self.type2id[input_type]))
+            batch[input_type] = fixed_batch if isinstance(fixed_batch[0], list) else torch.stack(fixed_batch)
+        if kwargs['padding']:
+            batch = self.pad(batch,
+                             padding=kwargs['padding'],
+                             max_length=kwargs['max_length'],
+                             pad_to_multiple_of=kwargs['max_length'])
+        return batch
+    def auto_chunking(self, texts, **kwargs):
+        batch = {}
+        for text_idx, text in enumerate(texts):
+            example_batch = self._tokenizer(text, add_special_tokens=False, **kwargs)
+            for input_key in example_batch:
+                key_inputs_list = []
+                for idx, example in enumerate(example_batch[input_key][:self.config.max_sentences]):
+                    key_inputs_list.append(self.pad_sentence(example, special_id=self.type2id[input_key]))
+                if isinstance(key_inputs_list[0], list):
+                    key_inputs_list = [token for sentence in key_inputs_list for token in sentence]
+                else:
+                    key_inputs_list = torch.stack(key_inputs_list)
+                if input_key in batch:
+                    batch[input_key].append(key_inputs_list)
+                else:
+                    batch[input_key] = [key_inputs_list]
+        if kwargs['padding']:
+            batch = self.pad(batch,
+                             padding=kwargs['padding'],
+                             max_length=kwargs['max_length'],
+                             pad_to_multiple_of=kwargs['max_length'])
+        return batch
+    def chunks(self, flat_inputs, chunk_size=128, special_id=0):
+        if isinstance(flat_inputs, list):
+            return self.list_chunks(flat_inputs, chunk_size, special_id)
+        else:
+            return self.tensor_chunks(flat_inputs, chunk_size, special_id)
+    def list_chunks(self, flat_inputs, chunk_size=128, special_id=(0, 0)):
+        """Yield successive n-sized chunks from lst."""
+        structured_inputs = [[special_id[0] if sum(flat_inputs[i:i + chunk_size-1]) else special_id[1]]
+                             + flat_inputs[i:i + chunk_size-1] for i in range(0, len(flat_inputs), chunk_size-1)]
+        return [token_input for sentence_inputs in structured_inputs for token_input in sentence_inputs]
+    def tensor_chunks(self, flat_inputs, chunk_size=128, special_id=(0, 0)):
+        """Yield successive n-sized chunks from lst."""
+        structured_inputs = torch.stack([torch.cat((torch.tensor([special_id[0] if flat_inputs[i:i + chunk_size-1].sum() else special_id[1]], dtype=torch.int),
+                                                    flat_inputs[i:i + chunk_size-1])) for i in range(0, len(flat_inputs), chunk_size-1)])
+        return structured_inputs.reshape(-1)
+    def sentence_splitting(self, texts, **kwargs):
+        fixed_batch = []
+        doc_out = {}
+        for text in texts:
+            # sentence splitting
+            sentences = sent_tokenize(text)
+            # tokenization of sentences
+            sentences = self._tokenizer(sentences, add_special_tokens=False, padding=False, truncation=False)
+            # sentence grouping - merging short sentences to minimize padding
+            doc_out = self.sentence_grouping(sentences)
+            fixed_batch.append(doc_out)
+        # batchify examples
+        batch = {input_type: [] for input_type in doc_out}
+        for input_type in batch:
+            batch[input_type] = [example[input_type] for example in fixed_batch]
+            if not isinstance(batch[input_type][0], list):
+                batch[input_type] = torch.stack(batch[input_type])
+        if kwargs['padding']:
+            batch = self.pad(batch,
+                             padding=kwargs['padding'],
+                             max_length=kwargs['max_length'],
+                             pad_to_multiple_of=kwargs['max_length'])
+        return batch
+    def sentence_grouping(self, sentences):
+        doc_out = {input_type: [] for input_type in sentences}
+        for input_type in sentences:
+            tmp_doc = []
+            tmp_sentence = []
+            for example in sentences[input_type]:
+                if len(tmp_doc) >= self.config.max_sentences:
+                    break
+                if len(tmp_sentence) + len(example) <= self.config.max_sentence_length - 1:
+                    tmp_sentence.extend(example)
+                else:
+                    tmp_doc.append(self.pad_sentence(tmp_sentence if len(tmp_sentence) else example,
+                                                     chunk_size=self.config.max_sentence_length,
+                                                     special_id=self.type2id[input_type]))
+                    tmp_sentence = example if len(tmp_sentence) else example[self.config.max_sentence_length:]
+            if len(tmp_sentence) and len(tmp_doc) < self.config.max_sentences:
+                tmp_doc.append(self.pad_sentence(tmp_sentence,
+                                                 chunk_size=self.config.max_sentence_length,
+                                                 special_id=self.type2id[input_type]))
+            doc_out[input_type] = [token for sentence in tmp_doc for token in sentence]
+        return doc_out
+    def pad_sentence(self, flat_input, chunk_size=128, special_id=(0, 0)):
+        if isinstance(flat_input, list):
+            return [special_id[0]] + flat_input[:chunk_size-1] + [self.pad_token_id] * max(0, chunk_size - len(flat_input) - 1)
+        else:
+            return torch.cat((torch.tensor([special_id[0] if flat_input[:chunk_size-1].sum()
+                                            else special_id[1]], dtype=torch.int),
+                              flat_input[:chunk_size-1],
+                              torch.tensor([self.pad_token_id] * max(0, chunk_size - len(flat_input) - 1), dtype=torch.int)
+                              ))