|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
"""Conformer model configuration""" |
|
|
|
from transformers.configuration_utils import PretrainedConfig |
|
from transformers.utils import logging |
|
|
|
|
|
logger = logging.get_logger(__name__) |
|
|
|
|
|
class Speech2TextConformerConfig(PretrainedConfig): |
|
r""" |
|
This is the configuration class to store the configuration of a [`ConformerEncoderDecoderModel`]. It is used to |
|
instantiate a Conformer model according to the specified arguments, defining the model architecture. Instantiating a |
|
configuration with the defaults will yield a similar configuration to that of the conformer base architecture |
|
in https://github.com/hlt-mt/FBK-fairseq/. |
|
|
|
Configuration objects inherit from [`PretrainedConfig`] and can be used to control the model outputs. Read the |
|
documentation from [`PretrainedConfig`] for more information. |
|
|
|
|
|
Args: |
|
vocab_size (`int`, *optional*, defaults to 10000): |
|
Vocabulary size of the Conformer model. Defines the number of different tokens that can be represented by |
|
the `inputs_ids` passed when calling [`ConformerEncoderDecoderModel`] |
|
encoder_layers (`int`, *optional*, defaults to 12): |
|
Number of encoder layers. |
|
feed_forward_expansion_factor (`int`, *optional*, defaults to 4): |
|
Expansion factor that controls the size of the "intermediate" (often named feed-forward) layer in encoder. |
|
conv_expansion_factor (`int`, *optional*, defaults to 2): |
|
Expansion factor that controls the size of the intermediate convolution layers in the encoder. |
|
conformer_feedforward_dropout (`float`, *optional*, defaults to 0.1): |
|
Dropout probability of the Conformer FeedForward module. |
|
conformer_attention_dropout (`float`, *optional*, defaults to 0.1): |
|
Dropout probability of the Conformer Attention module. |
|
conformer_conv_dropout (`float`, *optional*, defaults to 0.1): |
|
Dropout probability of the Conformer Convolution module. |
|
conformer_conv_kernel_size (`int`, *optional*, defaults to 31): |
|
Kernel size of the Conformer Convolution module. |
|
conformer_half_step_residual (`bool`, *optional*, defaults to False): |
|
Whether to use half step residual connections. |
|
no_syncbatchnorm (`bool`, *optional*, defaults to False): |
|
If `True`, SyncBatchNorm is replaced by BatchNorm1D in the Conformer Convolution module. |
|
batch_unsafe_relative_shift (`bool`, *optional*, defaults to False): |
|
If `True`, the relative_shift implementation disregards padding (returning different results |
|
with different amount of padding for the same input) but is faster. This may lead to inconsistencies |
|
with different batch sizes. |
|
ctc_compress_strategy (`str`, *optional*, defaults to 'none'): |
|
Strategy to use when compressing CTC output. Valid strategies are 'none', 'avg', 'weighted', 'softmax', |
|
and 'fixed'. |
|
ctc_compress_fixed_ratio ('int', *optional*, defaults to 4): |
|
If ctc_compress_strategy is set to 'fixed', the fixed ratio controls how many consecutive steps to merge. |
|
ctc_compress_max_out_size ('int', *optional*, defaults to -1): |
|
If CTC compression is enabled (ctc_compress_strategy != 'none') and this argument is set to a positive |
|
number, every input is forced to be at most as long as the value set for this parameter, even though the |
|
CTC would not compress it enough. Intuitively, this parameter should be set to 1/4 of the max input length |
|
to ensure that the maximum sequence length of the self-attention input is the same as in the case of models |
|
having 2 initial convolutions with stride 2. |
|
encoder_attention_heads (`int`, *optional*, defaults to 8): |
|
Number of attention heads for each attention layer in the Transformer encoder. |
|
decoder_layers (`int`, *optional*, defaults to 6): |
|
Number of decoder layers. |
|
decoder_ffn_dim (`int`, *optional*, defaults to 2048): |
|
Dimensionality of the "intermediate" (often named feed-forward) layer in decoder. |
|
decoder_attention_heads (`int`, *optional*, defaults to 8): |
|
Number of attention heads for each attention layer in the Transformer decoder. |
|
decoder_layerdrop (`float`, *optional*, defaults to 0.0): |
|
The LayerDrop probability for the decoder. See the [LayerDrop paper](https://arxiv.org/abs/1909.11556) for |
|
more details. |
|
use_cache (`bool`, *optional*, defaults to `True`): |
|
Whether the model should return the last key/values attentions (not used by all models). |
|
is_encoder_decoder (`bool`, *optional*, defaults to `True`): |
|
Whether the model is set up as an encoder-decoder architecture for sequence-to-sequence tasks. |
|
activation_function (`str` or `function`, *optional*, defaults to `"relu"`): |
|
The non-linear activation function (function or string) in the encoder and pooler. If string, `"gelu"`, |
|
`"relu"`, `"silu"` and `"gelu_new"` are supported. |
|
d_model (`int`, *optional*, defaults to 512): |
|
Dimensionality of the layers and the pooler layer. |
|
dropout (`float`, *optional*, defaults to 0.1): |
|
The dropout probability for all fully connected layers in the embeddings, encoder, and pooler. |
|
attention_dropout (`float`, *optional*, defaults to 0.1): |
|
The dropout ratio for the attention probabilities. |
|
activation_dropout (`float`, *optional*, defaults to 0.1): |
|
The dropout ratio for activations inside the fully connected layer. |
|
init_std (`float`, *optional*, defaults to 0.02): |
|
The standard deviation of the truncated_normal_initializer for initializing all weight matrices. |
|
decoder_start_token_id (`int`, *optional*, defaults to 2): |
|
The initial token ID of the decoder when decoding sequences. |
|
scale_embedding (`bool`, *optional*, defaults to `True`): |
|
Whether the embeddings are scaled by the square root of `d_model`. |
|
pad_token_id (`int`, *optional*, defaults to 1): |
|
Padding token id. |
|
bos_token_id (`int`, *optional*, defaults to 0): |
|
The id of the beginning-of-sequence token. |
|
eos_token_id (`int`, *optional*, defaults to 2): |
|
The id of the end-of-sequence token. |
|
max_source_positions (`int`, *optional*, defaults to 6000): |
|
The maximum sequence length of log-mel filter-bank features that this model might ever be used with. |
|
max_target_positions (`int`, *optional*, defaults to 1024): |
|
The maximum sequence length that this model might ever be used with. Typically, set this to something large |
|
just in case (e.g., 512 or 1024 or 2048). |
|
num_conv_layers (`int`, *optional*, defaults to 2): |
|
Number of 1D convolutional layers in the conv module. |
|
conv_kernel_sizes (`Tuple[int]`, *optional*, defaults to `(5, 5)`): |
|
A tuple of integers defining the kernel size of each 1D convolutional layer in the conv module. The length |
|
of `conv_kernel_sizes` has to match `num_conv_layers`. |
|
conv_channels (`int`, *optional*, defaults to 1024): |
|
An integer defining the number of output channels of each convolution layers except the final one in the |
|
conv module. |
|
input_feat_per_channel (`int`, *optional*, defaults to 80): |
|
An integer specifying the size of feature vector. This is also the dimensions of log-mel filter-bank |
|
features. |
|
input_channels (`int`, *optional*, defaults to 1): |
|
An integer specifying number of input channels of the input feature vector. |
|
|
|
Example: |
|
|
|
```python |
|
>>> from transformers import Speech2TextConformerConfig, ConformerEncoderDecoderModel |
|
|
|
>>> # Initializing a configuration with default params |
|
>>> configuration = Speech2TextConformerConfig() |
|
|
|
>>> # Initializing a model (with random weights) from the default configuration |
|
>>> model = ConformerEncoderDecoderModel(configuration) |
|
|
|
>>> # Accessing the model configuration |
|
>>> configuration = model.config |
|
```""" |
|
|
|
model_type = "conformer_encoder_decoder" |
|
keys_to_ignore_at_inference = ["past_key_values"] |
|
attribute_map = {"num_attention_heads": "encoder_attention_heads", "hidden_size": "d_model"} |
|
|
|
def __init__( |
|
self, |
|
vocab_size=10000, |
|
encoder_layers=12, |
|
feed_forward_expansion_factor=4, |
|
conv_expansion_factor=2, |
|
conformer_feedforward_dropout=0.1, |
|
conformer_attention_dropout=0.1, |
|
conformer_conv_dropout=0.1, |
|
conformer_conv_kernel_size=31, |
|
conformer_half_step_residual=True, |
|
no_syncbatchnorm=False, |
|
batch_unsafe_relative_shift=False, |
|
ctc_compress_strategy="none", |
|
ctc_compress_fixed_ratio=4, |
|
ctc_compress_max_out_size=-1, |
|
encoder_attention_heads=8, |
|
decoder_layers=6, |
|
decoder_ffn_dim=2048, |
|
decoder_attention_heads=8, |
|
decoder_layerdrop=0.0, |
|
use_cache=True, |
|
is_encoder_decoder=True, |
|
activation_function="relu", |
|
d_model=512, |
|
dropout=0.1, |
|
attention_dropout=0.1, |
|
activation_dropout=0.1, |
|
init_std=0.02, |
|
decoder_start_token_id=2, |
|
scale_embedding=True, |
|
pad_token_id=1, |
|
bos_token_id=0, |
|
eos_token_id=2, |
|
max_source_positions=6000, |
|
max_target_positions=1024, |
|
num_conv_layers=2, |
|
conv_kernel_sizes=(5, 5), |
|
conv_channels=1024, |
|
input_feat_per_channel=80, |
|
input_channels=1, |
|
**kwargs, |
|
): |
|
self.vocab_size = vocab_size |
|
self.d_model = d_model |
|
self.feed_forward_expansion_factor = feed_forward_expansion_factor |
|
self.conv_expansion_factor = conv_expansion_factor |
|
self.conformer_feedforward_dropout = conformer_feedforward_dropout |
|
self.conformer_attention_dropout = conformer_attention_dropout |
|
self.conformer_conv_dropout = conformer_conv_dropout |
|
self.conformer_conv_kernel_size = conformer_conv_kernel_size |
|
self.conformer_half_step_residual = conformer_half_step_residual |
|
self.no_syncbatchnorm = no_syncbatchnorm |
|
self.batch_unsafe_relative_shift = batch_unsafe_relative_shift |
|
self.ctc_compress_strategy = ctc_compress_strategy |
|
self.ctc_compress_fixed_ratio = ctc_compress_fixed_ratio |
|
self.ctc_compress_max_out_size = ctc_compress_max_out_size |
|
self.encoder_layers = encoder_layers |
|
self.encoder_attention_heads = encoder_attention_heads |
|
self.decoder_ffn_dim = decoder_ffn_dim |
|
self.decoder_layers = decoder_layers |
|
self.decoder_attention_heads = decoder_attention_heads |
|
self.dropout = dropout |
|
self.attention_dropout = attention_dropout |
|
self.activation_dropout = activation_dropout |
|
self.activation_function = activation_function |
|
self.init_std = init_std |
|
self.decoder_layerdrop = decoder_layerdrop |
|
self.use_cache = use_cache |
|
self.num_hidden_layers = encoder_layers |
|
self.scale_embedding = scale_embedding |
|
self.max_source_positions = max_source_positions |
|
self.max_target_positions = max_target_positions |
|
self.num_conv_layers = num_conv_layers |
|
self.conv_kernel_sizes = list(conv_kernel_sizes) |
|
self.conv_channels = conv_channels |
|
self.input_feat_per_channel = input_feat_per_channel |
|
self.input_channels = input_channels |
|
|
|
if self.ctc_compress_strategy not in ['none', 'avg', 'weighted', 'softmax', 'fixed']: |
|
raise ValueError( |
|
f"Configuration value for ctc_compress_strategy is invalid. `{self.ctc_compress_strategy}` is set, " |
|
f"but the allowed values are: `none`, `avg`, `weighted`, `softmax`, `fixed`.") |
|
|
|
if len(self.conv_kernel_sizes) != self.num_conv_layers: |
|
raise ValueError( |
|
"Configuration for convolutional module is incorrect. " |
|
"It is required that `len(config.conv_kernel_sizes)` == `config.num_conv_layers` " |
|
f"but is `len(config.conv_kernel_sizes) = {len(self.conv_kernel_sizes)}`, " |
|
f"`config.num_conv_layers = {self.num_conv_layers}`." |
|
) |
|
|
|
super().__init__( |
|
pad_token_id=pad_token_id, |
|
bos_token_id=bos_token_id, |
|
eos_token_id=eos_token_id, |
|
is_encoder_decoder=is_encoder_decoder, |
|
decoder_start_token_id=decoder_start_token_id, |
|
**kwargs, |
|
) |
|
|