Upload model

32d4a60 over 1 year ago

13.2 kB

	# Copyright (c) Facebook, Inc. and its affiliates.
	#
	# This source code is licensed under the MIT license found in the
	# LICENSE file in the root directory of this source tree.

	import torch
	from fairseq import utils
	from fairseq.models.transformer import *
	from typing import Callable, List, Optional, Set, Tuple, Union
	import inspect

	import math

	import torch.nn as nn

	#rom fairseq.models.transformer import TransformerEncoder as TransformerEncoderFS
	#from fairseq.modules import TransformerEncoderLayer as TransformerEncoderLayerFS

	from torch.nn import TransformerEncoder, TransformerEncoderLayer

	from .multihead_linear_attention import MultiheadLinearAttention
	from transformers.models.roberta.modeling_roberta import RobertaEncoder, RobertaConfig, RobertaModel, RobertaLMHead, RobertaForMaskedLM, RobertaLayer


	class LinformerTransformerEncoderLayer(RobertaLayer):
	"""
	Implements a Linformer Encoder Layer used in BERT/XLM style pre-trained
	models.
	"""

	def __init__(self, config, shared_compress_layer):
	# wrap in a list so it's not automatically registered by PyTorch
	self.shared_compress_layer = [shared_compress_layer]
	d_model=config.embed_dim
	nhead=config.num_heads
	dim_feedforward=config.dim_feedforward
	dropout=config.dropout
	activation=config.activation
	layer_norm_eps=config.layer_norm_eps


	super().__init__(config)
	self.attention = self.build_self_attention(config.embed_dim, config)
	self.attn_layer_norm = nn.LayerNorm(config.hidden_size, eps=1e-5)
	self.final_layer_norm = nn.LayerNorm(config.hidden_size, eps=1e-5)
	self.output = RobertaOutput(config)



	def build_self_attention(self, embed_dim, args):

	attn = MultiheadLinearAttention(
	embed_dim,
	args.encoder_attention_heads,
	dropout=args.dropout,
	self_attention=True,
	q_noise=args.quant_noise_pq,
	qn_block_size=args.quant_noise_pq_block_size,
	compressed=args.compressed,
	max_seq_len=args.max_positions,
	shared_kv_compressed=args.shared_kv_compressed,
	shared_compress_layer=self.shared_compress_layer[0],
	freeze_compress=args.freeze_compress,
	)
	return attn

	def feed_forward_chunk(self, attention_output):

	residual = attention_output

	x = self.intermediate(attention_output)

	layer_output = self.output(x, residual)

	return layer_output

	def forward(
	self,
	hidden_states: torch.Tensor,
	attention_mask: Optional[torch.FloatTensor] = None,
	head_mask: Optional[torch.FloatTensor] = None,
	encoder_hidden_states: Optional[torch.FloatTensor] = None,
	encoder_attention_mask: Optional[torch.FloatTensor] = None,
	past_key_value: Optional[Tuple[Tuple[torch.FloatTensor]]] = None,
	output_attentions: Optional[bool] = False,
	) -> Tuple[torch.Tensor]:

	residual = hidden_states

	if self.attn_layer_norm is not None:
	hidden_states = self.attn_layer_norm(hidden_states)

	# decoder uni-directional self-attention cached key/values tuple is at positions 1,2
	self_attn_past_key_value = past_key_value[:2] if past_key_value is not None else None
	self_attention_outputs = self.attention(
	hidden_states,
	attention_mask,
	head_mask,
	output_attentions=output_attentions,
	past_key_value=self_attn_past_key_value,
	)
	attention_output = self_attention_outputs[0]

	# if decoder, the last output is tuple of self-attn cache
	if self.is_decoder:
	outputs = self_attention_outputs[1:-1]
	present_key_value = self_attention_outputs[-1]
	else:
	outputs = self_attention_outputs[1:] # add self attentions if we output attention weights

	cross_attn_present_key_value = None
	if self.is_decoder and encoder_hidden_states is not None:
	if not hasattr(self, "crossattention"):
	raise ValueError(
	f"If `encoder_hidden_states` are passed, {self} has to be instantiated with cross-attention layers"
	" by setting `config.add_cross_attention=True`"
	)

	# cross_attn cached key/values tuple is at positions 3,4 of past_key_value tuple
	cross_attn_past_key_value = past_key_value[-2:] if past_key_value is not None else None
	cross_attention_outputs = self.crossattention(
	attention_output,
	attention_mask,
	head_mask,
	encoder_hidden_states,
	encoder_attention_mask,
	cross_attn_past_key_value,
	output_attentions,
	)
	attention_output = cross_attention_outputs[0]
	outputs = outputs + cross_attention_outputs[1:-1] # add cross attentions if we output attention weights

	# add cross-attn cache to positions 3,4 of present_key_value tuple
	cross_attn_present_key_value = cross_attention_outputs[-1]
	present_key_value = present_key_value + cross_attn_present_key_value

	attention_output = attention_output + residual

	residual = attention_output

	attention_output = self.final_layer_norm(attention_output)

	layer_output = apply_chunking_to_forward(
	self.feed_forward_chunk, self.chunk_size_feed_forward, self.seq_len_dim, attention_output
	)
	layer_output = layer_output + residual

	outputs = (layer_output,) + outputs

	# if decoder, return the attn key/values as the last output
	if self.is_decoder:
	outputs = outputs + (present_key_value,)

	return outputs

	def upgrade_state_dict_named(self, state_dict, name):
	super().upgrade_state_dict_named(state_dict, name)
	prefix = name + "." if name != "" else ""

	# some old checkpoints had weight sharing implemented incorrectly
	# (note: this was correct in the original paper code)
	if utils.item(state_dict.get(f"{prefix}version", torch.tensor(1))) < 2:
	state_dict[f"{prefix}version"] = torch.tensor(1)
	# check compression layer sharing
	if f"{prefix}shared_compress_layer.weight" in state_dict:
	# reinitialize block without sharing compression layer to match
	# old behavior
	self.shared_compress_layer = [
	torch.nn.Linear(
	self.shared_compress_layer[0].weight.size(1),
	self.shared_compress_layer[0].weight.size(0),
	)
	]
	self.self_attn = self.build_self_attention(self.embed_dim, self.args)
	# delete shared_compress_layer, since it's already copied to
	# self_attn.compress_k.weight
	del state_dict[f"{prefix}shared_compress_layer.weight"]
	if f"{prefix}shared_compress_layer.bias" in state_dict:
	del state_dict[f"{prefix}shared_compress_layer.bias"]

	class RobertaOutput(nn.Module):
	def __init__(self, config):
	super().__init__()
	self.dense = nn.Linear(config.intermediate_size, config.hidden_size)
	self.LayerNorm = nn.LayerNorm(config.hidden_size, eps=config.layer_norm_eps)
	self.dropout = nn.Dropout(config.hidden_dropout_prob)

	def forward(self, hidden_states: torch.Tensor, input_tensor: torch.Tensor) -> torch.Tensor:
	hidden_states = self.dense(hidden_states)
	return hidden_states

	hidden_states = self.dropout(hidden_states)
	x = hidden_states + input_tensor

	return x
	hidden_states = self.LayerNorm(hidden_states + input_tensor)

	#hidden_states = self.LayerNorm(hidden_states + input_tensor)
	return hidden_states



	class LinformerTransformerEncoder(RobertaEncoder):
	"""
	Implementation for a Bi-directional Linformer based Sentence Encoder used
	in BERT/XLM style pre-trained models.

	This first computes the token embedding using the token embedding matrix,
	position embeddings (if specified) and segment embeddings
	(if specified). After applying the specified number of
	LinformerEncoderLayers, it outputs all the internal states of the
	encoder as well as the final representation associated with the first
	token (usually CLS token).

	Input:
	- tokens: B x T matrix representing sentences
	- segment_labels: B x T matrix representing segment label for tokens

	Output:
	- a tuple of the following:
	- a list of internal model states used to compute the
	predictions where each tensor has shape T x B x C
	- sentence representation associated with first input token
	in format B x C.
	"""

	def __init__(self, config,**kwargs):
	compress_layer = None
	if config.shared_layer_kv_compressed == 1 and compress_layer is None:
	compress_layer = nn.Linear(
	config.max_positions,
	config.max_positions // config.compressed
	)
	# intialize parameters for compressed layer
	nn.init.xavier_uniform_(compress_layer.weight, gain=1 / math.sqrt(2))
	if config.freeze_compress == 1:
	compress_layer.weight.requires_grad = False
	compress_layer = compress_layer
	#encoder_layer = LinformerTransformerEncoderLayer(config, compress_layer)

	super().__init__(config)

	self.layer = nn.ModuleList([LinformerTransformerEncoderLayer(config, compress_layer) for _ in range(config.num_layers)])
	self.compress_layer = compress_layer
	self.layer_norm = nn.LayerNorm(config.embed_dim)

	def apply_chunking_to_forward(
	forward_fn: Callable[..., torch.Tensor], chunk_size: int, chunk_dim: int, *input_tensors
	) -> torch.Tensor:
	"""
	This function chunks the `input_tensors` into smaller input tensor parts of size `chunk_size` over the dimension
	`chunk_dim`. It then applies a layer `forward_fn` to each chunk independently to save memory.

	If the `forward_fn` is independent across the `chunk_dim` this function will yield the same result as directly
	applying `forward_fn` to `input_tensors`.

	Args:
	forward_fn (`Callable[..., torch.Tensor]`):
	The forward function of the model.
	chunk_size (`int`):
	The chunk size of a chunked tensor: `num_chunks = len(input_tensors[0]) / chunk_size`.
	chunk_dim (`int`):
	The dimension over which the `input_tensors` should be chunked.
	input_tensors (`Tuple[torch.Tensor]`):
	The input tensors of `forward_fn` which will be chunked

	Returns:
	`torch.Tensor`: A tensor with the same shape as the `forward_fn` would have given if applied`.


	Examples:

	```python
	# rename the usual forward() fn to forward_chunk()
	def forward_chunk(self, hidden_states):
	hidden_states = self.decoder(hidden_states)
	return hidden_states


	# implement a chunked forward function
	def forward(self, hidden_states):
	return apply_chunking_to_forward(self.forward_chunk, self.chunk_size_lm_head, self.seq_len_dim, hidden_states)
	```"""

	assert len(input_tensors) > 0, f"{input_tensors} has to be a tuple/list of tensors"

	# inspect.signature exist since python 3.5 and is a python method -> no problem with backward compatibility
	num_args_in_forward_chunk_fn = len(inspect.signature(forward_fn).parameters)
	if num_args_in_forward_chunk_fn != len(input_tensors):
	raise ValueError(
	f"forward_chunk_fn expects {num_args_in_forward_chunk_fn} arguments, but only {len(input_tensors)} input "
	"tensors are given"
	)

	if chunk_size > 0:
	tensor_shape = input_tensors[0].shape[chunk_dim]
	for input_tensor in input_tensors:
	if input_tensor.shape[chunk_dim] != tensor_shape:
	raise ValueError(
	f"All input tenors have to be of the same shape: {tensor_shape}, "
	f"found shape {input_tensor.shape[chunk_dim]}"
	)

	if input_tensors[0].shape[chunk_dim] % chunk_size != 0:
	raise ValueError(
	f"The dimension to be chunked {input_tensors[0].shape[chunk_dim]} has to be a multiple of the chunk "
	f"size {chunk_size}"
	)

	num_chunks = input_tensors[0].shape[chunk_dim] // chunk_size

	# chunk input tensor into tuples
	input_tensors_chunks = tuple(input_tensor.chunk(num_chunks, dim=chunk_dim) for input_tensor in input_tensors)
	# apply forward fn to every tuple
	output_chunks = tuple(forward_fn(input_tensors_chunk) for input_tensors_chunk in zip(input_tensors_chunks))
	# concatenate output at same dimension
	return torch.cat(output_chunks, dim=chunk_dim)

	return forward_fn(*input_tensors)