NeMo / nemo /collections /asr /models /asr_model.py

thanks to NVIDIA ❤

7934b29 over 2 years ago

10.6 kB

	# Copyright (c) 2020, NVIDIA CORPORATION. All rights reserved.
	#
	# Licensed under the Apache License, Version 2.0 (the "License");
	# you may not use this file except in compliance with the License.
	# You may obtain a copy of the License at
	#
	# http://www.apache.org/licenses/LICENSE-2.0
	#
	# Unless required by applicable law or agreed to in writing, software
	# distributed under the License is distributed on an "AS IS" BASIS,
	# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
	# See the License for the specific language governing permissions and
	# limitations under the License.
	import logging
	from abc import ABC, abstractmethod
	from typing import List

	import torch

	from nemo.core.classes import ModelPT
	from nemo.core.classes.common import PretrainedModelInfo
	from nemo.core.classes.exportable import Exportable
	from nemo.core.classes.mixins import AccessMixin
	from nemo.core.utils.neural_type_utils import get_io_names
	from nemo.utils import logging, model_utils
	from nemo.utils.cast_utils import cast_all

	__all__ = ['ASRModel']


	class ASRModel(ModelPT, ABC):
	@abstractmethod
	def transcribe(self, paths2audio_files: List[str], batch_size: int = 4) -> List[str]:
	"""
	Takes paths to audio files and returns text transcription
	Args:
	paths2audio_files: paths to audio fragment to be transcribed

	Returns:
	transcription texts
	"""
	pass

	def multi_validation_epoch_end(self, outputs, dataloader_idx: int = 0):
	val_loss_mean = torch.stack([x['val_loss'] for x in outputs]).mean()
	wer_num = torch.stack([x['val_wer_num'] for x in outputs]).sum()
	wer_denom = torch.stack([x['val_wer_denom'] for x in outputs]).sum()
	tensorboard_logs = {'val_loss': val_loss_mean, 'val_wer': wer_num / wer_denom}
	return {'val_loss': val_loss_mean, 'log': tensorboard_logs}

	def multi_test_epoch_end(self, outputs, dataloader_idx: int = 0):
	val_loss_mean = torch.stack([x['test_loss'] for x in outputs]).mean()
	wer_num = torch.stack([x['test_wer_num'] for x in outputs]).sum()
	wer_denom = torch.stack([x['test_wer_denom'] for x in outputs]).sum()
	tensorboard_logs = {'test_loss': val_loss_mean, 'test_wer': wer_num / wer_denom}
	return {'test_loss': val_loss_mean, 'log': tensorboard_logs}

	@classmethod
	def list_available_models(cls) -> 'List[PretrainedModelInfo]':
	"""
	This method returns a list of pre-trained model which can be instantiated directly from NVIDIA's NGC cloud.
	Returns:
	List of available pre-trained models.
	"""
	# recursively walk the subclasses to generate pretrained model info
	list_of_models = model_utils.resolve_subclass_pretrained_model_info(cls)
	return list_of_models

	def add_auxiliary_losses(self, loss: torch.Tensor, reset_registry: bool = False) -> torch.Tensor:
	"""
	Utility method to enable calculation of auxiliary losses for ASR training.

	Args:
	loss: The output loss value prior to addition with auxiliary losses.
	reset_registry: Bool, whether to reset the AccessMixin registry after adding auxiliary losses.

	Returns:
	Loss tensor used for back propagation.
	"""
	# Add adapter auxiliary losses, if registered
	if AccessMixin.is_access_enabled():
	registry = AccessMixin.get_module_registry(self)
	log_dict = {}

	for loss_key, loss_registry in registry.items():
	# Add auxiliary loss to total loss
	if 'adapter_loss' in loss_registry:
	loss_list = loss_registry['adapter_loss']
	loss_value = sum(loss_list)
	loss += loss_value

	# Log current loss name and value
	keys = loss_key.split(".")
	key = "/".join(keys)
	key = "adapter_loss/" + key
	log_dict[key] = loss_value.detach()

	if len(log_dict) > 0:
	self.log_dict(log_dict)

	if reset_registry:
	AccessMixin.reset_registry(self)

	# return total loss
	return loss

	def setup_optimization_flags(self):
	"""
	Utility method that must be explicitly called by the subclass in order to support optional optimization flags.
	This method is the only valid place to access self.cfg prior to DDP training occurs.

	The subclass may chose not to support this method, therefore all variables here must be checked via hasattr()
	"""
	# Skip update if nan/inf grads appear on any rank.
	self._skip_nan_grad = False
	if "skip_nan_grad" in self._cfg and self._cfg["skip_nan_grad"]:
	self._skip_nan_grad = self._cfg["skip_nan_grad"]

	def on_after_backward(self):
	"""
	zero-out the gradients which any of them is NAN or INF
	"""
	super().on_after_backward()

	if hasattr(self, '_skip_nan_grad') and self._skip_nan_grad:
	device = next(self.parameters()).device
	valid_gradients = torch.tensor([1], device=device, dtype=torch.float32)

	# valid_gradients = True
	for param_name, param in self.named_parameters():
	if param.grad is not None:
	is_not_nan_or_inf = not (torch.isnan(param.grad).any() or torch.isinf(param.grad).any())
	if not is_not_nan_or_inf:
	valid_gradients = valid_gradients * 0
	break

	if torch.distributed.is_initialized():
	torch.distributed.all_reduce(valid_gradients, op=torch.distributed.ReduceOp.MIN)

	if valid_gradients < 1:
	logging.warning(f'detected inf or nan values in gradients! Setting gradients to zero.')
	self.zero_grad()


	class ExportableEncDecModel(Exportable):
	"""
	Simple utiliy mix-in to export models that consist of encoder/decoder pair
	plus pre/post processor, but have to be exported as encoder/decoder pair only
	(covers most ASR classes)
	"""

	@property
	def input_module(self):
	return self.encoder

	@property
	def output_module(self):
	return self.decoder

	@property
	def output_names(self):
	otypes = self.output_module.output_types
	if hasattr(self.input_module, 'export_cache_support') and self.input_module.export_cache_support:
	in_types = self.input_module.output_types
	otypes = {n: t for (n, t) in list(otypes.items())[:1]}
	for (n, t) in list(in_types.items())[1:]:
	otypes[n] = t
	return get_io_names(otypes, self.disabled_deployment_output_names)

	def forward_for_export(
	self, input, length=None, cache_last_channel=None, cache_last_time=None, cache_last_channel_len=None
	):
	"""
	This forward is used when we need to export the model to ONNX format.
	Inputs cache_last_channel and cache_last_time are needed to be passed for exporting streaming models.
	When they are passed, it just passes the inputs through the encoder part and currently the ONNX conversion does not fully work for this case.
	Args:
	input: Tensor that represents a batch of raw audio signals,
	of shape [B, T]. T here represents timesteps.
	length: Vector of length B, that contains the individual lengths of the audio sequences.
	cache_last_channel: Tensor of shape [N, B, T, H] which contains the cache for last channel layers
	cache_last_time: Tensor of shape [N, B, H, T] which contains the cache for last time layers
	N is the number of such layers which need caching, B is batch size, H is the hidden size of activations,
	and T is the length of the cache

	Returns:
	the output of the model
	"""
	if hasattr(self.input_module, 'forward_for_export'):
	if cache_last_channel is None and cache_last_time is None:
	encoder_output = self.input_module.forward_for_export(audio_signal=input, length=length)
	else:
	encoder_output = self.input_module.forward_for_export(
	audio_signal=input,
	length=length,
	cache_last_channel=cache_last_channel,
	cache_last_time=cache_last_time,
	cache_last_channel_len=cache_last_channel_len,
	)
	else:
	if cache_last_channel is None and cache_last_time is None:
	encoder_output = self.input_module(audio_signal=input, length=length)
	else:
	encoder_output = self.input_module(
	audio_signal=input,
	length=length,
	cache_last_channel=cache_last_channel,
	cache_last_time=cache_last_time,
	cache_last_channel_len=cache_last_channel_len,
	)
	if isinstance(encoder_output, tuple):
	decoder_input = encoder_output[0]
	else:
	decoder_input = encoder_output
	if hasattr(self.output_module, 'forward_for_export'):
	if cache_last_channel is None and cache_last_time is None:
	ret = self.output_module.forward_for_export(encoder_output=decoder_input)
	else:
	ret = self.output_module.forward_for_export(encoder_output=decoder_input)
	else:
	if cache_last_channel is None and cache_last_time is None:
	ret = self.output_module(encoder_output=decoder_input)
	else:
	ret = self.output_module(encoder_output=decoder_input)
	if cache_last_channel is None and cache_last_time is None:
	pass
	else:
	if isinstance(ret, tuple):
	ret = (ret[0], encoder_output[1], encoder_output[2], encoder_output[3], encoder_output[4])
	else:
	ret = (ret, encoder_output[1], encoder_output[2], encoder_output[3], encoder_output[4])
	return cast_all(ret, from_dtype=torch.float16, to_dtype=torch.float32)

	@property
	def disabled_deployment_input_names(self):
	return self.encoder.disabled_deployment_input_names

	@property
	def disabled_deployment_output_names(self):
	return self.encoder.disabled_deployment_output_names