# Copyright (c) 2022, NVIDIA CORPORATION. All rights reserved. # # Licensed under the Apache License, Version 2.0 (the "License"); # you may not use this file except in compliance with the License. # You may obtain a copy of the License at # # http://www.apache.org/licenses/LICENSE-2.0 # # Unless required by applicable law or agreed to in writing, software # distributed under the License is distributed on an "AS IS" BASIS, # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. # See the License for the specific language governing permissions and # limitations under the License. import os import pytest import torch from omegaconf import DictConfig from pytorch_lightning import Trainer from nemo.collections.common.tokenizers.huggingface.auto_tokenizer import AutoTokenizer from nemo.collections.nlp.models.language_modeling.megatron_gpt_model import MegatronGPTModel from nemo.collections.nlp.modules.common.megatron.utils import get_ltor_masks_and_position_ids from nemo.collections.nlp.parts.nlp_overrides import NLPDDPStrategy DEVICE_CAPABILITY = None if torch.cuda.is_available(): DEVICE_CAPABILITY = torch.cuda.get_device_capability() @pytest.fixture() def model_cfg(test_data_dir): model_cfg = { 'precision': 16, 'micro_batch_size': 4, 'global_batch_size': 8, 'tensor_model_parallel_size': 1, 'pipeline_model_parallel_size': 1, 'resume_from_checkpoint': None, 'encoder_seq_length': 512, 'max_position_embeddings': 512, 'num_layers': 1, 'hidden_size': 128, 'ffn_hidden_size': 512, 'num_attention_heads': 2, 'init_method_std': 0.02, 'hidden_dropout': 0.1, 'kv_channels': None, 'apply_query_key_layer_scaling': True, 'layernorm_epsilon': 1e-5, 'make_vocab_size_divisible_by': 128, 'pre_process': True, 'post_process': True, 'persist_layer_norm': True, 'gradient_as_bucket_view': True, 'tokenizer': { 'library': 'megatron', 'type': 'GPT2BPETokenizer', 'model': None, 'vocab_file': os.path.join(test_data_dir, 'nlp/gpt_vocab_merges/vocab.json'), 'merge_file': os.path.join(test_data_dir, 'nlp/gpt_vocab_merges/merges.txt'), 'delimiter': None, }, 'native_amp_init_scale': 4294967296, 'native_amp_growth_interval': 1000, 'hysteresis': 2, 'fp32_residual_connection': False, 'fp16_lm_cross_entropy': False, 'megatron_amp_O2': False, 'seed': 1234, 'use_cpu_initialization': False, 'onnx_safe': False, 'apex_transformer_log_level': 30, 'activations_checkpoint_method': None, 'activations_checkpoint_num_layers': 1, 'data': { 'data_prefix': '???', 'index_mapping_dir': None, 'data_impl': 'mmap', 'splits_string': '900,50,50', 'seq_length': 512, 'skip_warmup': True, 'num_workers': 2, 'dataloader_type': 'single', 'reset_position_ids': False, 'reset_attention_mask': False, 'eod_mask_loss': False, }, 'optim': { 'name': 'fused_adam', 'lr': 2e-4, 'weight_decay': 0.01, 'betas': [0.9, 0.98], 'sched': {'name': 'CosineAnnealing', 'warmup_steps': 500, 'constant_steps': 50000, 'min_lr': '2e-5'}, }, } return model_cfg @pytest.fixture() def trainer_cfg(): trainer_cfg = { 'devices': 1, 'num_nodes': 1, 'accelerator': 'gpu', 'precision': 16, 'logger': False, 'enable_checkpointing': False, 'replace_sampler_ddp': False, 'max_epochs': 1000, 'max_steps': 100000, 'log_every_n_steps': 10, 'val_check_interval': 100, 'limit_val_batches': 50, 'limit_test_batches': 500, 'accumulate_grad_batches': 1, 'gradient_clip_val': 1.0, } return trainer_cfg @pytest.fixture() def precision(): return 32 @pytest.fixture() def gpt_model(model_cfg, trainer_cfg, precision): model_cfg['precision'] = precision trainer_cfg['precision'] = precision strategy = NLPDDPStrategy() trainer = Trainer(strategy=strategy, **trainer_cfg) cfg = DictConfig(model_cfg) model = MegatronGPTModel(cfg=cfg, trainer=trainer) return model @pytest.fixture() def test_text(): test_text = [ "hello, world", "four score and seven years ago", "Your time is limited", "If you set goals rediculously high", ] return test_text @pytest.mark.run_only_on('GPU') class TestGPTModel: @pytest.mark.unit def test_constructor(self, gpt_model): assert isinstance(gpt_model, MegatronGPTModel) num_weights = gpt_model.num_weights assert num_weights == 6702976 @pytest.mark.unit def test_tokenizer(self, gpt_model, test_text): assert isinstance(gpt_model.tokenizer, AutoTokenizer) assert gpt_model.tokenizer.name == 'GPT2Tokenizer' assert gpt_model.tokenizer.vocab_size == 50257 ids = [gpt_model.tokenizer.text_to_ids(text) for text in test_text] true_ids = [ [31373, 11, 995], [14337, 4776, 290, 3598, 812, 2084], [7120, 640, 318, 3614], [1532, 345, 900, 4661, 2266, 291, 18117, 1029], ] assert sum([id_list == true_id_list for id_list, true_id_list in zip(ids, true_ids)]) == 4 @pytest.mark.parametrize( "precision", [ 32, 16, pytest.param( "bf16", marks=pytest.mark.skipif( not DEVICE_CAPABILITY or DEVICE_CAPABILITY[0] < 8, reason='bfloat16 is not supported on this device', ), ), ], ) @pytest.mark.unit def test_forward(self, gpt_model, test_text): dtype = None if gpt_model.cfg['precision'] == 32: dtype = torch.float elif gpt_model.cfg['precision'] == 16: dtype = torch.float16 elif gpt_model.cfg['precision'] == 'bf16': dtype = torch.bfloat16 else: raise ValueError(f"precision: {gpt_model.cfg['precision']} is not supported.") gpt_model.eval() ids = [gpt_model.tokenizer.text_to_ids(text) for text in test_text] id_tensors = [torch.unsqueeze(torch.LongTensor(id_list), dim=0) for id_list in ids] masks_and_position_ids = [ get_ltor_masks_and_position_ids(id_tensor, gpt_model.tokenizer.eos_id, False, False, False) for id_tensor in id_tensors ] output_tensors = [] with torch.no_grad(): for tokens, attn_mask_and_pos_ids in zip(id_tensors, masks_and_position_ids): attn_mask, _, pos_ids = attn_mask_and_pos_ids assert tokens.shape == pos_ids.shape assert attn_mask.shape[2] == attn_mask.shape[3] == tokens.shape[1] == pos_ids.shape[1] with torch.autocast('cuda', dtype=dtype): output_tensor = gpt_model.forward( tokens=tokens.cuda(), text_position_ids=pos_ids.cuda(), attention_mask=attn_mask.cuda(), labels=None, ) # output is [b s h] assert output_tensor.shape[0] == 1 assert output_tensor.shape[1] == tokens.shape[1] assert output_tensor.shape[2] == gpt_model.padded_vocab_size assert output_tensor.dtype == dtype output_tensors.append(output_tensor)