NeMo / tests /collections /nlp /test_gpt_model.py
camenduru's picture
thanks to NVIDIA ❤
7934b29
# Copyright (c) 2022, NVIDIA CORPORATION. All rights reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
import os
import pytest
import torch
from omegaconf import DictConfig
from pytorch_lightning import Trainer
from nemo.collections.common.tokenizers.huggingface.auto_tokenizer import AutoTokenizer
from nemo.collections.nlp.models.language_modeling.megatron_gpt_model import MegatronGPTModel
from nemo.collections.nlp.modules.common.megatron.utils import get_ltor_masks_and_position_ids
from nemo.collections.nlp.parts.nlp_overrides import NLPDDPStrategy
DEVICE_CAPABILITY = None
if torch.cuda.is_available():
DEVICE_CAPABILITY = torch.cuda.get_device_capability()
@pytest.fixture()
def model_cfg(test_data_dir):
model_cfg = {
'precision': 16,
'micro_batch_size': 4,
'global_batch_size': 8,
'tensor_model_parallel_size': 1,
'pipeline_model_parallel_size': 1,
'resume_from_checkpoint': None,
'encoder_seq_length': 512,
'max_position_embeddings': 512,
'num_layers': 1,
'hidden_size': 128,
'ffn_hidden_size': 512,
'num_attention_heads': 2,
'init_method_std': 0.02,
'hidden_dropout': 0.1,
'kv_channels': None,
'apply_query_key_layer_scaling': True,
'layernorm_epsilon': 1e-5,
'make_vocab_size_divisible_by': 128,
'pre_process': True,
'post_process': True,
'persist_layer_norm': True,
'gradient_as_bucket_view': True,
'tokenizer': {
'library': 'megatron',
'type': 'GPT2BPETokenizer',
'model': None,
'vocab_file': os.path.join(test_data_dir, 'nlp/gpt_vocab_merges/vocab.json'),
'merge_file': os.path.join(test_data_dir, 'nlp/gpt_vocab_merges/merges.txt'),
'delimiter': None,
},
'native_amp_init_scale': 4294967296,
'native_amp_growth_interval': 1000,
'hysteresis': 2,
'fp32_residual_connection': False,
'fp16_lm_cross_entropy': False,
'megatron_amp_O2': False,
'seed': 1234,
'use_cpu_initialization': False,
'onnx_safe': False,
'apex_transformer_log_level': 30,
'activations_checkpoint_method': None,
'activations_checkpoint_num_layers': 1,
'data': {
'data_prefix': '???',
'index_mapping_dir': None,
'data_impl': 'mmap',
'splits_string': '900,50,50',
'seq_length': 512,
'skip_warmup': True,
'num_workers': 2,
'dataloader_type': 'single',
'reset_position_ids': False,
'reset_attention_mask': False,
'eod_mask_loss': False,
},
'optim': {
'name': 'fused_adam',
'lr': 2e-4,
'weight_decay': 0.01,
'betas': [0.9, 0.98],
'sched': {'name': 'CosineAnnealing', 'warmup_steps': 500, 'constant_steps': 50000, 'min_lr': '2e-5'},
},
}
return model_cfg
@pytest.fixture()
def trainer_cfg():
trainer_cfg = {
'devices': 1,
'num_nodes': 1,
'accelerator': 'gpu',
'precision': 16,
'logger': False,
'enable_checkpointing': False,
'replace_sampler_ddp': False,
'max_epochs': 1000,
'max_steps': 100000,
'log_every_n_steps': 10,
'val_check_interval': 100,
'limit_val_batches': 50,
'limit_test_batches': 500,
'accumulate_grad_batches': 1,
'gradient_clip_val': 1.0,
}
return trainer_cfg
@pytest.fixture()
def precision():
return 32
@pytest.fixture()
def gpt_model(model_cfg, trainer_cfg, precision):
model_cfg['precision'] = precision
trainer_cfg['precision'] = precision
strategy = NLPDDPStrategy()
trainer = Trainer(strategy=strategy, **trainer_cfg)
cfg = DictConfig(model_cfg)
model = MegatronGPTModel(cfg=cfg, trainer=trainer)
return model
@pytest.fixture()
def test_text():
test_text = [
"hello, world",
"four score and seven years ago",
"Your time is limited",
"If you set goals rediculously high",
]
return test_text
@pytest.mark.run_only_on('GPU')
class TestGPTModel:
@pytest.mark.unit
def test_constructor(self, gpt_model):
assert isinstance(gpt_model, MegatronGPTModel)
num_weights = gpt_model.num_weights
assert num_weights == 6702976
@pytest.mark.unit
def test_tokenizer(self, gpt_model, test_text):
assert isinstance(gpt_model.tokenizer, AutoTokenizer)
assert gpt_model.tokenizer.name == 'GPT2Tokenizer'
assert gpt_model.tokenizer.vocab_size == 50257
ids = [gpt_model.tokenizer.text_to_ids(text) for text in test_text]
true_ids = [
[31373, 11, 995],
[14337, 4776, 290, 3598, 812, 2084],
[7120, 640, 318, 3614],
[1532, 345, 900, 4661, 2266, 291, 18117, 1029],
]
assert sum([id_list == true_id_list for id_list, true_id_list in zip(ids, true_ids)]) == 4
@pytest.mark.parametrize(
"precision",
[
32,
16,
pytest.param(
"bf16",
marks=pytest.mark.skipif(
not DEVICE_CAPABILITY or DEVICE_CAPABILITY[0] < 8,
reason='bfloat16 is not supported on this device',
),
),
],
)
@pytest.mark.unit
def test_forward(self, gpt_model, test_text):
dtype = None
if gpt_model.cfg['precision'] == 32:
dtype = torch.float
elif gpt_model.cfg['precision'] == 16:
dtype = torch.float16
elif gpt_model.cfg['precision'] == 'bf16':
dtype = torch.bfloat16
else:
raise ValueError(f"precision: {gpt_model.cfg['precision']} is not supported.")
gpt_model.eval()
ids = [gpt_model.tokenizer.text_to_ids(text) for text in test_text]
id_tensors = [torch.unsqueeze(torch.LongTensor(id_list), dim=0) for id_list in ids]
masks_and_position_ids = [
get_ltor_masks_and_position_ids(id_tensor, gpt_model.tokenizer.eos_id, False, False, False)
for id_tensor in id_tensors
]
output_tensors = []
with torch.no_grad():
for tokens, attn_mask_and_pos_ids in zip(id_tensors, masks_and_position_ids):
attn_mask, _, pos_ids = attn_mask_and_pos_ids
assert tokens.shape == pos_ids.shape
assert attn_mask.shape[2] == attn_mask.shape[3] == tokens.shape[1] == pos_ids.shape[1]
with torch.autocast('cuda', dtype=dtype):
output_tensor = gpt_model.forward(
tokens=tokens.cuda(),
text_position_ids=pos_ids.cuda(),
attention_mask=attn_mask.cuda(),
labels=None,
)
# output is [b s h]
assert output_tensor.shape[0] == 1
assert output_tensor.shape[1] == tokens.shape[1]
assert output_tensor.shape[2] == gpt_model.padded_vocab_size
assert output_tensor.dtype == dtype
output_tensors.append(output_tensor)