|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
import os |
|
|
|
import pytest |
|
import torch |
|
from omegaconf import DictConfig |
|
from pytorch_lightning import Trainer |
|
|
|
from nemo.collections.common.tokenizers.huggingface.auto_tokenizer import AutoTokenizer |
|
from nemo.collections.nlp.models.language_modeling.megatron_gpt_model import MegatronGPTModel |
|
from nemo.collections.nlp.modules.common.megatron.utils import get_ltor_masks_and_position_ids |
|
from nemo.collections.nlp.parts.nlp_overrides import NLPDDPStrategy |
|
|
|
DEVICE_CAPABILITY = None |
|
if torch.cuda.is_available(): |
|
DEVICE_CAPABILITY = torch.cuda.get_device_capability() |
|
|
|
|
|
@pytest.fixture() |
|
def model_cfg(test_data_dir): |
|
|
|
model_cfg = { |
|
'precision': 16, |
|
'micro_batch_size': 4, |
|
'global_batch_size': 8, |
|
'tensor_model_parallel_size': 1, |
|
'pipeline_model_parallel_size': 1, |
|
'resume_from_checkpoint': None, |
|
'encoder_seq_length': 512, |
|
'max_position_embeddings': 512, |
|
'num_layers': 1, |
|
'hidden_size': 128, |
|
'ffn_hidden_size': 512, |
|
'num_attention_heads': 2, |
|
'init_method_std': 0.02, |
|
'hidden_dropout': 0.1, |
|
'kv_channels': None, |
|
'apply_query_key_layer_scaling': True, |
|
'layernorm_epsilon': 1e-5, |
|
'make_vocab_size_divisible_by': 128, |
|
'pre_process': True, |
|
'post_process': True, |
|
'persist_layer_norm': True, |
|
'gradient_as_bucket_view': True, |
|
'tokenizer': { |
|
'library': 'megatron', |
|
'type': 'GPT2BPETokenizer', |
|
'model': None, |
|
'vocab_file': os.path.join(test_data_dir, 'nlp/gpt_vocab_merges/vocab.json'), |
|
'merge_file': os.path.join(test_data_dir, 'nlp/gpt_vocab_merges/merges.txt'), |
|
'delimiter': None, |
|
}, |
|
'native_amp_init_scale': 4294967296, |
|
'native_amp_growth_interval': 1000, |
|
'hysteresis': 2, |
|
'fp32_residual_connection': False, |
|
'fp16_lm_cross_entropy': False, |
|
'megatron_amp_O2': False, |
|
'seed': 1234, |
|
'use_cpu_initialization': False, |
|
'onnx_safe': False, |
|
'apex_transformer_log_level': 30, |
|
'activations_checkpoint_method': None, |
|
'activations_checkpoint_num_layers': 1, |
|
'data': { |
|
'data_prefix': '???', |
|
'index_mapping_dir': None, |
|
'data_impl': 'mmap', |
|
'splits_string': '900,50,50', |
|
'seq_length': 512, |
|
'skip_warmup': True, |
|
'num_workers': 2, |
|
'dataloader_type': 'single', |
|
'reset_position_ids': False, |
|
'reset_attention_mask': False, |
|
'eod_mask_loss': False, |
|
}, |
|
'optim': { |
|
'name': 'fused_adam', |
|
'lr': 2e-4, |
|
'weight_decay': 0.01, |
|
'betas': [0.9, 0.98], |
|
'sched': {'name': 'CosineAnnealing', 'warmup_steps': 500, 'constant_steps': 50000, 'min_lr': '2e-5'}, |
|
}, |
|
} |
|
return model_cfg |
|
|
|
|
|
@pytest.fixture() |
|
def trainer_cfg(): |
|
|
|
trainer_cfg = { |
|
'devices': 1, |
|
'num_nodes': 1, |
|
'accelerator': 'gpu', |
|
'precision': 16, |
|
'logger': False, |
|
'enable_checkpointing': False, |
|
'replace_sampler_ddp': False, |
|
'max_epochs': 1000, |
|
'max_steps': 100000, |
|
'log_every_n_steps': 10, |
|
'val_check_interval': 100, |
|
'limit_val_batches': 50, |
|
'limit_test_batches': 500, |
|
'accumulate_grad_batches': 1, |
|
'gradient_clip_val': 1.0, |
|
} |
|
|
|
return trainer_cfg |
|
|
|
|
|
@pytest.fixture() |
|
def precision(): |
|
return 32 |
|
|
|
|
|
@pytest.fixture() |
|
def gpt_model(model_cfg, trainer_cfg, precision): |
|
model_cfg['precision'] = precision |
|
trainer_cfg['precision'] = precision |
|
|
|
strategy = NLPDDPStrategy() |
|
|
|
trainer = Trainer(strategy=strategy, **trainer_cfg) |
|
|
|
cfg = DictConfig(model_cfg) |
|
|
|
model = MegatronGPTModel(cfg=cfg, trainer=trainer) |
|
|
|
return model |
|
|
|
|
|
@pytest.fixture() |
|
def test_text(): |
|
test_text = [ |
|
"hello, world", |
|
"four score and seven years ago", |
|
"Your time is limited", |
|
"If you set goals rediculously high", |
|
] |
|
return test_text |
|
|
|
|
|
@pytest.mark.run_only_on('GPU') |
|
class TestGPTModel: |
|
@pytest.mark.unit |
|
def test_constructor(self, gpt_model): |
|
assert isinstance(gpt_model, MegatronGPTModel) |
|
|
|
num_weights = gpt_model.num_weights |
|
assert num_weights == 6702976 |
|
|
|
@pytest.mark.unit |
|
def test_tokenizer(self, gpt_model, test_text): |
|
|
|
assert isinstance(gpt_model.tokenizer, AutoTokenizer) |
|
assert gpt_model.tokenizer.name == 'GPT2Tokenizer' |
|
assert gpt_model.tokenizer.vocab_size == 50257 |
|
|
|
ids = [gpt_model.tokenizer.text_to_ids(text) for text in test_text] |
|
|
|
true_ids = [ |
|
[31373, 11, 995], |
|
[14337, 4776, 290, 3598, 812, 2084], |
|
[7120, 640, 318, 3614], |
|
[1532, 345, 900, 4661, 2266, 291, 18117, 1029], |
|
] |
|
assert sum([id_list == true_id_list for id_list, true_id_list in zip(ids, true_ids)]) == 4 |
|
|
|
@pytest.mark.parametrize( |
|
"precision", |
|
[ |
|
32, |
|
16, |
|
pytest.param( |
|
"bf16", |
|
marks=pytest.mark.skipif( |
|
not DEVICE_CAPABILITY or DEVICE_CAPABILITY[0] < 8, |
|
reason='bfloat16 is not supported on this device', |
|
), |
|
), |
|
], |
|
) |
|
@pytest.mark.unit |
|
def test_forward(self, gpt_model, test_text): |
|
|
|
dtype = None |
|
if gpt_model.cfg['precision'] == 32: |
|
dtype = torch.float |
|
elif gpt_model.cfg['precision'] == 16: |
|
dtype = torch.float16 |
|
elif gpt_model.cfg['precision'] == 'bf16': |
|
dtype = torch.bfloat16 |
|
else: |
|
raise ValueError(f"precision: {gpt_model.cfg['precision']} is not supported.") |
|
|
|
gpt_model.eval() |
|
|
|
ids = [gpt_model.tokenizer.text_to_ids(text) for text in test_text] |
|
|
|
id_tensors = [torch.unsqueeze(torch.LongTensor(id_list), dim=0) for id_list in ids] |
|
|
|
masks_and_position_ids = [ |
|
get_ltor_masks_and_position_ids(id_tensor, gpt_model.tokenizer.eos_id, False, False, False) |
|
for id_tensor in id_tensors |
|
] |
|
|
|
output_tensors = [] |
|
with torch.no_grad(): |
|
for tokens, attn_mask_and_pos_ids in zip(id_tensors, masks_and_position_ids): |
|
attn_mask, _, pos_ids = attn_mask_and_pos_ids |
|
assert tokens.shape == pos_ids.shape |
|
assert attn_mask.shape[2] == attn_mask.shape[3] == tokens.shape[1] == pos_ids.shape[1] |
|
with torch.autocast('cuda', dtype=dtype): |
|
output_tensor = gpt_model.forward( |
|
tokens=tokens.cuda(), |
|
text_position_ids=pos_ids.cuda(), |
|
attention_mask=attn_mask.cuda(), |
|
labels=None, |
|
) |
|
|
|
assert output_tensor.shape[0] == 1 |
|
assert output_tensor.shape[1] == tokens.shape[1] |
|
assert output_tensor.shape[2] == gpt_model.padded_vocab_size |
|
assert output_tensor.dtype == dtype |
|
output_tensors.append(output_tensor) |
|
|