NeMo / tests /collections /nlp /test_indexed_retrieval_dataset.py

thanks to NVIDIA ❤

7934b29 over 2 years ago

35.5 kB

	# Copyright (c) 2022, NVIDIA CORPORATION. All rights reserved.
	#
	# Licensed under the Apache License, Version 2.0 (the "License");
	# you may not use this file except in compliance with the License.
	# You may obtain a copy of the License at
	#
	# http://www.apache.org/licenses/LICENSE-2.0
	#
	# Unless required by applicable law or agreed to in writing, software
	# distributed under the License is distributed on an "AS IS" BASIS,
	# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
	# See the License for the specific language governing permissions and
	# limitations under the License.


	import os

	import numpy as np
	import pytest
	import torch
	from numpy.testing import assert_array_equal
	from omegaconf import OmegaConf
	from scripts.nlp_language_modeling.build_knn_map_index import build_map, dedup

	from nemo.collections.nlp.data.language_modeling.megatron.indexed_retrieval_dataset import (
	KNNIndex,
	MMapRetrievalIndexedDataset,
	MMapRetrievalIndexedDatasetBuilder,
	merge_knn_files,
	)
	from nemo.collections.nlp.data.language_modeling.megatron.retro_dataset import RETRODataset

	try:
	from apex.transformer import parallel_state

	HAVE_APEX = True
	except (ImportError, ModuleNotFoundError):
	HAVE_APEX = False


	@pytest.mark.run_only_on('GPU')
	@pytest.mark.skipif(not HAVE_APEX, reason="apex is not installed")
	class TestRetrievalIndexFiles:
	@classmethod
	def setup_class(cls):
	init_method = 'tcp://'
	master_ip = 'localhost'
	master_port = '6000'
	init_method += master_ip + ':' + master_port
	torch.distributed.init_process_group(backend='gloo', world_size=1, rank=0, init_method=init_method)
	parallel_state.initialize_model_parallel(1, 1)

	@pytest.mark.unit
	def test_index(self):
	chunk_size = 64
	stride = 32
	sizes = np.array([128, 256], dtype=np.int32)
	dtype = np.int64
	itemsize = dtype().itemsize
	index_file = '/tmp/test.idx'
	try:
	with MMapRetrievalIndexedDataset.Index.writer(index_file, dtype, False) as index:
	index.write(sizes, chunk_size, stride=stride)

	index_load = MMapRetrievalIndexedDataset.Index(index_file)
	assert index_load.chunk_size == chunk_size
	assert not index_load.retrieval_db
	assert np.array_equal(index_load.sizes, sizes)
	assert np.array_equal(
	index_load._chunk_id_start,
	np.array([0, len(range(0, sizes[0] - chunk_size + 1, stride))], dtype=np.int64),
	)
	add1 = [i * itemsize for i in list(range(0, sizes[0] - chunk_size + 1, stride))]
	start = max(add1) + chunk_size * itemsize
	add2 = [i * itemsize + start for i in list(range(0, sizes[1] - chunk_size + 1, stride))]
	addr = add1 + add2
	assert np.array_equal(index_load._chunk_address, np.array(addr, dtype=np.int64))
	assert np.array_equal(index_load._pointers, np.array([0, sizes[0] * itemsize], dtype=np.int64))
	assert len(index_load._chunk_address) == index_load.num_chunks
	finally:
	os.remove(index_file)

	chunk_size = 64
	stride = 64
	sizes = np.array([128, 256], dtype=np.int32)
	dtype = np.int64
	itemsize = dtype().itemsize
	index_file = '/tmp/test.idx'
	try:
	with MMapRetrievalIndexedDataset.Index.writer(index_file, dtype, False) as index:
	index.write(sizes, chunk_size, stride=stride)

	index_load = MMapRetrievalIndexedDataset.Index(index_file)
	assert index_load.chunk_size == chunk_size
	assert not index_load.retrieval_db
	assert np.array_equal(index_load.sizes, sizes)
	assert np.array_equal(
	index_load._chunk_id_start,
	np.array([0, len(range(0, sizes[0] - chunk_size + 1, stride))], dtype=np.int64),
	)
	add1 = [i * itemsize for i in list(range(0, sizes[0] - chunk_size + 1, stride))]
	start = max(add1) + chunk_size * itemsize
	add2 = [i * itemsize + start for i in list(range(0, sizes[1] - chunk_size + 1, stride))]
	addr = add1 + add2
	assert np.array_equal(index_load._chunk_address, np.array(addr, dtype=np.int64))
	assert np.array_equal(index_load._pointers, np.array([0, sizes[0] * itemsize], dtype=np.int64))
	assert len(index_load._chunk_address) == index_load.num_chunks
	finally:
	os.remove(index_file)

	@pytest.mark.unit
	def test_create_data_index_stride32(self):
	chunk_size = 64
	pad_id = 0
	stride = 32
	sentence1 = torch.arange(0, 200, 2, dtype=torch.int64)
	padded_size = chunk_size - (len(sentence1) % chunk_size)
	gt1 = np.pad(sentence1, (0, padded_size), 'constant', constant_values=pad_id)

	sentence2 = torch.arange(1, 500, 2, dtype=torch.int64)
	padded_size = chunk_size - (len(sentence2) % chunk_size)
	gt2 = np.pad(sentence2, (0, padded_size), 'constant', constant_values=pad_id)

	data_file = '/tmp/test'
	index_file = data_file + '.idx'
	bin_file = data_file + '.bin'
	try:
	builder = MMapRetrievalIndexedDatasetBuilder(
	bin_file, chunk_size, pad_id, False, dtype=np.int64, stride=stride
	)
	builder.add_item(sentence1)
	builder.add_item(sentence2)
	builder.finalize(index_file)
	# load the data
	ds = MMapRetrievalIndexedDataset(data_file)
	assert np.array_equal(ds.get(0), gt1)
	assert np.array_equal(ds.get(1), gt2)
	fetch1, fetch2 = ds[0:2]
	assert np.array_equal(fetch1, gt1)
	assert np.array_equal(fetch2, gt2)
	chunk_id = ds.get_chunk_id(0, 64)
	assert chunk_id == 2
	assert ds.from_chunk_id_to_doc_id(0) == 0
	assert ds.from_chunk_id_to_doc_id(1) == 0
	assert ds.from_chunk_id_to_doc_id(2) == 0
	with pytest.raises(ValueError):
	ds.get_chunk_id(0, 128)
	assert np.array_equal(ds.get_chunk(chunk_id), gt1[64 : 64 + 64])
	chunk_id = ds.get_chunk_id(1, 0)
	assert chunk_id == 3
	assert ds.from_chunk_id_to_doc_id(3) == 1
	assert ds.from_chunk_id_to_doc_id(4) == 1
	assert ds.from_chunk_id_to_doc_id(5) == 1
	assert ds.from_chunk_id_to_doc_id(6) == 1
	assert ds.from_chunk_id_to_doc_id(7) == 1
	assert ds.from_chunk_id_to_doc_id(8) == 1
	assert ds.from_chunk_id_to_doc_id(9) == 1
	with pytest.raises(ValueError):
	ds.from_chunk_id_to_doc_id(10)
	assert np.array_equal(ds.get_chunk(chunk_id), gt2[0:64])
	assert np.array_equal(ds.get_chunk(chunk_id + 1), gt2[stride : stride + chunk_size])
	assert np.array_equal(ds.get_chunk(chunk_id + 2), gt2[stride * 2 : stride * 2 + chunk_size])
	assert np.array_equal(ds.get_chunk(chunk_id + 3), gt2[stride * 3 : stride * 3 + chunk_size])
	assert ds.get_chunk_id(1, 64) == 5
	assert ds.get_chunk_id(1, 128) == 7
	assert ds.get_chunk_id(1, 192) == 9
	with pytest.raises(ValueError):
	ds.get_chunk_id(0, 256)
	finally:
	os.remove(index_file)
	os.remove(bin_file)

	@pytest.mark.unit
	def test_create_data_index(self):
	chunk_size = 64
	pad_id = 0
	sentence1 = torch.arange(0, 200, 2, dtype=torch.int64)
	padded_size = chunk_size - (len(sentence1) % chunk_size)
	gt1 = np.pad(sentence1, (0, padded_size), 'constant', constant_values=pad_id)

	sentence2 = torch.arange(1, 500, 2, dtype=torch.int64)
	padded_size = chunk_size - (len(sentence2) % chunk_size)
	gt2 = np.pad(sentence2, (0, padded_size), 'constant', constant_values=pad_id)

	data_file = '/tmp/test'
	index_file = data_file + '.idx'
	bin_file = data_file + '.bin'
	try:
	builder = MMapRetrievalIndexedDatasetBuilder(bin_file, chunk_size, pad_id, False)
	builder.add_item(sentence1)
	builder.add_item(sentence2)
	builder.finalize(index_file)
	# load the data
	ds = MMapRetrievalIndexedDataset(data_file)
	assert np.array_equal(ds.get(0), gt1)
	assert np.array_equal(ds.get(1), gt2)
	fetch1, fetch2 = ds[0:2]
	assert np.array_equal(fetch1, gt1)
	assert np.array_equal(fetch2, gt2)
	chunk_id = ds.get_chunk_id(0, 64)
	assert chunk_id == 1
	assert ds.from_chunk_id_to_doc_id(0) == 0
	assert ds.from_chunk_id_to_doc_id(1) == 0
	with pytest.raises(ValueError):
	ds.get_chunk_id(0, 128)
	assert np.array_equal(ds.get_chunk(chunk_id), gt1[64 : 64 + 64])
	chunk_id = ds.get_chunk_id(1, 0)
	assert chunk_id == 2
	assert ds.from_chunk_id_to_doc_id(2) == 1
	assert ds.from_chunk_id_to_doc_id(3) == 1
	assert ds.from_chunk_id_to_doc_id(4) == 1
	assert ds.from_chunk_id_to_doc_id(5) == 1
	with pytest.raises(ValueError):
	ds.from_chunk_id_to_doc_id(6)
	assert np.array_equal(ds.get_chunk(chunk_id), gt2[0:64])
	assert np.array_equal(ds.get_chunk(chunk_id + 1), gt2[64:128])
	assert np.array_equal(ds.get_chunk(chunk_id + 2), gt2[128:192])
	assert np.array_equal(ds.get_chunk(chunk_id + 3), gt2[192:256])
	assert ds.get_chunk_id(1, 64) == 3
	assert ds.get_chunk_id(1, 128) == 4
	assert ds.get_chunk_id(1, 192) == 5
	with pytest.raises(ValueError):
	ds.get_chunk_id(0, 256)
	finally:
	os.remove(index_file)
	os.remove(bin_file)

	@pytest.mark.unit
	def test_create_retrieval_data_index_stride32(self):
	stride = 32
	chunk_size = 64
	pad_id = 0
	sentence1 = torch.arange(0, 200, 2, dtype=torch.int64)
	padded_size = chunk_size - (len(sentence1) % chunk_size)
	gt1 = np.pad(sentence1, (0, padded_size), 'constant', constant_values=pad_id)
	padded_gt1 = np.pad(sentence1, (0, padded_size + chunk_size), 'constant', constant_values=pad_id)

	sentence2 = torch.arange(1, 500, 2, dtype=torch.int64)
	padded_size = chunk_size - (len(sentence2) % chunk_size)
	gt2 = np.pad(sentence2, (0, padded_size), 'constant', constant_values=pad_id)
	padded_gt2 = np.pad(sentence2, (0, padded_size + chunk_size), 'constant', constant_values=pad_id)

	data_file = '/tmp/test'
	index_file = data_file + '.idx'
	bin_file = data_file + '.bin'
	try:
	builder = MMapRetrievalIndexedDatasetBuilder(bin_file, chunk_size, pad_id, True, stride=stride)
	builder.add_item(sentence1)
	builder.add_item(sentence2)
	builder.finalize(index_file)
	# load the data
	ds = MMapRetrievalIndexedDataset(data_file)
	assert np.array_equal(ds.get(0), gt1)
	assert np.array_equal(ds.get(1), gt2)
	fetch1, fetch2 = ds[0:2]
	assert np.array_equal(fetch1, gt1)
	assert np.array_equal(fetch2, gt2)
	chunk_id = ds.get_chunk_id(0, 64)
	assert chunk_id == 2
	assert ds.from_chunk_id_to_doc_id(0) == 0
	assert ds.from_chunk_id_to_doc_id(1) == 0
	assert ds.from_chunk_id_to_doc_id(2) == 0
	with pytest.raises(ValueError):
	ds.get_chunk_id(0, 128)
	assert np.array_equal(ds.get_chunk(chunk_id), padded_gt1[64 : 64 + 64 * 2])
	chunk_id = ds.get_chunk_id(1, 0)
	assert chunk_id == 3
	assert ds.from_chunk_id_to_doc_id(3) == 1
	assert ds.from_chunk_id_to_doc_id(4) == 1
	assert ds.from_chunk_id_to_doc_id(5) == 1
	assert ds.from_chunk_id_to_doc_id(6) == 1
	assert ds.from_chunk_id_to_doc_id(7) == 1
	assert ds.from_chunk_id_to_doc_id(8) == 1
	assert ds.from_chunk_id_to_doc_id(9) == 1
	with pytest.raises(ValueError):
	ds.from_chunk_id_to_doc_id(10)
	assert np.array_equal(ds.get_chunk(chunk_id), padded_gt2[0 : chunk_size * 2])
	assert np.array_equal(ds.get_chunk(chunk_id + 1), gt2[stride : stride + chunk_size * 2])
	assert np.array_equal(ds.get_chunk(chunk_id + 2), gt2[stride * 2 : stride * 2 + chunk_size * 2])
	assert np.array_equal(ds.get_chunk(chunk_id + 3), gt2[stride * 3 : stride * 3 + chunk_size * 2])
	assert ds.get_chunk_id(1, 64) == 5
	assert ds.get_chunk_id(1, 128) == 7
	assert ds.get_chunk_id(1, 192) == 9
	with pytest.raises(ValueError):
	ds.get_chunk_id(0, 256)
	chunk_id = ds.get_chunk_id(1, 64)
	assert np.array_equal(ds.get_chunk(chunk_id), padded_gt2[64:192])
	multi_chunks = ds.get_chunk(slice(0, ds.chunks))
	assert np.array_equal(multi_chunks[0], padded_gt1[0 : chunk_size * 2])
	assert np.array_equal(multi_chunks[1], padded_gt1[stride : stride + chunk_size * 2])
	assert np.array_equal(multi_chunks[2], padded_gt1[stride * 2 : stride * 2 + chunk_size * 2])
	assert np.array_equal(multi_chunks[3], padded_gt2[0 : chunk_size * 2])
	assert np.array_equal(multi_chunks[4], padded_gt2[stride : stride + chunk_size * 2])
	assert np.array_equal(multi_chunks[5], padded_gt2[stride * 2 : stride * 2 + chunk_size * 2])
	assert np.array_equal(multi_chunks[6], padded_gt2[stride * 3 : stride * 3 + chunk_size * 2])
	assert np.array_equal(multi_chunks[7], padded_gt2[stride * 4 : stride * 4 + chunk_size * 2])
	assert np.array_equal(multi_chunks[8], padded_gt2[stride * 5 : stride * 5 + chunk_size * 2])
	assert np.array_equal(multi_chunks[9], padded_gt2[stride * 6 : stride * 6 + chunk_size * 2])
	finally:
	os.remove(index_file)
	os.remove(bin_file)

	@pytest.mark.unit
	def test_create_retrieval_data_index(self):

	chunk_size = 64
	pad_id = 0
	sentence1 = torch.arange(0, 200, 2, dtype=torch.int64)
	padded_size = chunk_size - (len(sentence1) % chunk_size)
	gt1 = np.pad(sentence1, (0, padded_size), 'constant', constant_values=pad_id)
	padded_gt1 = np.pad(sentence1, (0, padded_size + chunk_size), 'constant', constant_values=pad_id)

	sentence2 = torch.arange(1, 500, 2, dtype=torch.int64)
	padded_size = chunk_size - (len(sentence2) % chunk_size)
	gt2 = np.pad(sentence2, (0, padded_size), 'constant', constant_values=pad_id)
	padded_gt2 = np.pad(sentence2, (0, padded_size + chunk_size), 'constant', constant_values=pad_id)

	data_file = '/tmp/test'
	index_file = data_file + '.idx'
	bin_file = data_file + '.bin'
	try:
	builder = MMapRetrievalIndexedDatasetBuilder(bin_file, chunk_size, pad_id, True)
	builder.add_item(sentence1)
	builder.add_item(sentence2)
	builder.finalize(index_file)
	# load the data
	ds = MMapRetrievalIndexedDataset(data_file)
	assert np.array_equal(ds.get(0), gt1)
	assert np.array_equal(ds.get(1), gt2)
	fetch1, fetch2 = ds[0:2]
	assert np.array_equal(fetch1, gt1)
	assert np.array_equal(fetch2, gt2)
	chunk_id = ds.get_chunk_id(0, 64)
	assert chunk_id == 1
	assert ds.from_chunk_id_to_doc_id(0) == 0
	assert ds.from_chunk_id_to_doc_id(1) == 0
	with pytest.raises(ValueError):
	ds.get_chunk_id(0, 128)
	assert np.array_equal(ds.get_chunk(chunk_id), padded_gt1[64 : 64 + 64 * 2])
	chunk_id = ds.get_chunk_id(1, 0)
	assert chunk_id == 2
	assert ds.from_chunk_id_to_doc_id(2) == 1
	assert ds.from_chunk_id_to_doc_id(3) == 1
	assert ds.from_chunk_id_to_doc_id(4) == 1
	assert ds.from_chunk_id_to_doc_id(5) == 1
	with pytest.raises(ValueError):
	ds.from_chunk_id_to_doc_id(6)
	assert np.array_equal(ds.get_chunk(chunk_id), padded_gt2[0:128])
	assert np.array_equal(ds.get_chunk(chunk_id + 1), padded_gt2[64:192])
	assert np.array_equal(ds.get_chunk(chunk_id + 2), padded_gt2[128:256])
	assert np.array_equal(ds.get_chunk(chunk_id + 3), padded_gt2[192:320])
	assert ds.get_chunk_id(1, 64) == 3
	assert ds.get_chunk_id(1, 128) == 4
	assert ds.get_chunk_id(1, 192) == 5
	with pytest.raises(ValueError):
	ds.get_chunk_id(0, 256)
	chunk_id = ds.get_chunk_id(1, 64)
	assert np.array_equal(ds.get_chunk(chunk_id), padded_gt2[64:192])
	multi_chunks = ds.get_chunk(slice(0, ds.chunks))
	assert np.array_equal(multi_chunks[0], padded_gt1[0:128])
	assert np.array_equal(multi_chunks[1], padded_gt1[64 : 64 + 128])
	assert np.array_equal(multi_chunks[2], padded_gt2[0:128])
	assert np.array_equal(multi_chunks[3], padded_gt2[64 : 64 + 128])
	assert np.array_equal(multi_chunks[4], padded_gt2[128 : 128 + 128])
	assert np.array_equal(multi_chunks[5], padded_gt2[192 : 192 + 128])
	finally:
	os.remove(index_file)
	os.remove(bin_file)

	@pytest.mark.unit
	def test_knn_index(self):
	data_file = '/tmp/test'
	index_file = data_file + '.idx'
	K = 8
	index_files = [f'{data_file}_{i}.idx' for i in range(3)]
	merged_file = '/tmp/merged.idx'
	try:
	with KNNIndex.writer(index_file, K) as w:
	map_np0 = np.random.randint(0, 100, (50, K))
	w.write(map_np0)
	map_np1 = np.random.randint(0, 100, (50, K))
	w.write(map_np1)
	map_np2 = np.random.randint(0, 100, (50, K))
	w.write(map_np2)
	f = KNNIndex(index_file)
	assert f.K == K
	assert f.len == map_np0.shape[0] + map_np1.shape[0] + map_np2.shape[0]
	assert np.array_equal(map_np0, f.knn_map[:50])
	assert np.array_equal(map_np1, f.knn_map[50:100])
	assert np.array_equal(map_np2, f.knn_map[100:])
	assert np.array_equal(f.get_KNN_chunk_ids(5), map_np0[5])
	assert f.chunk_start_id == 0
	assert f.chunk_end_id == f.len

	with KNNIndex.writer(index_file, K, 100) as w:
	map_np0 = np.random.randint(0, 100, (50, K))
	w.write(map_np0)
	map_np1 = np.random.randint(0, 100, (50, K))
	w.write(map_np1)
	map_np2 = np.random.randint(0, 100, (50, K))
	w.write(map_np2)
	f = KNNIndex(index_file)
	assert f.K == K
	assert f.len == map_np0.shape[0] + map_np1.shape[0] + map_np2.shape[0]
	assert np.array_equal(map_np0, f.knn_map[:50])
	assert np.array_equal(map_np1, f.knn_map[50:100])
	assert np.array_equal(map_np2, f.knn_map[100:])
	assert np.array_equal(f.get_KNN_chunk_ids(5 + 100), map_np0[5])
	assert f.chunk_start_id == 100
	assert f.chunk_end_id == f.len + 100

	# test multiple sharding indices
	inputs = []
	start = 0
	for i in range(3):
	with KNNIndex.writer(index_files[i], K, offset=start) as w:
	map_np0 = np.random.randint(0, 100, (50, K))
	inputs.append(map_np0)
	w.write(map_np0)
	map_np1 = np.random.randint(0, 100, (50, K))
	inputs.append(map_np1)
	w.write(map_np1)
	f = KNNIndex(index_files[i])
	start += f.len
	merge_knn_files(index_files, merged_file)
	f = KNNIndex(merged_file)
	input_array = np.vstack(inputs)
	assert f.len == 100 * 3
	for i in range(300):
	assert np.array_equal(f.get_KNN_chunk_ids(i), input_array[i])
	assert f.chunk_start_id == 0
	assert f.chunk_end_id == f.len
	assert f.K == K

	finally:
	os.remove(index_file)
	for i in range(3):
	os.remove(index_files[i])
	os.remove(merged_file)

	@pytest.mark.unit
	@pytest.mark.skipif(not HAVE_APEX, reason="apex is not installed")
	def test_retro_dataset(self):

	chunk_size = 64
	pad_id = 0
	sentence1 = torch.arange(0, 200, 2, dtype=torch.int64)
	sentence2 = torch.arange(1, 500, 2, dtype=torch.int64)
	sentence3 = torch.arange(0, 300, 2, dtype=torch.int64)
	sentence4 = torch.arange(1, 400, 2, dtype=torch.int64)

	# test the case that
	# training data and retrieval data are different

	data_file = '/tmp/test_data'
	data_index_file = data_file + '.idx'
	data_bin_file = data_file + '.bin'
	db_file = '/tmp/test_db_data'
	db_index_file = db_file + '.idx'
	db_bin_file = db_file + '.bin'
	K = 8
	map_index_file = '/tmp/test_map.idx'
	index_path = '/tmp'

	cfg = OmegaConf.create({'data': {"index_mapping_dir": index_path}})

	# dummy tokenizer
	class Tokenizer:
	eos_id = 1
	pad_id = 0

	tokenizer = Tokenizer()

	num_samples = 100
	seq_len = 192
	name = 'test'
	data_prefix = 'pref'
	seed = 1
	_filename = index_path + '/' + data_prefix
	_filename += '_{}_indexmap'.format(name)
	_filename += '_{}ns'.format(num_samples)
	_filename += '_{}sl'.format(seq_len)
	_filename += '_{}s'.format(seed)
	doc_idx_filename = _filename + '_doc_idx.npy'
	sample_idx_filename = _filename + '_sample_idx.npy'
	shuffle_idx_filename = _filename + '_shuffle_idx.npy'

	try:
	builder = MMapRetrievalIndexedDatasetBuilder(data_bin_file, chunk_size, pad_id, False)
	builder.add_item(sentence1)
	builder.add_item(sentence2)
	builder.finalize(data_index_file)

	builder = MMapRetrievalIndexedDatasetBuilder(db_bin_file, chunk_size, pad_id, True)
	builder.add_item(sentence3)
	builder.add_item(sentence4)
	builder.finalize(db_index_file)

	# load the data
	data_index = MMapRetrievalIndexedDataset(data_file)
	db_index = MMapRetrievalIndexedDataset(db_file)

	with KNNIndex.writer(map_index_file, K) as w:
	map_np = np.random.randint(-3, db_index.chunks, (data_index.chunks, K))
	w.write(map_np)
	map_index = KNNIndex(map_index_file)

	documents = np.arange(0, data_index.sizes.shape[0])
	d = RETRODataset(
	cfg,
	None,
	tokenizer,
	name,
	data_prefix,
	documents,
	data_index,
	num_samples,
	seq_len,
	seed,
	map_index,
	db_index,
	)
	for i in range(len(d)):
	record = d[i]
	assert record['tokens'].shape[0] == seq_len
	assert record['labels'].shape[0] == seq_len
	assert record['retrieved_ids'].shape[0] == seq_len // chunk_size
	assert record['retrieved_ids'].shape[1] == K
	assert record['retrieved_ids'].shape[2] == chunk_size * 2
	assert record['tokens_mask'].shape[0] == seq_len

	finally:
	os.remove(data_bin_file)
	os.remove(data_index_file)
	os.remove(db_bin_file)
	os.remove(db_index_file)
	os.remove(map_index_file)
	os.remove(doc_idx_filename)
	os.remove(sample_idx_filename)
	os.remove(shuffle_idx_filename)

	# test the case that
	# training data and retrieval data are the same

	try:

	builder = MMapRetrievalIndexedDatasetBuilder(db_bin_file, chunk_size, pad_id, True)
	builder.add_item(sentence1)
	builder.add_item(sentence2)
	builder.add_item(sentence3)
	builder.add_item(sentence4)
	builder.finalize(db_index_file)

	# load the data
	data_index = MMapRetrievalIndexedDataset(db_file)
	db_index = MMapRetrievalIndexedDataset(db_file)

	with KNNIndex.writer(map_index_file, K) as w:
	map_np = np.random.randint(-3, db_index.chunks, (data_index.chunks, K))
	w.write(map_np)
	map_index = KNNIndex(map_index_file)

	documents = np.arange(0, data_index.sizes.shape[0])
	d = RETRODataset(
	cfg,
	None,
	tokenizer,
	name,
	data_prefix,
	documents,
	data_index,
	num_samples,
	seq_len,
	seed,
	map_index,
	db_index,
	)
	for i in range(len(d)):
	record = d[i]
	assert record['tokens'].shape[0] == seq_len
	assert record['labels'].shape[0] == seq_len
	assert record['retrieved_ids'].shape[0] == seq_len // chunk_size
	assert record['retrieved_ids'].shape[1] == K
	assert record['retrieved_ids'].shape[2] == chunk_size * 2
	assert record['tokens_mask'].shape[0] == seq_len

	finally:
	os.remove(db_bin_file)
	os.remove(db_index_file)
	os.remove(map_index_file)
	os.remove(doc_idx_filename)
	os.remove(sample_idx_filename)
	os.remove(shuffle_idx_filename)

	@pytest.mark.unit
	@pytest.mark.skipif(not HAVE_APEX, reason="apex is not installed")
	def test_retro_dataset_stride32(self):
	chunk_size = 64
	pad_id = 0
	sentence1 = torch.arange(0, 200, 2, dtype=torch.int64)
	sentence2 = torch.arange(1, 500, 2, dtype=torch.int64)
	sentence3 = torch.arange(0, 300, 2, dtype=torch.int64)
	sentence4 = torch.arange(1, 400, 2, dtype=torch.int64)

	# test the case that
	# training data and retrieval data are different

	data_file = '/tmp/test_data'
	data_index_file = data_file + '.idx'
	data_bin_file = data_file + '.bin'
	db_file = '/tmp/test_db_data'
	db_index_file = db_file + '.idx'
	db_bin_file = db_file + '.bin'
	K = 8
	map_index_file = '/tmp/test_map.idx'
	index_path = '/tmp'

	cfg = OmegaConf.create({'data': {"index_mapping_dir": index_path}})

	# dummy tokenizer
	class Tokenizer:
	eos_id = 1
	pad_id = 0

	tokenizer = Tokenizer()

	num_samples = 100
	stride = 32
	seq_len = 192
	name = 'test'
	data_prefix = 'pref'
	seed = 1
	_filename = index_path + '/' + data_prefix
	_filename += '_{}_indexmap'.format(name)
	_filename += '_{}ns'.format(num_samples)
	_filename += '_{}sl'.format(seq_len)
	_filename += '_{}s'.format(seed)
	doc_idx_filename = _filename + '_doc_idx.npy'
	sample_idx_filename = _filename + '_sample_idx.npy'
	shuffle_idx_filename = _filename + '_shuffle_idx.npy'

	try:
	builder = MMapRetrievalIndexedDatasetBuilder(data_bin_file, chunk_size, pad_id, False, stride=32)
	builder.add_item(sentence1)
	builder.add_item(sentence2)
	builder.finalize(data_index_file)

	builder = MMapRetrievalIndexedDatasetBuilder(db_bin_file, chunk_size, pad_id, True, stride=32)
	builder.add_item(sentence3)
	builder.add_item(sentence4)
	builder.finalize(db_index_file)

	# load the data
	data_index = MMapRetrievalIndexedDataset(data_file)
	db_index = MMapRetrievalIndexedDataset(db_file)

	with KNNIndex.writer(map_index_file, K) as w:
	map_np = np.random.randint(-3, db_index.chunks, (data_index.chunks, K))
	w.write(map_np)
	map_index = KNNIndex(map_index_file)

	documents = np.arange(0, data_index.sizes.shape[0])
	d = RETRODataset(
	cfg,
	None,
	tokenizer,
	name,
	data_prefix,
	documents,
	data_index,
	num_samples,
	seq_len,
	seed,
	map_index,
	db_index,
	)
	for i in range(len(d)):
	record = d[i]
	assert record['tokens'].shape[0] == seq_len
	assert record['labels'].shape[0] == seq_len
	assert record['retrieved_ids'].shape[0] == seq_len // chunk_size
	assert record['retrieved_ids'].shape[1] == K
	assert record['retrieved_ids'].shape[2] == chunk_size * 2
	assert record['tokens_mask'].shape[0] == seq_len

	finally:
	os.remove(data_bin_file)
	os.remove(data_index_file)
	os.remove(db_bin_file)
	os.remove(db_index_file)
	os.remove(map_index_file)
	os.remove(doc_idx_filename)
	os.remove(sample_idx_filename)
	os.remove(shuffle_idx_filename)

	# test the case that
	# training data and retrieval data are the same

	try:

	builder = MMapRetrievalIndexedDatasetBuilder(db_bin_file, chunk_size, pad_id, True, stride=32)
	builder.add_item(sentence1)
	builder.add_item(sentence2)
	builder.add_item(sentence3)
	builder.add_item(sentence4)
	builder.finalize(db_index_file)

	# load the data
	data_index = MMapRetrievalIndexedDataset(db_file)
	db_index = MMapRetrievalIndexedDataset(db_file)

	with KNNIndex.writer(map_index_file, K) as w:
	map_np = np.random.randint(-3, db_index.chunks, (data_index.chunks, K))
	w.write(map_np)
	map_index = KNNIndex(map_index_file)

	documents = np.arange(0, data_index.sizes.shape[0])
	d = RETRODataset(
	cfg,
	None,
	tokenizer,
	name,
	data_prefix,
	documents,
	data_index,
	num_samples,
	seq_len,
	seed,
	map_index,
	db_index,
	)
	for i in range(len(d)):
	record = d[i]
	assert record['tokens'].shape[0] == seq_len
	assert record['labels'].shape[0] == seq_len
	assert record['retrieved_ids'].shape[0] == seq_len // chunk_size
	assert record['retrieved_ids'].shape[1] == K
	assert record['retrieved_ids'].shape[2] == chunk_size * 2
	assert record['tokens_mask'].shape[0] == seq_len

	finally:
	os.remove(db_bin_file)
	os.remove(db_index_file)
	os.remove(map_index_file)
	os.remove(doc_idx_filename)
	os.remove(sample_idx_filename)
	os.remove(shuffle_idx_filename)

	@pytest.mark.unit
	@pytest.mark.skipif(not HAVE_APEX, reason="apex is not installed")
	def test_dedup(self):
	total = 1000
	id_start = np.array([0, 100, 200, 300, 500, 900])
	beg = 30
	end = 210
	chunk_id_to_doc_id_map = np.zeros((end - beg, 2), dtype=np.int64)
	build_map(id_start, chunk_id_to_doc_id_map, total, beg, end)
	for i in range(30, 100):
	assert_array_equal(chunk_id_to_doc_id_map[i - beg], id_start[0:2])
	for i in range(100, 200):
	assert_array_equal(chunk_id_to_doc_id_map[i - beg], id_start[1:3])
	for i in range(200, 210):
	assert_array_equal(chunk_id_to_doc_id_map[i - beg], id_start[2:4])
	beg = 5
	end = 100
	chunk_id_to_doc_id_map = np.zeros((end - beg, 2), dtype=np.int64)
	build_map(id_start, chunk_id_to_doc_id_map, total, beg, end)
	for i in range(beg, end):
	assert_array_equal(chunk_id_to_doc_id_map[i - beg], id_start[0:2])
	beg = 100
	end = 200
	chunk_id_to_doc_id_map = np.zeros((end - beg, 2), dtype=np.int64)
	build_map(id_start, chunk_id_to_doc_id_map, total, beg, end)
	for i in range(beg, end):
	assert_array_equal(chunk_id_to_doc_id_map[i - beg], id_start[1:3])
	beg = 900
	end = 1000
	chunk_id_to_doc_id_map = np.zeros((end - beg, 2), dtype=np.int64)
	build_map(id_start, chunk_id_to_doc_id_map, total, beg, end)
	for i in range(beg, end):
	assert_array_equal(chunk_id_to_doc_id_map[i - beg], np.array([900, 1000]))
	beg = 150
	end = 250
	chunk_id_to_doc_id_map = np.zeros((end - beg, 2), dtype=np.int64)
	build_map(id_start, chunk_id_to_doc_id_map, total, beg, end)
	for i in range(beg, 200):
	assert_array_equal(chunk_id_to_doc_id_map[i - beg], id_start[1:3])
	for i in range(200, end):
	assert_array_equal(chunk_id_to_doc_id_map[i - beg], id_start[2:4])

	I = np.arange(1000)[None, :]
	tmp_neighbors = np.ones_like(I) * -1
	with pytest.raises(ValueError):
	dedup(chunk_id_to_doc_id_map, I, tmp_neighbors, 0, beg)

	I = np.arange(1000)[None, :]
	tmp_neighbors = np.ones_like(I) * -1
	with pytest.raises(ValueError):
	dedup(chunk_id_to_doc_id_map, I, tmp_neighbors, 250, beg)

	for i in range(beg, 200):
	I = np.arange(1000)[None, :]
	tmp_neighbors = np.ones_like(I) * -1
	dedup(chunk_id_to_doc_id_map, I, tmp_neighbors, i, beg)
	gt = np.array(list(range(100)) + list(range(200, 1000)) + ([-1] * 100))
	assert_array_equal(tmp_neighbors[0], gt)

	for i in range(200, 250):
	I = np.arange(1000)[None, :]
	tmp_neighbors = np.ones_like(I) * -1
	dedup(chunk_id_to_doc_id_map, I, tmp_neighbors, i, beg)
	gt = np.array(list(range(200)) + list(range(300, 1000)) + ([-1] * 100))
	assert_array_equal(tmp_neighbors[0], gt)

	I = np.arange(1000)[None, :]
	I = np.repeat(I, 70, axis=0)
	tmp_neighbors = np.ones_like(I) * -1
	dedup(chunk_id_to_doc_id_map, I, tmp_neighbors, 180, beg)
	gt0 = np.array(list(range(100)) + list(range(200, 1000)) + ([-1] * 100))
	gt1 = np.array(list(range(200)) + list(range(300, 1000)) + ([-1] * 100))
	for i in range(20):
	assert_array_equal(tmp_neighbors[i], gt0)
	for i in range(20, 70):
	assert_array_equal(tmp_neighbors[i], gt1)