diff --git a/megatron/data/blendable_dataset.py b/megatron/data/blendable_dataset.py index 979e9a174e..ab164fdc48 100755 --- a/megatron/data/blendable_dataset.py +++ b/megatron/data/blendable_dataset.py @@ -41,7 +41,7 @@ def __init__(self, datasets, weights, size, *, # Build indicies. @dlp.log def _build_indices(): - start_time = time.time() + start_time = time.perf_counter() dataset_index = np.zeros(self.size, dtype=np.int64) dataset_sample_index = np.zeros(self.size, dtype=np.int64) @@ -77,14 +77,14 @@ def _build_indices(): dataset_index, dataset_sample_index = _build_indices() try: log.debug(" > saving index map files") - start_time = time.time() + start_time = time.perf_counter() os.makedirs(os.path.dirname(index_path), exist_ok=True) with open(desc_path, 'wt') as fd: fd.write(desc) np.save(index_path, dataset_index, allow_pickle=True) np.save(sample_index_path, dataset_sample_index, allow_pickle=True) - log.info(f" > finished saving index map files in {time.time() - start_time} seconds") + log.info(f" > finished saving index map files in {time.perf_counter() - start_time} seconds") except OSError: print(f'There was an error trying to create the data cache directory ({data_cache_path})') print('or a file in it. This is set with the --data-cache-path argument. Please') @@ -108,14 +108,14 @@ def _build_indices(): torch.distributed.barrier(group=mpu.get_pipeline_model_parallel_group()) torch.distributed.barrier(group=mpu.get_data_parallel_group()) - start_time = time.time() + start_time = time.perf_counter() log.info(f'> loading blendable dataset index: {index_path}') self.dataset_index = np.load(index_path, allow_pickle=True, mmap_mode='r') assert self.dataset_index.size == self.size log.info(f'> loading blendable dataset sample index: {sample_index_path}') self.dataset_sample_index = np.load(sample_index_path, allow_pickle=True, mmap_mode='r') assert self.dataset_sample_index.size == self.size - log.info(f'> finished loading in {time.time() - start_time} seconds') + log.info(f'> finished loading in {time.perf_counter() - start_time} seconds') else: self.dataset_index, self.dataset_sample_index = _build_indices()