From 53663be2e5d5f750c954f35b358dfb05d7c4284a Mon Sep 17 00:00:00 2001 From: rly Date: Tue, 14 May 2024 08:57:55 -0700 Subject: [PATCH] Use _get_max_num_chunks --- lindi/LindiH5ZarrStore/LindiH5ZarrStore.py | 4 ++-- lindi/LindiH5ZarrStore/_util.py | 14 +++++++++++++- 2 files changed, 15 insertions(+), 3 deletions(-) diff --git a/lindi/LindiH5ZarrStore/LindiH5ZarrStore.py b/lindi/LindiH5ZarrStore/LindiH5ZarrStore.py index 2d6fbff..21ffd61 100644 --- a/lindi/LindiH5ZarrStore/LindiH5ZarrStore.py +++ b/lindi/LindiH5ZarrStore/LindiH5ZarrStore.py @@ -8,6 +8,7 @@ from tqdm import tqdm from ._util import ( _read_bytes, + _get_max_num_chunks, _apply_to_all_chunk_info, _get_chunk_byte_range, _get_byte_range_for_contiguous_dataset, @@ -450,8 +451,7 @@ def _add_chunk_info_to_refs(self, key_parent: str, add_ref: Callable, add_ref_ch if h5_item.chunks is not None: # Set up progress bar for manual updates because h5py chunk_iter used in _apply_to_all_chunk_info # does not provide a way to hook in a progress bar - dsid = h5_item.id - num_chunks = dsid.get_num_chunks() # NOTE: this is very slow if dataset is remote and has many chunks + num_chunks = _get_max_num_chunks(h5_item) # NOTE: unallocated chunks are counted pbar = tqdm( total=num_chunks, desc=f"Writing chunk info for {key_parent}", diff --git a/lindi/LindiH5ZarrStore/_util.py b/lindi/LindiH5ZarrStore/_util.py index 719a3cf..90b4b8b 100644 --- a/lindi/LindiH5ZarrStore/_util.py +++ b/lindi/LindiH5ZarrStore/_util.py @@ -2,6 +2,7 @@ import json import numpy as np import h5py +import math import warnings @@ -11,6 +12,17 @@ def _read_bytes(file: IO, offset: int, count: int): return file.read(count) +def _get_max_num_chunks(h5_dataset: h5py.Dataset): + """Get the maximum number of chunks in an h5py dataset. + + This is similar to h5_dataset.id.get_num_chunks() but significantly faster. It does not account for + whether some chunks are allocated. + """ + chunk_size = h5_dataset.chunks + assert chunk_size is not None + return math.prod([math.ceil(a / b) for a, b in zip(h5_dataset.shape, chunk_size)]) + + def _apply_to_all_chunk_info(h5_dataset: h5py.Dataset, callback: Callable): """Apply the callback function to each chunk of an h5py dataset. The chunks are iterated in order such that the last dimension changes the fastest, @@ -32,7 +44,7 @@ def _apply_to_all_chunk_info(h5_dataset: h5py.Dataset, callback: Callable): dsid.chunk_iter(callback) except AttributeError: # chunk_iter is not available - num_chunks = dsid.get_num_chunks() # NOTE: this is very slow if dataset is remote and has many chunks + num_chunks = _get_max_num_chunks(dsid) if num_chunks > 100: warnings.warn( f"Dataset {h5_dataset.name} has {num_chunks} chunks. Using get_chunk_info is slow. "