Use _get_max_num_chunks

NeurodataWithoutBorders · May 14, 2024 · 53663be · 53663be
1 parent c9c8955
commit 53663be
Show file tree

Hide file tree

Showing 2 changed files with 15 additions and 3 deletions.
diff --git a/lindi/LindiH5ZarrStore/LindiH5ZarrStore.py b/lindi/LindiH5ZarrStore/LindiH5ZarrStore.py
@@ -8,6 +8,7 @@
 from tqdm import tqdm
 from ._util import (
     _read_bytes,
+    _get_max_num_chunks,
     _apply_to_all_chunk_info,
     _get_chunk_byte_range,
     _get_byte_range_for_contiguous_dataset,
@@ -450,8 +451,7 @@ def _add_chunk_info_to_refs(self, key_parent: str, add_ref: Callable, add_ref_ch
         if h5_item.chunks is not None:
             # Set up progress bar for manual updates because h5py chunk_iter used in _apply_to_all_chunk_info
             # does not provide a way to hook in a progress bar
-            dsid = h5_item.id
-            num_chunks = dsid.get_num_chunks()  # NOTE: this is very slow if dataset is remote and has many chunks
+            num_chunks = _get_max_num_chunks(h5_item)  # NOTE: unallocated chunks are counted
             pbar = tqdm(
                 total=num_chunks,
                 desc=f"Writing chunk info for {key_parent}",

diff --git a/lindi/LindiH5ZarrStore/_util.py b/lindi/LindiH5ZarrStore/_util.py
@@ -2,6 +2,7 @@
 import json
 import numpy as np
 import h5py
+import math
 import warnings
 
 
@@ -11,6 +12,17 @@ def _read_bytes(file: IO, offset: int, count: int):
     return file.read(count)
 
 
+def _get_max_num_chunks(h5_dataset: h5py.Dataset):
+    """Get the maximum number of chunks in an h5py dataset.
+
+    This is similar to h5_dataset.id.get_num_chunks() but significantly faster. It does not account for
+    whether some chunks are allocated.
+    """
+    chunk_size = h5_dataset.chunks
+    assert chunk_size is not None
+    return math.prod([math.ceil(a / b) for a, b in zip(h5_dataset.shape, chunk_size)])
+
+
 def _apply_to_all_chunk_info(h5_dataset: h5py.Dataset, callback: Callable):
     """Apply the callback function to each chunk of an h5py dataset.
     The chunks are iterated in order such that the last dimension changes the fastest,
@@ -32,7 +44,7 @@ def _apply_to_all_chunk_info(h5_dataset: h5py.Dataset, callback: Callable):
         dsid.chunk_iter(callback)
     except AttributeError:
         # chunk_iter is not available
-        num_chunks = dsid.get_num_chunks()  # NOTE: this is very slow if dataset is remote and has many chunks
+        num_chunks = _get_max_num_chunks(dsid)
         if num_chunks > 100:
             warnings.warn(
                 f"Dataset {h5_dataset.name} has {num_chunks} chunks. Using get_chunk_info is slow. "