Skip to content

Commit

Permalink
Use _get_max_num_chunks
Browse files Browse the repository at this point in the history
  • Loading branch information
rly committed May 14, 2024
1 parent c9c8955 commit 53663be
Show file tree
Hide file tree
Showing 2 changed files with 15 additions and 3 deletions.
4 changes: 2 additions & 2 deletions lindi/LindiH5ZarrStore/LindiH5ZarrStore.py
Original file line number Diff line number Diff line change
Expand Up @@ -8,6 +8,7 @@
from tqdm import tqdm
from ._util import (
_read_bytes,
_get_max_num_chunks,
_apply_to_all_chunk_info,
_get_chunk_byte_range,
_get_byte_range_for_contiguous_dataset,
Expand Down Expand Up @@ -450,8 +451,7 @@ def _add_chunk_info_to_refs(self, key_parent: str, add_ref: Callable, add_ref_ch
if h5_item.chunks is not None:
# Set up progress bar for manual updates because h5py chunk_iter used in _apply_to_all_chunk_info
# does not provide a way to hook in a progress bar
dsid = h5_item.id
num_chunks = dsid.get_num_chunks() # NOTE: this is very slow if dataset is remote and has many chunks
num_chunks = _get_max_num_chunks(h5_item) # NOTE: unallocated chunks are counted
pbar = tqdm(
total=num_chunks,
desc=f"Writing chunk info for {key_parent}",
Expand Down
14 changes: 13 additions & 1 deletion lindi/LindiH5ZarrStore/_util.py
Original file line number Diff line number Diff line change
Expand Up @@ -2,6 +2,7 @@
import json
import numpy as np
import h5py
import math
import warnings


Expand All @@ -11,6 +12,17 @@ def _read_bytes(file: IO, offset: int, count: int):
return file.read(count)


def _get_max_num_chunks(h5_dataset: h5py.Dataset):
"""Get the maximum number of chunks in an h5py dataset.
This is similar to h5_dataset.id.get_num_chunks() but significantly faster. It does not account for
whether some chunks are allocated.
"""
chunk_size = h5_dataset.chunks
assert chunk_size is not None
return math.prod([math.ceil(a / b) for a, b in zip(h5_dataset.shape, chunk_size)])


def _apply_to_all_chunk_info(h5_dataset: h5py.Dataset, callback: Callable):
"""Apply the callback function to each chunk of an h5py dataset.
The chunks are iterated in order such that the last dimension changes the fastest,
Expand All @@ -32,7 +44,7 @@ def _apply_to_all_chunk_info(h5_dataset: h5py.Dataset, callback: Callable):
dsid.chunk_iter(callback)
except AttributeError:
# chunk_iter is not available
num_chunks = dsid.get_num_chunks() # NOTE: this is very slow if dataset is remote and has many chunks
num_chunks = _get_max_num_chunks(dsid)
if num_chunks > 100:
warnings.warn(
f"Dataset {h5_dataset.name} has {num_chunks} chunks. Using get_chunk_info is slow. "
Expand Down

0 comments on commit 53663be

Please sign in to comment.