From d58d935fe7989a9d3aee334ff1f17c971792ec23 Mon Sep 17 00:00:00 2001 From: Jeremy Magland Date: Wed, 3 Apr 2024 16:34:42 -0400 Subject: [PATCH 1/3] add create_dataset_with_zarr_compressor() --- lindi/LindiH5pyFile/LindiH5pyFile.py | 15 ++++++++++++ lindi/LindiH5pyFile/LindiH5pyGroup.py | 17 +++++++++++++- .../write/LindiH5pyGroupWrite.py | 19 ++++++++++++--- .../create_zarr_dataset_from_h5_data.py | 23 ++++++++++++++++--- tests/test_zarr_write.py | 23 +++++++++++++++++++ 5 files changed, 90 insertions(+), 7 deletions(-) diff --git a/lindi/LindiH5pyFile/LindiH5pyFile.py b/lindi/LindiH5pyFile/LindiH5pyFile.py index 021b6dc..607280a 100644 --- a/lindi/LindiH5pyFile/LindiH5pyFile.py +++ b/lindi/LindiH5pyFile/LindiH5pyFile.py @@ -5,6 +5,7 @@ import h5py import zarr from zarr.storage import Store as ZarrStore +from numcodecs.abc import Codec from .LindiH5pyGroup import LindiH5pyGroup from .LindiH5pyDataset import LindiH5pyDataset @@ -331,6 +332,20 @@ def create_dataset(self, name, shape=None, dtype=None, data=None, **kwds): raise Exception("Cannot create dataset in read-only mode") return self._the_group.create_dataset(name, shape=shape, dtype=dtype, data=data, **kwds) + def create_dataset_with_zarr_compressor( + self, + name, + shape=None, + dtype=None, + data=None, + *, + compressor: Union[Codec, Literal['default']] = 'default', + **kwds + ): + if self._mode not in ['r+']: + raise Exception("Cannot create dataset in read-only mode") + return self._the_group.create_dataset_with_zarr_compressor(name, shape=shape, dtype=dtype, data=data, compressor=compressor, **kwds) + def _download_file(url: str, filename: str) -> None: headers = { diff --git a/lindi/LindiH5pyFile/LindiH5pyGroup.py b/lindi/LindiH5pyFile/LindiH5pyGroup.py index db8b755..47f2990 100644 --- a/lindi/LindiH5pyFile/LindiH5pyGroup.py +++ b/lindi/LindiH5pyFile/LindiH5pyGroup.py @@ -1,6 +1,7 @@ -from typing import TYPE_CHECKING, Union +from typing import TYPE_CHECKING, Union, Literal import h5py import zarr +from numcodecs.abc import Codec from .LindiH5pyDataset import LindiH5pyDataset from .LindiH5pyLink import LindiH5pyHardLink, LindiH5pySoftLink @@ -173,6 +174,20 @@ def create_dataset(self, name, shape=None, dtype=None, data=None, **kwds): raise Exception('Cannot create dataset in read-only mode') return self._write.create_dataset(name, shape=shape, dtype=dtype, data=data, **kwds) + def create_dataset_with_zarr_compressor( + self, + name, + shape=None, + dtype=None, + data=None, + *, + compressor: Union[Codec, Literal['default']] = 'default', + **kwds + ): + if self._file._mode not in ['r+']: + raise Exception('Cannot create dataset in read-only mode') + return self._write.create_dataset(name, shape=shape, dtype=dtype, data=data, _zarr_compressor=compressor, **kwds) + def __setitem__(self, name, obj): if self._file._mode not in ['r+']: raise Exception('Cannot set item in read-only mode') diff --git a/lindi/LindiH5pyFile/write/LindiH5pyGroupWrite.py b/lindi/LindiH5pyFile/write/LindiH5pyGroupWrite.py index 5526aa9..822f4a0 100644 --- a/lindi/LindiH5pyFile/write/LindiH5pyGroupWrite.py +++ b/lindi/LindiH5pyFile/write/LindiH5pyGroupWrite.py @@ -1,7 +1,8 @@ -from typing import TYPE_CHECKING +from typing import TYPE_CHECKING, Union, Literal import h5py import numpy as np import zarr +from numcodecs.abc import Codec from ..LindiH5pyDataset import LindiH5pyDataset from ..LindiH5pyReference import LindiH5pyReference @@ -39,7 +40,16 @@ def require_group(self, name): return ret return self.create_group(name) - def create_dataset(self, name, shape=None, dtype=None, data=None, **kwds): + def create_dataset( + self, + name, + shape=None, + dtype=None, + data=None, + *, + _zarr_compressor: Union[Codec, Literal['default']] = 'default', + **kwds + ): chunks = None for k, v in kwds.items(): if k == 'chunks': @@ -48,6 +58,8 @@ def create_dataset(self, name, shape=None, dtype=None, data=None, **kwds): raise Exception(f'Unsupported kwds in create_dataset: {k}') if isinstance(self.p._group_object, h5py.Group): + if _zarr_compressor != 'default': + raise Exception('zarr_compressor is not supported when _group_object is h5py.Group') return LindiH5pyDataset( self._group_object.create_dataset(name, shape=shape, dtype=dtype, data=data, chunks=chunks), # type: ignore self.p._file @@ -77,7 +89,8 @@ def create_dataset(self, name, shape=None, dtype=None, data=None, **kwds): h5_shape=shape, h5_dtype=dtype, h5_data=data, - h5f=None + h5f=None, + zarr_compressor=_zarr_compressor ) return LindiH5pyDataset(ds, self.p._file) else: diff --git a/lindi/conversion/create_zarr_dataset_from_h5_data.py b/lindi/conversion/create_zarr_dataset_from_h5_data.py index 0e36127..5a91233 100644 --- a/lindi/conversion/create_zarr_dataset_from_h5_data.py +++ b/lindi/conversion/create_zarr_dataset_from_h5_data.py @@ -1,7 +1,8 @@ -from typing import Union, List, Any, Tuple +from typing import Union, List, Any, Tuple, Literal from dataclasses import dataclass import numpy as np import numcodecs +from numcodecs.abc import Codec import h5py import zarr from .h5_ref_to_zarr_attr import h5_ref_to_zarr_attr @@ -17,7 +18,8 @@ def create_zarr_dataset_from_h5_data( h5f: Union[h5py.File, None], name: str, label: str, - h5_chunks: Union[Tuple, None] + h5_chunks: Union[Tuple, None], + zarr_compressor: Union[Codec, Literal['default']] = 'default' ): """Create a zarr dataset from an h5py dataset. @@ -41,6 +43,9 @@ def create_zarr_dataset_from_h5_data( The name of the h5py dataset for error messages. h5_chunks : tuple The chunk shape of the h5py dataset. + zarr_compressor : numcodecs.abc.Codec + The codec compressor to use when writing the dataset. If default, the + default compressor will be used. """ if h5_dtype is None: raise Exception(f'No dtype in h5_to_zarr_dataset_prep for dataset {label}') @@ -53,6 +58,9 @@ def create_zarr_dataset_from_h5_data( if h5_data is None: raise Exception(f'Data must be provided for scalar dataset {label}') + if zarr_compressor != 'default': + raise Exception('zarr_compressor is not supported for scalar datasets') + if _is_numeric_dtype(h5_dtype) or h5_dtype in [bool, np.bool_]: # Handle the simple numeric types ds = zarr_parent_group.create_dataset( @@ -118,10 +126,13 @@ def create_zarr_dataset_from_h5_data( shape=h5_shape, chunks=h5_chunks, dtype=h5_dtype, - data=h5_data + data=h5_data, + compressor=zarr_compressor ) elif h5_dtype.kind == 'O': # For type object, we are going to use the JSON codec + if zarr_compressor != 'default': + raise Exception('zarr_compressor is not supported for object datasets') if h5_data is not None: if isinstance(h5_data, h5py.Dataset): h5_data = h5_data[:] @@ -138,6 +149,8 @@ def create_zarr_dataset_from_h5_data( object_codec=object_codec ) elif h5_dtype.kind == 'S': # byte string + if zarr_compressor != 'default': + raise Exception('zarr_compressor is not supported for byte string datasets') if h5_data is None: raise Exception(f'Data must be provided when converting dataset {label} with dtype {h5_dtype}') return zarr_parent_group.create_dataset( @@ -148,8 +161,12 @@ def create_zarr_dataset_from_h5_data( data=h5_data ) elif h5_dtype.kind == 'U': # unicode string + if zarr_compressor != 'default': + raise Exception('zarr_compressor is not supported for unicode string datasets') raise Exception(f'Array of unicode strings not supported: dataset {label} with dtype {h5_dtype} and shape {h5_shape}') elif h5_dtype.kind == 'V' and h5_dtype.fields is not None: # compound dtype + if zarr_compressor != 'default': + raise Exception('zarr_compressor is not supported for compound datasets') if h5_data is None: raise Exception(f'Data must be provided when converting compound dataset {label}') h5_data_1d_view = h5_data.ravel() diff --git a/tests/test_zarr_write.py b/tests/test_zarr_write.py index 0f44836..dea2c14 100644 --- a/tests/test_zarr_write.py +++ b/tests/test_zarr_write.py @@ -6,6 +6,7 @@ import lindi from lindi.conversion.attr_conversion import h5_to_zarr_attr import pytest +import numcodecs def test_zarr_write(): @@ -21,6 +22,28 @@ def test_zarr_write(): compare_example_h5_data(h5f_backed_by_zarr, tmpdir=tmpdir) +def test_zarr_write_with_zstd_compressor(): + with tempfile.TemporaryDirectory() as tmpdir: + dirname = f'{tmpdir}/test.zarr' + store = zarr.DirectoryStore(dirname) + zarr.group(store=store) + with lindi.LindiH5pyFile.from_zarr_store(store, mode='r+') as h5f_backed_by_zarr: + h5f_backed_by_zarr.create_dataset_with_zarr_compressor( + 'dset_float32', + data=np.array([1, 2, 3], dtype=np.float32), + compressor=numcodecs.Zstd(), # this compressor not supported in hdf5 + ) + + store2 = zarr.DirectoryStore(dirname) + with lindi.LindiH5pyFile.from_zarr_store(store2) as h5f_backed_by_zarr: + dset = h5f_backed_by_zarr['dset_float32'] + assert isinstance(dset, h5py.Dataset) + if not _arrays_are_equal(dset[()], np.array([1, 2, 3], dtype=np.float32)): + print(dset[()]) + print(np.array([1, 2, 3], dtype=np.float32)) + raise Exception('Data mismatch') + + def write_example_h5_data(h5f: h5py.File): h5f.attrs['attr_str'] = 'hello' h5f.attrs['attr_int'] = 42 From c2944286796d98136ed9591aa9808515ad47b5b1 Mon Sep 17 00:00:00 2001 From: Jeremy Magland Date: Thu, 4 Apr 2024 11:30:33 -0400 Subject: [PATCH 2/3] fix create_dataset_with_zarr_compressor --- lindi/LindiH5pyFile/LindiH5pyGroup.py | 5 +++-- 1 file changed, 3 insertions(+), 2 deletions(-) diff --git a/lindi/LindiH5pyFile/LindiH5pyGroup.py b/lindi/LindiH5pyFile/LindiH5pyGroup.py index d70bcd3..02020a2 100644 --- a/lindi/LindiH5pyFile/LindiH5pyGroup.py +++ b/lindi/LindiH5pyFile/LindiH5pyGroup.py @@ -200,9 +200,10 @@ def create_dataset_with_zarr_compressor( compressor: Union[Codec, Literal['default']] = 'default', **kwds ): - if self._file._mode not in ['r+']: + if self._readonly: raise Exception('Cannot create dataset in read-only mode') - return self._write.create_dataset(name, shape=shape, dtype=dtype, data=data, _zarr_compressor=compressor, **kwds) + assert self._writer is not None + return self._writer.create_dataset(name, shape=shape, dtype=dtype, data=data, _zarr_compressor=compressor, **kwds) def __setitem__(self, name, obj): if self._readonly: From 22fc137153742817a8d3e60f97a6314af47869c3 Mon Sep 17 00:00:00 2001 From: Jeremy Magland Date: Thu, 18 Apr 2024 20:23:55 -0500 Subject: [PATCH 3/3] compression parameter in create_dataset --- lindi/LindiH5pyFile/LindiH5pyFile.py | 15 -------- lindi/LindiH5pyFile/LindiH5pyGroup.py | 18 +--------- .../writers/LindiH5pyGroupWriter.py | 35 +++++++++++++++++-- tests/test_zarr_write.py | 4 +-- 4 files changed, 35 insertions(+), 37 deletions(-) diff --git a/lindi/LindiH5pyFile/LindiH5pyFile.py b/lindi/LindiH5pyFile/LindiH5pyFile.py index 43fab45..9a93d28 100644 --- a/lindi/LindiH5pyFile/LindiH5pyFile.py +++ b/lindi/LindiH5pyFile/LindiH5pyFile.py @@ -5,7 +5,6 @@ import h5py import zarr from zarr.storage import Store as ZarrStore -from numcodecs.abc import Codec from .LindiH5pyGroup import LindiH5pyGroup from .LindiH5pyDataset import LindiH5pyDataset @@ -342,20 +341,6 @@ def require_dataset(self, name, shape, dtype, exact=False, **kwds): raise Exception("Cannot require dataset in read-only mode") return self._the_group.require_dataset(name, shape, dtype, exact=exact, **kwds) - def create_dataset_with_zarr_compressor( - self, - name, - shape=None, - dtype=None, - data=None, - *, - compressor: Union[Codec, Literal['default']] = 'default', - **kwds - ): - if self._mode not in ['r+']: - raise Exception("Cannot create dataset in read-only mode") - return self._the_group.create_dataset_with_zarr_compressor(name, shape=shape, dtype=dtype, data=data, compressor=compressor, **kwds) - def _download_file(url: str, filename: str) -> None: headers = { diff --git a/lindi/LindiH5pyFile/LindiH5pyGroup.py b/lindi/LindiH5pyFile/LindiH5pyGroup.py index c9acdaf..fda4dcb 100644 --- a/lindi/LindiH5pyFile/LindiH5pyGroup.py +++ b/lindi/LindiH5pyFile/LindiH5pyGroup.py @@ -1,7 +1,6 @@ -from typing import TYPE_CHECKING, Union, Literal +from typing import TYPE_CHECKING, Union import h5py import zarr -from numcodecs.abc import Codec from .LindiH5pyDataset import LindiH5pyDataset from .LindiH5pyLink import LindiH5pyHardLink, LindiH5pySoftLink @@ -188,21 +187,6 @@ def require_dataset(self, name, shape, dtype, exact=False, **kwds): assert self._writer is not None return self._writer.require_dataset(name, shape, dtype, exact=exact, **kwds) - def create_dataset_with_zarr_compressor( - self, - name, - shape=None, - dtype=None, - data=None, - *, - compressor: Union[Codec, Literal['default']] = 'default', - **kwds - ): - if self._readonly: - raise Exception('Cannot create dataset in read-only mode') - assert self._writer is not None - return self._writer.create_dataset(name, shape=shape, dtype=dtype, data=data, _zarr_compressor=compressor, **kwds) - def __setitem__(self, name, obj): if self._readonly: raise Exception('Cannot set item in read-only mode') diff --git a/lindi/LindiH5pyFile/writers/LindiH5pyGroupWriter.py b/lindi/LindiH5pyFile/writers/LindiH5pyGroupWriter.py index 6da07e1..cdf6dc9 100644 --- a/lindi/LindiH5pyFile/writers/LindiH5pyGroupWriter.py +++ b/lindi/LindiH5pyFile/writers/LindiH5pyGroupWriter.py @@ -1,7 +1,8 @@ -from typing import TYPE_CHECKING, Union, Literal +from typing import TYPE_CHECKING import h5py import numpy as np import zarr +import numcodecs from numcodecs.abc import Codec from ..LindiH5pyDataset import LindiH5pyDataset @@ -12,6 +13,8 @@ from ...conversion.create_zarr_dataset_from_h5_data import create_zarr_dataset_from_h5_data +_compression_not_specified_ = object() + class LindiH5pyGroupWriter: def __init__(self, p: 'LindiH5pyGroup'): @@ -46,17 +49,43 @@ def create_dataset( shape=None, dtype=None, data=None, - *, - _zarr_compressor: Union[Codec, Literal['default']] = 'default', **kwds ): chunks = None + compression = _compression_not_specified_ + compression_opts = None for k, v in kwds.items(): if k == 'chunks': chunks = v + elif k == 'compression': + compression = v + elif k == 'compression_opts': + compression_opts = v else: raise Exception(f'Unsupported kwds in create_dataset: {k}') + if compression is _compression_not_specified_: + _zarr_compressor = 'default' + if compression_opts is not None: + raise Exception('compression_opts is only supported when compression is provided') + elif isinstance(compression, Codec): + _zarr_compressor = compression + if compression_opts is not None: + raise Exception('compression_opts is not supported when compression is provided as a Codec') + elif isinstance(compression, str): + if compression == 'gzip': + if compression_opts is None: + level = 4 # default for h5py + elif isinstance(compression_opts, int): + level = compression_opts + else: + raise Exception(f'Unexpected type for compression_opts: {type(compression_opts)}') + _zarr_compressor = numcodecs.GZip(level=level) + else: + raise Exception(f'Compression {compression} is not supported') + else: + raise Exception(f'Unexpected type for compression: {type(compression)}') + if isinstance(self.p._group_object, h5py.Group): if _zarr_compressor != 'default': raise Exception('zarr_compressor is not supported when _group_object is h5py.Group') diff --git a/tests/test_zarr_write.py b/tests/test_zarr_write.py index a96f65f..d5380a4 100644 --- a/tests/test_zarr_write.py +++ b/tests/test_zarr_write.py @@ -49,10 +49,10 @@ def test_zarr_write_with_zstd_compressor(): store = zarr.DirectoryStore(dirname) zarr.group(store=store) with lindi.LindiH5pyFile.from_zarr_store(store, mode='r+') as h5f_backed_by_zarr: - h5f_backed_by_zarr.create_dataset_with_zarr_compressor( + h5f_backed_by_zarr.create_dataset( 'dset_float32', data=np.array([1, 2, 3], dtype=np.float32), - compressor=numcodecs.Zstd(), # this compressor not supported in hdf5 + compression=numcodecs.Zstd(), # this compressor not supported without plugin in hdf5 ) store2 = zarr.DirectoryStore(dirname)