diff --git a/README.md b/README.md index baec2fb..46a9493 100644 --- a/README.md +++ b/README.md @@ -11,19 +11,24 @@ $ pip install lazy_ops ## Usage: ```python -import h5py from lazy_ops import DatasetView -dsetview = DatasetView(dataset) # dataset is an instantiated h5py dataset -view1 = dsetview.lazy_slice[1:10:2,:,0:50:5].lazy_transpose([2,0,1]).lazy_slice[25:55,1,1:4:1,:].lazy_transpose() +# h5py # +import h5py +dsetview = DatasetView(dataset) # dataset is an instance of h5py.Dataset +view1 = dsetview.lazy_slice[1:40:2,:,0:50:5].lazy_transpose([2,0,1]).lazy_slice[8,5:10] + +# zarr # +import zarr +zarrview = DatasetView(zarray) # dataset is an instance of zarr.core.Array +view1 = zview.lazy_slice[1:10:2,:,5:10].lazy_transpose([0,2,1]).lazy_slice[0:3,1:4] -A = view1[:] # Brackets on DataSetView call the h5py slicing method, that returns the data +# reading from view on either h5py or zarr +A = view1[:] # Brackets on DataSetView call the h5py or zarr slicing method, returning the data B = view1.dsetread() # same as view1[:] -for ib in view1.lazy_iter(axis=1): +# iterating on either h5yy or zarr +for ib in view.lazy_iter(axis=1): print(ib[0]) ``` - - - diff --git a/lazy_ops/lazy_loading.py b/lazy_ops/lazy_loading.py index a87003b..bf7d2d0 100644 --- a/lazy_ops/lazy_loading.py +++ b/lazy_ops/lazy_loading.py @@ -1,34 +1,65 @@ -"""Provides a class to allow for lazy transposing and slicing operations on h5py datasets -Example Usage: -import h5py +"""Provides a class to allow for lazy transposing and slicing operations on h5py datasets and zarr arrays + +## Usage: + from lazy_ops import DatasetView +# h5py # +import h5py +dsetview = DatasetView(dataset) # dataset is an instance of h5py.Dataset +view1 = dsetview.lazy_slice[1:40:2,:,0:50:5].lazy_transpose([2,0,1]).lazy_slice[8,5:10] -dsetview = DatasetView(dataset) # dataset is an instantiated h5py dataset -view1 = dsetview.lazy_slice[1:10:2,:,0:50:5].lazy_transpose([2,0,1]).lazy_slice[25:55,1,1:4:1,:] -A = view1[:] # Brackets on DataSetView call the h5py slicing method, that returns dataset data +# zarr # +import zarr +zarrview = DatasetView(zarray) # dataset is an instance of zarr.core.Array +view1 = zview.lazy_slice[1:10:2,:,5:10].lazy_transpose([0,2,1]).lazy_slice[0:3,1:4] + +# reading from view on either h5py or zarr +A = view1[:] # Brackets on DataSetView call the h5py or zarr slicing method, returning the data B = view1.dsetread() # same as view1[:] +# iterating on either h5yy or zarr +for ib in view.lazy_iter(axis=1): + print(ib[0]) + """ -import h5py import numpy as np +from abc import ABCMeta, abstractmethod +from typing import Union +import h5py +import zarr -class DatasetView(h5py.Dataset): +class DatasetView(metaclass=ABCMeta): - def __init__(self, dataset: h5py.Dataset = None, slice_index=(np.index_exp[:],()), axis_order=None): + def __new__(cls, dataset: Union[h5py.Dataset,zarr.core.Array] = None, slice_index=(np.index_exp[:],()), axis_order=None): """ Args: dataset: the underlying dataset slice_index: the aggregate slice and int indices after multiple lazy calls axis_order: the aggregate axis_order after multiple transpositions Returns: - lazy object of the view + lazy object """ - if dataset is None or isinstance(dataset,h5py.Dataset) is False: - raise TypeError("DatasetView requires a h5py.Dataset as positional argument") + if cls == DatasetView: + if isinstance(dataset,h5py.Dataset): + dsetview = DatasetViewh5py(dataset=dataset) + elif isinstance(dataset,zarr.core.Array): + dsetview = DatasetViewzarr(dataset=dataset) + else: + raise TypeError("DatasetView requires either an h5py dataset or a zarr array as first argument") + return dsetview else: - h5py.Dataset.__init__(self, dataset.id) + return super().__new__(cls) + + def __init__(self, dataset: Union[h5py.Dataset,zarr.core.Array] = None, slice_index=(np.index_exp[:],()), axis_order=None): + """ + Args: + dataset: the underlying dataset + slice_index: the aggregate slice and int indices after multiple lazy calls + axis_order: the aggregate axis_order after multiple transpositions + """ + if axis_order is None: self._axis_order = tuple(range(len(dataset.shape))) else: @@ -102,7 +133,7 @@ def _slice_shape(self, slice_): slice_regindices = [slice(*slice_[i].indices(self.dataset.shape[self.axis_order[i]])) if isinstance(slice_[i],slice) else slice_[i] for i in range(len(slice_))] - + slice_shape = () int_index = () axis_order = () @@ -130,11 +161,11 @@ def _slice_shape(self, slice_): return slice_shape, slice_regindices, int_index, axis_order def __getitem__(self, new_slice): - """ supports python's colon slicing syntax + """ supports python's colon slicing syntax Args: new_slice: the new slice to compose with the lazy instance's self.key slice Returns: - lazy object of the view + lazy object """ key_reinit = self._slice_composition(new_slice) if self._lazy_slice_call: @@ -257,7 +288,7 @@ def lazy_transpose(self, axis_order=None): Args: axis_order: permutation order for transpose Returns: - lazy object of the view + lazy object """ if axis_order is None: @@ -333,7 +364,7 @@ def read_direct(self, dest, source_sel=None, dest_sel=None): self.dataset.read_direct(reversed_dest, source_sel=reversed_slice_key, dest_sel=reversed_dest_sel) np.copyto(dest, reversed_dest.transpose(axis_order_read)) -def lazy_transpose(dset: h5py.Dataset, axes=None): +def lazy_transpose(dset: Union[h5py.Dataset,zarr.core.Array], axes=None): """ Array lazy transposition, not passing axis argument reverses the order of dimensions Args: dset: h5py dataset @@ -345,3 +376,20 @@ def lazy_transpose(dset: h5py.Dataset, axes=None): axes = tuple(reversed(range(len(dset.shape)))) return DatasetView(dset).lazy_transpose(axis_order=axes) + +class DatasetViewh5py(DatasetView, h5py.Dataset): + + def __new__(cls,dataset): + + _self = super().__new__(cls) + h5py.Dataset.__init__(_self, dataset.id) + return _self + +class DatasetViewzarr(DatasetView, zarr.core.Array): + + def __new__(cls,dataset): + + _self = super().__new__(cls) + zarr.core.Array.__init__(_self, dataset.store, path=dataset.path) + return _self + diff --git a/requirements.txt b/requirements.txt index c5a4eac..f81cb68 100644 --- a/requirements.txt +++ b/requirements.txt @@ -1 +1,2 @@ h5py +zarr \ No newline at end of file diff --git a/tests/test_dsetview.py b/tests/test_dsetview.py index 9e06eb6..f7ce1c8 100644 --- a/tests/test_dsetview.py +++ b/tests/test_dsetview.py @@ -1,4 +1,3 @@ -import h5py import numpy as np from lazy_ops import DatasetView, lazy_transpose import secrets @@ -6,6 +5,9 @@ import unittest import tempfile from functools import wraps +import h5py +import zarr +import pytest # Define decorator to iterate over dset_list def dset_iterator(f): @@ -15,27 +17,10 @@ def wrapper(self, *args, **kwargs): f(self, *args, **kwargs) return wrapper -class LazyOpsTest(unittest.TestCase): - ''' Class array equality test ''' +class LazyOpsBase(object): srand = secrets.SystemRandom() - def setUp(self): - self.temp_file = tempfile.NamedTemporaryFile(suffix=".hdf5", delete=False) - self.temp_file.close() - self.h5py_file = h5py.File(self.temp_file.name,'w') - - self.ndims = 7 - num_datasets = 5 - self.dset_list = list(self.h5py_file.create_dataset(name='dset'+str(i), - data=np.random.rand(*self.srand.choices(range(1, 90//self.ndims), k=self.ndims))) - for i in range(num_datasets)) - self.dsetview_list = list(DatasetView(self.dset_list[i]) for i in range(num_datasets)) - - def tearDown(self): - self.temp_file.delete = True - self.temp_file.close() - @classmethod def _slices(cls,shape): ''' find an appropriate tuple of slices ''' @@ -135,12 +120,6 @@ def test_dsetview_lazy_slice(self): slices = self._slices(self.dset.shape) assert_array_equal(self.dset[slices], self.dsetview.lazy_slice[slices]) - @dset_iterator - def test_dsetview_lazy_slice_bool(self): - # test __getitem__ read after lazy_slice, single slice - indexing = self._bool_indexing(self.dset.shape) - assert_array_equal(self.dset[indexing], self.dsetview.lazy_slice[indexing]) - @dset_iterator def test_dsetview_lazy_slice_lower_dimensions(self): for num_slice_dims in range(1, len(self.dset.shape)+1): @@ -161,30 +140,6 @@ def test_dsetview_lazy_slice_int_indexing(self): # int indexing only assert_array_equal(self.dset[indexing], self.dsetview.lazy_slice[indexing]) - @dset_iterator - def test_dsetview_lazy_slice_array_indexing(self): - for num_slice_dims in range(1, len(self.dset.shape)+1): - indexing = self._array_indexing(self.dset.shape[:num_slice_dims]) - # test __getitem__ read specifying lower dimensions - assert_array_equal(self.dset[indexing], self.dsetview[indexing]) - # test __getitem__ read after lazy_slice - # for lower and all dimensions - # array indexing only - assert_array_equal(self.dset[indexing], self.dsetview.lazy_slice[indexing]) - - @dset_iterator - def test_dsetview_lazy_slice_bool_indexing(self): - for num_slice_dims in range(2, len(self.dset.shape)+1): - # num_slice_dims starts from 2, dset[(1-D bool np.ndarray,)] is invalid in h5py - # dset[(1-D bool np.ndarray, slice(None))] is valid - indexing = self._bool_indexing(self.dset.shape[:num_slice_dims]) - # test __getitem__ read specifying lower dimensions - assert_array_equal(self.dset[indexing], self.dsetview[indexing]) - # test __getitem__ read after lazy_slice - # for lower and all dimensions - # bool indexing only - assert_array_equal(self.dset[indexing], self.dsetview.lazy_slice[indexing]) - @dset_iterator def test_dsetview_lazy_iter(self): for axis in range(len(self.dset.shape)): @@ -237,48 +192,6 @@ def _dsetview_multi_lazy_slice_with_slice_and_int_indexing(cls, dset, dsetview): if np.prod(dset_new.shape) != 0: cls._dsetview_multi_lazy_slice_with_slice_and_int_indexing(dset_new, dsetview_new) - # multi lazy_slice using slices and array indexing - @dset_iterator - def test_dsetview_multi_lazy_slice_with_slice_and_array_indexing(self): - remaining_slice_calls = 10 - array_dim = self.srand.randint(0, len(self.dset.shape)-1) - self._dsetview_multi_lazy_slice_with_slice_and_array_indexing(self.dset, self.dsetview, remaining_slice_calls, array_dim) - - @classmethod - def _dsetview_multi_lazy_slice_with_slice_and_array_indexing(cls, dset, dsetview, remaining_slice_calls, array_dim): - for num_slice_dims in range(array_dim+1, len(dset.shape)+1): - indexing = cls._slices_and_array(dset.shape[:num_slice_dims], array_dim) - dset_new = dset[indexing] - dsetview_new = dsetview.lazy_slice[indexing] - # test __getitem__ read after lazy_slice - # for lower and all dimensions - # combination of slice and array indexing - assert_array_equal(dset_new, dsetview_new) - if np.prod(dset_new.shape) != 0 and remaining_slice_calls > 0: - cls._dsetview_multi_lazy_slice_with_slice_and_array_indexing(dset_new, dsetview_new, remaining_slice_calls - 1, array_dim) - - # multi lazy_slice using slices and boolean array indexing - @dset_iterator - def test_dsetview_multi_lazy_slice_with_slice_and_bool_indexing(self): - remaining_slice_calls = 4 - array_dim = self.srand.randint(1, len(self.dset.shape)-1) - # array_dim starts from 1, for array_dim=0, dset[(1-D bool np.ndarray,)] is invalid in h5py - # dset[(slice(None),1-D bool np.ndarray)] is valid - self._dsetview_multi_lazy_slice_with_slice_and_bool_indexing(self.dset, self.dsetview, remaining_slice_calls, array_dim) - - @classmethod - def _dsetview_multi_lazy_slice_with_slice_and_bool_indexing(cls, dset, dsetview, remaining_slice_calls, array_dim): - for num_slice_dims in range(array_dim+1, len(dset.shape)+1): - indexing = cls._slices_and_bool(dset.shape[:num_slice_dims], array_dim) - dset_new = dset[indexing] - dsetview_new = dsetview.lazy_slice[indexing] - # test __getitem__ read after lazy_slice - # for lower and all dimensions - # combination of slice and bool indexing - assert_array_equal(dset_new, dsetview_new) - if np.prod(dset_new.shape) != 0 and remaining_slice_calls > 0: - cls._dsetview_multi_lazy_slice_with_slice_and_bool_indexing(dset_new, dsetview_new, remaining_slice_calls - 1, array_dim) - ########################################### # tests for multiple lazy operation calls # ########################################### @@ -352,3 +265,121 @@ def _dsetview_multi_lazy_ops_with_slice_and_int_indexing(cls, dset, dsetview): if np.prod(dset_new.shape) != 0: cls._dsetview_multi_lazy_ops_with_slice_and_int_indexing(dset_new, dsetview_new) +class LazyOpsBaseh5py(object): + + ########################################################### + # tests for single lazy operation calls specific to h5py # + ########################################################### + + @dset_iterator + def test_dsetview_lazy_slice_bool(self): + # test __getitem__ read after lazy_slice, single slice + indexing = self._bool_indexing(self.dset.shape) + assert_array_equal(self.dset[indexing], self.dsetview.lazy_slice[indexing]) + + @dset_iterator + def test_dsetview_lazy_slice_array_indexing(self): + for num_slice_dims in range(1, len(self.dset.shape)+1): + indexing = self._array_indexing(self.dset.shape[:num_slice_dims]) + # test __getitem__ read specifying lower dimensions + assert_array_equal(self.dset[indexing], self.dsetview[indexing]) + # test __getitem__ read after lazy_slice + # for lower and all dimensions + # array indexing only + assert_array_equal(self.dset[indexing], self.dsetview.lazy_slice[indexing]) + + @dset_iterator + def test_dsetview_lazy_slice_bool_indexing(self): + for num_slice_dims in range(2, len(self.dset.shape)+1): + # num_slice_dims starts from 2, dset[(1-D bool np.ndarray,)] is invalid in h5py + # dset[(1-D bool np.ndarray, slice(None))] is valid + indexing = self._bool_indexing(self.dset.shape[:num_slice_dims]) + # test __getitem__ read specifying lower dimensions + assert_array_equal(self.dset[indexing], self.dsetview[indexing]) + # test __getitem__ read after lazy_slice + # for lower and all dimensions + # bool indexing only + assert_array_equal(self.dset[indexing], self.dsetview.lazy_slice[indexing]) + + ######################################################## + # tests for multiple lazy slice calls specific to h5py # + ######################################################## + + # multi lazy_slice using slices and array indexing + @dset_iterator + def test_dsetview_multi_lazy_slice_with_slice_and_array_indexing(self): + remaining_slice_calls = 10 + array_dim = self.srand.randint(0, len(self.dset.shape)-1) + self._dsetview_multi_lazy_slice_with_slice_and_array_indexing(self.dset, self.dsetview, remaining_slice_calls, array_dim) + + @classmethod + def _dsetview_multi_lazy_slice_with_slice_and_array_indexing(cls, dset, dsetview, remaining_slice_calls, array_dim): + for num_slice_dims in range(array_dim+1, len(dset.shape)+1): + indexing = cls._slices_and_array(dset.shape[:num_slice_dims], array_dim) + dset_new = dset[indexing] + dsetview_new = dsetview.lazy_slice[indexing] + # test __getitem__ read after lazy_slice + # for lower and all dimensions + # combination of slice and array indexing + assert_array_equal(dset_new, dsetview_new) + if np.prod(dset_new.shape) != 0 and remaining_slice_calls > 0: + cls._dsetview_multi_lazy_slice_with_slice_and_array_indexing(dset_new, dsetview_new, remaining_slice_calls - 1, array_dim) + + # multi lazy_slice using slices and boolean array indexing + @dset_iterator + def test_dsetview_multi_lazy_slice_with_slice_and_bool_indexing(self): + remaining_slice_calls = 4 + array_dim = self.srand.randint(1, len(self.dset.shape)-1) + # array_dim starts from 1, for array_dim=0, dset[(1-D bool np.ndarray,)] is invalid in h5py + # dset[(slice(None),1-D bool np.ndarray)] is valid + self._dsetview_multi_lazy_slice_with_slice_and_bool_indexing(self.dset, self.dsetview, remaining_slice_calls, array_dim) + + @classmethod + def _dsetview_multi_lazy_slice_with_slice_and_bool_indexing(cls, dset, dsetview, remaining_slice_calls, array_dim): + for num_slice_dims in range(array_dim+1, len(dset.shape)+1): + indexing = cls._slices_and_bool(dset.shape[:num_slice_dims], array_dim) + dset_new = dset[indexing] + dsetview_new = dsetview.lazy_slice[indexing] + # test __getitem__ read after lazy_slice + # for lower and all dimensions + # combination of slice and bool indexing + assert_array_equal(dset_new, dsetview_new) + if np.prod(dset_new.shape) != 0 and remaining_slice_calls > 0: + cls._dsetview_multi_lazy_slice_with_slice_and_bool_indexing(dset_new, dsetview_new, remaining_slice_calls - 1, array_dim) + +class LazyOpszarrTest(LazyOpsBase,unittest.TestCase): + ''' Class zarr array equality test ''' + + def setUp(self): + self.ndims = 7 + num_datasets = 3 + + self.temp_dir_zarr = tempfile.TemporaryDirectory(suffix=".zgroup") + self.zarr_group = zarr.group(store=self.temp_dir_zarr.name, overwrite=True) + self.dset_list = list(self.zarr_group.create_dataset(name='zarray'+str(i), + data=np.random.rand(*self.srand.choices(range(1, 90//self.ndims), k=self.ndims))) + for i in range(num_datasets)) + self.dsetview_list = list(DatasetView(self.dset_list[i]) for i in range(num_datasets)) + print(LazyOpszarrTest) + + def tearDown(self): + self.temp_dir_zarr.cleanup() + +class LazyOpsh5pyTest(LazyOpsBase,LazyOpsBaseh5py,unittest.TestCase): + ''' Class h5py dataset array equality test ''' + + def setUp(self): + self.temp_file = tempfile.NamedTemporaryFile(suffix=".hdf5", delete=False) + self.temp_file.close() + self.h5py_file = h5py.File(self.temp_file.name,'w') + + self.ndims = 7 + num_datasets = 3 + self.dset_list = list(self.h5py_file.create_dataset(name='dset'+str(i), + data=np.random.rand(*self.srand.choices(range(1, 90//self.ndims), k=self.ndims))) + for i in range(num_datasets)) + self.dsetview_list = list(DatasetView(self.dset_list[i]) for i in range(num_datasets)) + + def tearDown(self): + self.temp_file.delete = True + self.temp_file.close() diff --git a/tests/test_file.py b/tests/test_file.py deleted file mode 100644 index bdf2e27..0000000 --- a/tests/test_file.py +++ /dev/null @@ -1,82 +0,0 @@ -import os -import h5py -import numpy as np -from lazy_ops import DatasetView -import secrets -from random import shuffle -from numpy.testing import assert_array_equal - - -def test_func(): - - f = h5py.File('testfile.hdf5','w') - ndims = 4 - secret_rand = secrets.SystemRandom() - dset = f.create_dataset(name='test_dataset', data=np.random.rand(*secret_rand.sample(range(1, 200//ndims), ndims))) - dsetview = DatasetView(dset) - - randslice = lambda d: slice(*secret_rand.sample(range(-dsetview.shape[d] * 5 // 4, dsetview.shape[d] * 5 // 4), 1), - *secret_rand.sample(range(-dsetview.shape[d] * 5 // 4, dsetview.shape[d] * 5 // 4), 1), - *secret_rand.sample(range(1, dsetview.shape[d] + 1), 1)) - - for _ in range(5): - - slice_list = [randslice(0), - np.s_[:], - np.index_exp[:], - tuple(randslice(i) for i in range(secrets.randbelow(len(dsetview.shape)+1))), - tuple(randslice(i) for i in range(secrets.randbelow(len(dsetview.shape)+1))), - tuple(randslice(i) for i in range(secrets.randbelow(len(dsetview.shape)+1))), - tuple(randslice(i) for i in range(secrets.randbelow(len(dsetview.shape)+1))), - tuple(randslice(i) for i in range(secrets.randbelow(len(dsetview.shape)+1)))] - - slice_list = [secrets.choice(slice_list) for _ in range(5)] - - shuffle_list = [list(range(ndims)) for _ in range(5)] - for li in shuffle_list: - shuffle(li) - - assert_array_equal(dset[slice_list[1]], - dsetview.lazy_slice[slice_list[1]]) - - assert_array_equal(dset[slice_list[1]].T, dsetview.lazy_slice[slice_list[1]].lazy_transpose()) - - assert_array_equal(dset[slice_list[1]].T, dsetview.lazy_slice[slice_list[1]].T) - - assert_array_equal(dsetview[slice_list[1]][slice_list[2]], - dsetview.lazy_slice[slice_list[1]].lazy_slice[slice_list[2]]) - - assert_array_equal(dsetview[slice_list[1]][slice_list[2]][slice_list[3]], - dsetview.lazy_slice[slice_list[1]].lazy_slice[slice_list[2]].lazy_slice[slice_list[3]]) - - assert_array_equal(dset[slice_list[1]][slice_list[2]][slice_list[3]], - dsetview.lazy_slice[slice_list[1]].lazy_slice[slice_list[2]].lazy_slice[slice_list[3]]) - - assert_array_equal(dset[slice_list[1]].transpose(), dsetview.lazy_slice[slice_list[1]].lazy_transpose()) - - assert_array_equal(dsetview[slice_list[1]].transpose(), dsetview.lazy_slice[slice_list[1]].lazy_transpose()) - - assert_array_equal(dsetview[slice_list[1]].transpose()[slice_list[2]], - dsetview.lazy_slice[slice_list[1]].lazy_transpose().lazy_slice[slice_list[2]]) - - assert_array_equal(dsetview[slice_list[1]].transpose()[slice_list[2]][slice_list[3]], - dsetview.lazy_slice[slice_list[1]].lazy_transpose().lazy_slice[slice_list[2]]. - lazy_slice[slice_list[3]]) - - assert_array_equal(dset[slice_list[1]].transpose()[slice_list[2]][slice_list[3]], - dsetview.lazy_slice[slice_list[1]].lazy_transpose().lazy_slice[slice_list[2]]. - lazy_slice[slice_list[3]]) - - assert_array_equal(dsetview[slice_list[1]].transpose(shuffle_list[3])[slice_list[2]][slice_list[3]], - dsetview.lazy_slice[slice_list[1]].lazy_transpose(shuffle_list[3]).lazy_slice[slice_list[2]]. - lazy_slice[slice_list[3]].dsetread()) - - assert_array_equal(dset[slice_list[1]].transpose(shuffle_list[3])[slice_list[2]][slice_list[3]]. - transpose(shuffle_list[4]), - dsetview.lazy_slice[slice_list[1]].lazy_transpose(shuffle_list[3]).lazy_slice[slice_list[2]] - .lazy_slice[slice_list[3]].lazy_transpose(shuffle_list[4])) - - assert_array_equal(dset[:].transpose(shuffle_list[0]), dsetview.lazy_transpose(shuffle_list[0]).dsetread()) - - os.remove('testfile.hdf5') - diff --git a/tox.ini b/tox.ini new file mode 100644 index 0000000..560630e --- /dev/null +++ b/tox.ini @@ -0,0 +1,9 @@ +[tox] +envlist = py35, py36, py37, py38 +[testenv] +# install testing framework +deps = + -rrequirements.txt + pytest +# run the tests +commands = pytest