From aef6710e359bb54baa29a0cf34b48050a55bf05b Mon Sep 17 00:00:00 2001 From: d-sot <54951034+d-sot@users.noreply.github.com> Date: Fri, 7 Feb 2020 01:29:04 -0500 Subject: [PATCH] Bool indexing, tests (#17) * add 1-D boolean indexing support * add tests * add requirements.txt --- .github/workflows/pythonapp.yml | 1 + lazy_ops/lazy_loading.py | 41 +++++---- requirements.txt | 1 + tests/test_dsetview.py | 157 +++++++++++++++++++++++++++----- 4 files changed, 162 insertions(+), 38 deletions(-) create mode 100644 requirements.txt diff --git a/.github/workflows/pythonapp.yml b/.github/workflows/pythonapp.yml index 7dcf795..5f44587 100644 --- a/.github/workflows/pythonapp.yml +++ b/.github/workflows/pythonapp.yml @@ -16,6 +16,7 @@ jobs: - name: Install dependencies run: | python -m pip install --upgrade pip + pip install -r requirements.txt pip install . - name: Test with pytest run: | diff --git a/lazy_ops/lazy_loading.py b/lazy_ops/lazy_loading.py index 59ab8a4..a87003b 100644 --- a/lazy_ops/lazy_loading.py +++ b/lazy_ops/lazy_loading.py @@ -14,7 +14,6 @@ import h5py import numpy as np - class DatasetView(h5py.Dataset): def __init__(self, dataset: h5py.Dataset = None, slice_index=(np.index_exp[:],()), axis_order=None): @@ -26,8 +25,10 @@ def __init__(self, dataset: h5py.Dataset = None, slice_index=(np.index_exp[:],() Returns: lazy object of the view """ - - h5py.Dataset.__init__(self, dataset.id) + if dataset is None or isinstance(dataset,h5py.Dataset) is False: + raise TypeError("DatasetView requires a h5py.Dataset as positional argument") + else: + h5py.Dataset.__init__(self, dataset.id) if axis_order is None: self._axis_order = tuple(range(len(dataset.shape))) else: @@ -72,7 +73,7 @@ def _slice_tuple(self, key): Returns: The slice object tuple """ - if isinstance(key, (slice,int)): + if isinstance(key, (slice,int,np.ndarray)): key = key, else: key = *key, @@ -88,9 +89,9 @@ def _slice_shape(self, slice_): slice_key: An equivalent slice tuple with positive starts and stops int_index: a nested tuple, int_index records the information needed by dsetread to access data Each element of int_index, denoted ind is given by: - int_index[2] is the dataset axis at which the integer index operates - int_index[1] is the value of the integer index entered by the user - int_index[0] is the lazy_axis at which the integer index operates + ind[2] is the dataset axis at which the integer index operates + ind[1] is the value of the integer index entered by the user + ind[0] is the lazy_axis at which the integer index operates ,the lazy_axis is the axis number had the operations been carried out by h5py instead of lazy_ops axis_order: removes the elements of current axis_order where integer indexing has been applied @@ -220,20 +221,28 @@ def _slice_composition(self, new_slice): slice_result += (new_slice[i],) else: try: - if any(not isinstance(el,int) for el in new_slice[i]): - raise ValueError("Indices must be integers") + if not all(isinstance(el,int) for el in new_slice[i]): + if new_slice[i].dtype.kind != 'b': + raise ValueError("Indices must be either integers or booleans") + else: + # boolean indexing + if len(new_slice[i]) != self.shape[i]: + raise IndexError("Length of boolean index $d must be equal to size %d in dim %d" % (len(new_slice[i]),self.shape[i],i)) + new_slice_i = new_slice[i].nonzero()[0] + else: + new_slice_i = new_slice[i] if i < len(self.key): - if any(el >= self._shape[i] or el <= ~self._shape[i] for el in new_slice[i]): - raise IndexError("Index %s out of range, dim %d of size %d" % (str(new_slice[i]),i,self._shape[i])) + if any(el >= self._shape[i] or el <= ~self._shape[i] for el in new_slice_i): + raise IndexError("Index %s out of range, dim %d of size %d" % (str(new_slice_i),i,self._shape[i])) if isinstance(self.key[i],slice): - slice_result += (tuple(self.key[i].start + self.key[i].step*(ind%self._shape[i]) for ind in new_slice[i]),) + slice_result += (tuple(self.key[i].start + self.key[i].step*(ind%self._shape[i]) for ind in new_slice_i),) else: # self.key[i] is an iterator of integers - slice_result += (tuple(self.key[i][ind] for ind in new_slice[i]),) + slice_result += (tuple(self.key[i][ind] for ind in new_slice_i),) else: - slice_result += (new_slice[i],) + slice_result += (new_slice_i,) except: - raise IndexError("Indices must be either integers, iterators of integers, or slice objects") + raise IndexError("Indices must be either integers, iterators of integers, slice objects, or numpy boolean arrays") slice_result += self.key[len(new_slice):] return slice_result @@ -273,7 +282,7 @@ def _ellipsis_slices(self, new_slice): Returns: equivalent slices with Ellipsis expanded """ - ellipsis_count = new_slice.count(Ellipsis) + ellipsis_count = sum(s==Ellipsis for s in new_slice if not isinstance(s,np.ndarray)) if ellipsis_count == 1: ellipsis_index = new_slice.index(Ellipsis) if ellipsis_index == len(new_slice)-1: diff --git a/requirements.txt b/requirements.txt new file mode 100644 index 0000000..c5a4eac --- /dev/null +++ b/requirements.txt @@ -0,0 +1 @@ +h5py diff --git a/tests/test_dsetview.py b/tests/test_dsetview.py index cec4c81..9e06eb6 100644 --- a/tests/test_dsetview.py +++ b/tests/test_dsetview.py @@ -27,7 +27,6 @@ def setUp(self): self.ndims = 7 num_datasets = 5 - self.srand = secrets.SystemRandom() self.dset_list = list(self.h5py_file.create_dataset(name='dset'+str(i), data=np.random.rand(*self.srand.choices(range(1, 90//self.ndims), k=self.ndims))) for i in range(num_datasets)) @@ -65,15 +64,46 @@ def _array_indexing(cls,shape): return tuple(slice(None,None,None) if i != single_array_dim else single_array_indexing for i in range(len(shape))) + @classmethod + def _bool_indexing(cls,shape): + ''' find an appropriate tuple with a single array index ''' + single_array_dim = cls.srand.randrange(0,len(shape)) + single_bool_indexing = np.array(cls.srand.choices([True,False], k=shape[single_array_dim])) + return tuple(slice(None,None,None) if i != single_array_dim else single_bool_indexing + for i in range(len(shape))) + @classmethod def _slices_and_int(cls,shape): ''' find an appropriate tuple of slices and integers ''' return tuple(slice(cls.srand.randint(~s-1, s+1), cls.srand.randint(~s-1, s+1), cls.srand.randint(1, s)) - if cls.srand.choice([True, False]) else + if cls.srand.choice([True, False]) else cls.srand.randint(0, s-1) for s in shape) + @classmethod + def _slices_and_array(cls,shape, single_array_dim): + ''' find an appropriate tuple of slices and a single array index''' + single_array_len = cls.srand.randrange(0,shape[single_array_dim]) + single_array_indexing = sorted(cls.srand.sample(range(shape[single_array_dim]), + single_array_len)) + return tuple(slice(cls.srand.randint(~s-1, s+1), cls.srand.randint(~s-1, s+1), + cls.srand.randint(1, s)) + if i != single_array_dim else + single_array_indexing + for i, s in enumerate(shape)) + + @classmethod + def _slices_and_bool(cls,shape, single_array_dim): + ''' find an appropriate tuple of slices and a single array index''' + single_bool_indexing = np.array(cls.srand.choices([True,False], k=shape[single_array_dim])) + return tuple(slice(cls.srand.randint(~s-1, s+1), cls.srand.randint(~s-1, s+1), + cls.srand.randint(1, s)) + if i != single_array_dim else + single_bool_indexing + for i, s in enumerate(shape)) + + ########################################## # basic tests # ########################################## @@ -105,6 +135,12 @@ def test_dsetview_lazy_slice(self): slices = self._slices(self.dset.shape) assert_array_equal(self.dset[slices], self.dsetview.lazy_slice[slices]) + @dset_iterator + def test_dsetview_lazy_slice_bool(self): + # test __getitem__ read after lazy_slice, single slice + indexing = self._bool_indexing(self.dset.shape) + assert_array_equal(self.dset[indexing], self.dsetview.lazy_slice[indexing]) + @dset_iterator def test_dsetview_lazy_slice_lower_dimensions(self): for num_slice_dims in range(1, len(self.dset.shape)+1): @@ -136,6 +172,19 @@ def test_dsetview_lazy_slice_array_indexing(self): # array indexing only assert_array_equal(self.dset[indexing], self.dsetview.lazy_slice[indexing]) + @dset_iterator + def test_dsetview_lazy_slice_bool_indexing(self): + for num_slice_dims in range(2, len(self.dset.shape)+1): + # num_slice_dims starts from 2, dset[(1-D bool np.ndarray,)] is invalid in h5py + # dset[(1-D bool np.ndarray, slice(None))] is valid + indexing = self._bool_indexing(self.dset.shape[:num_slice_dims]) + # test __getitem__ read specifying lower dimensions + assert_array_equal(self.dset[indexing], self.dsetview[indexing]) + # test __getitem__ read after lazy_slice + # for lower and all dimensions + # bool indexing only + assert_array_equal(self.dset[indexing], self.dsetview.lazy_slice[indexing]) + @dset_iterator def test_dsetview_lazy_iter(self): for axis in range(len(self.dset.shape)): @@ -150,6 +199,86 @@ def test_dsetview_lazy_transpose(self): # test lazy_ops.lazy_transpose assert_array_equal(np.transpose(self.dset[()], axis),lazy_transpose(self.dsetview, axis)) + ########################################### + # tests for multiple lazy slice calls # + ########################################### + + # multi lazy_slice using only slices + @dset_iterator + def test_dsetview_multi_lazy_slice(self): + self._dsetview_multi_lazy_slice(self.dset, self.dsetview) + + @classmethod + def _dsetview_multi_lazy_slice(cls, dset, dsetview): + for num_slice_dims in range(1, len(dset.shape)+1): + slices = cls._slices(dset.shape[:num_slice_dims]) + dset_new = dset[slices] + dsetview_new = dsetview.lazy_slice[slices] + # test __getitem__ read after lazy_slice for lower dimensions + assert_array_equal(dset_new, dsetview_new) + if np.prod(dset_new.shape) != 0: + cls._dsetview_multi_lazy_slice(dset_new, dsetview_new) + + # multi lazy_slice using slices and int indexing + @dset_iterator + def test_dsetview_multi_lazy_slice_with_slice_and_int_indexing(self): + self._dsetview_multi_lazy_slice_with_slice_and_int_indexing(self.dset, self.dsetview) + + @classmethod + def _dsetview_multi_lazy_slice_with_slice_and_int_indexing(cls, dset, dsetview): + for num_slice_dims in range(1, len(dset.shape)+1): + indexing = cls._slices_and_int(dset.shape[:num_slice_dims]) + dset_new = dset[indexing] + dsetview_new = dsetview.lazy_slice[indexing] + # test __getitem__ read after lazy_slice + # for lower and all dimensions + # combination of slice and int indexing + assert_array_equal(dset_new, dsetview_new) + if np.prod(dset_new.shape) != 0: + cls._dsetview_multi_lazy_slice_with_slice_and_int_indexing(dset_new, dsetview_new) + + # multi lazy_slice using slices and array indexing + @dset_iterator + def test_dsetview_multi_lazy_slice_with_slice_and_array_indexing(self): + remaining_slice_calls = 10 + array_dim = self.srand.randint(0, len(self.dset.shape)-1) + self._dsetview_multi_lazy_slice_with_slice_and_array_indexing(self.dset, self.dsetview, remaining_slice_calls, array_dim) + + @classmethod + def _dsetview_multi_lazy_slice_with_slice_and_array_indexing(cls, dset, dsetview, remaining_slice_calls, array_dim): + for num_slice_dims in range(array_dim+1, len(dset.shape)+1): + indexing = cls._slices_and_array(dset.shape[:num_slice_dims], array_dim) + dset_new = dset[indexing] + dsetview_new = dsetview.lazy_slice[indexing] + # test __getitem__ read after lazy_slice + # for lower and all dimensions + # combination of slice and array indexing + assert_array_equal(dset_new, dsetview_new) + if np.prod(dset_new.shape) != 0 and remaining_slice_calls > 0: + cls._dsetview_multi_lazy_slice_with_slice_and_array_indexing(dset_new, dsetview_new, remaining_slice_calls - 1, array_dim) + + # multi lazy_slice using slices and boolean array indexing + @dset_iterator + def test_dsetview_multi_lazy_slice_with_slice_and_bool_indexing(self): + remaining_slice_calls = 4 + array_dim = self.srand.randint(1, len(self.dset.shape)-1) + # array_dim starts from 1, for array_dim=0, dset[(1-D bool np.ndarray,)] is invalid in h5py + # dset[(slice(None),1-D bool np.ndarray)] is valid + self._dsetview_multi_lazy_slice_with_slice_and_bool_indexing(self.dset, self.dsetview, remaining_slice_calls, array_dim) + + @classmethod + def _dsetview_multi_lazy_slice_with_slice_and_bool_indexing(cls, dset, dsetview, remaining_slice_calls, array_dim): + for num_slice_dims in range(array_dim+1, len(dset.shape)+1): + indexing = cls._slices_and_bool(dset.shape[:num_slice_dims], array_dim) + dset_new = dset[indexing] + dsetview_new = dsetview.lazy_slice[indexing] + # test __getitem__ read after lazy_slice + # for lower and all dimensions + # combination of slice and bool indexing + assert_array_equal(dset_new, dsetview_new) + if np.prod(dset_new.shape) != 0 and remaining_slice_calls > 0: + cls._dsetview_multi_lazy_slice_with_slice_and_bool_indexing(dset_new, dsetview_new, remaining_slice_calls - 1, array_dim) + ########################################### # tests for multiple lazy operation calls # ########################################### @@ -180,22 +309,6 @@ def _dsetview_multi_lazy_transpose(self, dset, dsetview, remaining_transpose_cal if remaining_transpose_calls > 0: self._dsetview_multi_lazy_transpose(dset_new, dsetview_new, remaining_transpose_calls - 1) - # multi lazy_slice using only slices - @dset_iterator - def test_dsetview_multi_lazy_slice(self): - self._dsetview_multi_lazy_slice(self.dset, self.dsetview) - - @classmethod - def _dsetview_multi_lazy_slice(cls, dset, dsetview): - for num_slice_dims in range(1, len(dset.shape)+1): - slices = cls._slices(dset.shape[:num_slice_dims]) - dset_new = dset[slices] - dsetview_new = dsetview.lazy_slice[slices] - # test __getitem__ read after lazy_slice for lower dimensions - assert_array_equal(dset_new, dsetview_new) - if np.prod(dset_new.shape) != 0: - cls._dsetview_multi_lazy_slice(dset_new, dsetview_new) - # multi lazy_transpose and lazy_slice using only slices @dset_iterator def test_dsetview_multi_lazy_ops_with_slice_indexing(self): @@ -223,11 +336,11 @@ def _dsetview_multi_lazy_ops_with_slice_indexing(cls, dset, dsetview, remaining_ # multi lazy_transpose and lazy_slice using slices and int @dset_iterator - def test_dsetview_multi_lazy_slice_with_slice_and_int_indexing(self): - self._dsetview_multi_lazy_slice_with_slice_and_int_indexing(self.dset, self.dsetview) + def test_dsetview_multi_lazy_ops_with_slice_and_int_indexing(self): + self._dsetview_multi_lazy_ops_with_slice_and_int_indexing(self.dset, self.dsetview) @classmethod - def _dsetview_multi_lazy_slice_with_slice_and_int_indexing(cls, dset, dsetview): + def _dsetview_multi_lazy_ops_with_slice_and_int_indexing(cls, dset, dsetview): for num_slice_dims in range(1, len(dset.shape)+1): slices = cls._slices_and_int(dset.shape[:num_slice_dims]) dset_new = dset[slices] @@ -237,5 +350,5 @@ def _dsetview_multi_lazy_slice_with_slice_and_int_indexing(cls, dset, dsetview): # combination of slice and int indexing assert_array_equal(dset_new, dsetview_new) if np.prod(dset_new.shape) != 0: - cls._dsetview_multi_lazy_slice_with_slice_and_int_indexing(dset_new, dsetview_new) + cls._dsetview_multi_lazy_ops_with_slice_and_int_indexing(dset_new, dsetview_new)