From 5604eac9aeac4de1ecf06f5a51ec2ff734d9fe03 Mon Sep 17 00:00:00 2001 From: giovp Date: Sun, 14 Nov 2021 11:54:48 +0100 Subject: [PATCH 001/110] first attempt to support awkward arrays --- anndata/_core/aligned_mapping.py | 25 +++++++++++++++- anndata/_core/anndata.py | 2 +- anndata/_core/index.py | 15 ++++++++++ anndata/_core/views.py | 49 ++++++++++++++++++++++++++++++++ 4 files changed, 89 insertions(+), 2 deletions(-) diff --git a/anndata/_core/aligned_mapping.py b/anndata/_core/aligned_mapping.py index 2c8430794..56ad13e1b 100644 --- a/anndata/_core/aligned_mapping.py +++ b/anndata/_core/aligned_mapping.py @@ -3,6 +3,17 @@ from typing import Union, Optional, Type, ClassVar, TypeVar # Special types from typing import Iterator, Mapping, Sequence # ABCs from typing import Tuple, List, Dict # Generic base types +from functools import singledispatch + +import warnings + +try: + with warnings.catch_warnings(): + warnings.simplefilter("ignore", UserWarning) + import awkward as ak +except ImportError: + ak = None + import numpy as np import pandas as pd @@ -47,7 +58,7 @@ def _ipython_key_completions_(self) -> List[str]: def _validate_value(self, val: V, key: str) -> V: """Raises an error if value is invalid""" for i, axis in enumerate(self.axes): - if self.parent.shape[axis] != val.shape[i]: + if self.parent.shape[axis] != dim_len(val, i): # val.shape[i]: right_shape = tuple(self.parent.shape[a] for a in self.axes) raise ValueError( f"Value passed for key {key!r} is of incorrect shape. " @@ -349,3 +360,15 @@ def __init__( PairwiseArraysBase._view_class = PairwiseArraysView PairwiseArraysBase._actual_class = PairwiseArrays + + +@singledispatch +def dim_len(x, dim): + return x.shape[dim] + + +@dim_len.register(ak.Array) +def dim_len_array(x, dim): + if dim != 0: + raise IndexError() + return len(x) diff --git a/anndata/_core/anndata.py b/anndata/_core/anndata.py index e53f2226d..95966dcc2 100644 --- a/anndata/_core/anndata.py +++ b/anndata/_core/anndata.py @@ -1846,7 +1846,7 @@ def _check_dimensions(self, key=None): if "obsm" in key: obsm = self._obsm if ( - not all([o.shape[0] == self._n_obs for o in obsm.values()]) + not all([len(o) == self._n_obs for o in obsm.values()]) and len(obsm.dim_names) != self._n_obs ): raise ValueError( diff --git a/anndata/_core/index.py b/anndata/_core/index.py index 8082d48c6..5da327105 100644 --- a/anndata/_core/index.py +++ b/anndata/_core/index.py @@ -8,6 +8,14 @@ import pandas as pd from scipy.sparse import spmatrix, issparse +import warnings + +try: + with warnings.catch_warnings(): + warnings.simplefilter("ignore", UserWarning) + import awkward as ak +except ImportError: + ak = None Index1D = Union[slice, int, str, np.int64, np.ndarray] Index = Union[Index1D, Tuple[Index1D, Index1D], spmatrix] @@ -140,6 +148,13 @@ def _subset_df(df: pd.DataFrame, subset_idx: Index): return df.iloc[subset_idx] +@_subset.register(ak.Array) +def _subset_awkarray(a: ak.Array, subset_idx: Index): + if all(isinstance(x, cabc.Iterable) for x in subset_idx): + subset_idx = np.ix_(*subset_idx) + return a[subset_idx] + + # Registration for SparseDataset occurs in sparse_dataset.py @_subset.register(h5py.Dataset) def _subset_dataset(d, subset_idx): diff --git a/anndata/_core/views.py b/anndata/_core/views.py index 1afc0be4b..497424417 100644 --- a/anndata/_core/views.py +++ b/anndata/_core/views.py @@ -12,6 +12,15 @@ from ..logging import anndata_logger as logger from ..compat import ZappyArray +import warnings + +try: + with warnings.catch_warnings(): + warnings.simplefilter("ignore", UserWarning) + import awkward as ak +except ImportError: + ak = None + class _SetItemMixin: """\ @@ -112,6 +121,41 @@ def drop(self, *args, inplace: bool = False, **kw): df.drop(*args, inplace=True, **kw) +class AwkwardArrayView(_ViewMixin, ak.Array): + def copy(self, order: str = "C") -> np.ndarray: + # we want a conventional array + return ak.copy(self) + + +# class AwkwardArrayView(_SetItemMixin, np.ndarray): +# def __new__( +# cls, +# input_array: Sequence[Any], +# view_args: Tuple["anndata.AnnData", str, Tuple[str, ...]] = None, +# ): +# arr = np.asanyarray(input_array).view(cls) + +# if view_args is not None: +# view_args = ElementRef(*view_args) +# arr._view_args = view_args +# return arr + +# def __array_finalize__(self, obj: Optional[np.ndarray]): +# if obj is not None: +# self._view_args = getattr(obj, "_view_args", None) + +# def keys(self) -> KeysView[str]: +# # it’s a structured array +# return self.dtype.names + +# def copy(self, order: str = "C") -> np.ndarray: +# # we want a conventional array +# return np.array(self) + +# def toarray(self) -> np.ndarray: +# return self.copy() + + @singledispatch def as_view(obj, view_args): raise NotImplementedError(f"No view type has been registered for {type(obj)}") @@ -149,6 +193,11 @@ def as_view_zappy(z, view_args): return z +@as_view.register(ak.Array) +def as_view_awkarray(array, view_args): + return AwkwardArrayView(array, view_args=view_args) + + def _resolve_idxs(old, new, adata): t = tuple(_resolve_idx(old[i], new[i], adata.shape[i]) for i in (0, 1)) return t From 7dbe9084a5084c241a821bbf4a14286e5e5eec75 Mon Sep 17 00:00:00 2001 From: giovp Date: Sun, 14 Nov 2021 12:13:43 +0100 Subject: [PATCH 002/110] remove comments --- anndata/_core/views.py | 29 ----------------------------- 1 file changed, 29 deletions(-) diff --git a/anndata/_core/views.py b/anndata/_core/views.py index 497424417..980c1ee41 100644 --- a/anndata/_core/views.py +++ b/anndata/_core/views.py @@ -127,35 +127,6 @@ def copy(self, order: str = "C") -> np.ndarray: return ak.copy(self) -# class AwkwardArrayView(_SetItemMixin, np.ndarray): -# def __new__( -# cls, -# input_array: Sequence[Any], -# view_args: Tuple["anndata.AnnData", str, Tuple[str, ...]] = None, -# ): -# arr = np.asanyarray(input_array).view(cls) - -# if view_args is not None: -# view_args = ElementRef(*view_args) -# arr._view_args = view_args -# return arr - -# def __array_finalize__(self, obj: Optional[np.ndarray]): -# if obj is not None: -# self._view_args = getattr(obj, "_view_args", None) - -# def keys(self) -> KeysView[str]: -# # it’s a structured array -# return self.dtype.names - -# def copy(self, order: str = "C") -> np.ndarray: -# # we want a conventional array -# return np.array(self) - -# def toarray(self) -> np.ndarray: -# return self.copy() - - @singledispatch def as_view(obj, view_args): raise NotImplementedError(f"No view type has been registered for {type(obj)}") From c0bbf5ad9bba9b2a153749dba6aa76afe9ac1d1e Mon Sep 17 00:00:00 2001 From: giovp Date: Sun, 14 Nov 2021 12:14:06 +0100 Subject: [PATCH 003/110] better comment --- anndata/_core/views.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/anndata/_core/views.py b/anndata/_core/views.py index 980c1ee41..ef6dfad9b 100644 --- a/anndata/_core/views.py +++ b/anndata/_core/views.py @@ -123,7 +123,7 @@ def drop(self, *args, inplace: bool = False, **kw): class AwkwardArrayView(_ViewMixin, ak.Array): def copy(self, order: str = "C") -> np.ndarray: - # we want a conventional array + # we want a copy of an akward array return ak.copy(self) From 028132465318ff275b79ae601a00101b4ed8f4f0 Mon Sep 17 00:00:00 2001 From: giovp Date: Mon, 15 Nov 2021 15:31:41 +0100 Subject: [PATCH 004/110] add type to gen_adata --- anndata/_core/anndata.py | 3 ++- anndata/tests/helpers.py | 21 +++++++++++++++++++-- 2 files changed, 21 insertions(+), 3 deletions(-) diff --git a/anndata/_core/anndata.py b/anndata/_core/anndata.py index 95966dcc2..75eec179c 100644 --- a/anndata/_core/anndata.py +++ b/anndata/_core/anndata.py @@ -56,6 +56,7 @@ _overloaded_uns, OverloadedDict, ) +from .aligned_mapping import dim_len class StorageType(Enum): @@ -1846,7 +1847,7 @@ def _check_dimensions(self, key=None): if "obsm" in key: obsm = self._obsm if ( - not all([len(o) == self._n_obs for o in obsm.values()]) + not all([dim_len(o, 0) == self._n_obs for o in obsm.values()]) and len(obsm.dim_names) != self._n_obs ): raise ValueError( diff --git a/anndata/tests/helpers.py b/anndata/tests/helpers.py index f830cb04d..4514ca4ff 100644 --- a/anndata/tests/helpers.py +++ b/anndata/tests/helpers.py @@ -17,6 +17,13 @@ from anndata._core.aligned_mapping import AlignedMapping from anndata.utils import asarray +try: + with warnings.catch_warnings(): + warnings.simplefilter("ignore", UserWarning) + import awkward as ak +except ImportError: + ak = None + def gen_vstr_recarray(m, n, dtype=None): size = m * n @@ -68,8 +75,18 @@ def gen_adata( X_dtype=np.float32, # obs_dtypes, # var_dtypes, - obsm_types: "Collection[Type]" = (sparse.csr_matrix, np.ndarray, pd.DataFrame), - varm_types: "Collection[Type]" = (sparse.csr_matrix, np.ndarray, pd.DataFrame), + obsm_types: "Collection[Type]" = ( + sparse.csr_matrix, + np.ndarray, + pd.DataFrame, + ak.Array, + ), + varm_types: "Collection[Type]" = ( + sparse.csr_matrix, + np.ndarray, + pd.DataFrame, + ak.Array, + ), layers_types: "Collection[Type]" = (sparse.csr_matrix, np.ndarray, pd.DataFrame), ) -> AnnData: """\ From 624a529781191cde510ef069defe56f1d9e6dc17 Mon Sep 17 00:00:00 2001 From: giovp Date: Mon, 15 Nov 2021 18:15:49 +0100 Subject: [PATCH 005/110] first attempt at concat --- anndata/_core/aligned_mapping.py | 24 +----------------------- anndata/_core/anndata.py | 5 ++--- anndata/_core/merge.py | 23 +++++++++++++++++++---- anndata/utils.py | 23 +++++++++++++++++++++++ 4 files changed, 45 insertions(+), 30 deletions(-) diff --git a/anndata/_core/aligned_mapping.py b/anndata/_core/aligned_mapping.py index 56ad13e1b..ae6c9f7e7 100644 --- a/anndata/_core/aligned_mapping.py +++ b/anndata/_core/aligned_mapping.py @@ -3,23 +3,13 @@ from typing import Union, Optional, Type, ClassVar, TypeVar # Special types from typing import Iterator, Mapping, Sequence # ABCs from typing import Tuple, List, Dict # Generic base types -from functools import singledispatch - -import warnings - -try: - with warnings.catch_warnings(): - warnings.simplefilter("ignore", UserWarning) - import awkward as ak -except ImportError: - ak = None import numpy as np import pandas as pd from scipy.sparse import spmatrix -from ..utils import deprecated, ensure_df_homogeneous +from ..utils import deprecated, ensure_df_homogeneous, dim_len from . import raw, anndata from .views import as_view from .access import ElementRef @@ -360,15 +350,3 @@ def __init__( PairwiseArraysBase._view_class = PairwiseArraysView PairwiseArraysBase._actual_class = PairwiseArrays - - -@singledispatch -def dim_len(x, dim): - return x.shape[dim] - - -@dim_len.register(ak.Array) -def dim_len_array(x, dim): - if dim != 0: - raise IndexError() - return len(x) diff --git a/anndata/_core/anndata.py b/anndata/_core/anndata.py index 75eec179c..d0f947de0 100644 --- a/anndata/_core/anndata.py +++ b/anndata/_core/anndata.py @@ -44,7 +44,7 @@ ) from .sparse_dataset import SparseDataset from .. import utils -from ..utils import convert_to_dict, ensure_df_homogeneous +from ..utils import convert_to_dict, ensure_df_homogeneous, dim_len from ..logging import anndata_logger as logger from ..compat import ( ZarrArray, @@ -56,7 +56,6 @@ _overloaded_uns, OverloadedDict, ) -from .aligned_mapping import dim_len class StorageType(Enum): @@ -1857,7 +1856,7 @@ def _check_dimensions(self, key=None): if "varm" in key: varm = self._varm if ( - not all([v.shape[0] == self._n_vars for v in varm.values()]) + not all([dim_len(v, 0) == self._n_vars for v in varm.values()]) and len(varm.dim_names) != self._n_vars ): raise ValueError( diff --git a/anndata/_core/merge.py b/anndata/_core/merge.py index 17a024a40..b810f1825 100644 --- a/anndata/_core/merge.py +++ b/anndata/_core/merge.py @@ -17,8 +17,15 @@ from .anndata import AnnData from ..compat import Literal -from ..utils import asarray - +from ..utils import asarray, dim_len +import warnings + +try: + with warnings.catch_warnings(): + warnings.simplefilter("ignore", UserWarning) + import awkward as ak +except ImportError: + ak = None T = TypeVar("T") ################### @@ -282,12 +289,14 @@ def apply(self, el, *, axis, fill_value=None): Missing values are to be replaced with `fill_value`. """ - if self.no_change and (el.shape[axis] == len(self.old_idx)): + if self.no_change and (dim_len(el, axis) == len(self.old_idx)): return el if isinstance(el, pd.DataFrame): return self._apply_to_df(el, axis=axis, fill_value=fill_value) elif isinstance(el, sparse.spmatrix): return self._apply_to_sparse(el, axis=axis, fill_value=fill_value) + elif isinstance(el, ak.Array): + return self._apply_to_awkward(el, axis=axis, fill_value=fill_value) else: return self._apply_to_array(el, axis=axis, fill_value=fill_value) @@ -365,6 +374,9 @@ def _apply_to_sparse(self, el: spmatrix, *, axis, fill_value=None) -> spmatrix: return out + def _apply_to_awkward(self, el: ak.Array, *, axis, fill_value=None): + return el + def merge_indices( inds: Iterable[pd.Index], join: Literal["inner", "outer"] @@ -438,6 +450,8 @@ def concat_arrays(arrays, reindexers, axis=0, index=None, fill_value=None): ], format="csr", ) + elif any(isinstance(a, ak.Array) for a in arrays): + return ak.concatenate([f(a) for f, a in zip(reindexers, arrays)]) else: return np.concatenate( [ @@ -450,7 +464,6 @@ def concat_arrays(arrays, reindexers, axis=0, index=None, fill_value=None): def inner_concat_aligned_mapping(mappings, reindexers=None, index=None, axis=0): result = {} - for k in intersect_keys(mappings): els = [m[k] for m in mappings] if reindexers is None: @@ -474,6 +487,8 @@ def gen_inner_reindexers(els, new_index, axis: Literal[0, 1] = 0): lambda x, y: x.intersection(y), (df_indices(el) for el in els) ) reindexers = [Reindexer(df_indices(el), common_ind) for el in els] + elif all(isinstance(el, ak.Array) for el in els if not_missing(el)): + reindexers = [gen_reindexer(pd.RangeIndex(0), pd.RangeIndex(0)) for _ in els] else: min_ind = min(el.shape[alt_axis] for el in els) reindexers = [ diff --git a/anndata/utils.py b/anndata/utils.py index 2be5eb49f..784e5a359 100644 --- a/anndata/utils.py +++ b/anndata/utils.py @@ -9,6 +9,14 @@ from .logging import get_logger from ._core.sparse_dataset import SparseDataset +import warnings + +try: + with warnings.catch_warnings(): + warnings.simplefilter("ignore", UserWarning) + import awkward as ak +except ImportError: + ak = None logger = get_logger(__name__) @@ -252,3 +260,18 @@ def func(*_, **__): raise error return func + + +@singledispatch +def dim_len(x, dim): + return x.shape[dim] + + +@dim_len.register(ak.Array) +def dim_len_array(x, dim): + if dim == 0: + return len(x) + elif dim == 1: + return None + else: + raise IndexError() From 05c6c754bdbbdd72fec6337a1c5faae452abe850 Mon Sep 17 00:00:00 2001 From: giovp Date: Mon, 15 Nov 2021 18:36:11 +0100 Subject: [PATCH 006/110] remove comment --- anndata/_core/aligned_mapping.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/anndata/_core/aligned_mapping.py b/anndata/_core/aligned_mapping.py index ae6c9f7e7..55d4a7896 100644 --- a/anndata/_core/aligned_mapping.py +++ b/anndata/_core/aligned_mapping.py @@ -48,7 +48,7 @@ def _ipython_key_completions_(self) -> List[str]: def _validate_value(self, val: V, key: str) -> V: """Raises an error if value is invalid""" for i, axis in enumerate(self.axes): - if self.parent.shape[axis] != dim_len(val, i): # val.shape[i]: + if self.parent.shape[axis] != dim_len(val, i): right_shape = tuple(self.parent.shape[a] for a in self.axes) raise ValueError( f"Value passed for key {key!r} is of incorrect shape. " From 3d359de8daeab34e192e51693fec4babde375adb Mon Sep 17 00:00:00 2001 From: giovp Date: Mon, 15 Nov 2021 21:44:05 +0100 Subject: [PATCH 007/110] add outer concat --- anndata/_core/merge.py | 2 ++ 1 file changed, 2 insertions(+) diff --git a/anndata/_core/merge.py b/anndata/_core/merge.py index b810f1825..ac5933331 100644 --- a/anndata/_core/merge.py +++ b/anndata/_core/merge.py @@ -506,6 +506,8 @@ def gen_outer_reindexers(els, shapes, new_index: pd.Index, *, axis=0): else (lambda x: pd.DataFrame(index=range(shape))) for el, shape in zip(els, shapes) ] + elif all(isinstance(el, ak.Array) for el in els if not_missing(el)): + reindexers = [gen_reindexer(pd.RangeIndex(0), pd.RangeIndex(0)) for _ in els] else: # if fill_value is None: # fill_value = default_fill_value(els) From 9bf0cb9f41c82cd465fd1bc0a33eb28eb2ed24d8 Mon Sep 17 00:00:00 2001 From: giovp Date: Mon, 15 Nov 2021 22:06:32 +0100 Subject: [PATCH 008/110] add awkward to test dep --- pyproject.toml | 1 + 1 file changed, 1 insertion(+) diff --git a/pyproject.toml b/pyproject.toml index 7845737be..5b80d4be0 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -87,6 +87,7 @@ test = [ "joblib", "boltons", "scanpy", + "awkward" ] [tool.flit.sdist] From 974040cdbd3c36566e0504356ce766911e385c7e Mon Sep 17 00:00:00 2001 From: giovp Date: Mon, 15 Nov 2021 22:36:50 +0100 Subject: [PATCH 009/110] add awk arr to data gen --- anndata/tests/helpers.py | 13 +++++++++++++ pyproject.toml | 2 +- 2 files changed, 14 insertions(+), 1 deletion(-) diff --git a/anndata/tests/helpers.py b/anndata/tests/helpers.py index 4514ca4ff..ca3ca0f9f 100644 --- a/anndata/tests/helpers.py +++ b/anndata/tests/helpers.py @@ -53,6 +53,17 @@ def gen_typed_df(n, index=None): ) +def gen_awkward(n, index=None): + rng = np.random.default_rng(42) + arr = ak.Array( + [ + rng.standard_normal((rng.integers(1, 10), rng.integers(1, 10))) + for _ in range(n) + ] + ) + return arr + + def gen_typed_df_t2_size(m, n, index=None, columns=None) -> pd.DataFrame: s = 0 df = pd.DataFrame() @@ -129,12 +140,14 @@ def gen_adata( array=np.random.random((M, 50)), sparse=sparse.random(M, 100, format="csr"), df=gen_typed_df(M, obs_names), + awk=gen_awkward(M), ) obsm = {k: v for k, v in obsm.items() if type(v) in obsm_types} varm = dict( array=np.random.random((N, 50)), sparse=sparse.random(N, 100, format="csr"), df=gen_typed_df(N, var_names), + awk=gen_awkward(N), ) varm = {k: v for k, v in varm.items() if type(v) in varm_types} layers = dict( diff --git a/pyproject.toml b/pyproject.toml index 5b80d4be0..b151fcff2 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -87,7 +87,7 @@ test = [ "joblib", "boltons", "scanpy", - "awkward" + "awkward", ] [tool.flit.sdist] From 13c4d59c32052496663778665461175dceb19e10 Mon Sep 17 00:00:00 2001 From: giovp Date: Tue, 16 Nov 2021 13:14:20 +0100 Subject: [PATCH 010/110] fix test base --- anndata/_core/aligned_mapping.py | 13 ++++++++++++- anndata/tests/helpers.py | 5 +++++ anndata/utils.py | 5 +++++ 3 files changed, 22 insertions(+), 1 deletion(-) diff --git a/anndata/_core/aligned_mapping.py b/anndata/_core/aligned_mapping.py index 55d4a7896..366478dfa 100644 --- a/anndata/_core/aligned_mapping.py +++ b/anndata/_core/aligned_mapping.py @@ -8,6 +8,14 @@ import numpy as np import pandas as pd from scipy.sparse import spmatrix +import warnings + +try: + with warnings.catch_warnings(): + warnings.simplefilter("ignore", UserWarning) + import awkward as ak +except ImportError: + ak = None from ..utils import deprecated, ensure_df_homogeneous, dim_len from . import raw, anndata @@ -85,7 +93,10 @@ def parent(self) -> Union["anndata.AnnData", "raw.Raw"]: def copy(self): d = self._actual_class(self.parent, self._axis) for k, v in self.items(): - d[k] = v.copy() + if isinstance(v, ak.Array): + d[k] = v + else: + d[k] = v.copy() return d def _view(self, parent: "anndata.AnnData", subset_idx: I): diff --git a/anndata/tests/helpers.py b/anndata/tests/helpers.py index ca3ca0f9f..6e9c4fe4f 100644 --- a/anndata/tests/helpers.py +++ b/anndata/tests/helpers.py @@ -358,6 +358,11 @@ def are_equal_dataframe(a, b, exact=False, elem_name=None): ) +@assert_equal.register(ak.Array) +def assert_equal_awkarray(a, b, exact=False, elem_name=None): + assert ak.all(a == b) + + @assert_equal.register(Mapping) def assert_equal_mapping(a, b, exact=False, elem_name=None): assert set(a.keys()) == set(b.keys()), format_msg(elem_name) diff --git a/anndata/utils.py b/anndata/utils.py index 784e5a359..a884e0236 100644 --- a/anndata/utils.py +++ b/anndata/utils.py @@ -42,6 +42,11 @@ def asarray_h5py_dataset(x): return x[...] +@asarray.register(ak.Array) +def asarray_awkward(x): + return x.toarray() + + @singledispatch def convert_to_dict(obj) -> dict: return dict(obj) From 74ae9e32e1ee2e402c31e746d47082811347b4e9 Mon Sep 17 00:00:00 2001 From: giovp Date: Tue, 16 Nov 2021 13:22:54 +0100 Subject: [PATCH 011/110] init test for concat --- anndata/_core/merge.py | 5 +++++ anndata/utils.py | 2 +- 2 files changed, 6 insertions(+), 1 deletion(-) diff --git a/anndata/_core/merge.py b/anndata/_core/merge.py index ac5933331..c35b130ab 100644 --- a/anndata/_core/merge.py +++ b/anndata/_core/merge.py @@ -129,6 +129,11 @@ def equal_sparse(a, b) -> bool: return False +@equal.register(ak.Array) +def equal_dataframe(a, b) -> bool: + return ak.all(a == b) + + def as_sparse(x): if not isinstance(x, sparse.spmatrix): return sparse.csr_matrix(x) diff --git a/anndata/utils.py b/anndata/utils.py index a884e0236..08010b201 100644 --- a/anndata/utils.py +++ b/anndata/utils.py @@ -44,7 +44,7 @@ def asarray_h5py_dataset(x): @asarray.register(ak.Array) def asarray_awkward(x): - return x.toarray() + return x @singledispatch From 1d0e6298e5a382e057ecf815650ca836dd9179b0 Mon Sep 17 00:00:00 2001 From: giovp Date: Fri, 19 Nov 2021 18:22:36 +0100 Subject: [PATCH 012/110] fix concatenate tests --- anndata/_core/aligned_mapping.py | 4 +++- anndata/_core/merge.py | 7 +++++-- anndata/tests/helpers.py | 2 +- anndata/tests/test_concatenate.py | 19 +++++++++++++++---- anndata/utils.py | 2 +- 5 files changed, 25 insertions(+), 9 deletions(-) diff --git a/anndata/_core/aligned_mapping.py b/anndata/_core/aligned_mapping.py index 366478dfa..d2ef50e6e 100644 --- a/anndata/_core/aligned_mapping.py +++ b/anndata/_core/aligned_mapping.py @@ -56,7 +56,9 @@ def _ipython_key_completions_(self) -> List[str]: def _validate_value(self, val: V, key: str) -> V: """Raises an error if value is invalid""" for i, axis in enumerate(self.axes): - if self.parent.shape[axis] != dim_len(val, i): + if (not isinstance(val, ak.Array)) and ( # don't validate awk arrays + self.parent.shape[axis] != dim_len(val, i) + ): right_shape = tuple(self.parent.shape[a] for a in self.axes) raise ValueError( f"Value passed for key {key!r} is of incorrect shape. " diff --git a/anndata/_core/merge.py b/anndata/_core/merge.py index c35b130ab..2f5213165 100644 --- a/anndata/_core/merge.py +++ b/anndata/_core/merge.py @@ -130,8 +130,11 @@ def equal_sparse(a, b) -> bool: @equal.register(ak.Array) -def equal_dataframe(a, b) -> bool: - return ak.all(a == b) +def equal_awkward(a, b) -> bool: + if dim_len(a, 0) == dim_len(b, 0): + return ak.all(a == b) + else: + return False def as_sparse(x): diff --git a/anndata/tests/helpers.py b/anndata/tests/helpers.py index 6e9c4fe4f..7beeaaeba 100644 --- a/anndata/tests/helpers.py +++ b/anndata/tests/helpers.py @@ -57,7 +57,7 @@ def gen_awkward(n, index=None): rng = np.random.default_rng(42) arr = ak.Array( [ - rng.standard_normal((rng.integers(1, 10), rng.integers(1, 10))) + rng.standard_normal(size=(rng.integers(1, 10), rng.integers(1, 10))) for _ in range(n) ] ) diff --git a/anndata/tests/test_concatenate.py b/anndata/tests/test_concatenate.py index 6609b259a..b0d5921bc 100644 --- a/anndata/tests/test_concatenate.py +++ b/anndata/tests/test_concatenate.py @@ -18,7 +18,16 @@ from anndata._core import merge from anndata.tests import helpers from anndata.tests.helpers import assert_equal, gen_adata -from anndata.utils import asarray +from anndata.utils import asarray, dim_len + +import warnings + +try: + with warnings.catch_warnings(): + warnings.simplefilter("ignore", UserWarning) + import awkward as ak +except ImportError: + ak = None @singledispatch @@ -430,19 +439,21 @@ def get_obs_els(adata): adata1 = gen_adata((10, 10)) adata1.obsm = { - k: v for k, v in adata1.obsm.items() if not isinstance(v, pd.DataFrame) + k: v + for k, v in adata1.obsm.items() + if not isinstance(v, (pd.DataFrame, ak.Array)) } adata2 = gen_adata((10, 5)) adata2.obsm = { k: v[:, : v.shape[1] // 2] for k, v in adata2.obsm.items() - if not isinstance(v, pd.DataFrame) + if not isinstance(v, (pd.DataFrame, ak.Array)) } adata3 = gen_adata((7, 3)) adata3.obsm = { k: v[:, : v.shape[1] // 3] for k, v in adata3.obsm.items() - if not isinstance(v, pd.DataFrame) + if not isinstance(v, (pd.DataFrame, ak.Array)) } joined = adata1.concatenate([adata2, adata3], join="outer", fill_value=fill_val) diff --git a/anndata/utils.py b/anndata/utils.py index 08010b201..e7c6ea3c2 100644 --- a/anndata/utils.py +++ b/anndata/utils.py @@ -44,7 +44,7 @@ def asarray_h5py_dataset(x): @asarray.register(ak.Array) def asarray_awkward(x): - return x + return ak.copy(x) @singledispatch From aeba5492fe5c5e54c5a92ed00880542eade80fd8 Mon Sep 17 00:00:00 2001 From: giovp Date: Thu, 25 Nov 2021 16:38:42 +0100 Subject: [PATCH 013/110] create mock class for awkward array --- anndata/_core/aligned_mapping.py | 13 +++---------- anndata/_core/index.py | 14 +++----------- anndata/_core/merge.py | 23 ++++++++--------------- anndata/_core/views.py | 15 +++------------ anndata/compat/__init__.py | 7 +++++++ anndata/tests/helpers.py | 15 +++++---------- anndata/tests/test_concatenate.py | 10 +--------- 7 files changed, 30 insertions(+), 67 deletions(-) diff --git a/anndata/_core/aligned_mapping.py b/anndata/_core/aligned_mapping.py index d2ef50e6e..c658265c5 100644 --- a/anndata/_core/aligned_mapping.py +++ b/anndata/_core/aligned_mapping.py @@ -8,20 +8,13 @@ import numpy as np import pandas as pd from scipy.sparse import spmatrix -import warnings - -try: - with warnings.catch_warnings(): - warnings.simplefilter("ignore", UserWarning) - import awkward as ak -except ImportError: - ak = None from ..utils import deprecated, ensure_df_homogeneous, dim_len from . import raw, anndata from .views import as_view from .access import ElementRef from .index import _subset +from ..compat import AwkArray OneDIdx = Union[Sequence[int], Sequence[bool], slice] @@ -56,7 +49,7 @@ def _ipython_key_completions_(self) -> List[str]: def _validate_value(self, val: V, key: str) -> V: """Raises an error if value is invalid""" for i, axis in enumerate(self.axes): - if (not isinstance(val, ak.Array)) and ( # don't validate awk arrays + if (not isinstance(val, AwkArray)) and ( # don't validate awk arrays self.parent.shape[axis] != dim_len(val, i) ): right_shape = tuple(self.parent.shape[a] for a in self.axes) @@ -95,7 +88,7 @@ def parent(self) -> Union["anndata.AnnData", "raw.Raw"]: def copy(self): d = self._actual_class(self.parent, self._axis) for k, v in self.items(): - if isinstance(v, ak.Array): + if isinstance(v, AwkArray): d[k] = v else: d[k] = v.copy() diff --git a/anndata/_core/index.py b/anndata/_core/index.py index 5da327105..444adba81 100644 --- a/anndata/_core/index.py +++ b/anndata/_core/index.py @@ -7,15 +7,7 @@ import numpy as np import pandas as pd from scipy.sparse import spmatrix, issparse - -import warnings - -try: - with warnings.catch_warnings(): - warnings.simplefilter("ignore", UserWarning) - import awkward as ak -except ImportError: - ak = None +from ..compat import AwkArray Index1D = Union[slice, int, str, np.int64, np.ndarray] Index = Union[Index1D, Tuple[Index1D, Index1D], spmatrix] @@ -148,8 +140,8 @@ def _subset_df(df: pd.DataFrame, subset_idx: Index): return df.iloc[subset_idx] -@_subset.register(ak.Array) -def _subset_awkarray(a: ak.Array, subset_idx: Index): +@_subset.register(AwkArray) +def _subset_awkarray(a: AwkArray, subset_idx: Index): if all(isinstance(x, cabc.Iterable) for x in subset_idx): subset_idx = np.ix_(*subset_idx) return a[subset_idx] diff --git a/anndata/_core/merge.py b/anndata/_core/merge.py index 2f5213165..5c679f34c 100644 --- a/anndata/_core/merge.py +++ b/anndata/_core/merge.py @@ -16,16 +16,9 @@ from scipy.sparse.base import spmatrix from .anndata import AnnData -from ..compat import Literal +from ..compat import Literal, AwkArray from ..utils import asarray, dim_len -import warnings - -try: - with warnings.catch_warnings(): - warnings.simplefilter("ignore", UserWarning) - import awkward as ak -except ImportError: - ak = None + T = TypeVar("T") ################### @@ -129,7 +122,7 @@ def equal_sparse(a, b) -> bool: return False -@equal.register(ak.Array) +@equal.register(AwkArray) def equal_awkward(a, b) -> bool: if dim_len(a, 0) == dim_len(b, 0): return ak.all(a == b) @@ -303,7 +296,7 @@ def apply(self, el, *, axis, fill_value=None): return self._apply_to_df(el, axis=axis, fill_value=fill_value) elif isinstance(el, sparse.spmatrix): return self._apply_to_sparse(el, axis=axis, fill_value=fill_value) - elif isinstance(el, ak.Array): + elif isinstance(el, AwkArray): return self._apply_to_awkward(el, axis=axis, fill_value=fill_value) else: return self._apply_to_array(el, axis=axis, fill_value=fill_value) @@ -382,7 +375,7 @@ def _apply_to_sparse(self, el: spmatrix, *, axis, fill_value=None) -> spmatrix: return out - def _apply_to_awkward(self, el: ak.Array, *, axis, fill_value=None): + def _apply_to_awkward(self, el: AwkArray, *, axis, fill_value=None): return el @@ -458,7 +451,7 @@ def concat_arrays(arrays, reindexers, axis=0, index=None, fill_value=None): ], format="csr", ) - elif any(isinstance(a, ak.Array) for a in arrays): + elif any(isinstance(a, AwkArray) for a in arrays): return ak.concatenate([f(a) for f, a in zip(reindexers, arrays)]) else: return np.concatenate( @@ -495,7 +488,7 @@ def gen_inner_reindexers(els, new_index, axis: Literal[0, 1] = 0): lambda x, y: x.intersection(y), (df_indices(el) for el in els) ) reindexers = [Reindexer(df_indices(el), common_ind) for el in els] - elif all(isinstance(el, ak.Array) for el in els if not_missing(el)): + elif all(isinstance(el, AwkArray) for el in els if not_missing(el)): reindexers = [gen_reindexer(pd.RangeIndex(0), pd.RangeIndex(0)) for _ in els] else: min_ind = min(el.shape[alt_axis] for el in els) @@ -514,7 +507,7 @@ def gen_outer_reindexers(els, shapes, new_index: pd.Index, *, axis=0): else (lambda x: pd.DataFrame(index=range(shape))) for el, shape in zip(els, shapes) ] - elif all(isinstance(el, ak.Array) for el in els if not_missing(el)): + elif all(isinstance(el, AwkArray) for el in els if not_missing(el)): reindexers = [gen_reindexer(pd.RangeIndex(0), pd.RangeIndex(0)) for _ in els] else: # if fill_value is None: diff --git a/anndata/_core/views.py b/anndata/_core/views.py index ef6dfad9b..84fd88e1e 100644 --- a/anndata/_core/views.py +++ b/anndata/_core/views.py @@ -10,16 +10,7 @@ from .access import ElementRef from ..logging import anndata_logger as logger -from ..compat import ZappyArray - -import warnings - -try: - with warnings.catch_warnings(): - warnings.simplefilter("ignore", UserWarning) - import awkward as ak -except ImportError: - ak = None +from ..compat import ZappyArray, AwkArray class _SetItemMixin: @@ -121,7 +112,7 @@ def drop(self, *args, inplace: bool = False, **kw): df.drop(*args, inplace=True, **kw) -class AwkwardArrayView(_ViewMixin, ak.Array): +class AwkwardArrayView(_ViewMixin, AwkArray): def copy(self, order: str = "C") -> np.ndarray: # we want a copy of an akward array return ak.copy(self) @@ -164,7 +155,7 @@ def as_view_zappy(z, view_args): return z -@as_view.register(ak.Array) +@as_view.register(AwkArray) def as_view_awkarray(array, view_args): return AwkwardArrayView(array, view_args=view_args) diff --git a/anndata/compat/__init__.py b/anndata/compat/__init__.py index fa8025d54..ccc156f85 100644 --- a/anndata/compat/__init__.py +++ b/anndata/compat/__init__.py @@ -26,6 +26,13 @@ def __repr__(): return "mock zarr.core.Array" +try: + from awkward import Array as AwkArray + import awkward as ak +except ImportError: + ak = None + + try: from zappy.base import ZappyArray except ImportError: diff --git a/anndata/tests/helpers.py b/anndata/tests/helpers.py index 7beeaaeba..0773a5eca 100644 --- a/anndata/tests/helpers.py +++ b/anndata/tests/helpers.py @@ -17,12 +17,7 @@ from anndata._core.aligned_mapping import AlignedMapping from anndata.utils import asarray -try: - with warnings.catch_warnings(): - warnings.simplefilter("ignore", UserWarning) - import awkward as ak -except ImportError: - ak = None +from ..compat import AwkArray def gen_vstr_recarray(m, n, dtype=None): @@ -55,7 +50,7 @@ def gen_typed_df(n, index=None): def gen_awkward(n, index=None): rng = np.random.default_rng(42) - arr = ak.Array( + arr = AwkArray( [ rng.standard_normal(size=(rng.integers(1, 10), rng.integers(1, 10))) for _ in range(n) @@ -90,13 +85,13 @@ def gen_adata( sparse.csr_matrix, np.ndarray, pd.DataFrame, - ak.Array, + AwkArray, ), varm_types: "Collection[Type]" = ( sparse.csr_matrix, np.ndarray, pd.DataFrame, - ak.Array, + AwkArray, ), layers_types: "Collection[Type]" = (sparse.csr_matrix, np.ndarray, pd.DataFrame), ) -> AnnData: @@ -358,7 +353,7 @@ def are_equal_dataframe(a, b, exact=False, elem_name=None): ) -@assert_equal.register(ak.Array) +@assert_equal.register(AwkArray) def assert_equal_awkarray(a, b, exact=False, elem_name=None): assert ak.all(a == b) diff --git a/anndata/tests/test_concatenate.py b/anndata/tests/test_concatenate.py index b0d5921bc..8f4164dd3 100644 --- a/anndata/tests/test_concatenate.py +++ b/anndata/tests/test_concatenate.py @@ -19,15 +19,7 @@ from anndata.tests import helpers from anndata.tests.helpers import assert_equal, gen_adata from anndata.utils import asarray, dim_len - -import warnings - -try: - with warnings.catch_warnings(): - warnings.simplefilter("ignore", UserWarning) - import awkward as ak -except ImportError: - ak = None +from anndata.compat import AwkArray, ak @singledispatch From 88a5c83db56395adec08c930b3b2194ed702317e Mon Sep 17 00:00:00 2001 From: giovp Date: Thu, 25 Nov 2021 16:39:51 +0100 Subject: [PATCH 014/110] remove space --- anndata/tests/test_concatenate.py | 1 + 1 file changed, 1 insertion(+) diff --git a/anndata/tests/test_concatenate.py b/anndata/tests/test_concatenate.py index 8f4164dd3..c07b69b2a 100644 --- a/anndata/tests/test_concatenate.py +++ b/anndata/tests/test_concatenate.py @@ -4,6 +4,7 @@ from functools import partial, singledispatch import warnings + import numpy as np from numpy import ma import pandas as pd From 15b3d1a30493a3a20b8fb9c45d1d429404703954 Mon Sep 17 00:00:00 2001 From: giovp Date: Sun, 28 Nov 2021 20:53:34 +0100 Subject: [PATCH 015/110] import ak when needed --- anndata/_core/merge.py | 4 ++++ anndata/_core/views.py | 2 ++ anndata/compat/__init__.py | 7 +++++-- anndata/tests/test_concatenate.py | 2 +- 4 files changed, 12 insertions(+), 3 deletions(-) diff --git a/anndata/_core/merge.py b/anndata/_core/merge.py index 5c679f34c..e84e6bd99 100644 --- a/anndata/_core/merge.py +++ b/anndata/_core/merge.py @@ -124,6 +124,8 @@ def equal_sparse(a, b) -> bool: @equal.register(AwkArray) def equal_awkward(a, b) -> bool: + import awkward as ak + if dim_len(a, 0) == dim_len(b, 0): return ak.all(a == b) else: @@ -452,6 +454,8 @@ def concat_arrays(arrays, reindexers, axis=0, index=None, fill_value=None): format="csr", ) elif any(isinstance(a, AwkArray) for a in arrays): + import awkward as ak + return ak.concatenate([f(a) for f, a in zip(reindexers, arrays)]) else: return np.concatenate( diff --git a/anndata/_core/views.py b/anndata/_core/views.py index 84fd88e1e..e9cfe7b4c 100644 --- a/anndata/_core/views.py +++ b/anndata/_core/views.py @@ -114,6 +114,8 @@ def drop(self, *args, inplace: bool = False, **kw): class AwkwardArrayView(_ViewMixin, AwkArray): def copy(self, order: str = "C") -> np.ndarray: + import awkward as ak + # we want a copy of an akward array return ak.copy(self) diff --git a/anndata/compat/__init__.py b/anndata/compat/__init__.py index ccc156f85..8a2142837 100644 --- a/anndata/compat/__init__.py +++ b/anndata/compat/__init__.py @@ -28,9 +28,12 @@ def __repr__(): try: from awkward import Array as AwkArray - import awkward as ak except ImportError: - ak = None + + class AwkArray: + @staticmethod + def __repr__(): + return "mock awkward.highlevel.Array" try: diff --git a/anndata/tests/test_concatenate.py b/anndata/tests/test_concatenate.py index c07b69b2a..86bf61196 100644 --- a/anndata/tests/test_concatenate.py +++ b/anndata/tests/test_concatenate.py @@ -20,7 +20,7 @@ from anndata.tests import helpers from anndata.tests.helpers import assert_equal, gen_adata from anndata.utils import asarray, dim_len -from anndata.compat import AwkArray, ak +from anndata.compat import AwkArray @singledispatch From 7e6beaa198dcf2aaa9abafce21eb256bc245f0b6 Mon Sep 17 00:00:00 2001 From: giovp Date: Sun, 28 Nov 2021 21:30:39 +0100 Subject: [PATCH 016/110] relative import of awk array --- anndata/_core/aligned_mapping.py | 2 +- anndata/_core/index.py | 2 +- anndata/tests/helpers.py | 4 +++- 3 files changed, 5 insertions(+), 3 deletions(-) diff --git a/anndata/_core/aligned_mapping.py b/anndata/_core/aligned_mapping.py index c658265c5..1ead38d31 100644 --- a/anndata/_core/aligned_mapping.py +++ b/anndata/_core/aligned_mapping.py @@ -14,7 +14,7 @@ from .views import as_view from .access import ElementRef from .index import _subset -from ..compat import AwkArray +from anndata.compat import AwkArray OneDIdx = Union[Sequence[int], Sequence[bool], slice] diff --git a/anndata/_core/index.py b/anndata/_core/index.py index 444adba81..c17c2e790 100644 --- a/anndata/_core/index.py +++ b/anndata/_core/index.py @@ -7,7 +7,7 @@ import numpy as np import pandas as pd from scipy.sparse import spmatrix, issparse -from ..compat import AwkArray +from anndata.compat import AwkArray Index1D = Union[slice, int, str, np.int64, np.ndarray] Index = Union[Index1D, Tuple[Index1D, Index1D], spmatrix] diff --git a/anndata/tests/helpers.py b/anndata/tests/helpers.py index 0773a5eca..f3c652091 100644 --- a/anndata/tests/helpers.py +++ b/anndata/tests/helpers.py @@ -17,7 +17,7 @@ from anndata._core.aligned_mapping import AlignedMapping from anndata.utils import asarray -from ..compat import AwkArray +from anndata.compat import AwkArray def gen_vstr_recarray(m, n, dtype=None): @@ -355,6 +355,8 @@ def are_equal_dataframe(a, b, exact=False, elem_name=None): @assert_equal.register(AwkArray) def assert_equal_awkarray(a, b, exact=False, elem_name=None): + import awkward as ak + assert ak.all(a == b) From 77d5b6c14207b0c9e6f34482b3c0597241ba736c Mon Sep 17 00:00:00 2001 From: giovp Date: Mon, 29 Nov 2021 14:46:32 +0100 Subject: [PATCH 017/110] fix optional dep import --- anndata/_core/index.py | 2 +- anndata/compat/__init__.py | 3 ++- anndata/tests/test_concatenate.py | 6 +++--- 3 files changed, 6 insertions(+), 5 deletions(-) diff --git a/anndata/_core/index.py b/anndata/_core/index.py index c17c2e790..444adba81 100644 --- a/anndata/_core/index.py +++ b/anndata/_core/index.py @@ -7,7 +7,7 @@ import numpy as np import pandas as pd from scipy.sparse import spmatrix, issparse -from anndata.compat import AwkArray +from ..compat import AwkArray Index1D = Union[slice, int, str, np.int64, np.ndarray] Index = Union[Index1D, Tuple[Index1D, Index1D], spmatrix] diff --git a/anndata/compat/__init__.py b/anndata/compat/__init__.py index 8a2142837..b775882b9 100644 --- a/anndata/compat/__init__.py +++ b/anndata/compat/__init__.py @@ -11,7 +11,6 @@ import pandas as pd from ._overloaded_dict import _overloaded_uns, OverloadedDict -from .._core.index import _subset # try importing zarr, dask, and zappy from packaging import version @@ -230,6 +229,8 @@ def _find_sparse_matrices(d: Mapping, n: int, keys: tuple, paths: list): def _slice_uns_sparse_matrices(uns: MutableMapping, oidx: "Index1d", orig_n_obs: int): + from anndata._core.index import _subset + """slice sparse spatrices of n_obs × n_obs in self.uns""" if isinstance(oidx, slice) and len(range(*oidx.indices(orig_n_obs))) == orig_n_obs: return uns # slice of entire dimension is a no-op diff --git a/anndata/tests/test_concatenate.py b/anndata/tests/test_concatenate.py index 86bf61196..5d328c5db 100644 --- a/anndata/tests/test_concatenate.py +++ b/anndata/tests/test_concatenate.py @@ -434,19 +434,19 @@ def get_obs_els(adata): adata1.obsm = { k: v for k, v in adata1.obsm.items() - if not isinstance(v, (pd.DataFrame, ak.Array)) + if not isinstance(v, (pd.DataFrame, AwkArray)) } adata2 = gen_adata((10, 5)) adata2.obsm = { k: v[:, : v.shape[1] // 2] for k, v in adata2.obsm.items() - if not isinstance(v, (pd.DataFrame, ak.Array)) + if not isinstance(v, (pd.DataFrame, AwkArray)) } adata3 = gen_adata((7, 3)) adata3.obsm = { k: v[:, : v.shape[1] // 3] for k, v in adata3.obsm.items() - if not isinstance(v, (pd.DataFrame, ak.Array)) + if not isinstance(v, (pd.DataFrame, AwkArray)) } joined = adata1.concatenate([adata2, adata3], join="outer", fill_value=fill_val) From 4aa3d26df066eb570857ac26828d04a787260fb7 Mon Sep 17 00:00:00 2001 From: giovp Date: Mon, 29 Nov 2021 14:48:57 +0100 Subject: [PATCH 018/110] resolve conflicts --- anndata/_core/views.py | 1 - 1 file changed, 1 deletion(-) diff --git a/anndata/_core/views.py b/anndata/_core/views.py index e9cfe7b4c..13a70f5dd 100644 --- a/anndata/_core/views.py +++ b/anndata/_core/views.py @@ -9,7 +9,6 @@ from scipy import sparse from .access import ElementRef -from ..logging import anndata_logger as logger from ..compat import ZappyArray, AwkArray From ccc28c26113e2e4ebd4ae6907b9497d1b7f771e6 Mon Sep 17 00:00:00 2001 From: giovp Date: Fri, 4 Mar 2022 16:55:46 +0100 Subject: [PATCH 019/110] draft IO for akward arrays --- anndata/_io/specs/methods.py | 30 ++++++++++++++++++++++++++++++ 1 file changed, 30 insertions(+) diff --git a/anndata/_io/specs/methods.py b/anndata/_io/specs/methods.py index 71c216a2a..08890c51e 100644 --- a/anndata/_io/specs/methods.py +++ b/anndata/_io/specs/methods.py @@ -29,6 +29,7 @@ ) from anndata._io.utils import report_write_key_on_error, check_key, H5PY_V3 from anndata._warnings import OldFormatWarning +from anndata.compat import AwkArray from .registry import ( _REGISTRY, @@ -481,6 +482,35 @@ def read_sparse_partial(elem, *, items=None, indices=(slice(None), slice(None))) return SparseDataset(elem)[indices] +################# +# Awkward array # +################# + + +@_REGISTRY.register_write(H5Group, AwkArray, IOSpec("awkward-array", "0.1.0")) +@_REGISTRY.register_write(ZarrGroup, AwkArray, IOSpec("awkward-array", "0.1.0")) +def write_awkward(f, k, v, dataset_kwargs=MappingProxyType({})): + import awkward as ak + + group = f.create_group(k) + form, length, container = ak.to_buffers(v) + group.attrs["length"] = length + write_elem(group, "form", form.tojson(), dataset_kwargs=dataset_kwargs) + write_elem(group, "container", container, dataset_kwargs=dataset_kwargs) + + +@_REGISTRY.register_read(H5Group, IOSpec("awkward-array", "0.1.0")) +@_REGISTRY.register_read(ZarrGroup, IOSpec("awkward-array", "0.1.0")) +def read_awkward(elem): + import awkward as ak + + form = read_elem(elem["form"]) + length = _read_attr(elem.attrs, "length") + container = read_elem(elem["container"]) + + return ak.from_buffers(form, length, container) + + ############## # DataFrames # ############## From e524389e82c42e18e22930ab3a06b995682b771b Mon Sep 17 00:00:00 2001 From: giovp Date: Fri, 4 Mar 2022 17:34:09 +0100 Subject: [PATCH 020/110] add awkward to docs and save form to attrs --- anndata/_io/specs/methods.py | 4 ++-- pyproject.toml | 1 + 2 files changed, 3 insertions(+), 2 deletions(-) diff --git a/anndata/_io/specs/methods.py b/anndata/_io/specs/methods.py index 08890c51e..a467f4324 100644 --- a/anndata/_io/specs/methods.py +++ b/anndata/_io/specs/methods.py @@ -495,7 +495,7 @@ def write_awkward(f, k, v, dataset_kwargs=MappingProxyType({})): group = f.create_group(k) form, length, container = ak.to_buffers(v) group.attrs["length"] = length - write_elem(group, "form", form.tojson(), dataset_kwargs=dataset_kwargs) + group.attrs["form"] = form.tojson() write_elem(group, "container", container, dataset_kwargs=dataset_kwargs) @@ -504,7 +504,7 @@ def write_awkward(f, k, v, dataset_kwargs=MappingProxyType({})): def read_awkward(elem): import awkward as ak - form = read_elem(elem["form"]) + form = _read_attr(elem.attrs, "form") length = _read_attr(elem.attrs, "length") container = read_elem(elem["container"]) diff --git a/pyproject.toml b/pyproject.toml index 9dda20156..f4026ec55 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -76,6 +76,7 @@ doc = [ "scanpydoc>=0.7.3", "typing_extensions; python_version < '3.8'", "zarr", + "awkward", ] test = [ "loompy>=3.0.5", From a9281985163008d86c32494c6fc361614aa23409 Mon Sep 17 00:00:00 2001 From: Gregor Sturm Date: Tue, 19 Jul 2022 09:19:16 +0200 Subject: [PATCH 021/110] Update dependencies --- pyproject.toml | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/pyproject.toml b/pyproject.toml index 13ead2865..ccf499922 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -76,7 +76,7 @@ doc = [ "scanpydoc>=0.7.7", "typing_extensions; python_version < '3.8'", "zarr", - "awkward", + "awkward>=1,<2", ] test = [ "loompy>=3.0.5", @@ -90,7 +90,7 @@ test = [ "boltons", "scanpy", "dask[array]", - "awkward", + "awkward>=1,<2", ] [tool.flit.sdist] From fee56eed9b1b575e232d6014faf9f0417a14ce30 Mon Sep 17 00:00:00 2001 From: Gregor Sturm Date: Tue, 19 Jul 2022 09:44:31 +0200 Subject: [PATCH 022/110] Update dim_len --- anndata/utils.py | 49 +++++++++++++++++++++++------------------------- 1 file changed, 23 insertions(+), 26 deletions(-) diff --git a/anndata/utils.py b/anndata/utils.py index 47d936b4f..be0e5dc0f 100644 --- a/anndata/utils.py +++ b/anndata/utils.py @@ -11,12 +11,6 @@ from ._core.sparse_dataset import SparseDataset import warnings -try: - with warnings.catch_warnings(): - warnings.simplefilter("ignore", UserWarning) - import awkward as ak -except ImportError: - ak = None logger = get_logger(__name__) @@ -42,11 +36,6 @@ def asarray_h5py_dataset(x): return x[...] -@asarray.register(ak.Array) -def asarray_awkward(x): - return ak.copy(x) - - @singledispatch def convert_to_dict(obj) -> dict: return dict(obj) @@ -72,6 +61,29 @@ def convert_to_dict_nonetype(obj: None): return dict() +@singledispatch +def dim_len(x, dim): + """\ + Return the size of an array in dimension `dim`. + + Raises a ValueError if `x` is an awkward array with variable length in the requested dimension. + """ + return x.shape[dim] + + +try: + import awkward as ak + + dim_len.register(ak.Array, ak.size) + + @asarray.register(ak.Array) + def asarray_awkward(x): + return x + +except ImportError: + pass + + def make_index_unique(index: pd.Index, join: str = "-"): """ Makes the index unique by appending a number string to each duplicate index element: @@ -267,18 +279,3 @@ def func(*_, **__): raise error return func - - -@singledispatch -def dim_len(x, dim): - return x.shape[dim] - - -@dim_len.register(ak.Array) -def dim_len_array(x, dim): - if dim == 0: - return len(x) - elif dim == 1: - return None - else: - raise IndexError() From 0775e53f674a4250cf3de92143359cee03393a16 Mon Sep 17 00:00:00 2001 From: Gregor Sturm Date: Tue, 19 Jul 2022 09:44:39 +0200 Subject: [PATCH 023/110] ignore vscode directory --- .gitignore | 1 + 1 file changed, 1 insertion(+) diff --git a/.gitignore b/.gitignore index 88e01a343..4053f744f 100644 --- a/.gitignore +++ b/.gitignore @@ -27,4 +27,5 @@ test.h5ad # IDEs /.idea/ +/.vscode/ From 4b89a9b6cfdcab8db3367be192e96a20e93c0d57 Mon Sep 17 00:00:00 2001 From: Gregor Sturm Date: Tue, 19 Jul 2022 10:00:25 +0200 Subject: [PATCH 024/110] Validate that awkward arrays align to axes --- anndata/_core/aligned_mapping.py | 24 +++++++++++++++++------- 1 file changed, 17 insertions(+), 7 deletions(-) diff --git a/anndata/_core/aligned_mapping.py b/anndata/_core/aligned_mapping.py index 1ead38d31..8f5430463 100644 --- a/anndata/_core/aligned_mapping.py +++ b/anndata/_core/aligned_mapping.py @@ -4,25 +4,26 @@ from typing import Iterator, Mapping, Sequence # ABCs from typing import Tuple, List, Dict # Generic base types - import numpy as np import pandas as pd from scipy.sparse import spmatrix -from ..utils import deprecated, ensure_df_homogeneous, dim_len +from ..utils import deprecated, ensure_df_homogeneous, dim_len, import_function from . import raw, anndata from .views import as_view from .access import ElementRef from .index import _subset from anndata.compat import AwkArray +ak_to_regular = import_function("awkward", "to_regular") + OneDIdx = Union[Sequence[int], Sequence[bool], slice] TwoDIdx = Tuple[OneDIdx, OneDIdx] I = TypeVar("I", OneDIdx, TwoDIdx, covariant=True) # TODO: pd.DataFrame only allowed in AxisArrays? -V = Union[pd.DataFrame, spmatrix, np.ndarray] +V = Union[pd.DataFrame, spmatrix, np.ndarray, AwkArray] class AlignedMapping(cabc.MutableMapping, ABC): @@ -49,16 +50,25 @@ def _ipython_key_completions_(self) -> List[str]: def _validate_value(self, val: V, key: str) -> V: """Raises an error if value is invalid""" for i, axis in enumerate(self.axes): - if (not isinstance(val, AwkArray)) and ( # don't validate awk arrays - self.parent.shape[axis] != dim_len(val, i) - ): + if isinstance(val, AwkArray): + try: + val = ak_to_regular(val, i) + except ValueError: + raise ValueError( + f"Awkward array passed for key {key!r} is of variable length in dimension {i}." + "Dimensions aligned to AnnData axes must be fixed-length." + ) + + if self.parent.shape[axis] != dim_len(val, i): right_shape = tuple(self.parent.shape[a] for a in self.axes) + actual_shape = tuple(dim_len(val, a) for a in self.axes) raise ValueError( f"Value passed for key {key!r} is of incorrect shape. " f"Values of {self.attrname} must match dimensions " - f"{self.axes} of parent. Value had shape {val.shape} while " + f"{self.axes} of parent. Value had shape {actual_shape} while " f"it should have had {right_shape}." ) + if not self._allow_df and isinstance(val, pd.DataFrame): name = self.attrname.title().rstrip("s") val = ensure_df_homogeneous(val, f"{name} {key!r}") From 9d5615734ad1bc5e85e51992dad6ac5dd6a67c74 Mon Sep 17 00:00:00 2001 From: Gregor Sturm Date: Tue, 19 Jul 2022 13:17:16 +0200 Subject: [PATCH 025/110] Fix reindexing during merge --- anndata/_core/merge.py | 29 ++++++++++++++++++++++------- anndata/tests/helpers.py | 15 ++++++++------- anndata/tests/test_concatenate.py | 16 ++++++++++++++++ anndata/utils.py | 10 +++++++++- 4 files changed, 55 insertions(+), 15 deletions(-) diff --git a/anndata/_core/merge.py b/anndata/_core/merge.py index fd8e10566..d845052de 100644 --- a/anndata/_core/merge.py +++ b/anndata/_core/merge.py @@ -383,7 +383,18 @@ def _apply_to_sparse(self, el: spmatrix, *, axis, fill_value=None) -> spmatrix: return out def _apply_to_awkward(self, el: AwkArray, *, axis, fill_value=None): - return el + try: + dim_len(el, axis) + except ValueError: + # Do not reindex variable-length dimensions + return el + else: + indexer = self.old_idx.get_indexer(self.new_idx) + if -1 in indexer: + raise ValueError( + "Outer join operations are currently not supported with AwkwardArrays" + ) + return pd.api.extensions.take(el, indexer, axis=axis, allow_fill=False) def merge_indices( @@ -449,6 +460,10 @@ def concat_arrays(arrays, reindexers, axis=0, index=None, fill_value=None): ) df.index = index return df + elif any(isinstance(a, AwkArray) for a in arrays): + import awkward as ak + + return ak.concatenate([f(a) for f, a in zip(reindexers, arrays)]) elif any(isinstance(a, sparse.spmatrix) for a in arrays): sparse_stack = (sparse.vstack, sparse.hstack)[axis] return sparse_stack( @@ -458,10 +473,6 @@ def concat_arrays(arrays, reindexers, axis=0, index=None, fill_value=None): ], format="csr", ) - elif any(isinstance(a, AwkArray) for a in arrays): - import awkward as ak - - return ak.concatenate([f(a) for f, a in zip(reindexers, arrays)]) else: return np.concatenate( [ @@ -498,7 +509,9 @@ def gen_inner_reindexers(els, new_index, axis: Literal[0, 1] = 0): ) reindexers = [Reindexer(df_indices(el), common_ind) for el in els] elif all(isinstance(el, AwkArray) for el in els if not_missing(el)): - reindexers = [gen_reindexer(pd.RangeIndex(0), pd.RangeIndex(0)) for _ in els] + # do not reindex awkward arrays + # TODO unintended behaviour? + reindexers = [lambda x: x for _ in els] else: min_ind = min(el.shape[alt_axis] for el in els) reindexers = [ @@ -517,7 +530,9 @@ def gen_outer_reindexers(els, shapes, new_index: pd.Index, *, axis=0): for el, shape in zip(els, shapes) ] elif all(isinstance(el, AwkArray) for el in els if not_missing(el)): - reindexers = [gen_reindexer(pd.RangeIndex(0), pd.RangeIndex(0)) for _ in els] + # do not reindex awkward arrays + # TODO unintended behaviour? + reindexers = [lambda x: x for _ in els] else: # if fill_value is None: # fill_value = default_fill_value(els) diff --git a/anndata/tests/helpers.py b/anndata/tests/helpers.py index 587cd2371..8c0201b09 100644 --- a/anndata/tests/helpers.py +++ b/anndata/tests/helpers.py @@ -57,14 +57,11 @@ def gen_typed_df(n, index=None): ) -def gen_awkward(n, index=None): +# TODO simulate variable-length non-aligned dimensions +def gen_awkward(m, n=None, dtype=np.int32): rng = np.random.default_rng(42) - arr = AwkArray( - [ - rng.standard_normal(size=(rng.integers(1, 10), rng.integers(1, 10))) - for _ in range(n) - ] - ) + dim = (m,) if n is None else (m, n) + arr = AwkArray(np.random.binomial(100, 0.005, dim).astype(dtype)) return arr @@ -136,6 +133,8 @@ def gen_adata( obs.rename(columns=dict(cat="obs_cat"), inplace=True) var.rename(columns=dict(cat="var_cat"), inplace=True) + # TODO test with awkward X + # TODO is AnnData not tested with dense matrices? if X_type is None: X = None else: @@ -154,6 +153,7 @@ def gen_adata( awk=gen_awkward(N), ) varm = {k: v for k, v in varm.items() if type(v) in varm_types} + # TODO test with awkward layer layers = dict( array=np.random.random((M, N)), sparse=sparse.random(M, N, format="csr") ) @@ -173,6 +173,7 @@ def gen_adata( nested_further=dict(array=np.arange(5)), ), # U_recarray=gen_vstr_recarray(N, 5, "U4") + # TODO Add awkward array to uns ) adata = AnnData( X=X, diff --git a/anndata/tests/test_concatenate.py b/anndata/tests/test_concatenate.py index b6b071b0b..9da476ded 100644 --- a/anndata/tests/test_concatenate.py +++ b/anndata/tests/test_concatenate.py @@ -448,6 +448,10 @@ def get_obs_els(adata): for k, v in adata3.obsm.items() if not isinstance(v, (pd.DataFrame, AwkArray)) } + # TODO no outer joins with awkward arrays + for tmp_ad in [adata1, adata2, adata3]: + del tmp_ad.varm["awk"] + joined = adata1.concatenate([adata2, adata3], join="outer", fill_value=fill_val) ptr = 0 @@ -986,6 +990,12 @@ def test_transposed_concat(array_type, axis, join_type, merge_strategy, fill_val lhs = gen_adata((10, 10), X_type=array_type) rhs = gen_adata((10, 12), X_type=array_type) + # TODO no outer joins with awkward arrays + if join_type == "outer": + for tmp_ad in [lhs, rhs]: + del tmp_ad.varm["awk"] + del tmp_ad.obsm["awk"] + a = concat([lhs, rhs], axis=axis, join=join_type, merge=merge_strategy) b = concat( [lhs.T, rhs.T], axis=abs(axis - 1), join=join_type, merge=merge_strategy @@ -1082,6 +1092,12 @@ def test_concat_size_0_dim(axis, join_type, merge_strategy, shape): alt_axis = 1 - axis dim = ("obs", "var")[axis] + if join_type == "outer": + # TODO outer joins are currently not supported with awkward arrays + for tmp_ad in [a, b]: + del tmp_ad.obsm["awk"] + del tmp_ad.varm["awk"] + expected_size = expected_shape(a, b, axis=axis, join=join_type) result = concat( {"a": a, "b": b}, diff --git a/anndata/utils.py b/anndata/utils.py index be0e5dc0f..f575190a7 100644 --- a/anndata/utils.py +++ b/anndata/utils.py @@ -74,7 +74,15 @@ def dim_len(x, dim): try: import awkward as ak - dim_len.register(ak.Array, ak.size) + # TODO use `.num`; ak.size is deprecated and doesn't handle missing dimensions. + # num will return one size per entry to accomodate for variable-length arrays + # so something like all(ak.num(x, d)[0] == ak.num(x, d)) + @dim_len.register(ak.Array) + def dim_len_awkward(x, dim): + ak.num( + x, dim + ) # just to throw an error if out-of-dimension due to weird behavior of ak.size + return ak.size(x, dim) @asarray.register(ak.Array) def asarray_awkward(x): From d14de3ef05334ecd104582cc05e216c4cf559520 Mon Sep 17 00:00:00 2001 From: Gregor Sturm Date: Tue, 19 Jul 2022 13:20:39 +0200 Subject: [PATCH 026/110] fix lint --- anndata/tests/helpers.py | 1 - 1 file changed, 1 deletion(-) diff --git a/anndata/tests/helpers.py b/anndata/tests/helpers.py index 8c0201b09..22c1ae8c6 100644 --- a/anndata/tests/helpers.py +++ b/anndata/tests/helpers.py @@ -59,7 +59,6 @@ def gen_typed_df(n, index=None): # TODO simulate variable-length non-aligned dimensions def gen_awkward(m, n=None, dtype=np.int32): - rng = np.random.default_rng(42) dim = (m,) if n is None else (m, n) arr = AwkArray(np.random.binomial(100, 0.005, dim).astype(dtype)) return arr From 4d62e7e402de40718f49090c773a45a66c9283ef Mon Sep 17 00:00:00 2001 From: Gregor Sturm Date: Tue, 19 Jul 2022 13:36:00 +0200 Subject: [PATCH 027/110] remove duplicate import --- anndata/utils.py | 1 - 1 file changed, 1 deletion(-) diff --git a/anndata/utils.py b/anndata/utils.py index f575190a7..7f46cef33 100644 --- a/anndata/utils.py +++ b/anndata/utils.py @@ -9,7 +9,6 @@ from .logging import get_logger from ._core.sparse_dataset import SparseDataset -import warnings logger = get_logger(__name__) From c4c1b3f216e5f3218e0bd0ebac8f59042997dbec Mon Sep 17 00:00:00 2001 From: Gregor Sturm Date: Tue, 19 Jul 2022 14:57:34 +0200 Subject: [PATCH 028/110] Test different types of awkward arrays in different slots --- anndata/_core/aligned_mapping.py | 5 ++- anndata/_core/merge.py | 6 +++- anndata/tests/helpers.py | 51 +++++++++++++++++++++++------- anndata/tests/test_concatenate.py | 52 ++++++++++++++++++------------- 4 files changed, 80 insertions(+), 34 deletions(-) diff --git a/anndata/_core/aligned_mapping.py b/anndata/_core/aligned_mapping.py index 8f5430463..f74c6496d 100644 --- a/anndata/_core/aligned_mapping.py +++ b/anndata/_core/aligned_mapping.py @@ -283,7 +283,10 @@ class LayersBase(AlignedMapping): def copy(self) -> "Layers": d = self._actual_class(self.parent) for k, v in self.items(): - d[k] = v.copy() + if isinstance(v, AwkArray): + d[k] = v + else: + d[k] = v.copy() return d diff --git a/anndata/_core/merge.py b/anndata/_core/merge.py index d845052de..081e6fa41 100644 --- a/anndata/_core/merge.py +++ b/anndata/_core/merge.py @@ -394,7 +394,11 @@ def _apply_to_awkward(self, el: AwkArray, *, axis, fill_value=None): raise ValueError( "Outer join operations are currently not supported with AwkwardArrays" ) - return pd.api.extensions.take(el, indexer, axis=axis, allow_fill=False) + # TODO is there no way to slice an awkward array programmatically? + if axis == 0: + return el[indexer] + if axis == 1: + return el[:, indexer] def merge_indices( diff --git a/anndata/tests/helpers.py b/anndata/tests/helpers.py index 22c1ae8c6..5830a3dbd 100644 --- a/anndata/tests/helpers.py +++ b/anndata/tests/helpers.py @@ -3,6 +3,7 @@ from typing import Tuple, Optional from collections.abc import Mapping import warnings +import random import h5py import numpy as np @@ -15,7 +16,7 @@ from anndata._core.views import ArrayView from anndata._core.sparse_dataset import SparseDataset from anndata._core.aligned_mapping import AlignedMapping -from anndata.utils import asarray +from anndata.utils import asarray, dim_len from anndata.compat import AwkArray @@ -57,11 +58,27 @@ def gen_typed_df(n, index=None): ) -# TODO simulate variable-length non-aligned dimensions -def gen_awkward(m, n=None, dtype=np.int32): - dim = (m,) if n is None else (m, n) - arr = AwkArray(np.random.binomial(100, 0.005, dim).astype(dtype)) - return arr +def gen_awkward(m, n=None, ragged=False, dtype=np.int32): + random.seed(123) + + def gen_element(ragged=False): + """Return array of random length if ragged is True, otherwise returns a single element of dtype""" + if ragged: + return np.array( + [random.random() * 100 for _ in range(random.randint(0, 10))], + dtype=dtype, + ) + else: + return dtype(random.random() * 100) + + awkward_pyobj = [] + for _ in range(m): + if n is None: + awkward_pyobj.append(gen_element(ragged=ragged)) + else: + awkward_pyobj.append([gen_element(ragged=ragged) for _ in range(n)]) + + return AwkArray(awkward_pyobj) def gen_typed_df_t2_size(m, n, index=None, columns=None) -> pd.DataFrame: @@ -98,7 +115,12 @@ def gen_adata( pd.DataFrame, AwkArray, ), - layers_types: "Collection[Type]" = (sparse.csr_matrix, np.ndarray, pd.DataFrame), + layers_types: "Collection[Type]" = ( + sparse.csr_matrix, + np.ndarray, + pd.DataFrame, + AwkArray, + ), ) -> AnnData: """\ Helper function to generate a random AnnData for testing purposes. @@ -132,8 +154,7 @@ def gen_adata( obs.rename(columns=dict(cat="obs_cat"), inplace=True) var.rename(columns=dict(cat="var_cat"), inplace=True) - # TODO test with awkward X - # TODO is AnnData not tested with dense matrices? + # TODO test with awkward X (see layers) if X_type is None: X = None else: @@ -143,6 +164,9 @@ def gen_adata( sparse=sparse.random(M, 100, format="csr"), df=gen_typed_df(M, obs_names), awk=gen_awkward(M), + awk_2d=gen_awkward(M, 20), + awk_2d_ragged=gen_awkward(M, ragged=True), + awk_3d_ragged=gen_awkward(M, 20, ragged=True), ) obsm = {k: v for k, v in obsm.items() if type(v) in obsm_types} varm = dict( @@ -150,11 +174,16 @@ def gen_adata( sparse=sparse.random(N, 100, format="csr"), df=gen_typed_df(N, var_names), awk=gen_awkward(N), + awk_2d=gen_awkward(N, 20), + awk_2d_ragged=gen_awkward(N, ragged=True), + awk_3d_ragged=gen_awkward(N, 20, ragged=True), ) varm = {k: v for k, v in varm.items() if type(v) in varm_types} - # TODO test with awkward layer layers = dict( - array=np.random.random((M, N)), sparse=sparse.random(M, N, format="csr") + array=np.random.random((M, N)), + sparse=sparse.random(M, N, format="csr"), + awk=gen_awkward(M, N), + # awk_ragged=gen_awkward(M, N, ragged=True), ) layers = {k: v for k, v in layers.items() if type(v) in layers_types} obsp = dict( diff --git a/anndata/tests/test_concatenate.py b/anndata/tests/test_concatenate.py index 9da476ded..b2b6fbc08 100644 --- a/anndata/tests/test_concatenate.py +++ b/anndata/tests/test_concatenate.py @@ -430,27 +430,30 @@ def get_obs_els(adata): **{f"obsm_{k}": adata.obsm[k] for k in adata.obsm}, } - adata1 = gen_adata((10, 10)) + # TODO outer joins are currently not supported with awkward arrays + types = { + k: (sparse.csr_matrix, np.ndarray, pd.DataFrame) + for k in ["obsm_types", "varm_types", "layers_types"] + } + + adata1 = gen_adata((10, 10), **types) adata1.obsm = { k: v for k, v in adata1.obsm.items() if not isinstance(v, (pd.DataFrame, AwkArray)) } - adata2 = gen_adata((10, 5)) + adata2 = gen_adata((10, 5), **types) adata2.obsm = { k: v[:, : v.shape[1] // 2] for k, v in adata2.obsm.items() if not isinstance(v, (pd.DataFrame, AwkArray)) } - adata3 = gen_adata((7, 3)) + adata3 = gen_adata((7, 3), **types) adata3.obsm = { k: v[:, : v.shape[1] // 3] for k, v in adata3.obsm.items() if not isinstance(v, (pd.DataFrame, AwkArray)) } - # TODO no outer joins with awkward arrays - for tmp_ad in [adata1, adata2, adata3]: - del tmp_ad.varm["awk"] joined = adata1.concatenate([adata2, adata3], join="outer", fill_value=fill_val) @@ -987,14 +990,17 @@ def test_concatenate_uns(unss, merge_strategy, result, value_gen): def test_transposed_concat(array_type, axis, join_type, merge_strategy, fill_val): - lhs = gen_adata((10, 10), X_type=array_type) - rhs = gen_adata((10, 12), X_type=array_type) - - # TODO no outer joins with awkward arrays + # TODO outer joins are currently not supported with awkward arrays if join_type == "outer": - for tmp_ad in [lhs, rhs]: - del tmp_ad.varm["awk"] - del tmp_ad.obsm["awk"] + types = { + k: (sparse.csr_matrix, np.ndarray, pd.DataFrame) + for k in ["obsm_types", "varm_types", "layers_types"] + } + else: + types = {} + + lhs = gen_adata((10, 10), X_type=array_type, **types) + rhs = gen_adata((10, 12), X_type=array_type, **types) a = concat([lhs, rhs], axis=axis, join=join_type, merge=merge_strategy) b = concat( @@ -1087,16 +1093,20 @@ def expected_shape(a, b, axis, join): ) def test_concat_size_0_dim(axis, join_type, merge_strategy, shape): # https://github.com/scverse/anndata/issues/526 - a = gen_adata((5, 7)) - b = gen_adata(shape) - alt_axis = 1 - axis - dim = ("obs", "var")[axis] + # TODO outer joins are currently not supported with awkward arrays if join_type == "outer": - # TODO outer joins are currently not supported with awkward arrays - for tmp_ad in [a, b]: - del tmp_ad.obsm["awk"] - del tmp_ad.varm["awk"] + types = { + k: (sparse.csr_matrix, np.ndarray, pd.DataFrame) + for k in ["obsm_types", "varm_types", "layers_types"] + } + else: + types = {} + + a = gen_adata((5, 7), **types) + b = gen_adata(shape, **types) + alt_axis = 1 - axis + dim = ("obs", "var")[axis] expected_size = expected_shape(a, b, axis=axis, join=join_type) result = concat( From 339bce8e1835fffdfbd2bfe9c619321a0b829f6d Mon Sep 17 00:00:00 2001 From: Gregor Sturm Date: Tue, 19 Jul 2022 16:02:26 +0200 Subject: [PATCH 029/110] Better function to generate awkward arrays --- anndata/tests/helpers.py | 42 +++++++++++++++++++++++++--------------- 1 file changed, 26 insertions(+), 16 deletions(-) diff --git a/anndata/tests/helpers.py b/anndata/tests/helpers.py index 5830a3dbd..eb8c28eb5 100644 --- a/anndata/tests/helpers.py +++ b/anndata/tests/helpers.py @@ -4,6 +4,7 @@ from collections.abc import Mapping import warnings import random +import itertools import h5py import numpy as np @@ -11,6 +12,7 @@ from pandas.api.types import is_numeric_dtype import pytest from scipy import sparse +import awkward as ak from anndata import AnnData, Raw from anndata._core.views import ArrayView @@ -61,24 +63,32 @@ def gen_typed_df(n, index=None): def gen_awkward(m, n=None, ragged=False, dtype=np.int32): random.seed(123) - def gen_element(ragged=False): - """Return array of random length if ragged is True, otherwise returns a single element of dtype""" - if ragged: - return np.array( - [random.random() * 100 for _ in range(random.randint(0, 10))], - dtype=dtype, - ) - else: - return dtype(random.random() * 100) + def gen_ragged(): + """Return array of random length""" + return np.random.randint( + 0, 1000, size=(np.random.randint(0, 10),), dtype=dtype + ).tolist() + + shape = np.array((m,) if n is None else (m, n)) + if np.any(shape == 0): + # use empty numpy array, to pass the correct dimensions to + # ak.Array when one of the dimensions is 0 + np_arr = np.empty(shape, dtype=dtype) + elif ragged: + # build numpy array with list objects in the ragged dimension + np_arr = np.empty(shape, dtype=object) + for mi, ni in itertools.product(range(m), () if n is None else range(n)): + np_arr[mi, ni] = gen_ragged() + else: + # otherwise just build a regular numpy array + np_arr = np.random.randint(0, 1000, size=shape, dtype=dtype) - awkward_pyobj = [] - for _ in range(m): - if n is None: - awkward_pyobj.append(gen_element(ragged=ragged)) - else: - awkward_pyobj.append([gen_element(ragged=ragged) for _ in range(n)]) + arr = AwkArray(np_arr) + assert ak.num(arr, 0) == m + if n is not None: + assert ak.all(ak.num(arr, 1) == n) - return AwkArray(awkward_pyobj) + return arr def gen_typed_df_t2_size(m, n, index=None, columns=None) -> pd.DataFrame: From 012de5e6f965feac757baca3fc61f4b65079face Mon Sep 17 00:00:00 2001 From: Gregor Sturm Date: Tue, 19 Jul 2022 16:02:43 +0200 Subject: [PATCH 030/110] Better dim_len for awkward arrays --- anndata/utils.py | 14 +++++++------- 1 file changed, 7 insertions(+), 7 deletions(-) diff --git a/anndata/utils.py b/anndata/utils.py index 7f46cef33..df605f159 100644 --- a/anndata/utils.py +++ b/anndata/utils.py @@ -73,15 +73,15 @@ def dim_len(x, dim): try: import awkward as ak - # TODO use `.num`; ak.size is deprecated and doesn't handle missing dimensions. - # num will return one size per entry to accomodate for variable-length arrays - # so something like all(ak.num(x, d)[0] == ak.num(x, d)) @dim_len.register(ak.Array) def dim_len_awkward(x, dim): - ak.num( - x, dim - ) # just to throw an error if out-of-dimension due to weird behavior of ak.size - return ak.size(x, dim) + dim_lengths = ak.num(x, dim) + if isinstance(dim_lengths, int): + return dim_lengths + elif ak.all(dim_lengths == dim_lengths[0]): + return dim_lengths[0] + else: + raise ValueError(f"Array is of variable length in dimension {dim}") @asarray.register(ak.Array) def asarray_awkward(x): From 788459892bd70907ed8a99898ad58963a224a73d Mon Sep 17 00:00:00 2001 From: Gregor Sturm Date: Tue, 19 Jul 2022 18:46:43 +0200 Subject: [PATCH 031/110] Working out how to best check the dim_len --- anndata/_core/merge.py | 6 +++--- anndata/utils.py | 24 ++++++++++++++++++------ 2 files changed, 21 insertions(+), 9 deletions(-) diff --git a/anndata/_core/merge.py b/anndata/_core/merge.py index 081e6fa41..2e3879490 100644 --- a/anndata/_core/merge.py +++ b/anndata/_core/merge.py @@ -467,7 +467,7 @@ def concat_arrays(arrays, reindexers, axis=0, index=None, fill_value=None): elif any(isinstance(a, AwkArray) for a in arrays): import awkward as ak - return ak.concatenate([f(a) for f, a in zip(reindexers, arrays)]) + return ak.concatenate([f(a, axis=1 - axis) for f, a in zip(reindexers, arrays)]) elif any(isinstance(a, sparse.spmatrix) for a in arrays): sparse_stack = (sparse.vstack, sparse.hstack)[axis] return sparse_stack( @@ -515,7 +515,7 @@ def gen_inner_reindexers(els, new_index, axis: Literal[0, 1] = 0): elif all(isinstance(el, AwkArray) for el in els if not_missing(el)): # do not reindex awkward arrays # TODO unintended behaviour? - reindexers = [lambda x: x for _ in els] + reindexers = [lambda *args, **kwargs: args[0] for _ in els] else: min_ind = min(el.shape[alt_axis] for el in els) reindexers = [ @@ -536,7 +536,7 @@ def gen_outer_reindexers(els, shapes, new_index: pd.Index, *, axis=0): elif all(isinstance(el, AwkArray) for el in els if not_missing(el)): # do not reindex awkward arrays # TODO unintended behaviour? - reindexers = [lambda x: x for _ in els] + reindexers = [lambda *args, **kwargs: args[0] for _ in els] else: # if fill_value is None: # fill_value = default_fill_value(els) diff --git a/anndata/utils.py b/anndata/utils.py index df605f159..ce92cbba7 100644 --- a/anndata/utils.py +++ b/anndata/utils.py @@ -1,3 +1,4 @@ +from multiprocessing.sharedctypes import Value import warnings from functools import wraps, singledispatch from typing import Mapping, Any, Sequence, Union, Callable @@ -75,13 +76,24 @@ def dim_len(x, dim): @dim_len.register(ak.Array) def dim_len_awkward(x, dim): - dim_lengths = ak.num(x, dim) - if isinstance(dim_lengths, int): - return dim_lengths - elif ak.all(dim_lengths == dim_lengths[0]): - return dim_lengths[0] + if dim == 0: + try: + return x.type.length + except AttributeError: + return ValueError("The outermost type must be awkward.Array!") else: - raise ValueError(f"Array is of variable length in dimension {dim}") + t = x.type + for _ in range(dim): + if isinstance(t, ak.types.OptionType): + t = t.content + elif isinstance(t, (ak.types.RegularType, ak.types.ArrayType)): + t = t.type + else: + raise ValueError(f"Unsupported type in awkward array {t}") + try: + return t.size + except AttributeError: + raise ValueError("Array is of variable length in dimension {dim}") @asarray.register(ak.Array) def asarray_awkward(x): From e16ae35891e852b9d639428a292bec109d6a6c76 Mon Sep 17 00:00:00 2001 From: Gregor Sturm Date: Wed, 20 Jul 2022 08:02:37 +0200 Subject: [PATCH 032/110] Only accept awkward arrays that are "regular" in the aligned dimension The conversion is left to the user. Explicit is better than implicit. --- anndata/_core/aligned_mapping.py | 9 --------- anndata/tests/helpers.py | 7 +++++-- anndata/tests/test_base.py | 8 ++++++++ anndata/utils.py | 23 +++++++++++------------ 4 files changed, 24 insertions(+), 23 deletions(-) diff --git a/anndata/_core/aligned_mapping.py b/anndata/_core/aligned_mapping.py index f74c6496d..fc9d8b9fb 100644 --- a/anndata/_core/aligned_mapping.py +++ b/anndata/_core/aligned_mapping.py @@ -50,15 +50,6 @@ def _ipython_key_completions_(self) -> List[str]: def _validate_value(self, val: V, key: str) -> V: """Raises an error if value is invalid""" for i, axis in enumerate(self.axes): - if isinstance(val, AwkArray): - try: - val = ak_to_regular(val, i) - except ValueError: - raise ValueError( - f"Awkward array passed for key {key!r} is of variable length in dimension {i}." - "Dimensions aligned to AnnData axes must be fixed-length." - ) - if self.parent.shape[axis] != dim_len(val, i): right_shape = tuple(self.parent.shape[a] for a in self.axes) actual_shape = tuple(dim_len(val, a) for a in self.axes) diff --git a/anndata/tests/helpers.py b/anndata/tests/helpers.py index eb8c28eb5..fb540f597 100644 --- a/anndata/tests/helpers.py +++ b/anndata/tests/helpers.py @@ -84,9 +84,12 @@ def gen_ragged(): np_arr = np.random.randint(0, 1000, size=shape, dtype=dtype) arr = AwkArray(np_arr) - assert ak.num(arr, 0) == m if n is not None: - assert ak.all(ak.num(arr, 1) == n) + arr = ak.to_regular(arr, 1) + + assert dim_len(arr, 0) == m + if n is not None: + assert dim_len(arr, 1) == n return arr diff --git a/anndata/tests/test_base.py b/anndata/tests/test_base.py index 027e46b46..675d85d3b 100644 --- a/anndata/tests/test_base.py +++ b/anndata/tests/test_base.py @@ -607,3 +607,11 @@ def assert_eq_not_id(a, b): assert_eq_not_id(map_sprs.keys(), map_copy.keys()) for key in map_sprs.keys(): assert_eq_not_id(map_sprs[key], map_copy[key]) + + +def test_set_awkward(): + """Check if we can set .X, .layers, .obsm, .varm and .uns with different types + of awkward arrays and if error messages are properly raised when the dimensions do not align. + """ + # adata = gen_adata((10, 20), varm_types=tuple(), obsm_types=tuple()) + assert False, "TODO" diff --git a/anndata/utils.py b/anndata/utils.py index ce92cbba7..7db6b4f6a 100644 --- a/anndata/utils.py +++ b/anndata/utils.py @@ -80,20 +80,19 @@ def dim_len_awkward(x, dim): try: return x.type.length except AttributeError: - return ValueError("The outermost type must be awkward.Array!") - else: - t = x.type - for _ in range(dim): - if isinstance(t, ak.types.OptionType): - t = t.content - elif isinstance(t, (ak.types.RegularType, ak.types.ArrayType)): - t = t.type - else: - raise ValueError(f"Unsupported type in awkward array {t}") + raise ValueError("The outermost type must be awkward.Array!") + elif dim == 1: try: - return t.size + return x.type.type.size except AttributeError: - raise ValueError("Array is of variable length in dimension {dim}") + raise ValueError( + f"Array is of variable length in dimension {dim}.", + f"Try ak.to_regular(array, {dim}) before including the array in AnnData", + ) + else: + raise NotImplementedError( + "This check is currently only implemented for the first two dimensions. " + ) @asarray.register(ak.Array) def asarray_awkward(x): From 588b6af9282fc2b56f938ec0925a12bed7ca4b8f Mon Sep 17 00:00:00 2001 From: Gregor Sturm Date: Fri, 12 Aug 2022 18:54:00 +0200 Subject: [PATCH 033/110] Switch to v2 API --- anndata/_core/merge.py | 4 ++-- anndata/_core/views.py | 2 +- anndata/_io/specs/methods.py | 4 ++-- anndata/compat/__init__.py | 2 +- anndata/tests/helpers.py | 4 ++-- anndata/utils.py | 6 +++--- 6 files changed, 11 insertions(+), 11 deletions(-) diff --git a/anndata/_core/merge.py b/anndata/_core/merge.py index 2e3879490..d4d6ec197 100644 --- a/anndata/_core/merge.py +++ b/anndata/_core/merge.py @@ -129,7 +129,7 @@ def equal_sparse(a, b) -> bool: @equal.register(AwkArray) def equal_awkward(a, b) -> bool: - import awkward as ak + import awkward._v2 as ak if dim_len(a, 0) == dim_len(b, 0): return ak.all(a == b) @@ -465,7 +465,7 @@ def concat_arrays(arrays, reindexers, axis=0, index=None, fill_value=None): df.index = index return df elif any(isinstance(a, AwkArray) for a in arrays): - import awkward as ak + import awkward._v2 as ak return ak.concatenate([f(a, axis=1 - axis) for f, a in zip(reindexers, arrays)]) elif any(isinstance(a, sparse.spmatrix) for a in arrays): diff --git a/anndata/_core/views.py b/anndata/_core/views.py index 96d771a64..3365cef37 100644 --- a/anndata/_core/views.py +++ b/anndata/_core/views.py @@ -122,7 +122,7 @@ def drop(self, *args, inplace: bool = False, **kw): class AwkwardArrayView(_ViewMixin, AwkArray): def copy(self, order: str = "C") -> np.ndarray: - import awkward as ak + import awkward._v2 as ak # we want a copy of an akward array return ak.copy(self) diff --git a/anndata/_io/specs/methods.py b/anndata/_io/specs/methods.py index a467f4324..32418c24e 100644 --- a/anndata/_io/specs/methods.py +++ b/anndata/_io/specs/methods.py @@ -490,7 +490,7 @@ def read_sparse_partial(elem, *, items=None, indices=(slice(None), slice(None))) @_REGISTRY.register_write(H5Group, AwkArray, IOSpec("awkward-array", "0.1.0")) @_REGISTRY.register_write(ZarrGroup, AwkArray, IOSpec("awkward-array", "0.1.0")) def write_awkward(f, k, v, dataset_kwargs=MappingProxyType({})): - import awkward as ak + import awkward._v2 as ak group = f.create_group(k) form, length, container = ak.to_buffers(v) @@ -502,7 +502,7 @@ def write_awkward(f, k, v, dataset_kwargs=MappingProxyType({})): @_REGISTRY.register_read(H5Group, IOSpec("awkward-array", "0.1.0")) @_REGISTRY.register_read(ZarrGroup, IOSpec("awkward-array", "0.1.0")) def read_awkward(elem): - import awkward as ak + import awkward._v2 as ak form = _read_attr(elem.attrs, "form") length = _read_attr(elem.attrs, "length") diff --git a/anndata/compat/__init__.py b/anndata/compat/__init__.py index bbfbc88ac..41d952fc6 100644 --- a/anndata/compat/__init__.py +++ b/anndata/compat/__init__.py @@ -41,7 +41,7 @@ def __repr__(): try: - from awkward import Array as AwkArray + from awkward._v2 import Array as AwkArray except ImportError: class AwkArray: diff --git a/anndata/tests/helpers.py b/anndata/tests/helpers.py index fb540f597..4e940fe3b 100644 --- a/anndata/tests/helpers.py +++ b/anndata/tests/helpers.py @@ -12,7 +12,7 @@ from pandas.api.types import is_numeric_dtype import pytest from scipy import sparse -import awkward as ak +import awkward._v2 as ak from anndata import AnnData, Raw from anndata._core.views import ArrayView @@ -406,7 +406,7 @@ def are_equal_dataframe(a, b, exact=False, elem_name=None): @assert_equal.register(AwkArray) def assert_equal_awkarray(a, b, exact=False, elem_name=None): - import awkward as ak + import awkward._v2 as ak assert ak.all(a == b) diff --git a/anndata/utils.py b/anndata/utils.py index 7db6b4f6a..1ec95712b 100644 --- a/anndata/utils.py +++ b/anndata/utils.py @@ -72,7 +72,7 @@ def dim_len(x, dim): try: - import awkward as ak + import awkward._v2 as ak @dim_len.register(ak.Array) def dim_len_awkward(x, dim): @@ -80,10 +80,10 @@ def dim_len_awkward(x, dim): try: return x.type.length except AttributeError: - raise ValueError("The outermost type must be awkward.Array!") + raise ValueError("The outermost type must be an `awkward.Array`!") elif dim == 1: try: - return x.type.type.size + return x.type.content.size except AttributeError: raise ValueError( f"Array is of variable length in dimension {dim}.", From e687e19344edc71a75f0e12e0ddc516258c75bf6 Mon Sep 17 00:00:00 2001 From: Gregor Sturm Date: Fri, 12 Aug 2022 19:55:07 +0200 Subject: [PATCH 034/110] WIP rewrite awkward array generation --- anndata/tests/helpers.py | 57 +++++++++++++++++++++++++++++++++++ anndata/tests/test_helpers.py | 14 ++++++++- 2 files changed, 70 insertions(+), 1 deletion(-) diff --git a/anndata/tests/helpers.py b/anndata/tests/helpers.py index 4e940fe3b..358b49cba 100644 --- a/anndata/tests/helpers.py +++ b/anndata/tests/helpers.py @@ -1,5 +1,6 @@ from functools import singledispatch, wraps from string import ascii_letters +from threading import current_thread from typing import Tuple, Optional from collections.abc import Mapping import warnings @@ -60,6 +61,62 @@ def gen_typed_df(n, index=None): ) +def _gen_awkward_inner(shape, rng, dtype): + # the maximum length a ragged dimension can take + MAX_RAGGED_DIM_LEN = 20 + if not len(shape): + # abort condition -> no dimension left, return an actual value instead + return dtype(rng.randrange(1000)) + else: + curr_dim_len = shape[0] + lil = [] + if curr_dim_len is None: + # ragged dimension, set random length + curr_dim_len = rng.randrange(MAX_RAGGED_DIM_LEN) + + for _ in range(curr_dim_len): + lil.append(_gen_awkward_inner(shape[1:], rng, dtype)) + + return lil + + +def gen_awkward2(shape, dtype=np.int32): + """Function to generate an awkward array with random values. + + Awkward array dimensions can either be fixed-length ("regular") or variable length ("ragged") + (the first dimension is always fixed-length). + + + Parameters + ---------- + shape + shape of the array to be generated. Any dimension specified as `None` will be simulated as ragged. + """ + if shape[0] is None: + raise ValueError("The first dimension must be fixed-length.") + + rng = random.Random(123) + + if np.any(shape == 0): + # use empty numpy array, to pass the correct dimensions to + # ak.Array when one of the dimensions is 0 (the list-of-list approach + # does not work in that case because the list in the 0-dimension would be empty and all + # following dimensions would be lost). + # The size of the variable-length dimension is irrelevant in that case, we arbitrarily set it to 1 + np_arr = np.empty([1 if x is None else x for x in shape], dtype=dtype) + arr = AwkArray(np_arr) + else: + lil = _gen_awkward_inner(shape, rng, dtype) + arr = AwkArray(lil) + + # make fixed-length dimensions regular + for i, d in enumerate(shape): + if d is not None: + arr = ak.to_regular(arr, i) + + return arr + + def gen_awkward(m, n=None, ragged=False, dtype=np.int32): random.seed(123) diff --git a/anndata/tests/test_helpers.py b/anndata/tests/test_helpers.py index 1556b7bdb..b35511484 100644 --- a/anndata/tests/test_helpers.py +++ b/anndata/tests/test_helpers.py @@ -6,7 +6,8 @@ from scipy import sparse import anndata as ad -from anndata.tests.helpers import assert_equal, report_name, gen_adata +from anndata.tests.helpers import assert_equal, gen_awkward2, report_name, gen_adata +from anndata.utils import dim_len # Testing to see if all error types can have the key name appended. # Currently fails for 22/118 since they have required arguments. Not sure what to do about that. @@ -40,6 +41,17 @@ def reusable_adata(): return gen_adata((10, 10)) +@pytest.mark.parametrize("shape", [(4, 2), (100, 200, 7), (4, None), (0, 4), (4, 0)]) +def test_gen_awkward(shape): + arr = gen_awkward2(shape) + for i, s in enumerate(shape): + if s is None: + with pytest.raises(ValueError): + dim_len(arr, i) + else: + assert dim_len(arr, i) == s + + # Does this work for every warning? def test_report_name(): def raise_error(): From 41b14239490fa8ad6bc7efe27169e7b485a9e69b Mon Sep 17 00:00:00 2001 From: Gregor Sturm Date: Fri, 12 Aug 2022 20:44:32 +0200 Subject: [PATCH 035/110] Improve awkward array generation and dim_len check --- anndata/tests/helpers.py | 1 + anndata/tests/test_helpers.py | 17 ++++++++++++++++- anndata/utils.py | 21 +++++++++++++++------ 3 files changed, 32 insertions(+), 7 deletions(-) diff --git a/anndata/tests/helpers.py b/anndata/tests/helpers.py index 358b49cba..e893ee0ff 100644 --- a/anndata/tests/helpers.py +++ b/anndata/tests/helpers.py @@ -96,6 +96,7 @@ def gen_awkward2(shape, dtype=np.int32): raise ValueError("The first dimension must be fixed-length.") rng = random.Random(123) + shape = np.array(shape) if np.any(shape == 0): # use empty numpy array, to pass the correct dimensions to diff --git a/anndata/tests/test_helpers.py b/anndata/tests/test_helpers.py index b35511484..a91accc3e 100644 --- a/anndata/tests/test_helpers.py +++ b/anndata/tests/test_helpers.py @@ -41,7 +41,22 @@ def reusable_adata(): return gen_adata((10, 10)) -@pytest.mark.parametrize("shape", [(4, 2), (100, 200, 7), (4, None), (0, 4), (4, 0)]) +@pytest.mark.parametrize( + "shape", + [ + (4, 2), + (100, 200, None), + (4, None), + (0, 4), + (4, 0), + (8, None, None), + (8, None, None, None), + (4, None, 8), + (100, 200, 4), + (4, 0, 0), + (0, 0, 0), + ], +) def test_gen_awkward(shape): arr = gen_awkward2(shape) for i, s in enumerate(shape): diff --git a/anndata/utils.py b/anndata/utils.py index 1ec95712b..815765e1c 100644 --- a/anndata/utils.py +++ b/anndata/utils.py @@ -77,22 +77,31 @@ def dim_len(x, dim): @dim_len.register(ak.Array) def dim_len_awkward(x, dim): if dim == 0: + # dimension 0 is a special case - it is always of `ArrayType` and has a fixed length. try: return x.type.length except AttributeError: raise ValueError("The outermost type must be an `awkward.Array`!") - elif dim == 1: + else: + arr_type = x.type + for _ in range(dim): + # we need to loop through the nested types for the other dimensions, e.g. + # ArrayType(RegularType(ListType(NumpyType('int64')), 200), 100) + try: + arr_type = arr_type.content + except AttributeError: + # RecordType and UnionType have multiple "contents" entries + raise NotImplementedError( + "This check is currently not implemented for RecordType and UnionType arrays. " + ) + try: - return x.type.content.size + return arr_type.size except AttributeError: raise ValueError( f"Array is of variable length in dimension {dim}.", f"Try ak.to_regular(array, {dim}) before including the array in AnnData", ) - else: - raise NotImplementedError( - "This check is currently only implemented for the first two dimensions. " - ) @asarray.register(ak.Array) def asarray_awkward(x): From fa8a3866801893ceb3ce1c96b6080b79489c56c7 Mon Sep 17 00:00:00 2001 From: Gregor Sturm Date: Fri, 12 Aug 2022 21:19:12 +0200 Subject: [PATCH 036/110] Switch to new awkward array generation in all tests --- anndata/tests/helpers.py | 87 +++++++++-------------------------- anndata/tests/test_helpers.py | 4 +- 2 files changed, 25 insertions(+), 66 deletions(-) diff --git a/anndata/tests/helpers.py b/anndata/tests/helpers.py index e893ee0ff..7664ca5fc 100644 --- a/anndata/tests/helpers.py +++ b/anndata/tests/helpers.py @@ -23,6 +23,13 @@ from anndata.compat import AwkArray +GEN_ADATA_DEFAULT_TYPES = ( + sparse.csr_matrix, + np.ndarray, + pd.DataFrame, + AwkArray, +) + def gen_vstr_recarray(m, n, dtype=None): size = m * n @@ -80,7 +87,7 @@ def _gen_awkward_inner(shape, rng, dtype): return lil -def gen_awkward2(shape, dtype=np.int32): +def gen_awkward(shape, dtype=np.int32): """Function to generate an awkward array with random values. Awkward array dimensions can either be fixed-length ("regular") or variable length ("ragged") @@ -118,40 +125,6 @@ def gen_awkward2(shape, dtype=np.int32): return arr -def gen_awkward(m, n=None, ragged=False, dtype=np.int32): - random.seed(123) - - def gen_ragged(): - """Return array of random length""" - return np.random.randint( - 0, 1000, size=(np.random.randint(0, 10),), dtype=dtype - ).tolist() - - shape = np.array((m,) if n is None else (m, n)) - if np.any(shape == 0): - # use empty numpy array, to pass the correct dimensions to - # ak.Array when one of the dimensions is 0 - np_arr = np.empty(shape, dtype=dtype) - elif ragged: - # build numpy array with list objects in the ragged dimension - np_arr = np.empty(shape, dtype=object) - for mi, ni in itertools.product(range(m), () if n is None else range(n)): - np_arr[mi, ni] = gen_ragged() - else: - # otherwise just build a regular numpy array - np_arr = np.random.randint(0, 1000, size=shape, dtype=dtype) - - arr = AwkArray(np_arr) - if n is not None: - arr = ak.to_regular(arr, 1) - - assert dim_len(arr, 0) == m - if n is not None: - assert dim_len(arr, 1) == n - - return arr - - def gen_typed_df_t2_size(m, n, index=None, columns=None) -> pd.DataFrame: s = 0 df = pd.DataFrame() @@ -174,24 +147,9 @@ def gen_adata( X_dtype=np.float32, # obs_dtypes, # var_dtypes, - obsm_types: "Collection[Type]" = ( - sparse.csr_matrix, - np.ndarray, - pd.DataFrame, - AwkArray, - ), - varm_types: "Collection[Type]" = ( - sparse.csr_matrix, - np.ndarray, - pd.DataFrame, - AwkArray, - ), - layers_types: "Collection[Type]" = ( - sparse.csr_matrix, - np.ndarray, - pd.DataFrame, - AwkArray, - ), + obsm_types: "Collection[Type]" = GEN_ADATA_DEFAULT_TYPES, + varm_types: "Collection[Type]" = GEN_ADATA_DEFAULT_TYPES, + layers_types: "Collection[Type]" = GEN_ADATA_DEFAULT_TYPES, ) -> AnnData: """\ Helper function to generate a random AnnData for testing purposes. @@ -234,27 +192,27 @@ def gen_adata( array=np.random.random((M, 50)), sparse=sparse.random(M, 100, format="csr"), df=gen_typed_df(M, obs_names), - awk=gen_awkward(M), - awk_2d=gen_awkward(M, 20), - awk_2d_ragged=gen_awkward(M, ragged=True), - awk_3d_ragged=gen_awkward(M, 20, ragged=True), + awk=gen_awkward((M,)), + awk_2d=gen_awkward((M, 20)), + awk_2d_ragged=gen_awkward((M, None)), + awk_3d_ragged=gen_awkward((M, 20, None)), ) obsm = {k: v for k, v in obsm.items() if type(v) in obsm_types} varm = dict( array=np.random.random((N, 50)), sparse=sparse.random(N, 100, format="csr"), df=gen_typed_df(N, var_names), - awk=gen_awkward(N), - awk_2d=gen_awkward(N, 20), - awk_2d_ragged=gen_awkward(N, ragged=True), - awk_3d_ragged=gen_awkward(N, 20, ragged=True), + awk=gen_awkward((N,)), + awk_2d=gen_awkward((N, 20)), + awk_2d_ragged=gen_awkward((N, None)), + awk_3d_ragged=gen_awkward((N, 20, None)), ) varm = {k: v for k, v in varm.items() if type(v) in varm_types} layers = dict( array=np.random.random((M, N)), sparse=sparse.random(M, N, format="csr"), - awk=gen_awkward(M, N), - # awk_ragged=gen_awkward(M, N, ragged=True), + awk=gen_awkward((M, N)), + awk_ragged=gen_awkward((M, N, None)), ) layers = {k: v for k, v in layers.items() if type(v) in layers_types} obsp = dict( @@ -271,8 +229,9 @@ def gen_adata( scalar_float=3.0, nested_further=dict(array=np.arange(5)), ), + awkward_regular=gen_awkward((10, 5)), + awkward_ragged=gen_awkward((12, None, None)), # U_recarray=gen_vstr_recarray(N, 5, "U4") - # TODO Add awkward array to uns ) adata = AnnData( X=X, diff --git a/anndata/tests/test_helpers.py b/anndata/tests/test_helpers.py index a91accc3e..c94af6bfc 100644 --- a/anndata/tests/test_helpers.py +++ b/anndata/tests/test_helpers.py @@ -6,7 +6,7 @@ from scipy import sparse import anndata as ad -from anndata.tests.helpers import assert_equal, gen_awkward2, report_name, gen_adata +from anndata.tests.helpers import assert_equal, gen_awkward, report_name, gen_adata from anndata.utils import dim_len # Testing to see if all error types can have the key name appended. @@ -58,7 +58,7 @@ def reusable_adata(): ], ) def test_gen_awkward(shape): - arr = gen_awkward2(shape) + arr = gen_awkward(shape) for i, s in enumerate(shape): if s is None: with pytest.raises(ValueError): From 733937a5a7dba8066832c512ee81b1d56fdb96ab Mon Sep 17 00:00:00 2001 From: Gregor Sturm Date: Fri, 12 Aug 2022 21:19:21 +0200 Subject: [PATCH 037/110] Fix test_transpose --- anndata/tests/test_transpose.py | 21 +++++++++++++++------ 1 file changed, 15 insertions(+), 6 deletions(-) diff --git a/anndata/tests/test_transpose.py b/anndata/tests/test_transpose.py index a7c010b3e..ebd426594 100644 --- a/anndata/tests/test_transpose.py +++ b/anndata/tests/test_transpose.py @@ -2,7 +2,13 @@ import pytest -from anndata.tests.helpers import gen_adata, assert_equal +from anndata.tests.helpers import GEN_ADATA_DEFAULT_TYPES, gen_adata, assert_equal +from anndata.compat import AwkArray + +# TODO +# transpose currently not implemented for awkward arrays +_types = [x for x in GEN_ADATA_DEFAULT_TYPES if x != AwkArray] +_types_kwargs = dict(obsm_types=_types, varm_types=_types, layers_types=_types) def test_transpose_orig(): @@ -10,7 +16,8 @@ def test_transpose_orig(): Original test for transpose, should be covered by more thorough tests below, but keeping around just in case. """ - adata = gen_adata((5, 3)) + adata = gen_adata((5, 3), **_types_kwargs) + adata.varp = {f"varp_{k}": v for k, v in adata.varp.items()} adata1 = adata.T adata1.uns["test123"] = 1 @@ -31,10 +38,12 @@ def _add_raw(adata, *, var_subset=slice(None)): # * Backed @pytest.fixture( params=[ - pytest.param(gen_adata((50, 20)), id="csr_X"), - pytest.param(gen_adata((50, 20), sparse.csc_matrix), id="csc_X"), - pytest.param(_add_raw(gen_adata((50, 20))), id="with_raw"), - pytest.param(gen_adata((20, 10), X_type=None), id="None_X"), + pytest.param(gen_adata((50, 20), **_types_kwargs), id="csr_X"), + pytest.param( + gen_adata((50, 20), sparse.csc_matrix, **_types_kwargs), id="csc_X" + ), + pytest.param(_add_raw(gen_adata((50, 20), **_types_kwargs)), id="with_raw"), + pytest.param(gen_adata((20, 10), X_type=None, **_types_kwargs), id="None_X"), ] ) def adata(request): From ed532a2ef6e7e7945def7d314c0cb2db98e776bf Mon Sep 17 00:00:00 2001 From: Gregor Sturm Date: Fri, 12 Aug 2022 22:07:15 +0200 Subject: [PATCH 038/110] Fix/workaround more tests --- anndata/_core/merge.py | 12 +++++++++++- anndata/tests/test_x.py | 4 +++- 2 files changed, 14 insertions(+), 2 deletions(-) diff --git a/anndata/_core/merge.py b/anndata/_core/merge.py index d4d6ec197..1539622de 100644 --- a/anndata/_core/merge.py +++ b/anndata/_core/merge.py @@ -467,7 +467,9 @@ def concat_arrays(arrays, reindexers, axis=0, index=None, fill_value=None): elif any(isinstance(a, AwkArray) for a in arrays): import awkward._v2 as ak - return ak.concatenate([f(a, axis=1 - axis) for f, a in zip(reindexers, arrays)]) + return ak.concatenate( + [f(a, axis=1 - axis) for f, a in zip(reindexers, arrays)], axis=axis + ) elif any(isinstance(a, sparse.spmatrix) for a in arrays): sparse_stack = (sparse.vstack, sparse.hstack)[axis] return sparse_stack( @@ -985,6 +987,14 @@ def concat( "not concatenating `.raw` attributes.", UserWarning, ) + + # TODO Workaround for https://github.com/scikit-hep/awkward/issues/1586 + for k, layer in layers.items(): + if isinstance(layer, AwkArray): + import awkward._v2 as ak + + layers[k] = ak.to_regular(layer, 1) + return AnnData( **{ "X": X, diff --git a/anndata/tests/test_x.py b/anndata/tests/test_x.py index fb333504c..4907af5cd 100644 --- a/anndata/tests/test_x.py +++ b/anndata/tests/test_x.py @@ -75,7 +75,9 @@ def test_init_X_as_none(): @pytest.mark.parametrize("shape", SINGULAR_SHAPES + [pytest.param((5, 3), id="(5, 3)")]) def test_transpose_with_X_as_none(shape): - adata = gen_adata(shape, X_type=lambda x: None) + from test_transpose import _types_kwargs + + adata = gen_adata(shape, X_type=lambda x: None, **_types_kwargs) adataT = adata.transpose() assert_equal(adataT.shape, shape[::-1]) assert_equal(adataT.obsp.keys(), adata.varp.keys()) From e5706c33ac612875a87694fa0adc068a869d9925 Mon Sep 17 00:00:00 2001 From: Gregor Sturm Date: Sat, 13 Aug 2022 11:52:41 +0200 Subject: [PATCH 039/110] Add test for setting anndata slots to awkward arrays --- anndata/_core/aligned_mapping.py | 18 +++++++---- anndata/_core/anndata.py | 16 ++++++++-- anndata/tests/helpers.py | 3 +- anndata/tests/test_base.py | 51 +++++++++++++++++++++++++++++--- anndata/tests/test_helpers.py | 9 ++---- anndata/utils.py | 18 +++++++---- 6 files changed, 89 insertions(+), 26 deletions(-) diff --git a/anndata/_core/aligned_mapping.py b/anndata/_core/aligned_mapping.py index fc9d8b9fb..5d367105c 100644 --- a/anndata/_core/aligned_mapping.py +++ b/anndata/_core/aligned_mapping.py @@ -53,12 +53,18 @@ def _validate_value(self, val: V, key: str) -> V: if self.parent.shape[axis] != dim_len(val, i): right_shape = tuple(self.parent.shape[a] for a in self.axes) actual_shape = tuple(dim_len(val, a) for a in self.axes) - raise ValueError( - f"Value passed for key {key!r} is of incorrect shape. " - f"Values of {self.attrname} must match dimensions " - f"{self.axes} of parent. Value had shape {actual_shape} while " - f"it should have had {right_shape}." - ) + if None in actual_shape: + raise ValueError( + f"The AwkwardArray is of variable length in dimension {i}.", + f"Try ak.to_regular(array, {i}) before including the array in AnnData", + ) + else: + raise ValueError( + f"Value passed for key {key!r} is of incorrect shape. " + f"Values of {self.attrname} must match dimensions " + f"{self.axes} of parent. Value had shape {actual_shape} while " + f"it should have had {right_shape}." + ) if not self._allow_df and isinstance(val, pd.DataFrame): name = self.attrname.title().rstrip("s") diff --git a/anndata/_core/anndata.py b/anndata/_core/anndata.py index fcc70c0e3..30cd551c1 100644 --- a/anndata/_core/anndata.py +++ b/anndata/_core/anndata.py @@ -45,7 +45,7 @@ ) from .sparse_dataset import SparseDataset from .. import utils -from ..utils import convert_to_dict, ensure_df_homogeneous, dim_len +from ..utils import convert_to_dict, ensure_df_homogeneous, dim_len, get_shape from ..logging import anndata_logger as logger from ..compat import ( ZarrArray, @@ -56,6 +56,7 @@ _move_adj_mtx, _overloaded_uns, OverloadedDict, + AwkArray, ) @@ -66,6 +67,7 @@ class StorageType(Enum): ZarrArray = ZarrArray ZappyArray = ZappyArray DaskArray = DaskArray + AwkArray = AwkArray @classmethod def classes(cls): @@ -664,10 +666,18 @@ def X(self, value: Optional[Union[np.ndarray, sparse.spmatrix]]): if ( np.isscalar(value) or (hasattr(value, "shape") and (self.shape == value.shape)) + or ( + isinstance(value, AwkArray) + and (self.shape == (dim_len(value, 0), dim_len(value, 1))) + ) or (self.n_vars == 1 and self.n_obs == len(value)) or (self.n_obs == 1 and self.n_vars == len(value)) ): - if not np.isscalar(value) and self.shape != value.shape: + if ( + not np.isscalar(value) + and not isinstance(value, AwkArray) + and self.shape != value.shape + ): # For assigning vector of values to 2d array or matrix # Not neccesary for row of 2d array value = value.reshape(self.shape) @@ -690,7 +700,7 @@ def X(self, value: Optional[Union[np.ndarray, sparse.spmatrix]]): self._X = value else: raise ValueError( - f"Data matrix has wrong shape {value.shape}, " + f"Data matrix has wrong shape {get_shape(value)}, " f"need to be {self.shape}." ) diff --git a/anndata/tests/helpers.py b/anndata/tests/helpers.py index 7664ca5fc..29699a895 100644 --- a/anndata/tests/helpers.py +++ b/anndata/tests/helpers.py @@ -212,7 +212,8 @@ def gen_adata( array=np.random.random((M, N)), sparse=sparse.random(M, N, format="csr"), awk=gen_awkward((M, N)), - awk_ragged=gen_awkward((M, N, None)), + # TODO + # awk_ragged=gen_awkward((M, N, None)), ) layers = {k: v for k, v in layers.items() if type(v) in layers_types} obsp = dict( diff --git a/anndata/tests/test_base.py b/anndata/tests/test_base.py index 675d85d3b..923106159 100644 --- a/anndata/tests/test_base.py +++ b/anndata/tests/test_base.py @@ -9,7 +9,7 @@ from scipy.sparse import csr_matrix, issparse from anndata import AnnData -from anndata.tests.helpers import assert_equal, gen_adata +from anndata.tests.helpers import assert_equal, gen_adata, gen_awkward # some test objects that we use below @@ -609,9 +609,52 @@ def assert_eq_not_id(a, b): assert_eq_not_id(map_sprs[key], map_copy[key]) -def test_set_awkward(): +@pytest.mark.parametrize( + "field,value,valid", + [ + ["X", gen_awkward((10, 20)), True], + ["X", gen_awkward((10, 20, 5)), True], + ["X", gen_awkward((10, 20, None)), True], + ["X", gen_awkward((10, None)), False], + ["X", gen_awkward((10, None, 20)), False], + ["X", gen_awkward((20, 10)), False], + ["layers", gen_awkward((10, 20)), True], + ["layers", gen_awkward((10, 20, 5)), True], + ["layers", gen_awkward((10, 20, None)), True], + ["layers", gen_awkward((10, None)), False], + ["layers", gen_awkward((10, None, 20)), False], + ["layers", gen_awkward((20, 10)), False], + ["obsm", gen_awkward((10, 5)), True], + ["obsm", gen_awkward((10, None)), True], + ["obsm", gen_awkward((10, None, None)), True], + ["obsm", gen_awkward((10, 5, None)), True], + ["obsm", gen_awkward((8, 10)), False], + ["obsm", gen_awkward((8, None)), False], + ["varm", gen_awkward((20, 5)), True], + ["varm", gen_awkward((20, None)), True], + ["varm", gen_awkward((20, None, None)), True], + ["varm", gen_awkward((20, 5, None)), True], + ["varm", gen_awkward((8, 20)), False], + ["varm", gen_awkward((8, None)), False], + ["uns", gen_awkward((7,)), True], + ["uns", gen_awkward((7, None)), True], + ["uns", gen_awkward((7, None, None)), True], + ], +) +def test_set_awkward(field, value, valid): """Check if we can set .X, .layers, .obsm, .varm and .uns with different types of awkward arrays and if error messages are properly raised when the dimensions do not align. """ - # adata = gen_adata((10, 20), varm_types=tuple(), obsm_types=tuple()) - assert False, "TODO" + adata = gen_adata((10, 20), varm_types=(), obsm_types=(), layers_types=()) + + def _assign(): + if field == "X": + setattr(adata, field, value) + else: + getattr(adata, field)["test"] = value + + if not valid: + with pytest.raises(ValueError): + _assign() + else: + _assign() diff --git a/anndata/tests/test_helpers.py b/anndata/tests/test_helpers.py index c94af6bfc..e9c94c58d 100644 --- a/anndata/tests/test_helpers.py +++ b/anndata/tests/test_helpers.py @@ -7,7 +7,7 @@ import anndata as ad from anndata.tests.helpers import assert_equal, gen_awkward, report_name, gen_adata -from anndata.utils import dim_len +from anndata.utils import dim_len, get_shape # Testing to see if all error types can have the key name appended. # Currently fails for 22/118 since they have required arguments. Not sure what to do about that. @@ -59,12 +59,7 @@ def reusable_adata(): ) def test_gen_awkward(shape): arr = gen_awkward(shape) - for i, s in enumerate(shape): - if s is None: - with pytest.raises(ValueError): - dim_len(arr, i) - else: - assert dim_len(arr, i) == s + assert get_shape(arr) == shape # Does this work for every warning? diff --git a/anndata/utils.py b/anndata/utils.py index 815765e1c..d20f38b2d 100644 --- a/anndata/utils.py +++ b/anndata/utils.py @@ -66,11 +66,17 @@ def dim_len(x, dim): """\ Return the size of an array in dimension `dim`. - Raises a ValueError if `x` is an awkward array with variable length in the requested dimension. + Returns None if `x` is an awkward array with variable length in the requested dimension. """ return x.shape[dim] +@singledispatch +def get_shape(x): + """Return the shape of an array""" + return x.shape + + try: import awkward._v2 as ak @@ -98,10 +104,12 @@ def dim_len_awkward(x, dim): try: return arr_type.size except AttributeError: - raise ValueError( - f"Array is of variable length in dimension {dim}.", - f"Try ak.to_regular(array, {dim}) before including the array in AnnData", - ) + # the arrays is of variable length in the requested dimension + return None + + @get_shape.register(ak.Array) + def get_shape_awkward(x): + return tuple(dim_len(x, i) for i in range(x.ndim)) @asarray.register(ak.Array) def asarray_awkward(x): From fceab1b9fe4c1346f5f3bb5c795a6bdb960326ae Mon Sep 17 00:00:00 2001 From: Gregor Sturm Date: Sat, 13 Aug 2022 12:13:51 +0200 Subject: [PATCH 040/110] enable tests for 3d ragged array in layers --- anndata/tests/helpers.py | 3 +-- 1 file changed, 1 insertion(+), 2 deletions(-) diff --git a/anndata/tests/helpers.py b/anndata/tests/helpers.py index 29699a895..7664ca5fc 100644 --- a/anndata/tests/helpers.py +++ b/anndata/tests/helpers.py @@ -212,8 +212,7 @@ def gen_adata( array=np.random.random((M, N)), sparse=sparse.random(M, N, format="csr"), awk=gen_awkward((M, N)), - # TODO - # awk_ragged=gen_awkward((M, N, None)), + awk_ragged=gen_awkward((M, N, None)), ) layers = {k: v for k, v in layers.items() if type(v) in layers_types} obsp = dict( From ef0637a6813a6741dcc4737a25dd4abe783699e0 Mon Sep 17 00:00:00 2001 From: Gregor Sturm Date: Sat, 13 Aug 2022 12:32:24 +0200 Subject: [PATCH 041/110] Cleanup --- anndata/_core/aligned_mapping.py | 6 +++--- anndata/_core/merge.py | 7 ++----- 2 files changed, 5 insertions(+), 8 deletions(-) diff --git a/anndata/_core/aligned_mapping.py b/anndata/_core/aligned_mapping.py index 5d367105c..5b1b61a4b 100644 --- a/anndata/_core/aligned_mapping.py +++ b/anndata/_core/aligned_mapping.py @@ -8,15 +8,13 @@ import pandas as pd from scipy.sparse import spmatrix -from ..utils import deprecated, ensure_df_homogeneous, dim_len, import_function +from ..utils import deprecated, ensure_df_homogeneous, dim_len from . import raw, anndata from .views import as_view from .access import ElementRef from .index import _subset from anndata.compat import AwkArray -ak_to_regular = import_function("awkward", "to_regular") - OneDIdx = Union[Sequence[int], Sequence[bool], slice] TwoDIdx = Tuple[OneDIdx, OneDIdx] @@ -96,6 +94,7 @@ def copy(self): d = self._actual_class(self.parent, self._axis) for k, v in self.items(): if isinstance(v, AwkArray): + # awkward arrays are immutable d[k] = v else: d[k] = v.copy() @@ -281,6 +280,7 @@ def copy(self) -> "Layers": d = self._actual_class(self.parent) for k, v in self.items(): if isinstance(v, AwkArray): + # awkward arrays are immutable d[k] = v else: d[k] = v.copy() diff --git a/anndata/_core/merge.py b/anndata/_core/merge.py index 1539622de..acf5b0747 100644 --- a/anndata/_core/merge.py +++ b/anndata/_core/merge.py @@ -383,18 +383,15 @@ def _apply_to_sparse(self, el: spmatrix, *, axis, fill_value=None) -> spmatrix: return out def _apply_to_awkward(self, el: AwkArray, *, axis, fill_value=None): - try: - dim_len(el, axis) - except ValueError: + if dim_len(el, axis) is None: # Do not reindex variable-length dimensions return el else: indexer = self.old_idx.get_indexer(self.new_idx) if -1 in indexer: - raise ValueError( + raise NotImplementedError( "Outer join operations are currently not supported with AwkwardArrays" ) - # TODO is there no way to slice an awkward array programmatically? if axis == 0: return el[indexer] if axis == 1: From 06608e97ac616cbafe5484ea20a8df1005d31f32 Mon Sep 17 00:00:00 2001 From: Gregor Sturm Date: Sat, 13 Aug 2022 19:47:07 +0200 Subject: [PATCH 042/110] Fix that X could not be set when creating AnnData object from scratch. Apparently the checks are quite different than when adding a Layer. --- anndata/_core/anndata.py | 55 +++++++++++++++++++++++--------------- anndata/tests/test_base.py | 23 ++++++++++++++++ 2 files changed, 57 insertions(+), 21 deletions(-) diff --git a/anndata/_core/anndata.py b/anndata/_core/anndata.py index 30cd551c1..9e675afc8 100644 --- a/anndata/_core/anndata.py +++ b/anndata/_core/anndata.py @@ -90,6 +90,7 @@ def _gen_keys_from_multicol_key(key_multicol, n_keys): return keys +@singledispatch def _check_2d_shape(X): """\ Check shape of array or sparse matrix. @@ -102,6 +103,16 @@ def _check_2d_shape(X): ) +@_check_2d_shape.register(AwkArray) +def _check_2d_shape_awkward(X): + shape = get_shape(X) + if len(shape) < 2 or None in shape[:2]: + raise ValueError( + "An awkward X needs to have at least 2 dimensions. The first two dimensions must be regular " + "(Try ak.to_regular(array, 1)). " + ) + + @singledispatch def _gen_dataframe(anno, length, index_names): if anno is None or len(anno) == 0: @@ -454,30 +465,32 @@ def _init_as_actual( raise ValueError("`shape` needs to be `None` if `X` is not `None`.") _check_2d_shape(X) # if type doesn’t match, a copy is made, otherwise, use a view - if dtype is None and X.dtype != np.float32: - warnings.warn( - f"X.dtype being converted to np.float32 from {X.dtype}. In the next " - "version of anndata (0.9) conversion will not be automatic. Pass " - "dtype explicitly to avoid this warning. Pass " - "`AnnData(X, dtype=X.dtype, ...)` to get the future behavour.", - FutureWarning, - stacklevel=3, - ) - dtype = np.float32 - elif dtype is None: - dtype = np.float32 - if issparse(X) or isinstance(X, ma.MaskedArray): - # TODO: maybe use view on data attribute of sparse matrix - # as in readwrite.read_10x_h5 - if X.dtype != np.dtype(dtype): + if not isinstance(X, AwkArray): + if dtype is None and X.dtype != np.float32: + warnings.warn( + f"X.dtype being converted to np.float32 from {X.dtype}. In the next " + "version of anndata (0.9) conversion will not be automatic. Pass " + "dtype explicitly to avoid this warning. Pass " + "`AnnData(X, dtype=X.dtype, ...)` to get the future behavour.", + FutureWarning, + stacklevel=3, + ) + dtype = np.float32 + elif dtype is None: + dtype = np.float32 + if issparse(X) or isinstance(X, ma.MaskedArray): + # TODO: maybe use view on data attribute of sparse matrix + # as in readwrite.read_10x_h5 + if X.dtype != np.dtype(dtype): + X = X.astype(dtype) + elif isinstance(X, ZarrArray): X = X.astype(dtype) - elif isinstance(X, ZarrArray): - X = X.astype(dtype) - else: # is np.ndarray or a subclass, convert to true np.ndarray - X = np.array(X, dtype, copy=False) + else: # is np.ndarray or a subclass, convert to true np.ndarray + X = np.array(X, dtype, copy=False) + # data matrix and shape self._X = X - self._n_obs, self._n_vars = self._X.shape + self._n_obs, self._n_vars = get_shape(self._X)[:2] else: self._X = None self._n_obs = len([] if obs is None else obs) diff --git a/anndata/tests/test_base.py b/anndata/tests/test_base.py index 923106159..84ed0a0ed 100644 --- a/anndata/tests/test_base.py +++ b/anndata/tests/test_base.py @@ -609,6 +609,29 @@ def assert_eq_not_id(a, b): assert_eq_not_id(map_sprs[key], map_copy[key]) +@pytest.mark.parametrize( + "value,valid", + [ + [gen_awkward((10, 20)), True], + [gen_awkward((10, 20, 5)), True], + [gen_awkward((10, 20, None)), True], + [gen_awkward((10, None)), False], + [gen_awkward((10, None, 20)), False], + ], +) +def test_build_awkward(value, valid): + """Test that building an awkward array with X being awkward from scratch works""" + + def _build(): + AnnData(X=value) + + if not valid: + with pytest.raises(ValueError): + _build() + else: + _build() + + @pytest.mark.parametrize( "field,value,valid", [ From 3c463639886009254ada8ade7f154c1606d51b70 Mon Sep 17 00:00:00 2001 From: Gregor Sturm Date: Fri, 19 Aug 2022 21:09:33 +0200 Subject: [PATCH 043/110] Remove code to make awkward array regular after merge. This is now done by the awkward array library. --- anndata/_core/merge.py | 7 ------- 1 file changed, 7 deletions(-) diff --git a/anndata/_core/merge.py b/anndata/_core/merge.py index acf5b0747..f1ce76eb8 100644 --- a/anndata/_core/merge.py +++ b/anndata/_core/merge.py @@ -985,13 +985,6 @@ def concat( UserWarning, ) - # TODO Workaround for https://github.com/scikit-hep/awkward/issues/1586 - for k, layer in layers.items(): - if isinstance(layer, AwkArray): - import awkward._v2 as ak - - layers[k] = ak.to_regular(layer, 1) - return AnnData( **{ "X": X, From 285e3b3d4c432a2346eaa0f04c6d3cd82e9f89a9 Mon Sep 17 00:00:00 2001 From: Gregor Sturm Date: Mon, 29 Aug 2022 12:23:34 +0200 Subject: [PATCH 044/110] Do not explicitly copy awkward arrays --- anndata/_core/views.py | 6 ++---- 1 file changed, 2 insertions(+), 4 deletions(-) diff --git a/anndata/_core/views.py b/anndata/_core/views.py index 3365cef37..f43b1d508 100644 --- a/anndata/_core/views.py +++ b/anndata/_core/views.py @@ -122,10 +122,8 @@ def drop(self, *args, inplace: bool = False, **kw): class AwkwardArrayView(_ViewMixin, AwkArray): def copy(self, order: str = "C") -> np.ndarray: - import awkward._v2 as ak - - # we want a copy of an akward array - return ak.copy(self) + # awkward arrays are immutable, we don't need to make an explicit copy. + return self @singledispatch From 7fa65dd39803f650a6d5f1a9b3d3e697e1e5ba01 Mon Sep 17 00:00:00 2001 From: Gregor Sturm Date: Mon, 29 Aug 2022 13:01:43 +0200 Subject: [PATCH 045/110] Implement transposing awkward arrays --- anndata/_core/anndata.py | 15 +++++++++------ anndata/tests/test_transpose.py | 26 ++++++++++---------------- anndata/utils.py | 24 ++++++++++++++++++++++++ 3 files changed, 43 insertions(+), 22 deletions(-) diff --git a/anndata/_core/anndata.py b/anndata/_core/anndata.py index 9e675afc8..561804718 100644 --- a/anndata/_core/anndata.py +++ b/anndata/_core/anndata.py @@ -45,7 +45,13 @@ ) from .sparse_dataset import SparseDataset from .. import utils -from ..utils import convert_to_dict, ensure_df_homogeneous, dim_len, get_shape +from ..utils import ( + convert_to_dict, + ensure_df_homogeneous, + dim_len, + get_shape, + transpose_matrix, +) from ..logging import anndata_logger as logger from ..compat import ( ZarrArray, @@ -1321,11 +1327,8 @@ def transpose(self) -> "AnnData": "which is currently not implemented. Call `.copy()` before transposing." ) - def t_csr(m: sparse.spmatrix) -> sparse.csr_matrix: - return m.T.tocsr() if sparse.isspmatrix_csr(m) else m.T - return AnnData( - X=t_csr(X) if X is not None else None, + X=transpose_matrix(X) if X is not None else None, obs=self.var, var=self.obs, # we're taking a private attributes here to be able to modify uns of the original object @@ -1335,7 +1338,7 @@ def t_csr(m: sparse.spmatrix) -> sparse.csr_matrix: obsp=self.varp.copy(), varp=self.obsp.copy(), filename=self.filename, - layers={k: t_csr(v) for k, v in self.layers.items()}, + layers={k: transpose_matrix(v) for k, v in self.layers.items()}, dtype=self.X.dtype.name if X is not None else "float32", ) diff --git a/anndata/tests/test_transpose.py b/anndata/tests/test_transpose.py index ebd426594..fc41a69a6 100644 --- a/anndata/tests/test_transpose.py +++ b/anndata/tests/test_transpose.py @@ -2,13 +2,8 @@ import pytest -from anndata.tests.helpers import GEN_ADATA_DEFAULT_TYPES, gen_adata, assert_equal -from anndata.compat import AwkArray - -# TODO -# transpose currently not implemented for awkward arrays -_types = [x for x in GEN_ADATA_DEFAULT_TYPES if x != AwkArray] -_types_kwargs = dict(obsm_types=_types, varm_types=_types, layers_types=_types) +from anndata.tests.helpers import gen_adata, assert_equal +from anndata.utils import transpose_matrix def test_transpose_orig(): @@ -16,7 +11,7 @@ def test_transpose_orig(): Original test for transpose, should be covered by more thorough tests below, but keeping around just in case. """ - adata = gen_adata((5, 3), **_types_kwargs) + adata = gen_adata((5, 3)) adata.varp = {f"varp_{k}": v for k, v in adata.varp.items()} adata1 = adata.T @@ -38,12 +33,10 @@ def _add_raw(adata, *, var_subset=slice(None)): # * Backed @pytest.fixture( params=[ - pytest.param(gen_adata((50, 20), **_types_kwargs), id="csr_X"), - pytest.param( - gen_adata((50, 20), sparse.csc_matrix, **_types_kwargs), id="csc_X" - ), - pytest.param(_add_raw(gen_adata((50, 20), **_types_kwargs)), id="with_raw"), - pytest.param(gen_adata((20, 10), X_type=None, **_types_kwargs), id="None_X"), + pytest.param(gen_adata((50, 20)), id="csr_X"), + pytest.param(gen_adata((50, 20), sparse.csc_matrix), id="csc_X"), + pytest.param(_add_raw(gen_adata((50, 20))), id="with_raw"), + pytest.param(gen_adata((20, 10), X_type=None), id="None_X"), ] ) def adata(request): @@ -63,12 +56,13 @@ def test_transposed_contents(adata): t = adata.T if adata.X is not None: - assert_equal(adata.X.T, t.X) + assert_equal(transpose_matrix(adata.X), t.X) else: assert adata.X is t.X is None assert_equal( - {k: v.T for k, v in adata.layers.items()}, {k: v for k, v in t.layers.items()} + {k: transpose_matrix(v) for k, v in adata.layers.items()}, + {k: v for k, v in t.layers.items()}, ) assert_equal(adata.obs, t.var) assert_equal(adata.var, t.obs) diff --git a/anndata/utils.py b/anndata/utils.py index d20f38b2d..c0a951af9 100644 --- a/anndata/utils.py +++ b/anndata/utils.py @@ -61,6 +61,17 @@ def convert_to_dict_nonetype(obj: None): return dict() +@singledispatch +def transpose_matrix(m): + return m.T + + +@transpose_matrix.register(sparse.csr_matrix) +@transpose_matrix.register(sparse.csr_array) +def transpose_matrix_csr(m): + return m.T.tocsr() + + @singledispatch def dim_len(x, dim): """\ @@ -80,6 +91,19 @@ def get_shape(x): try: import awkward._v2 as ak + @transpose_matrix.register(ak.Array) + def transpose_matrix_awkward(array): + """ + Compute the index over the flattened array that picks out the transposed elements, then re-assemble the array. + + Source: https://gitter.im/Scikit-HEP/awkward-array?at=6303924c443b7927a7af1199 by @agoose77 + """ + flat = ak.flatten(array, axis=1) + ix = np.arange(len(flat)).reshape(len(array), -1) + flat_transposed = flat[np.ravel(ix.T)] + transposed = ak.unflatten(flat_transposed, len(array), axis=0) + return transposed + @dim_len.register(ak.Array) def dim_len_awkward(x, dim): if dim == 0: From 32c44cff6dfa4f84b57afdf983acdd64dcb4637c Mon Sep 17 00:00:00 2001 From: Gregor Sturm Date: Mon, 29 Aug 2022 13:10:00 +0200 Subject: [PATCH 046/110] Add docs stub and update type hints --- anndata/_core/anndata.py | 8 +++++--- docs/fileformat-prose.rst | 5 +++++ 2 files changed, 10 insertions(+), 3 deletions(-) diff --git a/anndata/_core/anndata.py b/anndata/_core/anndata.py index 561804718..27c58a012 100644 --- a/anndata/_core/anndata.py +++ b/anndata/_core/anndata.py @@ -283,13 +283,15 @@ class AnnData(metaclass=utils.DeprecationMixinMeta): def __init__( self, - X: Optional[Union[np.ndarray, sparse.spmatrix, pd.DataFrame]] = None, + X: Optional[Union[np.ndarray, sparse.spmatrix, pd.DataFrame, AwkArray]] = None, obs: Optional[Union[pd.DataFrame, Mapping[str, Iterable[Any]]]] = None, var: Optional[Union[pd.DataFrame, Mapping[str, Iterable[Any]]]] = None, uns: Optional[Mapping[str, Any]] = None, - obsm: Optional[Union[np.ndarray, Mapping[str, Sequence[Any]]]] = None, + obsm: Optional[Union[np.ndarray, Mapping[str, Sequence[Any]], AwkArray]] = None, varm: Optional[Union[np.ndarray, Mapping[str, Sequence[Any]]]] = None, - layers: Optional[Mapping[str, Union[np.ndarray, sparse.spmatrix]]] = None, + layers: Optional[ + Mapping[str, Union[np.ndarray, sparse.spmatrix, AwkArray]] + ] = None, raw: Optional[Mapping[str, Any]] = None, dtype: Optional[Union[np.dtype, type, str]] = None, shape: Optional[Tuple[int, int]] = None, diff --git a/docs/fileformat-prose.rst b/docs/fileformat-prose.rst index da495d9f1..763719e28 100644 --- a/docs/fileformat-prose.rst +++ b/docs/fileformat-prose.rst @@ -100,6 +100,11 @@ indptr .. indices Dataset {41459314/Inf} .. indptr Dataset {38411/Inf} +AwkwardArrays +~~~~~~~~~~~~~ + +TODO + DataFrames ~~~~~~~~~~ From 5e1c1da3a0676f21d898697ac41d10c8d5b63b6b Mon Sep 17 00:00:00 2001 From: Gregor Sturm Date: Mon, 29 Aug 2022 14:11:18 +0200 Subject: [PATCH 047/110] Fix: dtype not available during merge if both X are awkward --- anndata/_core/merge.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/anndata/_core/merge.py b/anndata/_core/merge.py index f1ce76eb8..60ff00d6d 100644 --- a/anndata/_core/merge.py +++ b/anndata/_core/merge.py @@ -988,7 +988,7 @@ def concat( return AnnData( **{ "X": X, - "dtype": None if X is None else X.dtype, + "dtype": None if X is None or isinstance(X, AwkArray) else X.dtype, "layers": layers, dim: concat_annot, alt_dim: alt_annot, From 08154f7bf480e0cc74ffe3c85b0744b5c3096583 Mon Sep 17 00:00:00 2001 From: Isaac Virshup Date: Mon, 29 Aug 2022 19:07:11 +0200 Subject: [PATCH 048/110] Fix IO --- anndata/_io/specs/methods.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/anndata/_io/specs/methods.py b/anndata/_io/specs/methods.py index 32418c24e..dd54c5f8f 100644 --- a/anndata/_io/specs/methods.py +++ b/anndata/_io/specs/methods.py @@ -493,9 +493,9 @@ def write_awkward(f, k, v, dataset_kwargs=MappingProxyType({})): import awkward._v2 as ak group = f.create_group(k) - form, length, container = ak.to_buffers(v) + form, length, container = ak.to_buffers(ak.packed(v)) group.attrs["length"] = length - group.attrs["form"] = form.tojson() + group.attrs["form"] = form.to_json() write_elem(group, "container", container, dataset_kwargs=dataset_kwargs) From 71f047144f24cc02ed3acdda69e5f13c695a6614 Mon Sep 17 00:00:00 2001 From: Gregor Sturm Date: Tue, 30 Aug 2022 09:43:01 +0200 Subject: [PATCH 049/110] Request pre-release version of awkward --- pyproject.toml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/pyproject.toml b/pyproject.toml index ccf499922..54fa2936a 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -90,7 +90,7 @@ test = [ "boltons", "scanpy", "dask[array]", - "awkward>=1,<2", + "awkward>=1.9.0rc12,<2", ] [tool.flit.sdist] From 2e664091855582cf1928b7f2bcc64e40701570d1 Mon Sep 17 00:00:00 2001 From: Gregor Sturm Date: Tue, 30 Aug 2022 11:00:49 +0200 Subject: [PATCH 050/110] Exclude awkward layer in loom tests --- anndata/tests/test_readwrite.py | 11 ++++++++--- anndata/tests/test_x.py | 4 +--- 2 files changed, 9 insertions(+), 6 deletions(-) diff --git a/anndata/tests/test_readwrite.py b/anndata/tests/test_readwrite.py index 24d9db405..0105dcfe4 100644 --- a/anndata/tests/test_readwrite.py +++ b/anndata/tests/test_readwrite.py @@ -15,9 +15,9 @@ import anndata as ad from anndata.utils import asarray -from anndata.compat import _read_attr +from anndata.compat import _read_attr, AwkArray -from anndata.tests.helpers import gen_adata, assert_equal +from anndata.tests.helpers import GEN_ADATA_DEFAULT_TYPES, gen_adata, assert_equal HERE = Path(__file__).parent @@ -396,7 +396,12 @@ def test_readwrite_loom(typ, obsm_mapping, varm_mapping, tmp_path): @pytest.mark.skipif(not find_spec("loompy"), reason="Loompy is not installed") def test_readloom_deprecations(tmp_path): loom_pth = tmp_path / "test.loom" - adata_src = gen_adata((5, 10), obsm_types=[np.ndarray], varm_types=[np.ndarray]) + adata_src = gen_adata( + (5, 10), + obsm_types=[np.ndarray], + varm_types=[np.ndarray], + layers_types=[x for x in GEN_ADATA_DEFAULT_TYPES if x != AwkArray], + ) adata_src.write_loom(loom_pth, write_obsm_varm=True) # obsm_names -> obsm_mapping diff --git a/anndata/tests/test_x.py b/anndata/tests/test_x.py index 4907af5cd..fb333504c 100644 --- a/anndata/tests/test_x.py +++ b/anndata/tests/test_x.py @@ -75,9 +75,7 @@ def test_init_X_as_none(): @pytest.mark.parametrize("shape", SINGULAR_SHAPES + [pytest.param((5, 3), id="(5, 3)")]) def test_transpose_with_X_as_none(shape): - from test_transpose import _types_kwargs - - adata = gen_adata(shape, X_type=lambda x: None, **_types_kwargs) + adata = gen_adata(shape, X_type=lambda x: None) adataT = adata.transpose() assert_equal(adataT.shape, shape[::-1]) assert_equal(adataT.obsp.keys(), adata.varp.keys()) From 741af1c7daa900afd709d12f0d94a71a21506c3c Mon Sep 17 00:00:00 2001 From: Gregor Sturm Date: Tue, 30 Aug 2022 13:55:54 +0200 Subject: [PATCH 051/110] Pull in only changes relevant to obsm/varm --- .gitignore | 1 + anndata/_core/aligned_mapping.py | 31 +++++++++++++++------ anndata/_core/anndata.py | 11 ++++---- anndata/_core/index.py | 9 +++++- anndata/_core/merge.py | 47 +++++++++++++++++++++++++++++-- anndata/_core/views.py | 13 ++++++++- anndata/_io/specs/methods.py | 30 ++++++++++++++++++++ anndata/compat/__init__.py | 13 ++++++++- anndata/utils.py | 48 ++++++++++++++++++++++++++++++++ docs/fileformat-prose.rst | 5 ++++ pyproject.toml | 2 ++ 11 files changed, 190 insertions(+), 20 deletions(-) diff --git a/.gitignore b/.gitignore index 88e01a343..4053f744f 100644 --- a/.gitignore +++ b/.gitignore @@ -27,4 +27,5 @@ test.h5ad # IDEs /.idea/ +/.vscode/ diff --git a/anndata/_core/aligned_mapping.py b/anndata/_core/aligned_mapping.py index 2c8430794..b0e541069 100644 --- a/anndata/_core/aligned_mapping.py +++ b/anndata/_core/aligned_mapping.py @@ -8,11 +8,12 @@ import pandas as pd from scipy.sparse import spmatrix -from ..utils import deprecated, ensure_df_homogeneous +from ..utils import deprecated, ensure_df_homogeneous, dim_len from . import raw, anndata from .views import as_view from .access import ElementRef from .index import _subset +from anndata.compat import AwkArray OneDIdx = Union[Sequence[int], Sequence[bool], slice] @@ -47,14 +48,22 @@ def _ipython_key_completions_(self) -> List[str]: def _validate_value(self, val: V, key: str) -> V: """Raises an error if value is invalid""" for i, axis in enumerate(self.axes): - if self.parent.shape[axis] != val.shape[i]: + if self.parent.shape[axis] != dim_len(val, i): right_shape = tuple(self.parent.shape[a] for a in self.axes) - raise ValueError( - f"Value passed for key {key!r} is of incorrect shape. " - f"Values of {self.attrname} must match dimensions " - f"{self.axes} of parent. Value had shape {val.shape} while " - f"it should have had {right_shape}." - ) + actual_shape = tuple(dim_len(val, a) for a in self.axes) + if None in actual_shape: + raise ValueError( + f"The AwkwardArray is of variable length in dimension {i}.", + f"Try ak.to_regular(array, {i}) before including the array in AnnData", + ) + else: + raise ValueError( + f"Value passed for key {key!r} is of incorrect shape. " + f"Values of {self.attrname} must match dimensions " + f"{self.axes} of parent. Value had shape {actual_shape} while " + f"it should have had {right_shape}." + ) + if not self._allow_df and isinstance(val, pd.DataFrame): name = self.attrname.title().rstrip("s") val = ensure_df_homogeneous(val, f"{name} {key!r}") @@ -84,7 +93,11 @@ def parent(self) -> Union["anndata.AnnData", "raw.Raw"]: def copy(self): d = self._actual_class(self.parent, self._axis) for k, v in self.items(): - d[k] = v.copy() + if isinstance(v, AwkArray): + # awkward arrays are immutable + d[k] = v + else: + d[k] = v.copy() return d def _view(self, parent: "anndata.AnnData", subset_idx: I): diff --git a/anndata/_core/anndata.py b/anndata/_core/anndata.py index 93e16334e..6bb175f7e 100644 --- a/anndata/_core/anndata.py +++ b/anndata/_core/anndata.py @@ -45,7 +45,7 @@ ) from .sparse_dataset import SparseDataset from .. import utils -from ..utils import convert_to_dict, ensure_df_homogeneous +from ..utils import convert_to_dict, ensure_df_homogeneous, dim_len from ..logging import anndata_logger as logger from ..compat import ( ZarrArray, @@ -56,6 +56,7 @@ _move_adj_mtx, _overloaded_uns, OverloadedDict, + AwkArray, ) @@ -268,8 +269,8 @@ def __init__( obs: Optional[Union[pd.DataFrame, Mapping[str, Iterable[Any]]]] = None, var: Optional[Union[pd.DataFrame, Mapping[str, Iterable[Any]]]] = None, uns: Optional[Mapping[str, Any]] = None, - obsm: Optional[Union[np.ndarray, Mapping[str, Sequence[Any]]]] = None, - varm: Optional[Union[np.ndarray, Mapping[str, Sequence[Any]]]] = None, + obsm: Optional[Union[np.ndarray, Mapping[str, Sequence[Any], AwkArray]]] = None, + varm: Optional[Union[np.ndarray, Mapping[str, Sequence[Any], AwkArray]]] = None, layers: Optional[Mapping[str, Union[np.ndarray, sparse.spmatrix]]] = None, raw: Optional[Mapping[str, Any]] = None, dtype: Optional[Union[np.dtype, type, str]] = None, @@ -1852,7 +1853,7 @@ def _check_dimensions(self, key=None): if "obsm" in key: obsm = self._obsm if ( - not all([o.shape[0] == self._n_obs for o in obsm.values()]) + not all([dim_len(o, 0) == self._n_obs for o in obsm.values()]) and len(obsm.dim_names) != self._n_obs ): raise ValueError( @@ -1862,7 +1863,7 @@ def _check_dimensions(self, key=None): if "varm" in key: varm = self._varm if ( - not all([v.shape[0] == self._n_vars for v in varm.values()]) + not all([dim_len(v, 0) == self._n_vars for v in varm.values()]) and len(varm.dim_names) != self._n_vars ): raise ValueError( diff --git a/anndata/_core/index.py b/anndata/_core/index.py index 8082d48c6..444adba81 100644 --- a/anndata/_core/index.py +++ b/anndata/_core/index.py @@ -7,7 +7,7 @@ import numpy as np import pandas as pd from scipy.sparse import spmatrix, issparse - +from ..compat import AwkArray Index1D = Union[slice, int, str, np.int64, np.ndarray] Index = Union[Index1D, Tuple[Index1D, Index1D], spmatrix] @@ -140,6 +140,13 @@ def _subset_df(df: pd.DataFrame, subset_idx: Index): return df.iloc[subset_idx] +@_subset.register(AwkArray) +def _subset_awkarray(a: AwkArray, subset_idx: Index): + if all(isinstance(x, cabc.Iterable) for x in subset_idx): + subset_idx = np.ix_(*subset_idx) + return a[subset_idx] + + # Registration for SparseDataset occurs in sparse_dataset.py @_subset.register(h5py.Dataset) def _subset_dataset(d, subset_idx): diff --git a/anndata/_core/merge.py b/anndata/_core/merge.py index 6a5e13aff..7bc4a5a30 100644 --- a/anndata/_core/merge.py +++ b/anndata/_core/merge.py @@ -16,8 +16,8 @@ from scipy.sparse import spmatrix from .anndata import AnnData -from ..compat import Literal -from ..utils import asarray +from ..compat import Literal, AwkArray +from ..utils import asarray, dim_len T = TypeVar("T") @@ -127,6 +127,16 @@ def equal_sparse(a, b) -> bool: return False +@equal.register(AwkArray) +def equal_awkward(a, b) -> bool: + import awkward._v2 as ak + + if dim_len(a, 0) == dim_len(b, 0): + return ak.all(a == b) + else: + return False + + def as_sparse(x): if not isinstance(x, sparse.spmatrix): return sparse.csr_matrix(x) @@ -287,12 +297,14 @@ def apply(self, el, *, axis, fill_value=None): Missing values are to be replaced with `fill_value`. """ - if self.no_change and (el.shape[axis] == len(self.old_idx)): + if self.no_change and (dim_len(el, axis) == len(self.old_idx)): return el if isinstance(el, pd.DataFrame): return self._apply_to_df(el, axis=axis, fill_value=fill_value) elif isinstance(el, sparse.spmatrix): return self._apply_to_sparse(el, axis=axis, fill_value=fill_value) + elif isinstance(el, AwkArray): + return self._apply_to_awkward(el, axis=axis, fill_value=fill_value) else: return self._apply_to_array(el, axis=axis, fill_value=fill_value) @@ -370,6 +382,21 @@ def _apply_to_sparse(self, el: spmatrix, *, axis, fill_value=None) -> spmatrix: return out + def _apply_to_awkward(self, el: AwkArray, *, axis, fill_value=None): + if dim_len(el, axis) is None: + # Do not reindex variable-length dimensions + return el + else: + indexer = self.old_idx.get_indexer(self.new_idx) + if -1 in indexer: + raise NotImplementedError( + "Outer join operations are currently not supported with AwkwardArrays" + ) + if axis == 0: + return el[indexer] + if axis == 1: + return el[:, indexer] + def merge_indices( inds: Iterable[pd.Index], join: Literal["inner", "outer"] @@ -434,6 +461,12 @@ def concat_arrays(arrays, reindexers, axis=0, index=None, fill_value=None): ) df.index = index return df + elif any(isinstance(a, AwkArray) for a in arrays): + import awkward._v2 as ak + + return ak.concatenate( + [f(a, axis=1 - axis) for f, a in zip(reindexers, arrays)], axis=axis + ) elif any(isinstance(a, sparse.spmatrix) for a in arrays): sparse_stack = (sparse.vstack, sparse.hstack)[axis] return sparse_stack( @@ -479,6 +512,10 @@ def gen_inner_reindexers(els, new_index, axis: Literal[0, 1] = 0): lambda x, y: x.intersection(y), (df_indices(el) for el in els) ) reindexers = [Reindexer(df_indices(el), common_ind) for el in els] + elif all(isinstance(el, AwkArray) for el in els if not_missing(el)): + # do not reindex awkward arrays + # TODO unintended behaviour? + reindexers = [lambda *args, **kwargs: args[0] for _ in els] else: min_ind = min(el.shape[alt_axis] for el in els) reindexers = [ @@ -496,6 +533,10 @@ def gen_outer_reindexers(els, shapes, new_index: pd.Index, *, axis=0): else (lambda x: pd.DataFrame(index=range(shape))) for el, shape in zip(els, shapes) ] + elif all(isinstance(el, AwkArray) for el in els if not_missing(el)): + # do not reindex awkward arrays + # TODO unintended behaviour? + reindexers = [lambda *args, **kwargs: args[0] for _ in els] else: # if fill_value is None: # fill_value = default_fill_value(els) diff --git a/anndata/_core/views.py b/anndata/_core/views.py index 60e04bc77..f43b1d508 100644 --- a/anndata/_core/views.py +++ b/anndata/_core/views.py @@ -11,7 +11,7 @@ from anndata._warnings import ImplicitModificationWarning from .access import ElementRef -from ..compat import ZappyArray +from ..compat import ZappyArray, AwkArray class _SetItemMixin: @@ -120,6 +120,12 @@ def drop(self, *args, inplace: bool = False, **kw): df.drop(*args, inplace=True, **kw) +class AwkwardArrayView(_ViewMixin, AwkArray): + def copy(self, order: str = "C") -> np.ndarray: + # awkward arrays are immutable, we don't need to make an explicit copy. + return self + + @singledispatch def as_view(obj, view_args): raise NotImplementedError(f"No view type has been registered for {type(obj)}") @@ -157,6 +163,11 @@ def as_view_zappy(z, view_args): return z +@as_view.register(AwkArray) +def as_view_awkarray(array, view_args): + return AwkwardArrayView(array, view_args=view_args) + + def _resolve_idxs(old, new, adata): t = tuple(_resolve_idx(old[i], new[i], adata.shape[i]) for i in (0, 1)) return t diff --git a/anndata/_io/specs/methods.py b/anndata/_io/specs/methods.py index 71c216a2a..dd54c5f8f 100644 --- a/anndata/_io/specs/methods.py +++ b/anndata/_io/specs/methods.py @@ -29,6 +29,7 @@ ) from anndata._io.utils import report_write_key_on_error, check_key, H5PY_V3 from anndata._warnings import OldFormatWarning +from anndata.compat import AwkArray from .registry import ( _REGISTRY, @@ -481,6 +482,35 @@ def read_sparse_partial(elem, *, items=None, indices=(slice(None), slice(None))) return SparseDataset(elem)[indices] +################# +# Awkward array # +################# + + +@_REGISTRY.register_write(H5Group, AwkArray, IOSpec("awkward-array", "0.1.0")) +@_REGISTRY.register_write(ZarrGroup, AwkArray, IOSpec("awkward-array", "0.1.0")) +def write_awkward(f, k, v, dataset_kwargs=MappingProxyType({})): + import awkward._v2 as ak + + group = f.create_group(k) + form, length, container = ak.to_buffers(ak.packed(v)) + group.attrs["length"] = length + group.attrs["form"] = form.to_json() + write_elem(group, "container", container, dataset_kwargs=dataset_kwargs) + + +@_REGISTRY.register_read(H5Group, IOSpec("awkward-array", "0.1.0")) +@_REGISTRY.register_read(ZarrGroup, IOSpec("awkward-array", "0.1.0")) +def read_awkward(elem): + import awkward._v2 as ak + + form = _read_attr(elem.attrs, "form") + length = _read_attr(elem.attrs, "length") + container = read_elem(elem["container"]) + + return ak.from_buffers(form, length, container) + + ############## # DataFrames # ############## diff --git a/anndata/compat/__init__.py b/anndata/compat/__init__.py index 5ccd3571b..41d952fc6 100644 --- a/anndata/compat/__init__.py +++ b/anndata/compat/__init__.py @@ -11,7 +11,6 @@ import pandas as pd from ._overloaded_dict import _overloaded_uns, OverloadedDict -from .._core.index import _subset class Empty: @@ -41,6 +40,16 @@ def __repr__(): return "mock zarr.core.Group" +try: + from awkward._v2 import Array as AwkArray +except ImportError: + + class AwkArray: + @staticmethod + def __repr__(): + return "mock awkward.highlevel.Array" + + try: from zappy.base import ZappyArray except ImportError: @@ -248,6 +257,8 @@ def _find_sparse_matrices(d: Mapping, n: int, keys: tuple, paths: list): def _slice_uns_sparse_matrices(uns: MutableMapping, oidx: "Index1d", orig_n_obs: int): + from anndata._core.index import _subset + """slice sparse spatrices of n_obs × n_obs in self.uns""" if isinstance(oidx, slice) and len(range(*oidx.indices(orig_n_obs))) == orig_n_obs: return uns # slice of entire dimension is a no-op diff --git a/anndata/utils.py b/anndata/utils.py index bc00b0218..9a4abe6bc 100644 --- a/anndata/utils.py +++ b/anndata/utils.py @@ -59,6 +59,54 @@ def convert_to_dict_nonetype(obj: None): return dict() +@singledispatch +def dim_len(x, dim): + """\ + Return the size of an array in dimension `dim`. + + Returns None if `x` is an awkward array with variable length in the requested dimension. + """ + return x.shape[dim] + + +try: + import awkward._v2 as ak + + @dim_len.register(ak.Array) + def dim_len_awkward(x, dim): + if dim == 0: + # dimension 0 is a special case - it is always of `ArrayType` and has a fixed length. + try: + return x.type.length + except AttributeError: + raise ValueError("The outermost type must be an `awkward.Array`!") + else: + arr_type = x.type + for _ in range(dim): + # we need to loop through the nested types for the other dimensions, e.g. + # ArrayType(RegularType(ListType(NumpyType('int64')), 200), 100) + try: + arr_type = arr_type.content + except AttributeError: + # RecordType and UnionType have multiple "contents" entries + raise NotImplementedError( + "This check is currently not implemented for RecordType and UnionType arrays. " + ) + + try: + return arr_type.size + except AttributeError: + # the arrays is of variable length in the requested dimension + return None + + @asarray.register(ak.Array) + def asarray_awkward(x): + return x + +except ImportError: + pass + + def make_index_unique(index: pd.Index, join: str = "-"): """ Makes the index unique by appending a number string to each duplicate index element: diff --git a/docs/fileformat-prose.rst b/docs/fileformat-prose.rst index da495d9f1..763719e28 100644 --- a/docs/fileformat-prose.rst +++ b/docs/fileformat-prose.rst @@ -100,6 +100,11 @@ indptr .. indices Dataset {41459314/Inf} .. indptr Dataset {38411/Inf} +AwkwardArrays +~~~~~~~~~~~~~ + +TODO + DataFrames ~~~~~~~~~~ diff --git a/pyproject.toml b/pyproject.toml index 690f869a7..aa618523e 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -76,6 +76,7 @@ doc = [ "scanpydoc>=0.7.7", "typing_extensions; python_version < '3.8'", "zarr", + "awkward>=1.9.0rc12,<2", ] test = [ "loompy>=3.0.5", @@ -89,6 +90,7 @@ test = [ "boltons", "scanpy", "dask[array]", + "awkward>=1.9.0rc12,<2", ] [tool.flit.sdist] From 7f7ebb621f24358a64c281b6d1cf5ab1a6ac5313 Mon Sep 17 00:00:00 2001 From: Gregor Sturm Date: Tue, 30 Aug 2022 14:05:16 +0200 Subject: [PATCH 052/110] Update tests --- anndata/tests/helpers.py | 93 +++++++++++++++++++++++++++++++++-- anndata/tests/test_base.py | 38 +++++++++++++- anndata/tests/test_helpers.py | 25 +++++++++- 3 files changed, 151 insertions(+), 5 deletions(-) diff --git a/anndata/tests/helpers.py b/anndata/tests/helpers.py index e3ded19ec..e88e2a485 100644 --- a/anndata/tests/helpers.py +++ b/anndata/tests/helpers.py @@ -10,12 +10,15 @@ from pandas.api.types import is_numeric_dtype import pytest from scipy import sparse +import awkward._v2 as ak from anndata import AnnData, Raw from anndata._core.views import ArrayView from anndata._core.sparse_dataset import SparseDataset from anndata._core.aligned_mapping import AlignedMapping -from anndata.utils import asarray +from anndata.utils import asarray, dim_len + +from anndata.compat import AwkArray def gen_vstr_recarray(m, n, dtype=None): @@ -55,6 +58,63 @@ def gen_typed_df(n, index=None): ) +def _gen_awkward_inner(shape, rng, dtype): + # the maximum length a ragged dimension can take + MAX_RAGGED_DIM_LEN = 20 + if not len(shape): + # abort condition -> no dimension left, return an actual value instead + return dtype(rng.randrange(1000)) + else: + curr_dim_len = shape[0] + lil = [] + if curr_dim_len is None: + # ragged dimension, set random length + curr_dim_len = rng.randrange(MAX_RAGGED_DIM_LEN) + + for _ in range(curr_dim_len): + lil.append(_gen_awkward_inner(shape[1:], rng, dtype)) + + return lil + + +def gen_awkward(shape, dtype=np.int32): + """Function to generate an awkward array with random values. + + Awkward array dimensions can either be fixed-length ("regular") or variable length ("ragged") + (the first dimension is always fixed-length). + + + Parameters + ---------- + shape + shape of the array to be generated. Any dimension specified as `None` will be simulated as ragged. + """ + if shape[0] is None: + raise ValueError("The first dimension must be fixed-length.") + + rng = random.Random(123) + shape = np.array(shape) + + if np.any(shape == 0): + # use empty numpy array, to pass the correct dimensions to + # ak.Array when one of the dimensions is 0 (the list-of-list approach + # does not work in that case because the list in the 0-dimension would be empty and all + # following dimensions would be lost). + # The size of the variable-length dimension is irrelevant in that case, we arbitrarily set it to 1 + np_arr = np.empty([1 if x is None else x for x in shape], dtype=dtype) + arr = AwkArray(np_arr) + else: + lil = _gen_awkward_inner(shape, rng, dtype) + arr = AwkArray(lil) + + # make fixed-length dimensions regular + for i, d in enumerate(shape): + if d is not None: + arr = ak.to_regular(arr, i) + + return arr + + def gen_typed_df_t2_size(m, n, index=None, columns=None) -> pd.DataFrame: s = 0 df = pd.DataFrame() @@ -77,8 +137,18 @@ def gen_adata( X_dtype=np.float32, # obs_dtypes, # var_dtypes, - obsm_types: "Collection[Type]" = (sparse.csr_matrix, np.ndarray, pd.DataFrame), - varm_types: "Collection[Type]" = (sparse.csr_matrix, np.ndarray, pd.DataFrame), + obsm_types: "Collection[Type]" = ( + sparse.csr_matrix, + np.ndarray, + pd.DataFrame, + AwkArray, + ), + varm_types: "Collection[Type]" = ( + sparse.csr_matrix, + np.ndarray, + pd.DataFrame, + AwkArray, + ), layers_types: "Collection[Type]" = (sparse.csr_matrix, np.ndarray, pd.DataFrame), ) -> AnnData: """\ @@ -121,12 +191,20 @@ def gen_adata( array=np.random.random((M, 50)), sparse=sparse.random(M, 100, format="csr"), df=gen_typed_df(M, obs_names), + awk=gen_awkward((M,)), + awk_2d=gen_awkward((M, 20)), + awk_2d_ragged=gen_awkward((M, None)), + awk_3d_ragged=gen_awkward((M, 20, None)), ) obsm = {k: v for k, v in obsm.items() if type(v) in obsm_types} varm = dict( array=np.random.random((N, 50)), sparse=sparse.random(N, 100, format="csr"), df=gen_typed_df(N, var_names), + awk=gen_awkward((N,)), + awk_2d=gen_awkward((N, 20)), + awk_2d_ragged=gen_awkward((N, None)), + awk_3d_ragged=gen_awkward((N, 20, None)), ) varm = {k: v for k, v in varm.items() if type(v) in varm_types} layers = dict( @@ -147,6 +225,8 @@ def gen_adata( scalar_float=3.0, nested_further=dict(array=np.arange(5)), ), + awkward_regular=gen_awkward((10, 5)), + awkward_ragged=gen_awkward((12, None, None)), # U_recarray=gen_vstr_recarray(N, 5, "U4") ) adata = AnnData( @@ -337,6 +417,13 @@ def are_equal_dataframe(a, b, exact=False, elem_name=None): ) +@assert_equal.register(AwkArray) +def assert_equal_awkarray(a, b, exact=False, elem_name=None): + import awkward._v2 as ak + + assert ak.all(a == b) + + @assert_equal.register(Mapping) def assert_equal_mapping(a, b, exact=False, elem_name=None): assert set(a.keys()) == set(b.keys()), format_msg(elem_name) diff --git a/anndata/tests/test_base.py b/anndata/tests/test_base.py index 027e46b46..fffa146e4 100644 --- a/anndata/tests/test_base.py +++ b/anndata/tests/test_base.py @@ -9,7 +9,7 @@ from scipy.sparse import csr_matrix, issparse from anndata import AnnData -from anndata.tests.helpers import assert_equal, gen_adata +from anndata.tests.helpers import assert_equal, gen_adata, gen_awkward # some test objects that we use below @@ -607,3 +607,39 @@ def assert_eq_not_id(a, b): assert_eq_not_id(map_sprs.keys(), map_copy.keys()) for key in map_sprs.keys(): assert_eq_not_id(map_sprs[key], map_copy[key]) + + +@pytest.mark.parametrize( + "field,value,valid", + [ + ["obsm", gen_awkward((10, 5)), True], + ["obsm", gen_awkward((10, None)), True], + ["obsm", gen_awkward((10, None, None)), True], + ["obsm", gen_awkward((10, 5, None)), True], + ["obsm", gen_awkward((8, 10)), False], + ["obsm", gen_awkward((8, None)), False], + ["varm", gen_awkward((20, 5)), True], + ["varm", gen_awkward((20, None)), True], + ["varm", gen_awkward((20, None, None)), True], + ["varm", gen_awkward((20, 5, None)), True], + ["varm", gen_awkward((8, 20)), False], + ["varm", gen_awkward((8, None)), False], + ["uns", gen_awkward((7,)), True], + ["uns", gen_awkward((7, None)), True], + ["uns", gen_awkward((7, None, None)), True], + ], +) +def test_set_awkward(field, value, valid): + """Check if we can set .X, .layers, .obsm, .varm and .uns with different types + of awkward arrays and if error messages are properly raised when the dimensions do not align. + """ + adata = gen_adata((10, 20), varm_types=(), obsm_types=(), layers_types=()) + + def _assign(): + getattr(adata, field)["test"] = value + + if not valid: + with pytest.raises(ValueError): + _assign() + else: + _assign() diff --git a/anndata/tests/test_helpers.py b/anndata/tests/test_helpers.py index 1556b7bdb..249de25b8 100644 --- a/anndata/tests/test_helpers.py +++ b/anndata/tests/test_helpers.py @@ -6,7 +6,8 @@ from scipy import sparse import anndata as ad -from anndata.tests.helpers import assert_equal, report_name, gen_adata +from anndata.tests.helpers import assert_equal, gen_awkward, report_name, gen_adata +from anndata.utils import dim_len # Testing to see if all error types can have the key name appended. # Currently fails for 22/118 since they have required arguments. Not sure what to do about that. @@ -40,6 +41,28 @@ def reusable_adata(): return gen_adata((10, 10)) +@pytest.mark.parametrize( + "shape", + [ + (4, 2), + (100, 200, None), + (4, None), + (0, 4), + (4, 0), + (8, None, None), + (8, None, None, None), + (4, None, 8), + (100, 200, 4), + (4, 0, 0), + (0, 0, 0), + ], +) +def test_gen_awkward(shape): + arr = gen_awkward(shape) + for i, s in enumerate(shape): + assert dim_len(arr, i) == s + + # Does this work for every warning? def test_report_name(): def raise_error(): From 771b2ab7ca7308f83d549558743cb748f7515254 Mon Sep 17 00:00:00 2001 From: Gregor Sturm Date: Tue, 30 Aug 2022 14:14:02 +0200 Subject: [PATCH 053/110] Fix type hints --- anndata/_core/anndata.py | 4 ++-- anndata/tests/helpers.py | 1 + 2 files changed, 3 insertions(+), 2 deletions(-) diff --git a/anndata/_core/anndata.py b/anndata/_core/anndata.py index 6bb175f7e..27797fdf5 100644 --- a/anndata/_core/anndata.py +++ b/anndata/_core/anndata.py @@ -269,8 +269,8 @@ def __init__( obs: Optional[Union[pd.DataFrame, Mapping[str, Iterable[Any]]]] = None, var: Optional[Union[pd.DataFrame, Mapping[str, Iterable[Any]]]] = None, uns: Optional[Mapping[str, Any]] = None, - obsm: Optional[Union[np.ndarray, Mapping[str, Sequence[Any], AwkArray]]] = None, - varm: Optional[Union[np.ndarray, Mapping[str, Sequence[Any], AwkArray]]] = None, + obsm: Optional[Union[np.ndarray, Mapping[str, Sequence[Any]]]] = None, + varm: Optional[Union[np.ndarray, Mapping[str, Sequence[Any]]]] = None, layers: Optional[Mapping[str, Union[np.ndarray, sparse.spmatrix]]] = None, raw: Optional[Mapping[str, Any]] = None, dtype: Optional[Union[np.dtype, type, str]] = None, diff --git a/anndata/tests/helpers.py b/anndata/tests/helpers.py index e88e2a485..d8a6385f3 100644 --- a/anndata/tests/helpers.py +++ b/anndata/tests/helpers.py @@ -11,6 +11,7 @@ import pytest from scipy import sparse import awkward._v2 as ak +import random from anndata import AnnData, Raw from anndata._core.views import ArrayView From 492260344d492364f49c2dfe2f40eaf990e16703 Mon Sep 17 00:00:00 2001 From: Gregor Sturm Date: Wed, 31 Aug 2022 08:29:37 +0200 Subject: [PATCH 054/110] Update error message in algined mapping --- anndata/_core/aligned_mapping.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/anndata/_core/aligned_mapping.py b/anndata/_core/aligned_mapping.py index b0e541069..cbe072301 100644 --- a/anndata/_core/aligned_mapping.py +++ b/anndata/_core/aligned_mapping.py @@ -51,7 +51,7 @@ def _validate_value(self, val: V, key: str) -> V: if self.parent.shape[axis] != dim_len(val, i): right_shape = tuple(self.parent.shape[a] for a in self.axes) actual_shape = tuple(dim_len(val, a) for a in self.axes) - if None in actual_shape: + if actual_shape[i] is None and isinstance(val, AwkArray): raise ValueError( f"The AwkwardArray is of variable length in dimension {i}.", f"Try ak.to_regular(array, {i}) before including the array in AnnData", From c5c53358258149450c66680419ea91b6580d6e74 Mon Sep 17 00:00:00 2001 From: Gregor Sturm Date: Wed, 31 Aug 2022 08:42:27 +0200 Subject: [PATCH 055/110] Use compat module to support both awkward v1.9rc and 2.x --- anndata/_core/merge.py | 4 ++-- anndata/_io/specs/methods.py | 4 ++-- anndata/compat/__init__.py | 16 +++++++++++++++- anndata/tests/helpers.py | 2 +- anndata/utils.py | 2 +- 5 files changed, 21 insertions(+), 7 deletions(-) diff --git a/anndata/_core/merge.py b/anndata/_core/merge.py index 7bc4a5a30..6b20e46d1 100644 --- a/anndata/_core/merge.py +++ b/anndata/_core/merge.py @@ -129,7 +129,7 @@ def equal_sparse(a, b) -> bool: @equal.register(AwkArray) def equal_awkward(a, b) -> bool: - import awkward._v2 as ak + from ..compat import awkward as ak if dim_len(a, 0) == dim_len(b, 0): return ak.all(a == b) @@ -462,7 +462,7 @@ def concat_arrays(arrays, reindexers, axis=0, index=None, fill_value=None): df.index = index return df elif any(isinstance(a, AwkArray) for a in arrays): - import awkward._v2 as ak + from ..compat import awkward as ak return ak.concatenate( [f(a, axis=1 - axis) for f, a in zip(reindexers, arrays)], axis=axis diff --git a/anndata/_io/specs/methods.py b/anndata/_io/specs/methods.py index dd54c5f8f..9cb8f2c27 100644 --- a/anndata/_io/specs/methods.py +++ b/anndata/_io/specs/methods.py @@ -490,7 +490,7 @@ def read_sparse_partial(elem, *, items=None, indices=(slice(None), slice(None))) @_REGISTRY.register_write(H5Group, AwkArray, IOSpec("awkward-array", "0.1.0")) @_REGISTRY.register_write(ZarrGroup, AwkArray, IOSpec("awkward-array", "0.1.0")) def write_awkward(f, k, v, dataset_kwargs=MappingProxyType({})): - import awkward._v2 as ak + from anndata.compat import awkward as ak group = f.create_group(k) form, length, container = ak.to_buffers(ak.packed(v)) @@ -502,7 +502,7 @@ def write_awkward(f, k, v, dataset_kwargs=MappingProxyType({})): @_REGISTRY.register_read(H5Group, IOSpec("awkward-array", "0.1.0")) @_REGISTRY.register_read(ZarrGroup, IOSpec("awkward-array", "0.1.0")) def read_awkward(elem): - import awkward._v2 as ak + from anndata.compat import awkward as ak form = _read_attr(elem.attrs, "form") length = _read_attr(elem.attrs, "length") diff --git a/anndata/compat/__init__.py b/anndata/compat/__init__.py index 41d952fc6..38fd9f3aa 100644 --- a/anndata/compat/__init__.py +++ b/anndata/compat/__init__.py @@ -41,7 +41,21 @@ def __repr__(): try: - from awkward._v2 import Array as AwkArray + try: + from importlib.metadata import version + except ImportError: + from importlib_metadata import version + + # Ensure compatibility with both 1.9rc and 2.x releases of awkward + # TODO Once 2.x is released, we can require it as a minimal dependency and remove this code. + major, _ = version("awkward").split(".", maxsplit=1) + if int(major) < 2: + import awkward._v2 as awkward + else: + import awkward + + AwkArray = awkward.Array + except ImportError: class AwkArray: diff --git a/anndata/tests/helpers.py b/anndata/tests/helpers.py index d8a6385f3..68f665a8a 100644 --- a/anndata/tests/helpers.py +++ b/anndata/tests/helpers.py @@ -10,7 +10,7 @@ from pandas.api.types import is_numeric_dtype import pytest from scipy import sparse -import awkward._v2 as ak +from anndata.compat import awkward as ak import random from anndata import AnnData, Raw diff --git a/anndata/utils.py b/anndata/utils.py index 9a4abe6bc..dd2a3f780 100644 --- a/anndata/utils.py +++ b/anndata/utils.py @@ -70,7 +70,7 @@ def dim_len(x, dim): try: - import awkward._v2 as ak + from .compat import awkward as ak @dim_len.register(ak.Array) def dim_len_awkward(x, dim): From c3ccf2f14caa54da8163ade381483caa99d1e8e0 Mon Sep 17 00:00:00 2001 From: Gregor Sturm Date: Wed, 31 Aug 2022 08:53:25 +0200 Subject: [PATCH 056/110] restructure tests --- anndata/tests/test_awkward.py | 39 +++++++++++++++++++++++++++++++++++ anndata/tests/test_base.py | 36 -------------------------------- 2 files changed, 39 insertions(+), 36 deletions(-) create mode 100644 anndata/tests/test_awkward.py diff --git a/anndata/tests/test_awkward.py b/anndata/tests/test_awkward.py new file mode 100644 index 000000000..16c176093 --- /dev/null +++ b/anndata/tests/test_awkward.py @@ -0,0 +1,39 @@ +"""Tests related to awkward arrays""" +import pytest +from anndata.tests.helpers import gen_adata, gen_awkward + + +@pytest.mark.parametrize( + "field,value,valid", + [ + ["obsm", gen_awkward((10, 5)), True], + ["obsm", gen_awkward((10, None)), True], + ["obsm", gen_awkward((10, None, None)), True], + ["obsm", gen_awkward((10, 5, None)), True], + ["obsm", gen_awkward((8, 10)), False], + ["obsm", gen_awkward((8, None)), False], + ["varm", gen_awkward((20, 5)), True], + ["varm", gen_awkward((20, None)), True], + ["varm", gen_awkward((20, None, None)), True], + ["varm", gen_awkward((20, 5, None)), True], + ["varm", gen_awkward((8, 20)), False], + ["varm", gen_awkward((8, None)), False], + ["uns", gen_awkward((7,)), True], + ["uns", gen_awkward((7, None)), True], + ["uns", gen_awkward((7, None, None)), True], + ], +) +def test_set_awkward(field, value, valid): + """Check if we can set .X, .layers, .obsm, .varm and .uns with different types + of awkward arrays and if error messages are properly raised when the dimensions do not align. + """ + adata = gen_adata((10, 20), varm_types=(), obsm_types=(), layers_types=()) + + def _assign(): + getattr(adata, field)["test"] = value + + if not valid: + with pytest.raises(ValueError): + _assign() + else: + _assign() diff --git a/anndata/tests/test_base.py b/anndata/tests/test_base.py index fffa146e4..637702a4d 100644 --- a/anndata/tests/test_base.py +++ b/anndata/tests/test_base.py @@ -607,39 +607,3 @@ def assert_eq_not_id(a, b): assert_eq_not_id(map_sprs.keys(), map_copy.keys()) for key in map_sprs.keys(): assert_eq_not_id(map_sprs[key], map_copy[key]) - - -@pytest.mark.parametrize( - "field,value,valid", - [ - ["obsm", gen_awkward((10, 5)), True], - ["obsm", gen_awkward((10, None)), True], - ["obsm", gen_awkward((10, None, None)), True], - ["obsm", gen_awkward((10, 5, None)), True], - ["obsm", gen_awkward((8, 10)), False], - ["obsm", gen_awkward((8, None)), False], - ["varm", gen_awkward((20, 5)), True], - ["varm", gen_awkward((20, None)), True], - ["varm", gen_awkward((20, None, None)), True], - ["varm", gen_awkward((20, 5, None)), True], - ["varm", gen_awkward((8, 20)), False], - ["varm", gen_awkward((8, None)), False], - ["uns", gen_awkward((7,)), True], - ["uns", gen_awkward((7, None)), True], - ["uns", gen_awkward((7, None, None)), True], - ], -) -def test_set_awkward(field, value, valid): - """Check if we can set .X, .layers, .obsm, .varm and .uns with different types - of awkward arrays and if error messages are properly raised when the dimensions do not align. - """ - adata = gen_adata((10, 20), varm_types=(), obsm_types=(), layers_types=()) - - def _assign(): - getattr(adata, field)["test"] = value - - if not valid: - with pytest.raises(ValueError): - _assign() - else: - _assign() From a8e1648cb2c9dd000496fdf0da75fa039c5bad7d Mon Sep 17 00:00:00 2001 From: Gregor Sturm Date: Wed, 31 Aug 2022 10:18:21 +0200 Subject: [PATCH 057/110] Add tests for copies and view --- anndata/_core/aligned_mapping.py | 5 ++-- anndata/_core/merge.py | 9 +++++++ anndata/_core/views.py | 7 ++--- anndata/tests/test_awkward.py | 45 ++++++++++++++++++++++++++++++++ 4 files changed, 61 insertions(+), 5 deletions(-) diff --git a/anndata/_core/aligned_mapping.py b/anndata/_core/aligned_mapping.py index cbe072301..db3811877 100644 --- a/anndata/_core/aligned_mapping.py +++ b/anndata/_core/aligned_mapping.py @@ -94,8 +94,9 @@ def copy(self): d = self._actual_class(self.parent, self._axis) for k, v in self.items(): if isinstance(v, AwkArray): - # awkward arrays are immutable - d[k] = v + from ..compat import awkward as ak + + d[k] = ak.copy(v) else: d[k] = v.copy() return d diff --git a/anndata/_core/merge.py b/anndata/_core/merge.py index 055b89e40..a41bcc715 100644 --- a/anndata/_core/merge.py +++ b/anndata/_core/merge.py @@ -520,6 +520,15 @@ def concat_arrays(arrays, reindexers, axis=0, index=None, fill_value=None): elif any(isinstance(a, AwkArray) for a in arrays): from ..compat import awkward as ak + if not all( + # TODO need to test MissingVal and shape 0 case. + isinstance(a, AwkArray) or a is MissingVal or 0 in a.shape + for a in arrays + ): + raise NotImplementedError( + "Cannot concatenate an AwkwardArray with other array types." + ) + return ak.concatenate( [f(a, axis=1 - axis) for f, a in zip(reindexers, arrays)], axis=axis ) diff --git a/anndata/_core/views.py b/anndata/_core/views.py index f43b1d508..fb202f35e 100644 --- a/anndata/_core/views.py +++ b/anndata/_core/views.py @@ -121,9 +121,10 @@ def drop(self, *args, inplace: bool = False, **kw): class AwkwardArrayView(_ViewMixin, AwkArray): - def copy(self, order: str = "C") -> np.ndarray: - # awkward arrays are immutable, we don't need to make an explicit copy. - return self + def copy(self, order: str = "C") -> AwkArray: + from ..compat import awkward as ak + + return ak.copy(self) @singledispatch diff --git a/anndata/tests/test_awkward.py b/anndata/tests/test_awkward.py index 16c176093..4a00fd740 100644 --- a/anndata/tests/test_awkward.py +++ b/anndata/tests/test_awkward.py @@ -1,6 +1,11 @@ """Tests related to awkward arrays""" import pytest +import numpy as np +import numpy.testing as npt + from anndata.tests.helpers import gen_adata, gen_awkward +from anndata.compat import awkward as ak +from anndata import ImplicitModificationWarning @pytest.mark.parametrize( @@ -37,3 +42,43 @@ def _assign(): _assign() else: _assign() + + +@pytest.mark.parametrize("key", ["obsm", "varm", "uns"]) +def test_copy(key): + """Check that modifying a copy does not modify the original""" + adata = gen_adata((3, 3), varm_types=(), obsm_types=(), layers_types=()) + getattr(adata, key)["awk"] = ak.Array([{"a": [1], "b": [2], "c": [3]}] * 3) + adata_copy = adata.copy() + getattr(adata_copy, key)["awk"]["c"] = np.full((3, 1), 4) + getattr(adata_copy, key)["awk"]["d"] = np.full((3, 1), 5) + + # values in copy were correctly set + npt.assert_equal(getattr(adata_copy, key)["awk"]["c"], np.full((3, 1), 4)) + npt.assert_equal(getattr(adata_copy, key)["awk"]["d"], np.full((3, 1), 5)) + + # values in original were not updated + npt.assert_equal(getattr(adata, key)["awk"]["c"], np.full((3, 1), 3)) + with pytest.raises(IndexError): + getattr(adata, key)["awk"]["d"] + + +@pytest.mark.parametrize("key", ["obsm", "varm"]) +def test_view(key): + """Check that modifying a view does not modify the original""" + adata = gen_adata((3, 3), varm_types=(), obsm_types=(), layers_types=()) + getattr(adata, key)["awk"] = ak.Array([{"a": [1], "b": [2], "c": [3]}] * 3) + adata_view = adata[:2, :2] + + with pytest.warns(ImplicitModificationWarning, match="initializing view as actual"): + getattr(adata_view, key)["awk"]["c"] = np.full((2, 1), 4) + getattr(adata_view, key)["awk"]["d"] = np.full((2, 1), 5) + + # values in view were correctly set + npt.assert_equal(getattr(adata_view, key)["awk"]["c"], np.full((2, 1), 4)) + npt.assert_equal(getattr(adata_view, key)["awk"]["d"], np.full((2, 1), 5)) + + # values in original were not updated + npt.assert_equal(getattr(adata, key)["awk"]["c"], np.full((3, 1), 3)) + with pytest.raises(IndexError): + getattr(adata, key)["awk"]["d"] From c9a6417c9071a064cb4a07c081f07a26026e38f7 Mon Sep 17 00:00:00 2001 From: Gregor Sturm Date: Wed, 31 Aug 2022 10:47:59 +0200 Subject: [PATCH 058/110] Remove unused imoport --- anndata/tests/test_base.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/anndata/tests/test_base.py b/anndata/tests/test_base.py index 637702a4d..027e46b46 100644 --- a/anndata/tests/test_base.py +++ b/anndata/tests/test_base.py @@ -9,7 +9,7 @@ from scipy.sparse import csr_matrix, issparse from anndata import AnnData -from anndata.tests.helpers import assert_equal, gen_adata, gen_awkward +from anndata.tests.helpers import assert_equal, gen_adata # some test objects that we use below From d8369994e4bdacf13e4f4e2f86aa34ee493b92d9 Mon Sep 17 00:00:00 2001 From: Gregor Sturm Date: Fri, 2 Sep 2022 08:55:26 +0200 Subject: [PATCH 059/110] Fix how actual shape is computed in aligned mapping --- anndata/_core/aligned_mapping.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/anndata/_core/aligned_mapping.py b/anndata/_core/aligned_mapping.py index db3811877..dd885686e 100644 --- a/anndata/_core/aligned_mapping.py +++ b/anndata/_core/aligned_mapping.py @@ -50,7 +50,7 @@ def _validate_value(self, val: V, key: str) -> V: for i, axis in enumerate(self.axes): if self.parent.shape[axis] != dim_len(val, i): right_shape = tuple(self.parent.shape[a] for a in self.axes) - actual_shape = tuple(dim_len(val, a) for a in self.axes) + actual_shape = tuple(dim_len(val, a) for a, _ in enumerate(self.axes)) if actual_shape[i] is None and isinstance(val, AwkArray): raise ValueError( f"The AwkwardArray is of variable length in dimension {i}.", From ed95d8f8ddd3508c7f7a36d7cc24641ce1582760 Mon Sep 17 00:00:00 2001 From: Gregor Sturm Date: Fri, 2 Sep 2022 09:29:47 +0200 Subject: [PATCH 060/110] Attempt to support views with ak.behavior --- anndata/_core/views.py | 25 +++++++++++++++---------- 1 file changed, 15 insertions(+), 10 deletions(-) diff --git a/anndata/_core/views.py b/anndata/_core/views.py index fb202f35e..1707b42ac 100644 --- a/anndata/_core/views.py +++ b/anndata/_core/views.py @@ -120,13 +120,6 @@ def drop(self, *args, inplace: bool = False, **kw): df.drop(*args, inplace=True, **kw) -class AwkwardArrayView(_ViewMixin, AwkArray): - def copy(self, order: str = "C") -> AwkArray: - from ..compat import awkward as ak - - return ak.copy(self) - - @singledispatch def as_view(obj, view_args): raise NotImplementedError(f"No view type has been registered for {type(obj)}") @@ -164,9 +157,21 @@ def as_view_zappy(z, view_args): return z -@as_view.register(AwkArray) -def as_view_awkarray(array, view_args): - return AwkwardArrayView(array, view_args=view_args) +try: + from ..compat import awkward as ak + + @ak.behaviors.mixins.mixin_class(ak.behavior) + class AwkwardArrayView(_ViewMixin, AwkArray): + def copy(self, order: str = "C") -> AwkArray: + + return ak.copy(self) + + @as_view.register(AwkArray) + def as_view_awkarray(array, view_args): + return ak.with_name(array, name="AwkwardArrayView") + +except ImportError: + pass def _resolve_idxs(old, new, adata): From f7edc67ed238990256617f06122a4e0270730a0c Mon Sep 17 00:00:00 2001 From: Gregor Sturm Date: Fri, 2 Sep 2022 13:10:03 +0200 Subject: [PATCH 061/110] Use shallow copy --- anndata/_core/aligned_mapping.py | 5 ++--- anndata/_core/views.py | 5 ++--- 2 files changed, 4 insertions(+), 6 deletions(-) diff --git a/anndata/_core/aligned_mapping.py b/anndata/_core/aligned_mapping.py index dd885686e..dcab04aff 100644 --- a/anndata/_core/aligned_mapping.py +++ b/anndata/_core/aligned_mapping.py @@ -1,5 +1,6 @@ from abc import ABC, abstractmethod from collections import abc as cabc +from copy import copy from typing import Union, Optional, Type, ClassVar, TypeVar # Special types from typing import Iterator, Mapping, Sequence # ABCs from typing import Tuple, List, Dict # Generic base types @@ -94,9 +95,7 @@ def copy(self): d = self._actual_class(self.parent, self._axis) for k, v in self.items(): if isinstance(v, AwkArray): - from ..compat import awkward as ak - - d[k] = ak.copy(v) + d[k] = copy(v) else: d[k] = v.copy() return d diff --git a/anndata/_core/views.py b/anndata/_core/views.py index 1707b42ac..cf96c769a 100644 --- a/anndata/_core/views.py +++ b/anndata/_core/views.py @@ -1,5 +1,5 @@ from contextlib import contextmanager -from copy import deepcopy +from copy import copy, deepcopy from functools import reduce, singledispatch, wraps from typing import Any, KeysView, Optional, Sequence, Tuple import warnings @@ -163,8 +163,7 @@ def as_view_zappy(z, view_args): @ak.behaviors.mixins.mixin_class(ak.behavior) class AwkwardArrayView(_ViewMixin, AwkArray): def copy(self, order: str = "C") -> AwkArray: - - return ak.copy(self) + return copy(self) @as_view.register(AwkArray) def as_view_awkarray(array, view_args): From 5a1d056140d297c3d1a79e853b489d4d8e9955c8 Mon Sep 17 00:00:00 2001 From: Gregor Sturm Date: Sat, 3 Sep 2022 19:34:14 +0200 Subject: [PATCH 062/110] Add dim_len_awkward function including tests --- anndata/tests/test_awkward.py | 53 +++++++++++++++++++ anndata/utils.py | 98 ++++++++++++++++++++++++++--------- 2 files changed, 126 insertions(+), 25 deletions(-) diff --git a/anndata/tests/test_awkward.py b/anndata/tests/test_awkward.py index 4a00fd740..22661a6e2 100644 --- a/anndata/tests/test_awkward.py +++ b/anndata/tests/test_awkward.py @@ -6,6 +6,59 @@ from anndata.tests.helpers import gen_adata, gen_awkward from anndata.compat import awkward as ak from anndata import ImplicitModificationWarning +from anndata.utils import dim_len + + +@pytest.mark.parametrize( + "array,shape", + [ + # numpy array + [ak.Array(np.arange(2 * 3 * 4 * 5).reshape((2, 3, 4, 5))), (2, 3, 4, 5)], + # record + [ak.Array([{"a": 1, "b": 2}, {"a": 1, "b": 3}]), (2,)], + # ListType, variable length + [ak.Array([[1], [2, 3], [4, 5, 6]]), (3, None)], + # ListType, happens to have the same length, but is not regular + [ak.Array([[2], [3], [4]]), (3, None)], + # RegularType + nested ListType + [ak.to_regular(ak.Array([[[1, 2], [3]], [[2], [3, 4, 5]]]), 1), (2, 2, None)], + # nested record + [ + ak.to_regular(ak.Array([[{"a": 0}, {"b": 1}], [{"c": 2}, {"d": 3}]]), 1), + (2, 2), + ], + # mixed types (variable length) + [ak.Array([[1, 2], ["a"]]), (2, None)], + # mixed types (but regular) + [ak.to_regular(ak.Array([[1, 2], ["a", "b"]]), 1), (2, 2)], + # zero-size edge cases + [ak.Array(np.ones((0, 7))), (0, 7)], + [ak.Array(np.ones((7, 0))), (7, 0)], + # UnionType of two regular types with different dimensions + [ + ak.concatenate([ak.Array(np.ones((2, 2))), ak.Array(np.ones((2, 3)))]), + (4, None), + ], + # UnionType of two regular types with same dimension + [ + ak.concatenate( + [ + ak.Array(np.ones((2, 2))), + ak.Array(np.array([["a", "a"], ["a", "a"]])), + ] + ), + (4, 2), + ], + ], +) +def test_dim_len(array, shape): + """Test that dim_len returns the right value for awkward arrays.""" + for axis, size in enumerate(shape): + assert size == dim_len(array, axis) + + # Requesting the size for an axis higher than the array has dimensions should raise a TypeError + with pytest.raises(TypeError): + dim_len(array, len(shape)) @pytest.mark.parametrize( diff --git a/anndata/utils.py b/anndata/utils.py index dd2a3f780..3b5e10d4f 100644 --- a/anndata/utils.py +++ b/anndata/utils.py @@ -60,44 +60,92 @@ def convert_to_dict_nonetype(obj: None): @singledispatch -def dim_len(x, dim): +def dim_len(x, axis): """\ - Return the size of an array in dimension `dim`. + Return the size of an array in dimension `axis`. Returns None if `x` is an awkward array with variable length in the requested dimension. """ - return x.shape[dim] + return x.shape[axis] try: from .compat import awkward as ak @dim_len.register(ak.Array) - def dim_len_awkward(x, dim): - if dim == 0: - # dimension 0 is a special case - it is always of `ArrayType` and has a fixed length. - try: - return x.type.length - except AttributeError: - raise ValueError("The outermost type must be an `awkward.Array`!") + def dim_len_awkward(array, axis): + """Get the length of an awkward array in a given dimension + + Returns None if the dimension is of variable length. + + Code adapted from @jpivarski's solution in https://github.com/scikit-hep/awkward/discussions/1654#discussioncomment-3521574 + """ + if axis < 0: # negative axis is another can of worms... maybe later + raise NotImplementedError("Does not support negative axis") + elif axis == 0: + return len(array) else: - arr_type = x.type - for _ in range(dim): - # we need to loop through the nested types for the other dimensions, e.g. - # ArrayType(RegularType(ListType(NumpyType('int64')), 200), 100) - try: - arr_type = arr_type.content - except AttributeError: - # RecordType and UnionType have multiple "contents" entries - raise NotImplementedError( - "This check is currently not implemented for RecordType and UnionType arrays. " + + def size_at_depth(layout, depth, lateral_context, **kwargs): + if layout.is_NumpyType: + # if it's an embedded rectilinear array, we have to deal with its shape + # which might not be 1-dimensional + if layout.is_UnknownType: + shape = (0,) + else: + shape = layout.shape + numpy_axis = lateral_context["axis"] - depth + 1 + if not (1 <= numpy_axis < len(shape)): + raise TypeError(f"axis={lateral_context['axis']} is too deep") + lateral_context["out"] = shape[numpy_axis] + return layout.nplike.empty(1) + + elif layout.is_ListType and depth == lateral_context["axis"]: + if layout.is_RegularType: + # if it's a regular list, you want the size + lateral_context["out"] = layout.size + else: + # if it's an irregular list, you want a null token + lateral_context["out"] = -1 + return layout.nplike.empty(1) + + elif layout.is_RecordType: + # if it's a record, you want to stop descent with an error + raise TypeError( + f"axis={lateral_context['axis']} is too deep, reaches record" ) - try: - return arr_type.size - except AttributeError: - # the arrays is of variable length in the requested dimension - return None + elif layout.is_UnionType: + # if it's a union, you could get the result of each union branch + # separately and see if they're all the same; if not, it's an error + result = None + for content in layout.contents: + context = {"axis": lateral_context["axis"]} + ak.transform( + size_at_depth, + content, + lateral_context=context, + return_array=False, + ) + if result is None: + result = context["out"] + elif result != context["out"]: + # Union branches have different lengths -> return null token + lateral_context["out"] = -1 + return layout.nplike.empty(1) + lateral_context["out"] = result + return layout.nplike.empty(1) + + # communicate with the recursive function using a context (lateral) + context = {"axis": axis} + + # "transform" but we don't care what kind of array it returns + ak.transform( + size_at_depth, array, lateral_context=context, return_array=False + ) + + # Use `None` as null token. + return None if context["out"] == -1 else context["out"] @asarray.register(ak.Array) def asarray_awkward(x): From 83effadf3b5a015d8125e4a7496e76bd3be66a59 Mon Sep 17 00:00:00 2001 From: Gregor Sturm Date: Sat, 3 Sep 2022 19:55:45 +0200 Subject: [PATCH 063/110] Test that assigning an awkward v1 arrays fails --- anndata/compat/__init__.py | 5 +++-- anndata/tests/test_awkward.py | 18 +++++++++++++++++- 2 files changed, 20 insertions(+), 3 deletions(-) diff --git a/anndata/compat/__init__.py b/anndata/compat/__init__.py index 38fd9f3aa..3387732ff 100644 --- a/anndata/compat/__init__.py +++ b/anndata/compat/__init__.py @@ -48,8 +48,9 @@ def __repr__(): # Ensure compatibility with both 1.9rc and 2.x releases of awkward # TODO Once 2.x is released, we can require it as a minimal dependency and remove this code. - major, _ = version("awkward").split(".", maxsplit=1) - if int(major) < 2: + awkward_version, _ = version("awkward").split(".", maxsplit=1) + awkward_version = int(awkward_version) + if awkward_version < 2: import awkward._v2 as awkward else: import awkward diff --git a/anndata/tests/test_awkward.py b/anndata/tests/test_awkward.py index 22661a6e2..3b276a0da 100644 --- a/anndata/tests/test_awkward.py +++ b/anndata/tests/test_awkward.py @@ -4,9 +4,10 @@ import numpy.testing as npt from anndata.tests.helpers import gen_adata, gen_awkward -from anndata.compat import awkward as ak +from anndata.compat import awkward as ak, awkward_version from anndata import ImplicitModificationWarning from anndata.utils import dim_len +from anndata import AnnData @pytest.mark.parametrize( @@ -97,6 +98,21 @@ def _assign(): _assign() +@pytest.mark.parametrize("key", ["obsm", "varm", "uns"]) +@pytest.mark.skipif( + awkward_version >= 2, reason="This test is only applies for awkward versions 1.x" +) +def test_no_awkward_v1(key): + """Assigning an awkward v1 array to anndata should fail""" + import awkward as akv1 + + v1arr = akv1.Array([1, 2, 3, 4]) + + adata = AnnData(np.ones((4, 4))) + with pytest.raises(AttributeError): + getattr(adata, key)["test"] = v1arr + + @pytest.mark.parametrize("key", ["obsm", "varm", "uns"]) def test_copy(key): """Check that modifying a copy does not modify the original""" From 21a4b5fd6dbfd4fe7a61dcce660e740d61af3c6e Mon Sep 17 00:00:00 2001 From: Gregor Sturm Date: Sat, 3 Sep 2022 20:32:01 +0200 Subject: [PATCH 064/110] Add stub for element-wise IO tests --- anndata/tests/helpers.py | 3 ++- anndata/tests/test_awkward.py | 23 +++++++++++++++++++++-- 2 files changed, 23 insertions(+), 3 deletions(-) diff --git a/anndata/tests/helpers.py b/anndata/tests/helpers.py index 68f665a8a..3a8640d43 100644 --- a/anndata/tests/helpers.py +++ b/anndata/tests/helpers.py @@ -420,8 +420,9 @@ def are_equal_dataframe(a, b, exact=False, elem_name=None): @assert_equal.register(AwkArray) def assert_equal_awkarray(a, b, exact=False, elem_name=None): - import awkward._v2 as ak + from anndata.compat import awkward as ak + assert a.type == b.type, "type mismatch" assert ak.all(a == b) diff --git a/anndata/tests/test_awkward.py b/anndata/tests/test_awkward.py index 3b276a0da..df8ad90ab 100644 --- a/anndata/tests/test_awkward.py +++ b/anndata/tests/test_awkward.py @@ -3,11 +3,11 @@ import numpy as np import numpy.testing as npt -from anndata.tests.helpers import gen_adata, gen_awkward +from anndata.tests.helpers import assert_equal, gen_adata, gen_awkward from anndata.compat import awkward as ak, awkward_version from anndata import ImplicitModificationWarning from anndata.utils import dim_len -from anndata import AnnData +from anndata import AnnData, read_h5ad @pytest.mark.parametrize( @@ -151,3 +151,22 @@ def test_view(key): npt.assert_equal(getattr(adata, key)["awk"]["c"], np.full((3, 1), 3)) with pytest.raises(IndexError): getattr(adata, key)["awk"]["d"] + + +@pytest.mark.parametrize( + "array", + [ + # numpy array + ak.Array(np.arange(2 * 3 * 4 * 5).reshape((2, 3, 4, 5))), + ], +) +def test_awkward_io(tmp_path, array): + assert False, "add more test cases!" + adata = AnnData() + adata.uns["awk"] = array + adata_path = tmp_path / "adata.h5ad" + adata.write_h5ad(adata_path) + + adata2 = read_h5ad(adata_path) + + assert_equal(adata.uns["awk"], adata2.uns["awk"]) From 4ff585198693dda4a208d9da07f6041df9c290b3 Mon Sep 17 00:00:00 2001 From: Gregor Sturm Date: Sun, 4 Sep 2022 17:27:01 +0200 Subject: [PATCH 065/110] Restructur dim_len_awkward --- anndata/utils.py | 103 ++++++++++++++++++++++++----------------------- 1 file changed, 52 insertions(+), 51 deletions(-) diff --git a/anndata/utils.py b/anndata/utils.py index 3b5e10d4f..4aead0954 100644 --- a/anndata/utils.py +++ b/anndata/utils.py @@ -72,6 +72,54 @@ def dim_len(x, axis): try: from .compat import awkward as ak + def _size_at_depth(layout, depth, lateral_context, **kwargs): + """Callback function for dim_len_awkward, resolving the dim_len for a given level""" + if layout.is_NumpyType: + # if it's an embedded rectilinear array, we have to deal with its shape + # which might not be 1-dimensional + if layout.is_UnknownType: + shape = (0,) + else: + shape = layout.shape + numpy_axis = lateral_context["axis"] - depth + 1 + if not (1 <= numpy_axis < len(shape)): + raise TypeError(f"axis={lateral_context['axis']} is too deep") + lateral_context["out"] = shape[numpy_axis] + return layout.nplike.empty(1) + + elif layout.is_ListType and depth == lateral_context["axis"]: + if layout.is_RegularType: + # if it's a regular list, you want the size + lateral_context["out"] = layout.size + else: + # if it's an irregular list, you want a null token + lateral_context["out"] = -1 + return layout.nplike.empty(1) + + elif layout.is_RecordType: + # if it's a record, you want to stop descent with an error + raise TypeError( + f"axis={lateral_context['axis']} is too deep, reaches record" + ) + + elif layout.is_UnionType: + # if it's a union, you could get the result of each union branch + # separately and see if they're all the same; if not, it's an error + result = None + for content in layout.contents: + context = {"axis": lateral_context["axis"]} + ak.transform( + _size_at_depth, content, lateral_context=context, return_array=False + ) + if result is None: + result = context["out"] + elif result != context["out"]: + # Union branches have different lengths -> return null token + lateral_context["out"] = -1 + return layout.nplike.empty(1) + lateral_context["out"] = result + return layout.nplike.empty(1) + @dim_len.register(ak.Array) def dim_len_awkward(array, axis): """Get the length of an awkward array in a given dimension @@ -86,62 +134,15 @@ def dim_len_awkward(array, axis): return len(array) else: - def size_at_depth(layout, depth, lateral_context, **kwargs): - if layout.is_NumpyType: - # if it's an embedded rectilinear array, we have to deal with its shape - # which might not be 1-dimensional - if layout.is_UnknownType: - shape = (0,) - else: - shape = layout.shape - numpy_axis = lateral_context["axis"] - depth + 1 - if not (1 <= numpy_axis < len(shape)): - raise TypeError(f"axis={lateral_context['axis']} is too deep") - lateral_context["out"] = shape[numpy_axis] - return layout.nplike.empty(1) - - elif layout.is_ListType and depth == lateral_context["axis"]: - if layout.is_RegularType: - # if it's a regular list, you want the size - lateral_context["out"] = layout.size - else: - # if it's an irregular list, you want a null token - lateral_context["out"] = -1 - return layout.nplike.empty(1) - - elif layout.is_RecordType: - # if it's a record, you want to stop descent with an error - raise TypeError( - f"axis={lateral_context['axis']} is too deep, reaches record" - ) - - elif layout.is_UnionType: - # if it's a union, you could get the result of each union branch - # separately and see if they're all the same; if not, it's an error - result = None - for content in layout.contents: - context = {"axis": lateral_context["axis"]} - ak.transform( - size_at_depth, - content, - lateral_context=context, - return_array=False, - ) - if result is None: - result = context["out"] - elif result != context["out"]: - # Union branches have different lengths -> return null token - lateral_context["out"] = -1 - return layout.nplike.empty(1) - lateral_context["out"] = result - return layout.nplike.empty(1) - # communicate with the recursive function using a context (lateral) context = {"axis": axis} # "transform" but we don't care what kind of array it returns ak.transform( - size_at_depth, array, lateral_context=context, return_array=False + _size_at_depth, + array, + lateral_context=context, + return_array=False, ) # Use `None` as null token. From 2c59b19e638bf6e0af0f556296b9a43258cc1012 Mon Sep 17 00:00:00 2001 From: Gregor Sturm Date: Sun, 4 Sep 2022 17:27:17 +0200 Subject: [PATCH 066/110] Add more test cases for awkward IO --- anndata/tests/helpers.py | 2 +- anndata/tests/test_awkward.py | 23 ++++++++++++++++++++++- 2 files changed, 23 insertions(+), 2 deletions(-) diff --git a/anndata/tests/helpers.py b/anndata/tests/helpers.py index 3a8640d43..997681a1c 100644 --- a/anndata/tests/helpers.py +++ b/anndata/tests/helpers.py @@ -423,7 +423,7 @@ def assert_equal_awkarray(a, b, exact=False, elem_name=None): from anndata.compat import awkward as ak assert a.type == b.type, "type mismatch" - assert ak.all(a == b) + assert ak.to_json(a) == ak.to_json(b) @assert_equal.register(Mapping) diff --git a/anndata/tests/test_awkward.py b/anndata/tests/test_awkward.py index df8ad90ab..31b157fb3 100644 --- a/anndata/tests/test_awkward.py +++ b/anndata/tests/test_awkward.py @@ -158,10 +158,31 @@ def test_view(key): [ # numpy array ak.Array(np.arange(2 * 3 * 4 * 5).reshape((2, 3, 4, 5))), + # record + ak.Array([{"a": 1, "b": 2}, {"a": 1, "b": 3}]), + # ListType, variable length + ak.Array([[1], [2, 3], [4, 5, 6]]), + # RegularType + nested ListType + ak.to_regular(ak.Array([[[1, 2], [3]], [[2], [3, 4, 5]]]), 1), + # nested record + ak.to_regular(ak.Array([[{"a": 0}, {"b": 1}], [{"c": 2}, {"d": 3}]]), 1), + # mixed types (variable length) + ak.Array([[1, 2], ["a"]]), + # zero-size edge cases + ak.Array(np.ones((0, 7))), + ak.Array(np.ones((7, 0))), + # UnionType of two regular types with different dimensions + ak.concatenate([ak.Array(np.ones((2, 2))), ak.Array(np.ones((2, 3)))]), + # UnionType of two regular types with same dimension + ak.concatenate( + [ + ak.Array(np.ones((2, 2))), + ak.Array(np.array([["a", "a"], ["a", "a"]])), + ] + ), ], ) def test_awkward_io(tmp_path, array): - assert False, "add more test cases!" adata = AnnData() adata.uns["awk"] = array adata_path = tmp_path / "adata.h5ad" From 988579ea19f144b8083fdd0d4304e2cce3b88371 Mon Sep 17 00:00:00 2001 From: Gregor Sturm Date: Sun, 4 Sep 2022 19:20:42 +0200 Subject: [PATCH 067/110] WIP add tests for concatenating AwkArrays with missing values --- anndata/_core/merge.py | 16 ++++--- anndata/tests/test_awkward.py | 82 ++++++++++++++++++++++++++++++++++- 2 files changed, 92 insertions(+), 6 deletions(-) diff --git a/anndata/_core/merge.py b/anndata/_core/merge.py index a41bcc715..0358e020a 100644 --- a/anndata/_core/merge.py +++ b/anndata/_core/merge.py @@ -521,9 +521,7 @@ def concat_arrays(arrays, reindexers, axis=0, index=None, fill_value=None): from ..compat import awkward as ak if not all( - # TODO need to test MissingVal and shape 0 case. - isinstance(a, AwkArray) or a is MissingVal or 0 in a.shape - for a in arrays + isinstance(a, AwkArray) or a is MissingVal or 0 in a.shape for a in arrays ): raise NotImplementedError( "Cannot concatenate an AwkwardArray with other array types." @@ -577,7 +575,11 @@ def gen_inner_reindexers(els, new_index, axis: Literal[0, 1] = 0): lambda x, y: x.intersection(y), (df_indices(el) for el in els) ) reindexers = [Reindexer(df_indices(el), common_ind) for el in els] - elif all(isinstance(el, AwkArray) for el in els if not_missing(el)): + elif any(isinstance(el, AwkArray) for el in els if not_missing(el)): + if not all(isinstance(el, AwkArray) for el in els if not_missing(el)): + raise NotImplementedError( + "Cannot concatenate an AwkwardArray with other array types." + ) # do not reindex awkward arrays # TODO unintended behaviour? reindexers = [lambda *args, **kwargs: args[0] for _ in els] @@ -598,7 +600,11 @@ def gen_outer_reindexers(els, shapes, new_index: pd.Index, *, axis=0): else (lambda x: pd.DataFrame(index=range(shape))) for el, shape in zip(els, shapes) ] - elif all(isinstance(el, AwkArray) for el in els if not_missing(el)): + elif any(isinstance(el, AwkArray) for el in els if not_missing(el)): + if not all(isinstance(el, AwkArray) for el in els if not_missing(el)): + raise NotImplementedError( + "Cannot concatenate an AwkwardArray with other array types." + ) # do not reindex awkward arrays # TODO unintended behaviour? reindexers = [lambda *args, **kwargs: args[0] for _ in els] diff --git a/anndata/tests/test_awkward.py b/anndata/tests/test_awkward.py index 31b157fb3..969d003c3 100644 --- a/anndata/tests/test_awkward.py +++ b/anndata/tests/test_awkward.py @@ -8,6 +8,8 @@ from anndata import ImplicitModificationWarning from anndata.utils import dim_len from anndata import AnnData, read_h5ad +import anndata +import pandas as pd @pytest.mark.parametrize( @@ -83,7 +85,7 @@ def test_dim_len(array, shape): ], ) def test_set_awkward(field, value, valid): - """Check if we can set .X, .layers, .obsm, .varm and .uns with different types + """Check if we can set obsm, .varm and .uns with different types of awkward arrays and if error messages are properly raised when the dimensions do not align. """ adata = gen_adata((10, 20), varm_types=(), obsm_types=(), layers_types=()) @@ -191,3 +193,81 @@ def test_awkward_io(tmp_path, array): adata2 = read_h5ad(adata_path) assert_equal(adata.uns["awk"], adata2.uns["awk"]) + + +@pytest.mark.parametrize( + "arrays,expected", + [ + [ + [ak.Array([{"a": [1, 2], "b": [1, 2]}, {"a": [3], "b": [4]}]), None], + ak.Array([{"a": [1, 2], "b": [1, 2]}, {"a": [3], "b": [4]}, {}, {}, {}]), + ], + [ + [None, ak.Array([{"a": [1, 2], "b": [1, 2]}, {"a": [3], "b": [4]}])], + ak.Array([{}, {}, {}, {"a": [1, 2], "b": [1, 2]}, {"a": [3], "b": [4]}]), + ], + [ + [ + None, + ak.Array([{"a": [1, 2], "b": [1, 2]}, {"a": [3], "b": [4]}]), + pd.DataFrame(), + ], + ak.Array([{}, {}, {}, {"a": [1, 2], "b": [1, 2]}, {"a": [3], "b": [4]}]), + ], + [ + [ + ak.Array([{"a": [1, 2], "b": [1, 2]}, {"a": [3], "b": [4]}]), + pd.DataFrame(), + ], + ak.Array([{"a": [1, 2], "b": [1, 2]}, {"a": [3], "b": [4]}]), + ], + [ + [ + ak.Array([{"a": [1, 2], "b": [1, 2]}, {"a": [3], "b": [4]}]), + pd.DataFrame().assign(a=[3, 4], b=[5, 6]), + ], + NotImplementedError, + ], + [ + [ + ak.Array([{"a": [1, 2], "b": [1, 2]}, {"a": [3], "b": [4]}]), + np.ones((3, 2)), + ], + NotImplementedError, + ], + ], +) +@pytest.mark.parametrize("key", ["obsm", "varm"]) +@pytest.mark.parametrize("join", ["outer", "inner"]) +def test_concat_mixed_types(key, arrays, expected, join): + """Test that concatenation of AwkwardArrays with arbitrary types, but zero length dimension + or missing values works.""" + axis = 0 if key == "obsm" else 1 + + to_concat = [] + for a in arrays: + shape = np.array([3, 3]) # default shape (in case of missing array) + if a is not None: + length = dim_len(a, 0) + shape[axis] = length + + tmp_adata = gen_adata( + tuple(shape), varm_types=(), obsm_types=(), layers_types=() + ) + + if a is not None: + if isinstance(a, pd.DataFrame): + a.set_index( + tmp_adata.obs_names if key == "obsm" else tmp_adata.var_names, + inplace=True, + ) + getattr(tmp_adata, key)["test"] = a + + to_concat.append(tmp_adata) + + if isinstance(expected, type) and issubclass(expected, Exception): + with pytest.raises(expected): + anndata.concat(to_concat, axis=axis, join=join) + else: + result = anndata.concat(to_concat, axis=axis, join=join) + assert_equal(getattr(result, key)["test"], expected) From 504cae172e19c322abcc88673e3e0a5e8b3a612c Mon Sep 17 00:00:00 2001 From: Gregor Sturm Date: Sun, 4 Sep 2022 20:52:39 +0200 Subject: [PATCH 068/110] Fix AwkwardArrayView --- anndata/_core/views.py | 64 ++++++++++++++++++++++++++++++++++++++++-- 1 file changed, 61 insertions(+), 3 deletions(-) diff --git a/anndata/_core/views.py b/anndata/_core/views.py index cf96c769a..44af97236 100644 --- a/anndata/_core/views.py +++ b/anndata/_core/views.py @@ -1,5 +1,6 @@ from contextlib import contextmanager from copy import copy, deepcopy +from enum import Enum from functools import reduce, singledispatch, wraps from typing import Any, KeysView, Optional, Sequence, Tuple import warnings @@ -159,16 +160,73 @@ def as_view_zappy(z, view_args): try: from ..compat import awkward as ak + import weakref + + # Registry to store weak references from AwkwardArrayViews to their parent AnnData container + _registry = weakref.WeakValueDictionary() + + class _PARAM_NAMES(Enum): + """Keynames used to store attributes of ElementRef as params in an awkward array""" + + parent = "_view_args_parent" + attrname = "_view_args_attrname" + keys = "_view_args_keys" - @ak.behaviors.mixins.mixin_class(ak.behavior) class AwkwardArrayView(_ViewMixin, AwkArray): - def copy(self, order: str = "C") -> AwkArray: - return copy(self) + @property + def _view_args(self): + """Override _view_args to retrieve the values from awkward arrays parameters. + + Awkward arrays cannot be subclassed like other python objects. Instead subclasses need + to be attached as "behavior". These "behaviors" cannot take any additional parameters (as we do + for other data types to store `_view_args`). Therefore, we need to store `_view_args` using awkward's + parameter mechanism. These parameters need to be json-serializable, which is why we can't store + ElementRef directly. The reference to the parent AnnDataView object is stored as a key of a + WeakValueDictionary which holds weak references to all AnnDataViews. + """ + parent_key = self.layout.parameter(_PARAM_NAMES.parent) + attrname = self.layout.parameter(_PARAM_NAMES.attrname) + keys = self.layout.parameter(_PARAM_NAMES.keys) + if parent_key is None or attrname is None or keys is None: + raise KeyError( + "AwkwardArrayView does not hold reference to original AnnData object." + ) + else: + try: + parent = _registry[parent_key] + except KeyError: + raise KeyError( + "AwkwardArrayView has invalid reference to original AnnData object." + ) + else: + return ElementRef(parent, attrname, keys) + + def __copy__(self) -> AwkArray: + """ + Turn the AwkwardArrayView into an actual AwkwardArray with no special behavior. + + Need to override __copy__ instead of `.copy()` as awkward arrays don't implement `.copy()` + and are copied using python's standard copy mechanism in `aligned_mapping.py`. + """ + array = self + # makes a shallow copy and removes the reference to the original AnnData object + for param in _PARAM_NAMES: + array = ak.with_parameter(self, param, None) + array = ak.with_name(array, "") + return array @as_view.register(AwkArray) def as_view_awkarray(array, view_args): + parent, attrname, keys = view_args + parent_key = f"target-{id(parent)}" + _registry[parent_key] = parent + array = ak.with_parameter(array, _PARAM_NAMES.parent, parent_key) + array = ak.with_parameter(array, _PARAM_NAMES.attrname, attrname) + array = ak.with_parameter(array, _PARAM_NAMES.keys, keys) return ak.with_name(array, name="AwkwardArrayView") + ak.behavior["*", "AwkwardArrayView"] = AwkwardArrayView + except ImportError: pass From 4dc08269cf1b049b324e6ee9cb7af0288d33b90f Mon Sep 17 00:00:00 2001 From: Gregor Sturm Date: Sun, 4 Sep 2022 20:58:16 +0200 Subject: [PATCH 069/110] Simplify awkward array view code --- anndata/_core/views.py | 24 +++++++----------------- 1 file changed, 7 insertions(+), 17 deletions(-) diff --git a/anndata/_core/views.py b/anndata/_core/views.py index 44af97236..93d06d58a 100644 --- a/anndata/_core/views.py +++ b/anndata/_core/views.py @@ -164,13 +164,7 @@ def as_view_zappy(z, view_args): # Registry to store weak references from AwkwardArrayViews to their parent AnnData container _registry = weakref.WeakValueDictionary() - - class _PARAM_NAMES(Enum): - """Keynames used to store attributes of ElementRef as params in an awkward array""" - - parent = "_view_args_parent" - attrname = "_view_args_attrname" - keys = "_view_args_keys" + _PARAM_NAME = "_view_args" class AwkwardArrayView(_ViewMixin, AwkArray): @property @@ -181,12 +175,10 @@ def _view_args(self): to be attached as "behavior". These "behaviors" cannot take any additional parameters (as we do for other data types to store `_view_args`). Therefore, we need to store `_view_args` using awkward's parameter mechanism. These parameters need to be json-serializable, which is why we can't store - ElementRef directly. The reference to the parent AnnDataView object is stored as a key of a - WeakValueDictionary which holds weak references to all AnnDataViews. + ElementRef directly, but need to replace the reference to the parent AnnDataView container with a weak + reference. """ - parent_key = self.layout.parameter(_PARAM_NAMES.parent) - attrname = self.layout.parameter(_PARAM_NAMES.attrname) - keys = self.layout.parameter(_PARAM_NAMES.keys) + parent_key, attrname, keys = self.layout.parameter(_PARAM_NAME) if parent_key is None or attrname is None or keys is None: raise KeyError( "AwkwardArrayView does not hold reference to original AnnData object." @@ -210,8 +202,8 @@ def __copy__(self) -> AwkArray: """ array = self # makes a shallow copy and removes the reference to the original AnnData object - for param in _PARAM_NAMES: - array = ak.with_parameter(self, param, None) + array = ak.with_parameter(self, _PARAM_NAME, None) + # TODO what's the proper way of getting rid of a name? array = ak.with_name(array, "") return array @@ -220,9 +212,7 @@ def as_view_awkarray(array, view_args): parent, attrname, keys = view_args parent_key = f"target-{id(parent)}" _registry[parent_key] = parent - array = ak.with_parameter(array, _PARAM_NAMES.parent, parent_key) - array = ak.with_parameter(array, _PARAM_NAMES.attrname, attrname) - array = ak.with_parameter(array, _PARAM_NAMES.keys, keys) + array = ak.with_parameter(array, _PARAM_NAME, (parent_key, attrname, keys)) return ak.with_name(array, name="AwkwardArrayView") ak.behavior["*", "AwkwardArrayView"] = AwkwardArrayView From 3ab564668c9b23fef4f9e00e1f5c0aef6fafce02 Mon Sep 17 00:00:00 2001 From: Gregor Sturm Date: Mon, 5 Sep 2022 10:18:18 +0200 Subject: [PATCH 070/110] Use None to remove name from awkward array --- anndata/_core/views.py | 3 +-- 1 file changed, 1 insertion(+), 2 deletions(-) diff --git a/anndata/_core/views.py b/anndata/_core/views.py index 93d06d58a..6aa3f775b 100644 --- a/anndata/_core/views.py +++ b/anndata/_core/views.py @@ -203,8 +203,7 @@ def __copy__(self) -> AwkArray: array = self # makes a shallow copy and removes the reference to the original AnnData object array = ak.with_parameter(self, _PARAM_NAME, None) - # TODO what's the proper way of getting rid of a name? - array = ak.with_name(array, "") + array = ak.with_name(array, None) return array @as_view.register(AwkArray) From 371f66eb5fc95e8b231cbf121854a1577ca15421 Mon Sep 17 00:00:00 2001 From: Gregor Sturm Date: Mon, 5 Sep 2022 19:09:29 +0200 Subject: [PATCH 071/110] Mark test_no_awkward_v1 as xfail for uns --- anndata/tests/test_awkward.py | 11 ++++++++++- 1 file changed, 10 insertions(+), 1 deletion(-) diff --git a/anndata/tests/test_awkward.py b/anndata/tests/test_awkward.py index 969d003c3..7d7f190f0 100644 --- a/anndata/tests/test_awkward.py +++ b/anndata/tests/test_awkward.py @@ -100,7 +100,16 @@ def _assign(): _assign() -@pytest.mark.parametrize("key", ["obsm", "varm", "uns"]) +@pytest.mark.parametrize( + "key", + [ + "obsm", + "varm", + pytest.param( + "uns", marks=pytest.mark.xfail(reason="No checks for `uns` are implemented") + ), + ], +) @pytest.mark.skipif( awkward_version >= 2, reason="This test is only applies for awkward versions 1.x" ) From d2eaf664799be582cd85997485e3d1cc2c71ca65 Mon Sep 17 00:00:00 2001 From: Gregor Sturm Date: Tue, 6 Sep 2022 09:00:36 +0200 Subject: [PATCH 072/110] Add test for categorical arrays --- anndata/tests/helpers.py | 2 +- anndata/tests/test_awkward.py | 2 ++ 2 files changed, 3 insertions(+), 1 deletion(-) diff --git a/anndata/tests/helpers.py b/anndata/tests/helpers.py index 997681a1c..45e7ab524 100644 --- a/anndata/tests/helpers.py +++ b/anndata/tests/helpers.py @@ -423,7 +423,7 @@ def assert_equal_awkarray(a, b, exact=False, elem_name=None): from anndata.compat import awkward as ak assert a.type == b.type, "type mismatch" - assert ak.to_json(a) == ak.to_json(b) + assert ak.to_list(a) == ak.to_list(b) @assert_equal.register(Mapping) diff --git a/anndata/tests/test_awkward.py b/anndata/tests/test_awkward.py index 7d7f190f0..d27bdcbbd 100644 --- a/anndata/tests/test_awkward.py +++ b/anndata/tests/test_awkward.py @@ -191,6 +191,8 @@ def test_view(key): ak.Array(np.array([["a", "a"], ["a", "a"]])), ] ), + # categorical array + ak.operations.to_categorical(ak.Array([["a", "b", "c"], ["a", "b"]])), ], ) def test_awkward_io(tmp_path, array): From 7b571672a300b7db83318fe926d3f8a507946c7d Mon Sep 17 00:00:00 2001 From: Gregor Sturm Date: Tue, 6 Sep 2022 09:03:18 +0200 Subject: [PATCH 073/110] Update docs/fileformat-prose.rst Co-authored-by: Isaac Virshup --- docs/fileformat-prose.rst | 70 ++++++++++++++++++++++++++++++++++++++- 1 file changed, 69 insertions(+), 1 deletion(-) diff --git a/docs/fileformat-prose.rst b/docs/fileformat-prose.rst index 763719e28..746cae1fc 100644 --- a/docs/fileformat-prose.rst +++ b/docs/fileformat-prose.rst @@ -103,7 +103,75 @@ indptr AwkwardArrays ~~~~~~~~~~~~~ -TODO +Ragged arrays are supported in ``anndata`` through the `Awkward +Array `__ library. For storage on disk, we +break down the awkward array into it’s constituent arrays using +```ak.to_buffers`` `__, +then writing these arrays using ``anndata``\ ’s methods. + +The container of arrays is stored in a group called ``"container"`` + +.. code:: python + + >>> import zarr + >>> z = zarr.open("airr.zarr", "r") + >>> awkward_group = z["obsm/airr"] + >>> awkward_group.tree() + +:: + + airr + └── container + ├── node0-offsets (17,) int64 + ├── node2-offsets (40,) int64 + ├── node3-data (117,) uint8 + ├── node4-offsets (40,) int64 + └── node5-data (117,) uint8 + +The length of the array is saved to it’s own ``"length"`` attribute, +while metadata for the array structure is serialized and saved to the +“form” attribute. + +.. code:: python + + >>> dict(awkward_group.attrs) + +.. code:: json + + {'encoding-type': 'awkward-array', + 'encoding-version': '0.1.0', + 'form': '{"class": "ListOffsetArray", "offsets": "i64", "content": {"class": ' + '"RecordArray", "contents": {"locus": {"class": "ListOffsetArray", ' + '"offsets": "i64", "content": {"class": "NumpyArray", "primitive": ' + '"uint8", "inner_shape": [], "has_identifier": false, "parameters": ' + '{"__array__": "char"}, "form_key": "node3"}, "has_identifier": ' + 'false, "parameters": {"__array__": "string"}, "form_key": "node2"}, ' + '"junction_aa": {"class": "ListOffsetArray", "offsets": "i64", ' + '"content": {"class": "NumpyArray", "primitive": "uint8", ' + '"inner_shape": [], "has_identifier": false, "parameters": ' + '{"__array__": "char"}, "form_key": "node5"}, "has_identifier": ' + 'false, "parameters": {"__array__": "string"}, "form_key": "node4"}}, ' + '"has_identifier": false, "parameters": {}, "form_key": "node1"}, ' + '"has_identifier": false, "parameters": {}, "form_key": "node0"}', + 'length': 16} + +These can be read back as awkward arrays using the +```ak.from_buffers`` `__ +function: + +.. code:: python + + >>> import awkward._v2 as ak + >>> from anndata.experimental import read_elem + >>> ak.from_buffers( + ... awkward_group.attrs["form"], + ... awkward_group.attrs["length"], + ... {k: read_elem(v) for k, v in awkward_group["container"].items()} + ... ) + +:: + + DataFrames ~~~~~~~~~~ From b3678b6224e527a4091b506f1f908641686a61c8 Mon Sep 17 00:00:00 2001 From: Gregor Sturm Date: Tue, 6 Sep 2022 13:13:28 +0200 Subject: [PATCH 074/110] Update anndata/_core/aligned_mapping.py Co-authored-by: Isaac Virshup --- anndata/_core/aligned_mapping.py | 1 + 1 file changed, 1 insertion(+) diff --git a/anndata/_core/aligned_mapping.py b/anndata/_core/aligned_mapping.py index dcab04aff..f0b1f4708 100644 --- a/anndata/_core/aligned_mapping.py +++ b/anndata/_core/aligned_mapping.py @@ -95,6 +95,7 @@ def copy(self): d = self._actual_class(self.parent, self._axis) for k, v in self.items(): if isinstance(v, AwkArray): + # Shallow copy since awkward array buffers are immutable d[k] = copy(v) else: d[k] = v.copy() From d523c8962b6a15c3bfefba8d45e9a0170b02c34a Mon Sep 17 00:00:00 2001 From: Gregor Sturm Date: Tue, 6 Sep 2022 13:16:10 +0200 Subject: [PATCH 075/110] Update anndata/tests/helpers.py Co-authored-by: Isaac Virshup --- anndata/tests/helpers.py | 5 +++-- 1 file changed, 3 insertions(+), 2 deletions(-) diff --git a/anndata/tests/helpers.py b/anndata/tests/helpers.py index 45e7ab524..da5bfae6d 100644 --- a/anndata/tests/helpers.py +++ b/anndata/tests/helpers.py @@ -422,8 +422,9 @@ def are_equal_dataframe(a, b, exact=False, elem_name=None): def assert_equal_awkarray(a, b, exact=False, elem_name=None): from anndata.compat import awkward as ak - assert a.type == b.type, "type mismatch" - assert ak.to_list(a) == ak.to_list(b) + if exact: + assert a.type == b.type, format_msg(elem_name) + assert ak.to_list(a) == ak.to_list(b), format_msg(elem_name) @assert_equal.register(Mapping) From 222998df3069c2f88da15e7ad1d37618bfe1302b Mon Sep 17 00:00:00 2001 From: Gregor Sturm Date: Tue, 6 Sep 2022 13:16:37 +0200 Subject: [PATCH 076/110] Update awkward tests to use assert_equal with exact=True --- anndata/tests/test_awkward.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/anndata/tests/test_awkward.py b/anndata/tests/test_awkward.py index d27bdcbbd..719856fd2 100644 --- a/anndata/tests/test_awkward.py +++ b/anndata/tests/test_awkward.py @@ -203,7 +203,7 @@ def test_awkward_io(tmp_path, array): adata2 = read_h5ad(adata_path) - assert_equal(adata.uns["awk"], adata2.uns["awk"]) + assert_equal(adata.uns["awk"], adata2.uns["awk"], exact=True) @pytest.mark.parametrize( @@ -281,4 +281,4 @@ def test_concat_mixed_types(key, arrays, expected, join): anndata.concat(to_concat, axis=axis, join=join) else: result = anndata.concat(to_concat, axis=axis, join=join) - assert_equal(getattr(result, key)["test"], expected) + assert_equal(getattr(result, key)["test"], expected, exact=True) From 3fc9817ac202efac58f31c0288e3518ed06a202c Mon Sep 17 00:00:00 2001 From: Isaac Virshup Date: Thu, 8 Sep 2022 12:15:48 +0200 Subject: [PATCH 077/110] Bump required version --- pyproject.toml | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/pyproject.toml b/pyproject.toml index aa618523e..ea7988bb6 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -76,7 +76,7 @@ doc = [ "scanpydoc>=0.7.7", "typing_extensions; python_version < '3.8'", "zarr", - "awkward>=1.9.0rc12,<2", + "awkward>=1.10.0rc2,<2", ] test = [ "loompy>=3.0.5", @@ -90,7 +90,7 @@ test = [ "boltons", "scanpy", "dask[array]", - "awkward>=1.9.0rc12,<2", + "awkward>=1.10.0rc2,<2", ] [tool.flit.sdist] From 6a6657b9f1c8abd861fcc6d9b17a12876f57de8a Mon Sep 17 00:00:00 2001 From: Isaac Virshup Date: Thu, 8 Sep 2022 16:30:42 +0200 Subject: [PATCH 078/110] Update categorical syntax, add new categorical test --- anndata/tests/test_awkward.py | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/anndata/tests/test_awkward.py b/anndata/tests/test_awkward.py index 719856fd2..7cbee0ab3 100644 --- a/anndata/tests/test_awkward.py +++ b/anndata/tests/test_awkward.py @@ -192,7 +192,8 @@ def test_view(key): ] ), # categorical array - ak.operations.to_categorical(ak.Array([["a", "b", "c"], ["a", "b"]])), + ak.to_categorical(ak.Array([["a", "b", "c"], ["a", "b"]])), + ak.to_categorical(ak.Array([[1, 1, 2], [3, 3]])), ], ) def test_awkward_io(tmp_path, array): From 7ac4a0c88a7fab0bc986b35ee7242e3a6497a2ab Mon Sep 17 00:00:00 2001 From: Isaac Virshup Date: Tue, 13 Sep 2022 20:54:26 +0200 Subject: [PATCH 079/110] Start concat tests for awkward --- anndata/tests/test_concatenate.py | 38 +++++++++++++++++++++++++++++++ 1 file changed, 38 insertions(+) diff --git a/anndata/tests/test_concatenate.py b/anndata/tests/test_concatenate.py index 22cd35621..4e22a62ea 100644 --- a/anndata/tests/test_concatenate.py +++ b/anndata/tests/test_concatenate.py @@ -666,6 +666,44 @@ def test_concatenate_with_raw(): assert adata_all.raw is None +def test_concatenate_awkward(): + import awkward._v2 as ak + + a = ak.Array([[{"a": 1, "b": "foo"}], [{"a": 2, "b": "bar"}, {"a": 3, "b": "baz"}]]) + b = ak.Array( + [ + [{"a": 4}, {"a": 5}], + [{"a": 6}], + [{"a": 7}], + ] + ) + + adata_a = AnnData(np.zeros((2, 0), dtype=float), obsm={"awk": a}) + adata_b = AnnData(np.zeros((3, 0), dtype=float), obsm={"awk": b}) + + inner_result = ak.Array( + [ + [{"a": 1}], + [{"a": 2}, {"a": 3}], + [{"a": 4}, {"a": 5}], + [{"a": 6}], + [{"a": 7}], + ] + ) + outer_result = ak.Array( + [ + [{"a": 1, "b": "foo"}], + [{"a": 2, "b": "bar"}, {"a": 3, "b": "baz"}], + [{"a": 4, "b": None}, {"a": 5, "b": None}], + [{"a": 6, "b": None}], + [{"a": 7, "b": None}], + ] + ) + + assert_equal(inner_result, concat([adata_a, adata_b], join="inner").obsm["awk"]) + assert_equal(outer_result, concat([adata_a, adata_b], join="outer").obsm["awk"]) + + def test_pairwise_concat(axis, array_type): dim_sizes = [[100, 200, 50], [50, 50, 50]] if axis: From 0340151e0f7bd58d5f4ae8f7068da89c389df505 Mon Sep 17 00:00:00 2001 From: Gregor Sturm Date: Mon, 26 Sep 2022 20:10:30 +0200 Subject: [PATCH 080/110] Add release notes --- docs/release-notes/0.9.0.rst | 1 + 1 file changed, 1 insertion(+) diff --git a/docs/release-notes/0.9.0.rst b/docs/release-notes/0.9.0.rst index b8c919820..c2d21d272 100644 --- a/docs/release-notes/0.9.0.rst +++ b/docs/release-notes/0.9.0.rst @@ -4,6 +4,7 @@ .. rubric:: Features * Unordered categorical columns are no longer cast to object during :func:`anndata.concat` :pr:`763` :smaller:`ivirshup` +* `obsm`, `varm` and `uns` can now hold `AwkwardArrays `__ :pr:`647` :smaller:`giovp, grst, ivirshup` .. rubric:: Bug fixes From 02365a6559f16c86943ee440fa4e148e39eafb61 Mon Sep 17 00:00:00 2001 From: Gregor Sturm Date: Mon, 26 Sep 2022 20:18:40 +0200 Subject: [PATCH 081/110] Add testcases for dim_len with awkward arrays of strings --- anndata/tests/test_awkward.py | 4 ++++ 1 file changed, 4 insertions(+) diff --git a/anndata/tests/test_awkward.py b/anndata/tests/test_awkward.py index 7cbee0ab3..c1ab341c2 100644 --- a/anndata/tests/test_awkward.py +++ b/anndata/tests/test_awkward.py @@ -52,6 +52,10 @@ ), (4, 2), ], + # Array of string types + [ak.Array(["a", "b", "c"]), (3,)], + [ak.Array([["a", "b"], ["c", "d"], ["e", "f"]]), (3, None)], + [ak.to_regular(ak.Array([["a", "b"], ["c", "d"], ["e", "f"]]), 1), (3, 2)], ], ) def test_dim_len(array, shape): From c20cc310d3bc469ec71e0001ba0bc5b31502a380 Mon Sep 17 00:00:00 2001 From: Gregor Sturm Date: Mon, 26 Sep 2022 21:06:17 +0200 Subject: [PATCH 082/110] Fix dim_len for arrays of strings --- anndata/utils.py | 6 ++++++ 1 file changed, 6 insertions(+) diff --git a/anndata/utils.py b/anndata/utils.py index 4aead0954..4f459ceef 100644 --- a/anndata/utils.py +++ b/anndata/utils.py @@ -88,6 +88,12 @@ def _size_at_depth(layout, depth, lateral_context, **kwargs): return layout.nplike.empty(1) elif layout.is_ListType and depth == lateral_context["axis"]: + if layout.parameter("__array__") in ("string", "bytestring"): + # Strings are implemented like an array of lists of uint8 (ListType(NumpyType(...))) + # which results in an extra hierarchy-level that shouldn't show up in dim_len + # See https://github.com/scikit-hep/awkward/discussions/1654#discussioncomment-3736747 + raise TypeError(f"axis={lateral_context['axis']} is too deep") + if layout.is_RegularType: # if it's a regular list, you want the size lateral_context["out"] = layout.size From 2d024f1d18b66c83a638470d0b28c0c81fbf89da Mon Sep 17 00:00:00 2001 From: "pre-commit-ci[bot]" <66853113+pre-commit-ci[bot]@users.noreply.github.com> Date: Wed, 14 Dec 2022 09:34:09 +0000 Subject: [PATCH 083/110] [pre-commit.ci] auto fixes from pre-commit.com hooks for more information, see https://pre-commit.ci --- anndata/tests/test_helpers.py | 8 +++++++- 1 file changed, 7 insertions(+), 1 deletion(-) diff --git a/anndata/tests/test_helpers.py b/anndata/tests/test_helpers.py index 612b0ac49..150cab56f 100644 --- a/anndata/tests/test_helpers.py +++ b/anndata/tests/test_helpers.py @@ -6,7 +6,13 @@ from scipy import sparse import anndata as ad -from anndata.tests.helpers import assert_equal, gen_awkward, report_name, gen_adata, asarray +from anndata.tests.helpers import ( + assert_equal, + gen_awkward, + report_name, + gen_adata, + asarray, +) from anndata.utils import dim_len # Testing to see if all error types can have the key name appended. From 50a8dc34eb048948be06f64418a7a6f3272fc2cb Mon Sep 17 00:00:00 2001 From: Gregor Sturm Date: Mon, 2 Jan 2023 18:02:07 +0100 Subject: [PATCH 084/110] Awkward v2 fixes Several functions changed until the stable awkward v2 version was released. --- anndata/_io/specs/methods.py | 2 +- anndata/compat/__init__.py | 14 +------------- anndata/tests/test_awkward.py | 26 +------------------------- anndata/tests/test_concatenate.py | 2 +- anndata/utils.py | 25 +++++++++++++------------ docs/fileformat-prose.rst | 2 +- pyproject.toml | 4 ++-- 7 files changed, 20 insertions(+), 55 deletions(-) diff --git a/anndata/_io/specs/methods.py b/anndata/_io/specs/methods.py index 7070d7566..f2f131673 100644 --- a/anndata/_io/specs/methods.py +++ b/anndata/_io/specs/methods.py @@ -506,7 +506,7 @@ def write_awkward(f, k, v, dataset_kwargs=MappingProxyType({})): from anndata.compat import awkward as ak group = f.create_group(k) - form, length, container = ak.to_buffers(ak.packed(v)) + form, length, container = ak.to_buffers(ak.to_packed(v)) group.attrs["length"] = length group.attrs["form"] = form.to_json() write_elem(group, "container", container, dataset_kwargs=dataset_kwargs) diff --git a/anndata/compat/__init__.py b/anndata/compat/__init__.py index 552f991c2..1bedd3493 100644 --- a/anndata/compat/__init__.py +++ b/anndata/compat/__init__.py @@ -43,19 +43,7 @@ def __repr__(): try: - try: - from importlib.metadata import version - except ImportError: - from importlib_metadata import version - - # Ensure compatibility with both 1.9rc and 2.x releases of awkward - # TODO Once 2.x is released, we can require it as a minimal dependency and remove this code. - awkward_version, _ = version("awkward").split(".", maxsplit=1) - awkward_version = int(awkward_version) - if awkward_version < 2: - import awkward._v2 as awkward - else: - import awkward + import awkward AwkArray = awkward.Array diff --git a/anndata/tests/test_awkward.py b/anndata/tests/test_awkward.py index c1ab341c2..cecd1e4b9 100644 --- a/anndata/tests/test_awkward.py +++ b/anndata/tests/test_awkward.py @@ -4,7 +4,7 @@ import numpy.testing as npt from anndata.tests.helpers import assert_equal, gen_adata, gen_awkward -from anndata.compat import awkward as ak, awkward_version +from anndata.compat import awkward as ak from anndata import ImplicitModificationWarning from anndata.utils import dim_len from anndata import AnnData, read_h5ad @@ -104,30 +104,6 @@ def _assign(): _assign() -@pytest.mark.parametrize( - "key", - [ - "obsm", - "varm", - pytest.param( - "uns", marks=pytest.mark.xfail(reason="No checks for `uns` are implemented") - ), - ], -) -@pytest.mark.skipif( - awkward_version >= 2, reason="This test is only applies for awkward versions 1.x" -) -def test_no_awkward_v1(key): - """Assigning an awkward v1 array to anndata should fail""" - import awkward as akv1 - - v1arr = akv1.Array([1, 2, 3, 4]) - - adata = AnnData(np.ones((4, 4))) - with pytest.raises(AttributeError): - getattr(adata, key)["test"] = v1arr - - @pytest.mark.parametrize("key", ["obsm", "varm", "uns"]) def test_copy(key): """Check that modifying a copy does not modify the original""" diff --git a/anndata/tests/test_concatenate.py b/anndata/tests/test_concatenate.py index c71a4ec93..ae224bf13 100644 --- a/anndata/tests/test_concatenate.py +++ b/anndata/tests/test_concatenate.py @@ -681,7 +681,7 @@ def test_concatenate_with_raw(): def test_concatenate_awkward(): - import awkward._v2 as ak + import awkward as ak a = ak.Array([[{"a": 1, "b": "foo"}], [{"a": 2, "b": "bar"}, {"a": 3, "b": "baz"}]]) b = ak.Array( diff --git a/anndata/utils.py b/anndata/utils.py index 4f459ceef..0a3b34407 100644 --- a/anndata/utils.py +++ b/anndata/utils.py @@ -74,10 +74,10 @@ def dim_len(x, axis): def _size_at_depth(layout, depth, lateral_context, **kwargs): """Callback function for dim_len_awkward, resolving the dim_len for a given level""" - if layout.is_NumpyType: + if layout.is_numpy: # if it's an embedded rectilinear array, we have to deal with its shape # which might not be 1-dimensional - if layout.is_UnknownType: + if layout.is_unknown: shape = (0,) else: shape = layout.shape @@ -85,46 +85,48 @@ def _size_at_depth(layout, depth, lateral_context, **kwargs): if not (1 <= numpy_axis < len(shape)): raise TypeError(f"axis={lateral_context['axis']} is too deep") lateral_context["out"] = shape[numpy_axis] - return layout.nplike.empty(1) + return ak.contents.EmptyArray() - elif layout.is_ListType and depth == lateral_context["axis"]: + elif layout.is_list and depth == lateral_context["axis"]: if layout.parameter("__array__") in ("string", "bytestring"): # Strings are implemented like an array of lists of uint8 (ListType(NumpyType(...))) # which results in an extra hierarchy-level that shouldn't show up in dim_len # See https://github.com/scikit-hep/awkward/discussions/1654#discussioncomment-3736747 raise TypeError(f"axis={lateral_context['axis']} is too deep") - if layout.is_RegularType: + if layout.is_regular: # if it's a regular list, you want the size lateral_context["out"] = layout.size else: # if it's an irregular list, you want a null token lateral_context["out"] = -1 - return layout.nplike.empty(1) + return ak.contents.EmptyArray() - elif layout.is_RecordType: + elif layout.is_record: # if it's a record, you want to stop descent with an error raise TypeError( f"axis={lateral_context['axis']} is too deep, reaches record" ) - elif layout.is_UnionType: + elif layout.is_union: # if it's a union, you could get the result of each union branch # separately and see if they're all the same; if not, it's an error result = None for content in layout.contents: context = {"axis": lateral_context["axis"]} ak.transform( - _size_at_depth, content, lateral_context=context, return_array=False + _size_at_depth, + content, + lateral_context=context, ) if result is None: result = context["out"] elif result != context["out"]: # Union branches have different lengths -> return null token lateral_context["out"] = -1 - return layout.nplike.empty(1) + return ak.contents.EmptyArray() lateral_context["out"] = result - return layout.nplike.empty(1) + return ak.contents.EmptyArray() @dim_len.register(ak.Array) def dim_len_awkward(array, axis): @@ -148,7 +150,6 @@ def dim_len_awkward(array, axis): _size_at_depth, array, lateral_context=context, - return_array=False, ) # Use `None` as null token. diff --git a/docs/fileformat-prose.rst b/docs/fileformat-prose.rst index 01fedd24a..5cadc8d23 100644 --- a/docs/fileformat-prose.rst +++ b/docs/fileformat-prose.rst @@ -161,7 +161,7 @@ function: .. code:: python - >>> import awkward._v2 as ak + >>> import awkward as ak >>> from anndata.experimental import read_elem >>> ak.from_buffers( ... awkward_group.attrs["form"], diff --git a/pyproject.toml b/pyproject.toml index 9d7e309d8..30abdb8a5 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -71,7 +71,7 @@ doc = [ "nbsphinx", "scanpydoc>=0.7.7", "zarr", - "awkward>=1.10.0rc2,<2", + "awkward>=2.0.5", "IPython", # For syntax highlighting in notebooks ] test = [ @@ -86,7 +86,7 @@ test = [ "boltons", "scanpy", "dask[array]", - "awkward>=1.10.0rc2,<2", + "awkward>=2.0.5", ] [tool.flit.sdist] From fe27b749962db47083eb021d02b63919996ee730 Mon Sep 17 00:00:00 2001 From: Gregor Sturm Date: Mon, 2 Jan 2023 20:41:35 +0100 Subject: [PATCH 085/110] Exclude awkward arrays from fill_value concat test --- anndata/tests/test_concatenate.py | 15 +++++++++++---- 1 file changed, 11 insertions(+), 4 deletions(-) diff --git a/anndata/tests/test_concatenate.py b/anndata/tests/test_concatenate.py index ae224bf13..a5f17edbc 100644 --- a/anndata/tests/test_concatenate.py +++ b/anndata/tests/test_concatenate.py @@ -25,7 +25,7 @@ GEN_ADATA_DASK_ARGS, ) from anndata.utils import asarray -from anndata.compat import DaskArray +from anndata.compat import DaskArray, AwkArray @singledispatch @@ -444,20 +444,27 @@ def get_obs_els(adata): adata1 = gen_adata((10, 10)) adata1.obsm = { - k: v for k, v in adata1.obsm.items() if not isinstance(v, pd.DataFrame) + k: v + for k, v in adata1.obsm.items() + if not isinstance(v, (pd.DataFrame, AwkArray)) } adata2 = gen_adata((10, 5)) adata2.obsm = { k: v[:, : v.shape[1] // 2] for k, v in adata2.obsm.items() - if not isinstance(v, pd.DataFrame) + if not isinstance(v, (pd.DataFrame, AwkArray)) } adata3 = gen_adata((7, 3)) adata3.obsm = { k: v[:, : v.shape[1] // 3] for k, v in adata3.obsm.items() - if not isinstance(v, pd.DataFrame) + if not isinstance(v, (pd.DataFrame, AwkArray)) } + # remove AwkArrays from adata.var, as outer joins are not yet implemented for them + for tmp_ad in [adata1, adata2, adata3]: + for k in [k for k, v in tmp_ad.varm.items() if isinstance(v, AwkArray)]: + del tmp_ad.varm[k] + joined = adata1.concatenate([adata2, adata3], join="outer", fill_value=fill_val) ptr = 0 From 2aed5b6ad534b2206967cdbd4f153459acca72c3 Mon Sep 17 00:00:00 2001 From: Gregor Sturm Date: Mon, 2 Jan 2023 20:52:29 +0100 Subject: [PATCH 086/110] fix flake8 --- anndata/_core/index.py | 3 --- anndata/_core/views.py | 2 +- 2 files changed, 1 insertion(+), 4 deletions(-) diff --git a/anndata/_core/index.py b/anndata/_core/index.py index 1c54491ee..859c1bcdd 100644 --- a/anndata/_core/index.py +++ b/anndata/_core/index.py @@ -9,9 +9,6 @@ from scipy.sparse import spmatrix, issparse from ..compat import AwkArray, DaskArray, Index, Index1D -Index1D = Union[slice, int, str, np.int64, np.ndarray] -Index = Union[Index1D, Tuple[Index1D, Index1D], spmatrix] - def _normalize_indices( index: Optional[Index], names0: pd.Index, names1: pd.Index diff --git a/anndata/_core/views.py b/anndata/_core/views.py index d47dc6a05..b3228388e 100644 --- a/anndata/_core/views.py +++ b/anndata/_core/views.py @@ -1,5 +1,5 @@ from contextlib import contextmanager -from copy import copy, deepcopy +from copy import deepcopy from enum import Enum from functools import reduce, singledispatch, wraps from typing import Any, KeysView, Optional, Sequence, Tuple From a58982092fd948ab8240e972749eca2325ff3c03 Mon Sep 17 00:00:00 2001 From: Gregor Sturm Date: Fri, 27 Jan 2023 19:46:27 +0100 Subject: [PATCH 087/110] Add IO testcase for AIRR data --- anndata/tests/test_awkward.py | 29 +++++++++++++++++++++++++++++ pyproject.toml | 2 +- 2 files changed, 30 insertions(+), 1 deletion(-) diff --git a/anndata/tests/test_awkward.py b/anndata/tests/test_awkward.py index cecd1e4b9..5b6c2be4a 100644 --- a/anndata/tests/test_awkward.py +++ b/anndata/tests/test_awkward.py @@ -174,6 +174,35 @@ def test_view(key): # categorical array ak.to_categorical(ak.Array([["a", "b", "c"], ["a", "b"]])), ak.to_categorical(ak.Array([[1, 1, 2], [3, 3]])), + # tyical record type with AIRR data consisting of different dtypes + ak.Array( + [ + [ + { + "v_call": "TRV1", + "junction_aa": "ADDEEKK", + "productive": True, + "locus": None, + "consensus_count": 3, + }, + { + "v_call": "TRV2", + "productive": False, + "locus": "TRA", + "consensus_count": 4, + }, + ], + [ + { + "v_call": None, + "junction_aa": "ADDEKK", + "productive": None, + "locus": "IGK", + "consensus_count": 3, + } + ], + ] + ), ], ) def test_awkward_io(tmp_path, array): diff --git a/pyproject.toml b/pyproject.toml index 30abdb8a5..ef898d12e 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -71,7 +71,7 @@ doc = [ "nbsphinx", "scanpydoc>=0.7.7", "zarr", - "awkward>=2.0.5", + "awkward>=2.0.6", "IPython", # For syntax highlighting in notebooks ] test = [ From cd1a4517995bd6b5ded44abf6f6acbb1b361fa19 Mon Sep 17 00:00:00 2001 From: Isaac Virshup Date: Mon, 30 Jan 2023 15:38:51 +0100 Subject: [PATCH 088/110] Fix link --- docs/fileformat-prose.md | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/docs/fileformat-prose.md b/docs/fileformat-prose.md index 0055c006c..a66fc50e8 100644 --- a/docs/fileformat-prose.md +++ b/docs/fileformat-prose.md @@ -321,7 +321,7 @@ values **Experimental** Support for ragged arrays via awkward array is considered experimental under the 0.9.0 release series. -Please direct feedback on it's implementation to [https://github.com/scverse/anndata](). +Please direct feedback on it's implementation to [https://github.com/scverse/anndata](https://github.com/scverse/anndata). ``` Ragged arrays are supported in `anndata` through the [Awkward From 9b2ff6136f5f983abec4dcd7ddb6c6b91a396de0 Mon Sep 17 00:00:00 2001 From: Isaac Virshup Date: Mon, 30 Jan 2023 21:15:27 +0100 Subject: [PATCH 089/110] Get inner join working for concatenation --- anndata/_core/merge.py | 38 ++++++++++++++------------ anndata/tests/test_concatenate.py | 44 ++++++++++++++++--------------- 2 files changed, 44 insertions(+), 38 deletions(-) diff --git a/anndata/_core/merge.py b/anndata/_core/merge.py index 9c1ff1641..3fa349b21 100644 --- a/anndata/_core/merge.py +++ b/anndata/_core/merge.py @@ -481,19 +481,21 @@ def _apply_to_sparse(self, el: spmatrix, *, axis, fill_value=None) -> spmatrix: return out def _apply_to_awkward(self, el: AwkArray, *, axis, fill_value=None): - if dim_len(el, axis) is None: - # Do not reindex variable-length dimensions + import awkward as ak + + if self.no_change: return el + elif axis == 1: # Indexing by field + if self.new_idx.isin(self.old_idx).all(): # inner join + return el[self.new_idx] + else: # outer join + for field in self.new_idx.difference(self.old_idx): + el = ak.with_field(el, None, field) + return el else: - indexer = self.old_idx.get_indexer(self.new_idx) - if -1 in indexer: - raise NotImplementedError( - "Outer join operations are currently not supported with AwkwardArrays" - ) - if axis == 0: - return el[indexer] - if axis == 1: - return el[:, indexer] + raise NotImplementedError( + "Reindexing along axis 0 is not yet implemented for Awkward Arrays" + ) def merge_indices( @@ -624,9 +626,10 @@ def gen_inner_reindexers(els, new_index, axis: Literal[0, 1] = 0): raise NotImplementedError( "Cannot concatenate an AwkwardArray with other array types." ) - # do not reindex awkward arrays - # TODO unintended behaviour? - reindexers = [lambda *args, **kwargs: args[0] for _ in els] + common_keys = intersect_keys(el.fields for el in els) + reindexers = [ + Reindexer(pd.Index(el.fields), pd.Index(list(common_keys))) for el in els + ] else: min_ind = min(el.shape[alt_axis] for el in els) reindexers = [ @@ -649,9 +652,10 @@ def gen_outer_reindexers(els, shapes, new_index: pd.Index, *, axis=0): raise NotImplementedError( "Cannot concatenate an AwkwardArray with other array types." ) - # do not reindex awkward arrays - # TODO unintended behaviour? - reindexers = [lambda *args, **kwargs: args[0] for _ in els] + all_keys = union_keys(el.fields for el in els) + reindexers = [ + Reindexer(pd.Index(el.fields), pd.Index(list(all_keys))) for el in els + ] else: # if fill_value is None: # fill_value = default_fill_value(els) diff --git a/anndata/tests/test_concatenate.py b/anndata/tests/test_concatenate.py index a5f17edbc..95f191af2 100644 --- a/anndata/tests/test_concatenate.py +++ b/anndata/tests/test_concatenate.py @@ -687,7 +687,7 @@ def test_concatenate_with_raw(): assert adata_all.raw is None -def test_concatenate_awkward(): +def test_concatenate_awkward(join_type): import awkward as ak a = ak.Array([[{"a": 1, "b": "foo"}], [{"a": 2, "b": "bar"}, {"a": 3, "b": "baz"}]]) @@ -702,27 +702,29 @@ def test_concatenate_awkward(): adata_a = AnnData(np.zeros((2, 0), dtype=float), obsm={"awk": a}) adata_b = AnnData(np.zeros((3, 0), dtype=float), obsm={"awk": b}) - inner_result = ak.Array( - [ - [{"a": 1}], - [{"a": 2}, {"a": 3}], - [{"a": 4}, {"a": 5}], - [{"a": 6}], - [{"a": 7}], - ] - ) - outer_result = ak.Array( - [ - [{"a": 1, "b": "foo"}], - [{"a": 2, "b": "bar"}, {"a": 3, "b": "baz"}], - [{"a": 4, "b": None}, {"a": 5, "b": None}], - [{"a": 6, "b": None}], - [{"a": 7, "b": None}], - ] - ) + if join_type == "inner": + expected = ak.Array( + [ + [{"a": 1}], + [{"a": 2}, {"a": 3}], + [{"a": 4}, {"a": 5}], + [{"a": 6}], + [{"a": 7}], + ] + ) + elif join_type == "outer": + expected = ak.Array( + [ + [{"a": 1, "b": "foo"}], + [{"a": 2, "b": "bar"}, {"a": 3, "b": "baz"}], + [{"a": 4, "b": None}, {"a": 5, "b": None}], + [{"a": 6, "b": None}], + [{"a": 7, "b": None}], + ] + ) - assert_equal(inner_result, concat([adata_a, adata_b], join="inner").obsm["awk"]) - assert_equal(outer_result, concat([adata_a, adata_b], join="outer").obsm["awk"]) + result = concat([adata_a, adata_b], join=join_type).obsm["awk"] + assert_equal(expected, result) def test_pairwise_concat(axis, array_type): From 52a804a1daf33938f6986268704c32e6e07b411f Mon Sep 17 00:00:00 2001 From: Isaac Virshup Date: Tue, 31 Jan 2023 17:21:40 +0100 Subject: [PATCH 090/110] Bump some concatenation cases to a later PR --- anndata/tests/test_awkward.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/anndata/tests/test_awkward.py b/anndata/tests/test_awkward.py index 5b6c2be4a..eaddf9a8b 100644 --- a/anndata/tests/test_awkward.py +++ b/anndata/tests/test_awkward.py @@ -233,14 +233,14 @@ def test_awkward_io(tmp_path, array): ak.Array([{"a": [1, 2], "b": [1, 2]}, {"a": [3], "b": [4]}]), pd.DataFrame(), ], - ak.Array([{}, {}, {}, {"a": [1, 2], "b": [1, 2]}, {"a": [3], "b": [4]}]), + NotImplementedError, # TODO: ak.Array([{}, {}, {}, {"a": [1, 2], "b": [1, 2]}, {"a": [3], "b": [4]}]), ], [ [ ak.Array([{"a": [1, 2], "b": [1, 2]}, {"a": [3], "b": [4]}]), pd.DataFrame(), ], - ak.Array([{"a": [1, 2], "b": [1, 2]}, {"a": [3], "b": [4]}]), + NotImplementedError, # TODO: ak.Array([{"a": [1, 2], "b": [1, 2]}, {"a": [3], "b": [4]}]), ], [ [ From 75e7526b9e2535b6c828ff9ea3adad1d8f3fbf66 Mon Sep 17 00:00:00 2001 From: Isaac Virshup Date: Wed, 1 Feb 2023 11:36:46 +0100 Subject: [PATCH 091/110] Generate empty arrays for outer join --- anndata/_core/merge.py | 16 ++++++++++++++-- 1 file changed, 14 insertions(+), 2 deletions(-) diff --git a/anndata/_core/merge.py b/anndata/_core/merge.py index 3fa349b21..2f96512f2 100644 --- a/anndata/_core/merge.py +++ b/anndata/_core/merge.py @@ -648,13 +648,25 @@ def gen_outer_reindexers(els, shapes, new_index: pd.Index, *, axis=0): for el, shape in zip(els, shapes) ] elif any(isinstance(el, AwkArray) for el in els if not_missing(el)): + import awkward as ak + if not all(isinstance(el, AwkArray) for el in els if not_missing(el)): raise NotImplementedError( "Cannot concatenate an AwkwardArray with other array types." ) - all_keys = union_keys(el.fields for el in els) + all_keys = union_keys(el.fields for el in els if not_missing(el)) reindexers = [ - Reindexer(pd.Index(el.fields), pd.Index(list(all_keys))) for el in els + (lambda x: x) + if not_missing(el) + else ( + lambda x: ak.pad_none( + # TODO: Do we need to specify the fields? + ak.Array({k: None for k in all_keys})[0:0], + shape, + 0, + ) + ) + for el, shape in zip(els, shapes) ] else: # if fill_value is None: From d3d1d2627226ffe5d6c4c455ce50c3f327c8e79c Mon Sep 17 00:00:00 2001 From: Gregor Sturm Date: Wed, 1 Feb 2023 13:56:13 +0100 Subject: [PATCH 092/110] Raise NotImplementedError when creating a view of an awkward array with custom behavior --- anndata/_core/views.py | 14 +++++++++++--- anndata/tests/test_awkward.py | 11 +++++++++++ 2 files changed, 22 insertions(+), 3 deletions(-) diff --git a/anndata/_core/views.py b/anndata/_core/views.py index b3228388e..24f26ad01 100644 --- a/anndata/_core/views.py +++ b/anndata/_core/views.py @@ -215,7 +215,7 @@ def __copy__(self) -> AwkArray: array = self # makes a shallow copy and removes the reference to the original AnnData object array = ak.with_parameter(self, _PARAM_NAME, None) - array = ak.with_name(array, None) + array = ak.with_parameter(array, "__array__", None) return array @as_view.register(AwkArray) @@ -223,10 +223,18 @@ def as_view_awkarray(array, view_args): parent, attrname, keys = view_args parent_key = f"target-{id(parent)}" _registry[parent_key] = parent + # TODO: See https://github.com/scverse/anndata/pull/647#discussion_r963494798_ for more details and + # possible strategies to stack behaviors. + if type(array).__name__ != "Array": + raise NotImplementedError( + "Cannot create a view of an awkward array with __array__ parameter. " + "Please open an issue in the AnnData repo and describe your use-case." + ) array = ak.with_parameter(array, _PARAM_NAME, (parent_key, attrname, keys)) - return ak.with_name(array, name="AwkwardArrayView") + array = ak.with_parameter(array, "__array__", "AwkwardArrayView") + return array - ak.behavior["*", "AwkwardArrayView"] = AwkwardArrayView + ak.behavior["AwkwardArrayView"] = AwkwardArrayView except ImportError: pass diff --git a/anndata/tests/test_awkward.py b/anndata/tests/test_awkward.py index eaddf9a8b..b5be91c3a 100644 --- a/anndata/tests/test_awkward.py +++ b/anndata/tests/test_awkward.py @@ -144,6 +144,17 @@ def test_view(key): getattr(adata, key)["awk"]["d"] +def test_view_of_awkward_array_with_custom_behavior(): + """Currently can't create view of arrays with custom __name__ (in this case "string") + See https://github.com/scverse/anndata/pull/647#discussion_r963494798_""" + adata = gen_adata((3, 3), varm_types=(), obsm_types=(), layers_types=()) + adata.obsm["awk_string"] = ak.Array(["AAA", "BBB", "CCC"]) + adata_view = adata[:2] + + with pytest.raises(NotImplementedError): + adata_view.obsm["awk_string"] + + @pytest.mark.parametrize( "array", [ From 77e395385622f6f4da5b1bce1e42644f63892669 Mon Sep 17 00:00:00 2001 From: Gregor Sturm Date: Wed, 1 Feb 2023 14:20:31 +0100 Subject: [PATCH 093/110] Add warning when setting awkward array in aligned mapping --- anndata/_core/aligned_mapping.py | 6 ++++++ 1 file changed, 6 insertions(+) diff --git a/anndata/_core/aligned_mapping.py b/anndata/_core/aligned_mapping.py index f0b1f4708..77b20ca27 100644 --- a/anndata/_core/aligned_mapping.py +++ b/anndata/_core/aligned_mapping.py @@ -14,6 +14,7 @@ from .views import as_view from .access import ElementRef from .index import _subset +from ..logging import anndata_logger as logger from anndata.compat import AwkArray @@ -48,6 +49,11 @@ def _ipython_key_completions_(self) -> List[str]: def _validate_value(self, val: V, key: str) -> V: """Raises an error if value is invalid""" + if isinstance(val, AwkArray): + logger.warn( + "Support for Awkward Arrays is currently experimental. " + "Behavior may change in the future. Please report any issues you may encounter!" + ) for i, axis in enumerate(self.axes): if self.parent.shape[axis] != dim_len(val, i): right_shape = tuple(self.parent.shape[a] for a in self.axes) From cfe200e107dd6fb30734837e744ff111069f7bc6 Mon Sep 17 00:00:00 2001 From: Isaac Virshup Date: Wed, 1 Feb 2023 16:00:52 +0100 Subject: [PATCH 094/110] Get much more of concatenation 'working' --- anndata/_core/merge.py | 32 +++++-------- anndata/tests/test_awkward.py | 79 +++++++++++++++++++++++-------- anndata/tests/test_concatenate.py | 1 + 3 files changed, 74 insertions(+), 38 deletions(-) diff --git a/anndata/_core/merge.py b/anndata/_core/merge.py index 2f96512f2..d8cb17029 100644 --- a/anndata/_core/merge.py +++ b/anndata/_core/merge.py @@ -573,9 +573,7 @@ def concat_arrays(arrays, reindexers, axis=0, index=None, fill_value=None): "Cannot concatenate an AwkwardArray with other array types." ) - return ak.concatenate( - [f(a, axis=1 - axis) for f, a in zip(reindexers, arrays)], axis=axis - ) + return ak.concatenate([f(a) for f, a in zip(reindexers, arrays)], axis=axis) elif any(isinstance(a, sparse.spmatrix) for a in arrays): sparse_stack = (sparse.vstack, sparse.hstack)[axis] return sparse_stack( @@ -654,24 +652,20 @@ def gen_outer_reindexers(els, shapes, new_index: pd.Index, *, axis=0): raise NotImplementedError( "Cannot concatenate an AwkwardArray with other array types." ) - all_keys = union_keys(el.fields for el in els if not_missing(el)) - reindexers = [ - (lambda x: x) - if not_missing(el) - else ( - lambda x: ak.pad_none( - # TODO: Do we need to specify the fields? - ak.Array({k: None for k in all_keys})[0:0], - shape, - 0, + # all_keys = union_keys(el.fields for el in els if not_missing(el)) + reindexers = [] + for el in els: + if not_missing(el): + reindexers.append(lambda x: x) + else: + reindexers.append( + lambda x: ak.pad_none( + ak.Array([]), + len(x), + 0, + ) ) - ) - for el, shape in zip(els, shapes) - ] else: - # if fill_value is None: - # fill_value = default_fill_value(els) - max_col = max(el.shape[1] for el in els if not_missing(el)) orig_cols = [el.shape[1] if not_missing(el) else 0 for el in els] reindexers = [ diff --git a/anndata/tests/test_awkward.py b/anndata/tests/test_awkward.py index b5be91c3a..6df635fa1 100644 --- a/anndata/tests/test_awkward.py +++ b/anndata/tests/test_awkward.py @@ -227,56 +227,92 @@ def test_awkward_io(tmp_path, array): assert_equal(adata.uns["awk"], adata2.uns["awk"], exact=True) +# @pytest.mark.parametrize("join", ["outer", "inner"]) @pytest.mark.parametrize( - "arrays,expected", + "arrays,join,expected", [ - [ + pytest.param( [ak.Array([{"a": [1, 2], "b": [1, 2]}, {"a": [3], "b": [4]}]), None], - ak.Array([{"a": [1, 2], "b": [1, 2]}, {"a": [3], "b": [4]}, {}, {}, {}]), - ], - [ + "inner", + None, + id="awk:recordoflists_null-inner", + ), + pytest.param( + [ak.Array([{"a": [1, 2], "b": [1, 2]}, {"a": [3], "b": [4]}]), None], + "outer", + ak.Array( + [{"a": [1, 2], "b": [1, 2]}, {"a": [3], "b": [4]}, None, None, None] + ), + # maybe should return: ak.Array([{"a": [1, 2], "b": [1, 2]}, {"a": [3], "b": [4]}, {}, {}, {}]), + id="awk:recordoflists_null-outer", + ), + pytest.param( + [ak.Array([[{"a": 1}, {"a": 2}], []]), None], + "outer", + ak.Array([[{"a": 1}, {"a": 2}], [], None, None, None]), + # maybe should return: ak.Array([[{"a": 1}, {"a": 2}], [], [], []]), + id="awk:listofrecords_null-outer", + ), + pytest.param( [None, ak.Array([{"a": [1, 2], "b": [1, 2]}, {"a": [3], "b": [4]}])], - ak.Array([{}, {}, {}, {"a": [1, 2], "b": [1, 2]}, {"a": [3], "b": [4]}]), - ], - [ + "inner", + None, + id="null_awk-inner", + ), + pytest.param( + [None, ak.Array([{"a": [1, 2], "b": [1, 2]}, {"a": [3], "b": [4]}])], + "outer", + ak.Array( + [None, None, None, {"a": [1, 2], "b": [1, 2]}, {"a": [3], "b": [4]}] + ), + # maybe should return: ak.Array([{}, {}, {}, {"a": [1, 2], "b": [1, 2]}, {"a": [3], "b": [4]}]), + id="null_awk:recordoflists-outer", + ), + pytest.param( [ None, ak.Array([{"a": [1, 2], "b": [1, 2]}, {"a": [3], "b": [4]}]), pd.DataFrame(), ], + "outer", NotImplementedError, # TODO: ak.Array([{}, {}, {}, {"a": [1, 2], "b": [1, 2]}, {"a": [3], "b": [4]}]), - ], - [ + id="null_awk_empty-pd", + ), + pytest.param( [ ak.Array([{"a": [1, 2], "b": [1, 2]}, {"a": [3], "b": [4]}]), pd.DataFrame(), ], + "outer", NotImplementedError, # TODO: ak.Array([{"a": [1, 2], "b": [1, 2]}, {"a": [3], "b": [4]}]), - ], - [ + id="awk_empty-pd", + ), + pytest.param( [ ak.Array([{"a": [1, 2], "b": [1, 2]}, {"a": [3], "b": [4]}]), pd.DataFrame().assign(a=[3, 4], b=[5, 6]), ], + "outer", # TODO: Should try inner too if implemented NotImplementedError, - ], - [ + ), + pytest.param( [ ak.Array([{"a": [1, 2], "b": [1, 2]}, {"a": [3], "b": [4]}]), np.ones((3, 2)), ], + "outer", NotImplementedError, - ], + ), ], ) @pytest.mark.parametrize("key", ["obsm", "varm"]) -@pytest.mark.parametrize("join", ["outer", "inner"]) def test_concat_mixed_types(key, arrays, expected, join): """Test that concatenation of AwkwardArrays with arbitrary types, but zero length dimension or missing values works.""" axis = 0 if key == "obsm" else 1 to_concat = [] + cell_id, gene_id = 0, 0 for a in arrays: shape = np.array([3, 3]) # default shape (in case of missing array) if a is not None: @@ -286,7 +322,10 @@ def test_concat_mixed_types(key, arrays, expected, join): tmp_adata = gen_adata( tuple(shape), varm_types=(), obsm_types=(), layers_types=() ) - + prev_cell_id, prev_gene_id = cell_id, gene_id + cell_id, gene_id = cell_id + shape[0], gene_id + shape[1] + tmp_adata.obs_names = pd.RangeIndex(prev_cell_id, cell_id).astype(str) + tmp_adata.var_names = pd.RangeIndex(prev_gene_id, gene_id).astype(str) if a is not None: if isinstance(a, pd.DataFrame): a.set_index( @@ -301,5 +340,7 @@ def test_concat_mixed_types(key, arrays, expected, join): with pytest.raises(expected): anndata.concat(to_concat, axis=axis, join=join) else: - result = anndata.concat(to_concat, axis=axis, join=join) - assert_equal(getattr(result, key)["test"], expected, exact=True) + print(to_concat) + result_adata = anndata.concat(to_concat, axis=axis, join=join) + result = getattr(result_adata, key).get("test", None) + assert_equal(expected, result, exact=True) diff --git a/anndata/tests/test_concatenate.py b/anndata/tests/test_concatenate.py index 95f191af2..5d1990406 100644 --- a/anndata/tests/test_concatenate.py +++ b/anndata/tests/test_concatenate.py @@ -724,6 +724,7 @@ def test_concatenate_awkward(join_type): ) result = concat([adata_a, adata_b], join=join_type).obsm["awk"] + # Currently failing while https://github.com/scikit-hep/awkward/issues/2182 is resolved assert_equal(expected, result) From cf4ad0381e9ea6541a7b720c945971bf4e247ee5 Mon Sep 17 00:00:00 2001 From: Isaac Virshup Date: Wed, 1 Feb 2023 18:32:06 +0100 Subject: [PATCH 095/110] Use warning instead of logging --- anndata/__init__.py | 7 ++++++- anndata/_core/aligned_mapping.py | 16 +++++++++++++--- anndata/_warnings.py | 6 ++++++ 3 files changed, 25 insertions(+), 4 deletions(-) diff --git a/anndata/__init__.py b/anndata/__init__.py index ad7e4c68b..ba7861dad 100644 --- a/anndata/__init__.py +++ b/anndata/__init__.py @@ -18,7 +18,12 @@ read_mtx, read_zarr, ) - from ._warnings import OldFormatWarning, WriteWarning, ImplicitModificationWarning + from ._warnings import ( + OldFormatWarning, + WriteWarning, + ImplicitModificationWarning, + ExperimentalFeatureWarning, + ) # backwards compat / shortcut for default format from ._io import read_h5ad as read diff --git a/anndata/_core/aligned_mapping.py b/anndata/_core/aligned_mapping.py index 77b20ca27..b99325727 100644 --- a/anndata/_core/aligned_mapping.py +++ b/anndata/_core/aligned_mapping.py @@ -4,6 +4,7 @@ from typing import Union, Optional, Type, ClassVar, TypeVar # Special types from typing import Iterator, Mapping, Sequence # ABCs from typing import Tuple, List, Dict # Generic base types +import warnings import numpy as np import pandas as pd @@ -14,8 +15,8 @@ from .views import as_view from .access import ElementRef from .index import _subset -from ..logging import anndata_logger as logger from anndata.compat import AwkArray +from anndata._warnings import ExperimentalFeatureWarning OneDIdx = Union[Sequence[int], Sequence[bool], slice] @@ -50,9 +51,18 @@ def _ipython_key_completions_(self) -> List[str]: def _validate_value(self, val: V, key: str) -> V: """Raises an error if value is invalid""" if isinstance(val, AwkArray): - logger.warn( + warnings.warn( "Support for Awkward Arrays is currently experimental. " - "Behavior may change in the future. Please report any issues you may encounter!" + "Behavior may change in the future. Please report any issues you may encounter!", + ExperimentalFeatureWarning, + # stacklevel=3, + ) + # Prevent from showing up every time an awkward array is used + # You'd think `once` works, but it doesn't at the repl and in notebooks + warnings.filterwarnings( + "ignore", + category=ExperimentalFeatureWarning, + message="Support for Awkward Arrays is currently experimental.*", ) for i, axis in enumerate(self.axes): if self.parent.shape[axis] != dim_len(val, i): diff --git a/anndata/_warnings.py b/anndata/_warnings.py index 9409f6aab..5bc0c461c 100644 --- a/anndata/_warnings.py +++ b/anndata/_warnings.py @@ -21,3 +21,9 @@ class ImplicitModificationWarning(UserWarning): """ pass + + +class ExperimentalFeatureWarning(Warning): + """Raised when an unstable experimental feature is used.""" + + pass From 46d553fb4fe3d7fa57ed5db709c04806a009efaa Mon Sep 17 00:00:00 2001 From: Gregor Sturm Date: Thu, 2 Feb 2023 13:07:11 +0100 Subject: [PATCH 096/110] extend todo comment about views --- anndata/_core/views.py | 2 ++ 1 file changed, 2 insertions(+) diff --git a/anndata/_core/views.py b/anndata/_core/views.py index 8baa80f10..72ae8eb07 100644 --- a/anndata/_core/views.py +++ b/anndata/_core/views.py @@ -254,6 +254,8 @@ def as_view_awkarray(array, view_args): _registry[parent_key] = parent # TODO: See https://github.com/scverse/anndata/pull/647#discussion_r963494798_ for more details and # possible strategies to stack behaviors. + # A better solution might be based on xarray-style "attrs", once this is implemented + # https://github.com/scikit-hep/awkward/issues/1391#issuecomment-1412297114 if type(array).__name__ != "Array": raise NotImplementedError( "Cannot create a view of an awkward array with __array__ parameter. " From e8eeb54ca4128acf984c04e7a256bbd1a8df0e21 Mon Sep 17 00:00:00 2001 From: Isaac Virshup Date: Thu, 2 Feb 2023 14:34:13 +0100 Subject: [PATCH 097/110] Fix IO, and to_memory for views of awkward arrays --- anndata/_core/file_backing.py | 12 +++++++++++- anndata/_io/specs/methods.py | 6 ++++++ 2 files changed, 17 insertions(+), 1 deletion(-) diff --git a/anndata/_core/file_backing.py b/anndata/_core/file_backing.py index ce161335d..f7f42bb3b 100644 --- a/anndata/_core/file_backing.py +++ b/anndata/_core/file_backing.py @@ -8,7 +8,7 @@ from . import anndata from .sparse_dataset import SparseDataset -from ..compat import ZarrArray, DaskArray +from ..compat import ZarrArray, DaskArray, AwkArray class AnnDataFileManager: @@ -123,3 +123,13 @@ def _(x, copy=True): @to_memory.register(Mapping) def _(x: Mapping, copy=True): return {k: to_memory(v, copy=copy) for k, v in x.items()} + + +@to_memory.register(AwkArray) +def _(x, copy=True): + from copy import copy + + if copy: + return copy(x) + else: + return x diff --git a/anndata/_io/specs/methods.py b/anndata/_io/specs/methods.py index f2f131673..8a6d7da56 100644 --- a/anndata/_io/specs/methods.py +++ b/anndata/_io/specs/methods.py @@ -502,6 +502,12 @@ def read_sparse_partial(elem, *, items=None, indices=(slice(None), slice(None))) @_REGISTRY.register_write(H5Group, AwkArray, IOSpec("awkward-array", "0.1.0")) @_REGISTRY.register_write(ZarrGroup, AwkArray, IOSpec("awkward-array", "0.1.0")) +@_REGISTRY.register_write( + H5Group, views.AwkwardArrayView, IOSpec("awkward-array", "0.1.0") +) +@_REGISTRY.register_write( + ZarrGroup, views.AwkwardArrayView, IOSpec("awkward-array", "0.1.0") +) def write_awkward(f, k, v, dataset_kwargs=MappingProxyType({})): from anndata.compat import awkward as ak From 5ab0708d942d30111b19d69eca24de7f20260e6b Mon Sep 17 00:00:00 2001 From: Isaac Virshup Date: Thu, 2 Feb 2023 14:49:49 +0100 Subject: [PATCH 098/110] Removed a number of test cases that we're not targeting This fixed a number of tests because we had a 1d awkward array being generated, and we currently don't support 1d arrays in obsm well. Tracked in #652. --- anndata/tests/helpers.py | 6 ------ 1 file changed, 6 deletions(-) diff --git a/anndata/tests/helpers.py b/anndata/tests/helpers.py index 409c8d9dd..d7b62e058 100644 --- a/anndata/tests/helpers.py +++ b/anndata/tests/helpers.py @@ -206,10 +206,7 @@ def gen_adata( array=np.random.random((M, 50)), sparse=sparse.random(M, 100, format="csr"), df=gen_typed_df(M, obs_names), - awk=gen_awkward((M,)), - awk_2d=gen_awkward((M, 20)), awk_2d_ragged=gen_awkward((M, None)), - awk_3d_ragged=gen_awkward((M, 20, None)), da=da.random.random((M, 50)), ) obsm = {k: v for k, v in obsm.items() if type(v) in obsm_types} @@ -217,10 +214,7 @@ def gen_adata( array=np.random.random((N, 50)), sparse=sparse.random(N, 100, format="csr"), df=gen_typed_df(N, var_names), - awk=gen_awkward((N,)), - awk_2d=gen_awkward((N, 20)), awk_2d_ragged=gen_awkward((N, None)), - awk_3d_ragged=gen_awkward((N, 20, None)), da=da.random.random((N, 50)), ) varm = {k: v for k, v in varm.items() if type(v) in varm_types} From 5b39691334973d42842ebf8bfd7f9c6f7d38ffbe Mon Sep 17 00:00:00 2001 From: Isaac Virshup Date: Thu, 2 Feb 2023 15:39:22 +0100 Subject: [PATCH 099/110] Implement outer indexing on axis 0 of an awkward array --- anndata/_core/merge.py | 11 +++++------ 1 file changed, 5 insertions(+), 6 deletions(-) diff --git a/anndata/_core/merge.py b/anndata/_core/merge.py index d8cb17029..55cc12003 100644 --- a/anndata/_core/merge.py +++ b/anndata/_core/merge.py @@ -489,13 +489,12 @@ def _apply_to_awkward(self, el: AwkArray, *, axis, fill_value=None): if self.new_idx.isin(self.old_idx).all(): # inner join return el[self.new_idx] else: # outer join - for field in self.new_idx.difference(self.old_idx): - el = ak.with_field(el, None, field) - return el + # TODO: this code isn't actually hit, we should refactor + raise Exception("This should be unreachable, please open an issue.") else: - raise NotImplementedError( - "Reindexing along axis 0 is not yet implemented for Awkward Arrays" - ) + if len(self.new_idx) > len(self.old_idx): + el = ak.pad_none(el, 1, axis=axis) # axis == 0 + return el[self.old_idx.get_indexer(self.new_idx)] def merge_indices( From 45a99585a3530776f61157826656c4cb4d225410 Mon Sep 17 00:00:00 2001 From: Isaac Virshup Date: Thu, 2 Feb 2023 16:27:15 +0100 Subject: [PATCH 100/110] Fix gen_awkward when one of the dimensions has size 0 --- anndata/tests/helpers.py | 16 ++++++++-------- anndata/tests/test_helpers.py | 31 ++++++++++++++++++------------- 2 files changed, 26 insertions(+), 21 deletions(-) diff --git a/anndata/tests/helpers.py b/anndata/tests/helpers.py index d7b62e058..bcd607ce8 100644 --- a/anndata/tests/helpers.py +++ b/anndata/tests/helpers.py @@ -109,16 +109,16 @@ def gen_awkward(shape, dtype=np.int32): shape = np.array(shape) if np.any(shape == 0): - # use empty numpy array, to pass the correct dimensions to - # ak.Array when one of the dimensions is 0 (the list-of-list approach - # does not work in that case because the list in the 0-dimension would be empty and all - # following dimensions would be lost). - # The size of the variable-length dimension is irrelevant in that case, we arbitrarily set it to 1 - np_arr = np.empty([1 if x is None else x for x in shape], dtype=dtype) - arr = AwkArray(np_arr) + # use empty numpy array for fixed dimensions, then add empty singletons for ragged dimensions + var_dims = [i for i, s in enumerate(shape) if s is None] + shape = [s for s in shape if s is not None] + arr = ak.Array(np.empty(shape, dtype=dtype)) + for d in var_dims: + arr = ak.singletons(arr, axis=d - 1) + return arr else: lil = _gen_awkward_inner(shape, rng, dtype) - arr = AwkArray(lil) + arr = ak.values_astype(AwkArray(lil), dtype) # make fixed-length dimensions regular for i, d in enumerate(shape): diff --git a/anndata/tests/test_helpers.py b/anndata/tests/test_helpers.py index 150cab56f..f0fbcf656 100644 --- a/anndata/tests/test_helpers.py +++ b/anndata/tests/test_helpers.py @@ -48,25 +48,30 @@ def reusable_adata(): @pytest.mark.parametrize( - "shape", + "shape, datashape", [ - (4, 2), - (100, 200, None), - (4, None), - (0, 4), - (4, 0), - (8, None, None), - (8, None, None, None), - (4, None, 8), - (100, 200, 4), - (4, 0, 0), - (0, 0, 0), + [(4, 2), "4 * 2 * int32"], + [(100, 200, None), "100 * 200 * var * int32"], + [(4, None), "4 * var * int32"], + [(0, 4), "0 * 4 * int32"], + [(4, 0), "4 * 0 * int32"], + [(8, None, None), "8 * var * var * int32"], + [(8, None, None, None), "8 * var * var * var * int32"], + [(4, None, 8), "4 * var * 8 * int32"], + [(100, 200, 4), "100 * 200 * 4 * int32"], + [(4, 0, 0), "4 * 0 * 0 * int32"], + [(0, 0, 0), "0 * 0 * 0 * int32"], + [(0, None), "0 * var * int32"], ], ) -def test_gen_awkward(shape): +def test_gen_awkward(shape, datashape): + import awkward as ak + arr = gen_awkward(shape) for i, s in enumerate(shape): assert dim_len(arr, i) == s + arr_type = ak.types.from_datashape(datashape) + assert arr.type == arr_type # Does this work for every warning? From 94aa4efd0a560f1e8be17af3962c6c4360e8acb1 Mon Sep 17 00:00:00 2001 From: Isaac Virshup Date: Thu, 2 Feb 2023 17:10:27 +0100 Subject: [PATCH 101/110] Fix equality function for awkward arrays. Was throwing an error when the arrays weren't broadcastable. --- anndata/_core/merge.py | 17 ++++++++++++++--- 1 file changed, 14 insertions(+), 3 deletions(-) diff --git a/anndata/_core/merge.py b/anndata/_core/merge.py index 55cc12003..c6c6ecfc5 100644 --- a/anndata/_core/merge.py +++ b/anndata/_core/merge.py @@ -158,10 +158,21 @@ def equal_sparse(a, b) -> bool: def equal_awkward(a, b) -> bool: from ..compat import awkward as ak - if dim_len(a, 0) == dim_len(b, 0): - return ak.all(a == b) - else: + if dim_len(a, 0) != dim_len(b, 0): + return False + + a_form, _, a_buffers = ak.to_buffers(ak.to_packed(a)) + b_form, _, b_buffers = ak.to_buffers(ak.to_packed(b)) + + if a_form != b_form: return False + if set(a_buffers.keys()) != set(b_buffers.keys()): + return False + for k in a_buffers.keys(): + if not np.array_equal(a_buffers[k], b_buffers[k], equal_nan=True): + return False + + return True def as_sparse(x): From 99853d5c064b88db269e8c5536af096e5569f6db Mon Sep 17 00:00:00 2001 From: Isaac Virshup Date: Thu, 2 Feb 2023 18:02:13 +0100 Subject: [PATCH 102/110] Modify outer concatenation test to accept current behaviour of awkward array --- anndata/tests/test_concatenate.py | 36 ++++++++++++++++++++++++------- 1 file changed, 28 insertions(+), 8 deletions(-) diff --git a/anndata/tests/test_concatenate.py b/anndata/tests/test_concatenate.py index 5d1990406..569977b4e 100644 --- a/anndata/tests/test_concatenate.py +++ b/anndata/tests/test_concatenate.py @@ -713,18 +713,38 @@ def test_concatenate_awkward(join_type): ] ) elif join_type == "outer": - expected = ak.Array( - [ - [{"a": 1, "b": "foo"}], - [{"a": 2, "b": "bar"}, {"a": 3, "b": "baz"}], - [{"a": 4, "b": None}, {"a": 5, "b": None}], - [{"a": 6, "b": None}], - [{"a": 7, "b": None}], + # TODO: This is what we would like to return, but waiting on: + # * https://github.com/scikit-hep/awkward/issues/2182 and awkward 2.1.0 + # * https://github.com/scikit-hep/awkward/issues/2173 + # expected = ak.Array( + # [ + # [{"a": 1, "b": "foo"}], + # [{"a": 2, "b": "bar"}, {"a": 3, "b": "baz"}], + # [{"a": 4, "b": None}, {"a": 5, "b": None}], + # [{"a": 6, "b": None}], + # [{"a": 7, "b": None}], + # ] + # ) + expected = ak.concatenate( + [ # I don't think I can construct a UnionArray directly + ak.Array( + [ + [{"a": 1, "b": "foo"}], + [{"a": 2, "b": "bar"}, {"a": 3, "b": "baz"}], + ] + ), + ak.Array( + [ + [{"a": 4}, {"a": 5}], + [{"a": 6}], + [{"a": 7}], + ] + ), ] ) result = concat([adata_a, adata_b], join=join_type).obsm["awk"] - # Currently failing while https://github.com/scikit-hep/awkward/issues/2182 is resolved + assert_equal(expected, result) From 96bfe31d8f8bcab5ff6cca57536af9191ff42bbb Mon Sep 17 00:00:00 2001 From: Isaac Virshup Date: Thu, 2 Feb 2023 22:19:09 +0100 Subject: [PATCH 103/110] Add tests for mixed type concatenation with awkward arrays --- anndata/tests/test_concatenate.py | 33 +++++++++++++++++++++++++++++++ 1 file changed, 33 insertions(+) diff --git a/anndata/tests/test_concatenate.py b/anndata/tests/test_concatenate.py index 569977b4e..d67508854 100644 --- a/anndata/tests/test_concatenate.py +++ b/anndata/tests/test_concatenate.py @@ -748,6 +748,39 @@ def test_concatenate_awkward(join_type): assert_equal(expected, result) +@pytest.mark.parametrize( + "other", + [ + pd.DataFrame({"a": [4, 5, 6], "b": ["foo", "bar", "baz"]}, index=list("cde")), + np.ones((3, 2)), + sparse.random(3, 100, format="csr"), + ], +) +def test_awkward_does_not_mix(join_type, other): + import awkward as ak + + awk = ak.Array( + [[{"a": 1, "b": "foo"}], [{"a": 2, "b": "bar"}, {"a": 3, "b": "baz"}]] + ) + + adata_a = AnnData( + np.zeros((2, 3), dtype=float), + obs=pd.DataFrame(index=list("ab")), + obsm={"val": awk}, + ) + adata_b = AnnData( + np.zeros((3, 3), dtype=float), + obs=pd.DataFrame(index=list("cde")), + obsm={"val": other}, + ) + + with pytest.raises( + NotImplementedError, + match="Cannot concatenate an AwkwardArray with other array types", + ): + concat([adata_a, adata_b], join=join_type) + + def test_pairwise_concat(axis, array_type): dim_sizes = [[100, 200, 50], [50, 50, 50]] if axis: From 4a6d119e0f8df0e32040c03e15eb086b83d5cb9a Mon Sep 17 00:00:00 2001 From: Isaac Virshup Date: Fri, 3 Feb 2023 14:21:54 +0100 Subject: [PATCH 104/110] Add warning about outer joins --- anndata/_core/merge.py | 14 +++++++++++++- 1 file changed, 13 insertions(+), 1 deletion(-) diff --git a/anndata/_core/merge.py b/anndata/_core/merge.py index c6c6ecfc5..ac92d131c 100644 --- a/anndata/_core/merge.py +++ b/anndata/_core/merge.py @@ -18,7 +18,7 @@ Literal, ) import typing -from warnings import warn +from warnings import warn, filterwarnings from natsort import natsorted import numpy as np @@ -30,6 +30,7 @@ from ..compat import AwkArray, DaskArray from ..utils import asarray, dim_len from .index import _subset, make_slice +from anndata._warnings import ExperimentalFeatureWarning T = TypeVar("T") @@ -662,6 +663,17 @@ def gen_outer_reindexers(els, shapes, new_index: pd.Index, *, axis=0): raise NotImplementedError( "Cannot concatenate an AwkwardArray with other array types." ) + warn( + "Outer joins on awkward.Arrays will have different return values in the future." + "For details, and to offer input, please see:\n\n\t" + "https://github.com/scverse/anndata/issues/898", + ExperimentalFeatureWarning, + ) + filterwarnings( + "ignore", + category=ExperimentalFeatureWarning, + message=r"Outer joins on awkward.Arrays will have different return values.*", + ) # all_keys = union_keys(el.fields for el in els if not_missing(el)) reindexers = [] for el in els: From 4243ccc26290ef8b7bcb96abd797470a90caae9e Mon Sep 17 00:00:00 2001 From: Isaac Virshup Date: Fri, 3 Feb 2023 14:54:39 +0100 Subject: [PATCH 105/110] Call ak._util.arrays_approx_equal instead of rolling our own --- anndata/_core/merge.py | 16 +--------------- 1 file changed, 1 insertion(+), 15 deletions(-) diff --git a/anndata/_core/merge.py b/anndata/_core/merge.py index ac92d131c..7abe33b71 100644 --- a/anndata/_core/merge.py +++ b/anndata/_core/merge.py @@ -159,21 +159,7 @@ def equal_sparse(a, b) -> bool: def equal_awkward(a, b) -> bool: from ..compat import awkward as ak - if dim_len(a, 0) != dim_len(b, 0): - return False - - a_form, _, a_buffers = ak.to_buffers(ak.to_packed(a)) - b_form, _, b_buffers = ak.to_buffers(ak.to_packed(b)) - - if a_form != b_form: - return False - if set(a_buffers.keys()) != set(b_buffers.keys()): - return False - for k in a_buffers.keys(): - if not np.array_equal(a_buffers[k], b_buffers[k], equal_nan=True): - return False - - return True + return ak._util.arrays_approx_equal(a, b) def as_sparse(x): From 5ad915a90af311e5c1796640d392f1325263c322 Mon Sep 17 00:00:00 2001 From: Isaac Virshup Date: Mon, 6 Feb 2023 11:29:15 +0100 Subject: [PATCH 106/110] update awkward to 2.0.7 (unfortunately: errors) --- anndata/_core/merge.py | 2 +- pyproject.toml | 2 +- 2 files changed, 2 insertions(+), 2 deletions(-) diff --git a/anndata/_core/merge.py b/anndata/_core/merge.py index 7abe33b71..0f44c2326 100644 --- a/anndata/_core/merge.py +++ b/anndata/_core/merge.py @@ -159,7 +159,7 @@ def equal_sparse(a, b) -> bool: def equal_awkward(a, b) -> bool: from ..compat import awkward as ak - return ak._util.arrays_approx_equal(a, b) + return ak.almost_equal(a, b) def as_sparse(x): diff --git a/pyproject.toml b/pyproject.toml index 3eff27cea..2fa1d10cf 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -71,7 +71,7 @@ doc = [ "nbsphinx", "scanpydoc>=0.7.7", "zarr", - "awkward>=2.0.6", + "awkward>=2.0.7", "IPython", # For syntax highlighting in notebooks "myst_parser", ] From 07246cc6da6e347057c5dfcca4c8204155e86f23 Mon Sep 17 00:00:00 2001 From: Gregor Sturm Date: Mon, 6 Feb 2023 18:26:38 +0100 Subject: [PATCH 107/110] remove unnecessary checks from AwkwardArrayView --- anndata/_core/views.py | 15 ++------------- 1 file changed, 2 insertions(+), 13 deletions(-) diff --git a/anndata/_core/views.py b/anndata/_core/views.py index 72ae8eb07..514337f83 100644 --- a/anndata/_core/views.py +++ b/anndata/_core/views.py @@ -220,19 +220,8 @@ def _view_args(self): reference. """ parent_key, attrname, keys = self.layout.parameter(_PARAM_NAME) - if parent_key is None or attrname is None or keys is None: - raise KeyError( - "AwkwardArrayView does not hold reference to original AnnData object." - ) - else: - try: - parent = _registry[parent_key] - except KeyError: - raise KeyError( - "AwkwardArrayView has invalid reference to original AnnData object." - ) - else: - return ElementRef(parent, attrname, keys) + parent = _registry[parent_key] + return ElementRef(parent, attrname, keys) def __copy__(self) -> AwkArray: """ From fb137af1022d4d038606b46dd5c31f72d7ef0083 Mon Sep 17 00:00:00 2001 From: Isaac Virshup Date: Tue, 7 Feb 2023 19:32:44 +0100 Subject: [PATCH 108/110] Workaround scikit-hep/awkward#2209 --- anndata/tests/test_concatenate.py | 17 +++++++++++++++++ 1 file changed, 17 insertions(+) diff --git a/anndata/tests/test_concatenate.py b/anndata/tests/test_concatenate.py index d67508854..f3dfb5ed7 100644 --- a/anndata/tests/test_concatenate.py +++ b/anndata/tests/test_concatenate.py @@ -1258,6 +1258,23 @@ def test_concat_size_0_dim(axis, join_type, merge_strategy, shape): alt_axis = 1 - axis dim = ("obs", "var")[axis] + # TODO: Remove, see: https://github.com/scverse/anndata/issues/905 + import awkward as ak + + if ( + (join_type == "inner") + and (merge_strategy in ("same", "unique")) + and ((axis, shape.index(0)) in [(0, 1), (1, 0)]) + and ak.__version__ == "2.0.7" # indicates if a release has happened + ): + aligned_mapping = (b.obsm, b.varm)[1 - axis] + to_remove = [] + for k, v in aligned_mapping.items(): + if isinstance(v, ak.Array): + to_remove.append(k) + for k in to_remove: + aligned_mapping.pop(k) + expected_size = expected_shape(a, b, axis=axis, join=join_type) result = concat( {"a": a, "b": b}, From 6e326379fb99eefc854c05e696466e258c3ac3de Mon Sep 17 00:00:00 2001 From: "pre-commit-ci[bot]" <66853113+pre-commit-ci[bot]@users.noreply.github.com> Date: Tue, 7 Feb 2023 18:33:44 +0000 Subject: [PATCH 109/110] [pre-commit.ci] auto fixes from pre-commit.com hooks for more information, see https://pre-commit.ci --- anndata/utils.py | 1 - 1 file changed, 1 deletion(-) diff --git a/anndata/utils.py b/anndata/utils.py index 0a3b34407..d9233b2b2 100644 --- a/anndata/utils.py +++ b/anndata/utils.py @@ -141,7 +141,6 @@ def dim_len_awkward(array, axis): elif axis == 0: return len(array) else: - # communicate with the recursive function using a context (lateral) context = {"axis": axis} From 3883bb01ea7ece7c1ad6382aac938c78739c229d Mon Sep 17 00:00:00 2001 From: Isaac Virshup Date: Tue, 7 Feb 2023 20:21:59 +0100 Subject: [PATCH 110/110] Removed extra layer of nesting from on-disk format for awkward arrays --- anndata/_io/specs/methods.py | 5 +++-- docs/fileformat-prose.md | 2 +- 2 files changed, 4 insertions(+), 3 deletions(-) diff --git a/anndata/_io/specs/methods.py b/anndata/_io/specs/methods.py index 8a6d7da56..e1789a987 100644 --- a/anndata/_io/specs/methods.py +++ b/anndata/_io/specs/methods.py @@ -515,7 +515,8 @@ def write_awkward(f, k, v, dataset_kwargs=MappingProxyType({})): form, length, container = ak.to_buffers(ak.to_packed(v)) group.attrs["length"] = length group.attrs["form"] = form.to_json() - write_elem(group, "container", container, dataset_kwargs=dataset_kwargs) + for k, v in container.items(): + write_elem(group, k, v, dataset_kwargs=dataset_kwargs) @_REGISTRY.register_read(H5Group, IOSpec("awkward-array", "0.1.0")) @@ -525,7 +526,7 @@ def read_awkward(elem): form = _read_attr(elem.attrs, "form") length = _read_attr(elem.attrs, "length") - container = read_elem(elem["container"]) + container = {k: read_elem(elem[k]) for k in elem.keys()} return ak.from_buffers(form, length, container) diff --git a/docs/fileformat-prose.md b/docs/fileformat-prose.md index a66fc50e8..c459a4329 100644 --- a/docs/fileformat-prose.md +++ b/docs/fileformat-prose.md @@ -390,7 +390,7 @@ function: >>> ak.from_buffers( ... awkward_group.attrs["form"], ... awkward_group.attrs["length"], -... {k: read_elem(v) for k, v in awkward_group["container"].items()} +... {k: read_elem(v) for k, v in awkward_group.items()} ... ) ```