scverse · ivirshup · Feb 7, 2023 · Nov 14, 2021 · Nov 14, 2021 · Nov 14, 2021
diff --git a/.gitignore b/.gitignore
@@ -27,4 +27,5 @@ test.h5ad
 
 # IDEs
 /.idea/
+/.vscode/
 
diff --git a/anndata/_core/aligned_mapping.py b/anndata/_core/aligned_mapping.py
@@ -1,5 +1,6 @@
 from abc import ABC, abstractmethod
 from collections import abc as cabc
+from copy import copy
 from typing import Union, Optional, Type, ClassVar, TypeVar  # Special types
 from typing import Iterator, Mapping, Sequence  # ABCs
 from typing import Tuple, List, Dict  # Generic base types
@@ -8,11 +9,12 @@
 import pandas as pd
 from scipy.sparse import spmatrix
 
-from ..utils import deprecated, ensure_df_homogeneous
+from ..utils import deprecated, ensure_df_homogeneous, dim_len
 from . import raw, anndata
 from .views import as_view
 from .access import ElementRef
 from .index import _subset
+from anndata.compat import AwkArray
 
 
 OneDIdx = Union[Sequence[int], Sequence[bool], slice]
@@ -47,14 +49,22 @@ def _ipython_key_completions_(self) -> List[str]:
     def _validate_value(self, val: V, key: str) -> V:
         """Raises an error if value is invalid"""
         for i, axis in enumerate(self.axes):
-            if self.parent.shape[axis] != val.shape[i]:
+            if self.parent.shape[axis] != dim_len(val, i):
                 right_shape = tuple(self.parent.shape[a] for a in self.axes)
-                raise ValueError(
-                    f"Value passed for key {key!r} is of incorrect shape. "
-                    f"Values of {self.attrname} must match dimensions "
-                    f"{self.axes} of parent. Value had shape {val.shape} while "
-                    f"it should have had {right_shape}."
-                )
+                actual_shape = tuple(dim_len(val, a) for a, _ in enumerate(self.axes))
+                if actual_shape[i] is None and isinstance(val, AwkArray):
+                    raise ValueError(
+                        f"The AwkwardArray is of variable length in dimension {i}.",
+                        f"Try ak.to_regular(array, {i}) before including the array in AnnData",
+                    )
+                else:
+                    raise ValueError(
+                        f"Value passed for key {key!r} is of incorrect shape. "
+                        f"Values of {self.attrname} must match dimensions "
+                        f"{self.axes} of parent. Value had shape {actual_shape} while "
+                        f"it should have had {right_shape}."
+                    )
+
         if not self._allow_df and isinstance(val, pd.DataFrame):
             name = self.attrname.title().rstrip("s")
             val = ensure_df_homogeneous(val, f"{name} {key!r}")
@@ -84,7 +94,10 @@ def parent(self) -> Union["anndata.AnnData", "raw.Raw"]:
     def copy(self):
         d = self._actual_class(self.parent, self._axis)
         for k, v in self.items():
-            d[k] = v.copy()
+            if isinstance(v, AwkArray):
+                d[k] = copy(v)
+            else:
+                d[k] = v.copy()
         return d
 
     def _view(self, parent: "anndata.AnnData", subset_idx: I):

diff --git a/anndata/_core/anndata.py b/anndata/_core/anndata.py
@@ -45,7 +45,7 @@
 )
 from .sparse_dataset import SparseDataset
 from .. import utils
-from ..utils import convert_to_dict, ensure_df_homogeneous
+from ..utils import convert_to_dict, ensure_df_homogeneous, dim_len
 from ..logging import anndata_logger as logger
 from ..compat import (
     ZarrArray,
@@ -56,6 +56,7 @@
     _move_adj_mtx,
     _overloaded_uns,
     OverloadedDict,
+    AwkArray,
 )
 
 
@@ -1852,7 +1853,7 @@ def _check_dimensions(self, key=None):
         if "obsm" in key:
             obsm = self._obsm
             if (
-                not all([o.shape[0] == self._n_obs for o in obsm.values()])
+                not all([dim_len(o, 0) == self._n_obs for o in obsm.values()])
                 and len(obsm.dim_names) != self._n_obs
             ):
                 raise ValueError(
@@ -1862,7 +1863,7 @@ def _check_dimensions(self, key=None):
         if "varm" in key:
             varm = self._varm
             if (
-                not all([v.shape[0] == self._n_vars for v in varm.values()])
+                not all([dim_len(v, 0) == self._n_vars for v in varm.values()])
                 and len(varm.dim_names) != self._n_vars
             ):
                 raise ValueError(

diff --git a/anndata/_core/index.py b/anndata/_core/index.py
@@ -7,7 +7,7 @@
 import numpy as np
 import pandas as pd
 from scipy.sparse import spmatrix, issparse
-
+from ..compat import AwkArray
 
 Index1D = Union[slice, int, str, np.int64, np.ndarray]
 Index = Union[Index1D, Tuple[Index1D, Index1D], spmatrix]
@@ -140,6 +140,13 @@ def _subset_df(df: pd.DataFrame, subset_idx: Index):
     return df.iloc[subset_idx]
 
 
+@_subset.register(AwkArray)
+def _subset_awkarray(a: AwkArray, subset_idx: Index):
+    if all(isinstance(x, cabc.Iterable) for x in subset_idx):
+        subset_idx = np.ix_(*subset_idx)
+    return a[subset_idx]
+
+
 # Registration for SparseDataset occurs in sparse_dataset.py
 @_subset.register(h5py.Dataset)
 def _subset_dataset(d, subset_idx):

diff --git a/anndata/_core/merge.py b/anndata/_core/merge.py
@@ -17,8 +17,8 @@
 from scipy.sparse import spmatrix
 
 from .anndata import AnnData
-from ..compat import Literal
-from ..utils import asarray
+from ..compat import Literal, AwkArray
+from ..utils import asarray, dim_len
 
 T = TypeVar("T")
 
@@ -128,6 +128,16 @@ def equal_sparse(a, b) -> bool:
         return False
 
 
+@equal.register(AwkArray)
+def equal_awkward(a, b) -> bool:
+    from ..compat import awkward as ak
+
+    if dim_len(a, 0) == dim_len(b, 0):
+        return ak.all(a == b)
+    else:
+        return False
+
+
 def as_sparse(x):
     if not isinstance(x, sparse.spmatrix):
         return sparse.csr_matrix(x)
@@ -341,12 +351,14 @@ def apply(self, el, *, axis, fill_value=None):
 
         Missing values are to be replaced with `fill_value`.
         """
-        if self.no_change and (el.shape[axis] == len(self.old_idx)):
+        if self.no_change and (dim_len(el, axis) == len(self.old_idx)):
             return el
         if isinstance(el, pd.DataFrame):
             return self._apply_to_df(el, axis=axis, fill_value=fill_value)
         elif isinstance(el, sparse.spmatrix):
             return self._apply_to_sparse(el, axis=axis, fill_value=fill_value)
+        elif isinstance(el, AwkArray):
+            return self._apply_to_awkward(el, axis=axis, fill_value=fill_value)
         else:
             return self._apply_to_array(el, axis=axis, fill_value=fill_value)
 
@@ -424,6 +436,21 @@ def _apply_to_sparse(self, el: spmatrix, *, axis, fill_value=None) -> spmatrix:
 
         return out
 
+    def _apply_to_awkward(self, el: AwkArray, *, axis, fill_value=None):
+        if dim_len(el, axis) is None:
+            # Do not reindex variable-length dimensions
+            return el
+        else:
+            indexer = self.old_idx.get_indexer(self.new_idx)
+            if -1 in indexer:
+                raise NotImplementedError(
+                    "Outer join operations are currently not supported with AwkwardArrays"
+                )
+            if axis == 0:
+                return el[indexer]
+            if axis == 1:
+                return el[:, indexer]
+
 
 def merge_indices(
     inds: Iterable[pd.Index], join: Literal["inner", "outer"]
@@ -490,6 +517,19 @@ def concat_arrays(arrays, reindexers, axis=0, index=None, fill_value=None):
         )
         df.index = index
         return df
+    elif any(isinstance(a, AwkArray) for a in arrays):
+        from ..compat import awkward as ak
+
+        if not all(
+            isinstance(a, AwkArray) or a is MissingVal or 0 in a.shape for a in arrays
+        ):
+            raise NotImplementedError(
+                "Cannot concatenate an AwkwardArray with other array types."
+            )
+
+        return ak.concatenate(
+            [f(a, axis=1 - axis) for f, a in zip(reindexers, arrays)], axis=axis
+        )
     elif any(isinstance(a, sparse.spmatrix) for a in arrays):
         sparse_stack = (sparse.vstack, sparse.hstack)[axis]
         return sparse_stack(
@@ -535,6 +575,14 @@ def gen_inner_reindexers(els, new_index, axis: Literal[0, 1] = 0):
             lambda x, y: x.intersection(y), (df_indices(el) for el in els)
         )
         reindexers = [Reindexer(df_indices(el), common_ind) for el in els]
+    elif any(isinstance(el, AwkArray) for el in els if not_missing(el)):
+        if not all(isinstance(el, AwkArray) for el in els if not_missing(el)):
+            raise NotImplementedError(
+                "Cannot concatenate an AwkwardArray with other array types."
+            )
+        # do not reindex awkward arrays
+        # TODO unintended behaviour?
+        reindexers = [lambda *args, **kwargs: args[0] for _ in els]
     else:
         min_ind = min(el.shape[alt_axis] for el in els)
         reindexers = [
@@ -552,6 +600,14 @@ def gen_outer_reindexers(els, shapes, new_index: pd.Index, *, axis=0):
             else (lambda x: pd.DataFrame(index=range(shape)))
             for el, shape in zip(els, shapes)
         ]
+    elif any(isinstance(el, AwkArray) for el in els if not_missing(el)):
+        if not all(isinstance(el, AwkArray) for el in els if not_missing(el)):
+            raise NotImplementedError(
+                "Cannot concatenate an AwkwardArray with other array types."
+            )
+        # do not reindex awkward arrays
+        # TODO unintended behaviour?
+        reindexers = [lambda *args, **kwargs: args[0] for _ in els]
     else:
         # if fill_value is None:
         # fill_value = default_fill_value(els)

diff --git a/anndata/_core/views.py b/anndata/_core/views.py
@@ -1,5 +1,6 @@
 from contextlib import contextmanager
-from copy import deepcopy
+from copy import copy, deepcopy
+from enum import Enum
 from functools import reduce, singledispatch, wraps
 from typing import Any, KeysView, Optional, Sequence, Tuple
 import warnings
@@ -11,7 +12,7 @@
 
 from anndata._warnings import ImplicitModificationWarning
 from .access import ElementRef
-from ..compat import ZappyArray
+from ..compat import ZappyArray, AwkArray
 
 
 class _SetItemMixin:
@@ -157,6 +158,68 @@ def as_view_zappy(z, view_args):
     return z
 
 
+try:
+    from ..compat import awkward as ak
+    import weakref
+
+    # Registry to store weak references from AwkwardArrayViews to their parent AnnData container
+    _registry = weakref.WeakValueDictionary()
+    _PARAM_NAME = "_view_args"
+
+    class AwkwardArrayView(_ViewMixin, AwkArray):
+        @property
+        def _view_args(self):
+            """Override _view_args to retrieve the values from awkward arrays parameters.
+
+            Awkward arrays cannot be subclassed like other python objects. Instead subclasses need
+            to be attached as "behavior". These "behaviors" cannot take any additional parameters (as we do
+            for other data types to store `_view_args`). Therefore, we need to store `_view_args` using awkward's
+            parameter mechanism. These parameters need to be json-serializable, which is why we can't store
+            ElementRef directly, but need to replace the reference to the parent AnnDataView container with a weak
+            reference.
+            """
+            parent_key, attrname, keys = self.layout.parameter(_PARAM_NAME)
+            if parent_key is None or attrname is None or keys is None:
+                raise KeyError(
+                    "AwkwardArrayView does not hold reference to original AnnData object."
+                )
+            else:
+                try:
+                    parent = _registry[parent_key]
+                except KeyError:
+                    raise KeyError(
+                        "AwkwardArrayView has invalid reference to original AnnData object."
+                    )
+                else:
+                    return ElementRef(parent, attrname, keys)
+
+        def __copy__(self) -> AwkArray:
+            """
+            Turn the AwkwardArrayView into an actual AwkwardArray with no special behavior.
+
+            Need to override __copy__ instead of `.copy()` as awkward arrays don't implement `.copy()`
+            and are copied using python's standard copy mechanism in `aligned_mapping.py`.
+            """
+            array = self
+            # makes a shallow copy and removes the reference to the original AnnData object
+            array = ak.with_parameter(self, _PARAM_NAME, None)
+            array = ak.with_name(array, None)
+            return array
+
+    @as_view.register(AwkArray)
+    def as_view_awkarray(array, view_args):
+        parent, attrname, keys = view_args
+        parent_key = f"target-{id(parent)}"
+        _registry[parent_key] = parent
+        array = ak.with_parameter(array, _PARAM_NAME, (parent_key, attrname, keys))
+        return ak.with_name(array, name="AwkwardArrayView")
+
+    ak.behavior["*", "AwkwardArrayView"] = AwkwardArrayView
+
+except ImportError:
+    pass
+
+
 def _resolve_idxs(old, new, adata):
     t = tuple(_resolve_idx(old[i], new[i], adata.shape[i]) for i in (0, 1))
     return t

diff --git a/anndata/_io/specs/methods.py b/anndata/_io/specs/methods.py
@@ -29,6 +29,7 @@
 )
 from anndata._io.utils import report_write_key_on_error, check_key, H5PY_V3
 from anndata._warnings import OldFormatWarning
+from anndata.compat import AwkArray
 
 from .registry import (
     _REGISTRY,
@@ -481,6 +482,35 @@ def read_sparse_partial(elem, *, items=None, indices=(slice(None), slice(None)))
     return SparseDataset(elem)[indices]
 
 
+#################
+# Awkward array #
+#################
+
+
+@_REGISTRY.register_write(H5Group, AwkArray, IOSpec("awkward-array", "0.1.0"))
+@_REGISTRY.register_write(ZarrGroup, AwkArray, IOSpec("awkward-array", "0.1.0"))
+def write_awkward(f, k, v, dataset_kwargs=MappingProxyType({})):
+    from anndata.compat import awkward as ak
+
+    group = f.create_group(k)
+    form, length, container = ak.to_buffers(ak.packed(v))
+    group.attrs["length"] = length
+    group.attrs["form"] = form.to_json()
+    write_elem(group, "container", container, dataset_kwargs=dataset_kwargs)
+
+
+@_REGISTRY.register_read(H5Group, IOSpec("awkward-array", "0.1.0"))
+@_REGISTRY.register_read(ZarrGroup, IOSpec("awkward-array", "0.1.0"))
+def read_awkward(elem):
+    from anndata.compat import awkward as ak
+
+    form = _read_attr(elem.attrs, "form")
+    length = _read_attr(elem.attrs, "length")
+    container = read_elem(elem["container"])
+
+    return ak.from_buffers(form, length, container)
+
+
 ##############
 # DataFrames #
 ##############