Skip to content

Commit

Permalink
Merge branch 'ig/read_remote_dispatched' of github.com:scverse/anndat…
Browse files Browse the repository at this point in the history
…a into ig/read_remote_dispatched
  • Loading branch information
ilan-gold committed Sep 4, 2023
2 parents b50378c + d24678f commit be37320
Show file tree
Hide file tree
Showing 26 changed files with 381 additions and 135 deletions.
12 changes: 6 additions & 6 deletions .azure-pipelines.yml
Original file line number Diff line number Diff line change
Expand Up @@ -13,13 +13,13 @@ jobs:
vmImage: "ubuntu-22.04"
strategy:
matrix:
Python310:
python.version: "3.10"
Python3.11:
python.version: "3.11"
RUN_COVERAGE: yes
Python38:
Python3.8:
python.version: "3.8"
PreRelease:
python.version: "3.10"
python.version: "3.11"
PRERELEASE_DEPENDENCIES: yes
steps:
- task: UsePythonVersion@0
Expand Down Expand Up @@ -87,8 +87,8 @@ jobs:
steps:
- task: UsePythonVersion@0
inputs:
versionSpec: "3.10"
displayName: "Use Python 3.10"
versionSpec: "3.11"
displayName: "Use Python 3.11"

- script: |
python -m pip install --upgrade pip
Expand Down
23 changes: 11 additions & 12 deletions .github/workflows/benchmark.yml
Original file line number Diff line number Diff line change
Expand Up @@ -11,12 +11,12 @@ jobs:
runs-on: ${{ matrix.os }}
defaults:
run:
shell: bash -e {0} # -e to fail on error
shell: bash -el {0} # -e to fail on error, -l for mamba

strategy:
fail-fast: false
matrix:
python: ["3.10"]
python: ["3.11"]
os: [ubuntu-latest]

env:
Expand All @@ -33,12 +33,15 @@ jobs:
if: ${{ github.ref_name != 'main' }}
# Errors on main branch

- name: Set up Python ${{ matrix.python }}
uses: actions/setup-python@v4
- uses: mamba-org/setup-micromamba@v1
with:
python-version: ${{ matrix.python }}
cache: "pip"
cache-dependency-path: "**/pyproject.toml"
environment-name: asv
cache-environment: true
create-args: >-
python=3.11
asv
mamba
packaging
- name: Cache datasets
uses: actions/cache@v3
Expand All @@ -47,12 +50,8 @@ jobs:
~/.cache
key: benchmark-state-${{ hashFiles('benchmarks/**') }}

- name: Install dependencies
run: |
pip install asv
- name: Quick benchmark run
working-directory: ${{ env.ASV_DIR }}
run: |
asv machine --yes
asv run -qev --strict
asv run --quick --show-stderr --verbose
2 changes: 1 addition & 1 deletion .github/workflows/test-gpu.yml
Original file line number Diff line number Diff line change
Expand Up @@ -51,7 +51,7 @@ jobs:
micromamba-version: "1.3.1-0"
environment-name: anndata-gpu-ci
create-args: >-
python=3.10
python=3.11
cupy
numba
pytest
Expand Down
4 changes: 2 additions & 2 deletions .pre-commit-config.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -5,12 +5,12 @@ repos:
- id: black
- repo: https://github.com/astral-sh/ruff-pre-commit
# Ruff version.
rev: "v0.0.282"
rev: "v0.0.285"
hooks:
- id: ruff
args: ["--fix"]
- repo: https://github.com/pre-commit/mirrors-prettier
rev: v3.0.1
rev: v3.0.2
hooks:
- id: prettier
- repo: https://github.com/pre-commit/pre-commit-hooks
Expand Down
2 changes: 1 addition & 1 deletion .readthedocs.yml
Original file line number Diff line number Diff line change
Expand Up @@ -2,7 +2,7 @@ version: 2
build:
os: ubuntu-20.04
tools:
python: "3.10"
python: "3.11"
sphinx:
configuration: docs/conf.py
fail_on_warning: true # do not change or you will be fired
Expand Down
4 changes: 2 additions & 2 deletions README.md
Original file line number Diff line number Diff line change
Expand Up @@ -3,8 +3,8 @@
[![Coverage](https://codecov.io/gh/scverse/anndata/branch/main/graph/badge.svg?token=IN1mJN1Wi8)](https://codecov.io/gh/scverse/anndata)
[![Docs](https://readthedocs.com/projects/icb-anndata/badge/?version=latest)](https://anndata.readthedocs.io)
[![PyPI](https://img.shields.io/pypi/v/anndata.svg)](https://pypi.org/project/anndata)
[![PyPIDownloadsMonth](https://img.shields.io/pypi/dm/scanpy?logo=PyPI&color=blue)](https://pypi.org/project/anndata)
[![PyPIDownloadsTotal](https://pepy.tech/badge/anndata)](https://pepy.tech/project/anndata)
[![Downloads](https://static.pepy.tech/badge/anndata/month)](https://pepy.tech/project/anndata)
[![Downloads](https://static.pepy.tech/badge/anndata)](https://pepy.tech/project/anndata)
[![Stars](https://img.shields.io/github/stars/scverse/anndata?logo=GitHub&color=yellow)](https://github.com/scverse/anndata/stargazers)
[![Powered by NumFOCUS](https://img.shields.io/badge/powered%20by-NumFOCUS-orange.svg?style=flat&colorA=E1523D&colorB=007D8A)](http://numfocus.org)

Expand Down
7 changes: 7 additions & 0 deletions anndata/__init__.py
Original file line number Diff line number Diff line change
Expand Up @@ -12,6 +12,13 @@
"anndata is not correctly installed. Please install it, e.g. with pip."
)

# Allowing notes to be added to exceptions. See: https://github.com/scverse/anndata/issues/868
import sys

if sys.version_info < (3, 11):
# Backport package for exception groups
import exceptiongroup # noqa: F401

from ._core.anndata import AnnData
from ._core.merge import concat
from ._core.raw import Raw
Expand Down
13 changes: 9 additions & 4 deletions anndata/_core/aligned_mapping.py
Original file line number Diff line number Diff line change
Expand Up @@ -246,12 +246,17 @@ def _validate_value(self, val: V, key: str) -> V:
if (
hasattr(val, "index")
and isinstance(val.index, cabc.Collection)
and not (val.index == self.dim_names).all()
and not val.index.equals(self.dim_names)
):
# Could probably also re-order index if it’s contained
raise ValueError(
f"value.index does not match parent’s axis {self.axes[0]} names"
)
try:
pd.testing.assert_index_equal(val.index, self.dim_names)
except AssertionError as e:
msg = f"value.index does not match parent’s axis {self.axes[0]} names:\n{e}"
raise ValueError(msg) from None
else:
msg = "Index.equals and pd.testing.assert_index_equal disagree"
raise AssertionError(msg)
return super()._validate_value(val, key)

@property
Expand Down
10 changes: 7 additions & 3 deletions anndata/_core/anndata.py
Original file line number Diff line number Diff line change
@@ -1,6 +1,8 @@
"""\
Main class and helper functions.
"""
from __future__ import annotations

import warnings
import collections.abc as cabc
from collections import OrderedDict
Expand All @@ -19,7 +21,7 @@
import numpy as np
from numpy import ma
import pandas as pd
from pandas.api.types import infer_dtype, is_string_dtype, is_categorical_dtype
from pandas.api.types import infer_dtype, is_string_dtype
from scipy import sparse
from scipy.sparse import issparse, csr_matrix
from anndata._core.anndata_base import AbstractAnnData
Expand Down Expand Up @@ -1119,9 +1121,11 @@ def __getitem__(self, index: Index) -> "AnnData":
oidx, vidx = self._normalize_indices(index)
return AnnData(self, oidx=oidx, vidx=vidx, asview=True)

def _remove_unused_categories(self, df_full, df_sub, uns):
def _remove_unused_categories(
self, df_full: pd.DataFrame, df_sub: pd.DataFrame, uns: dict[str, Any]
):
for k in df_full:
if not is_categorical_dtype(df_full[k]):
if not isinstance(df_full[k].dtype, pd.CategoricalDtype):
continue
all_categories = df_full[k].cat.categories
with pd.option_context("mode.chained_assignment", None):
Expand Down
40 changes: 20 additions & 20 deletions anndata/_core/merge.py
Original file line number Diff line number Diff line change
Expand Up @@ -4,27 +4,25 @@
from __future__ import annotations

from collections import OrderedDict
from collections.abc import Mapping, MutableSet
from functools import reduce, singledispatch
from itertools import repeat
from operator import and_, or_, sub
from typing import (
Any,
from collections.abc import (
Callable,
Collection,
Mapping,
MutableSet,
Iterable,
Optional,
Tuple,
TypeVar,
Union,
Literal,
Sequence,
)
from functools import reduce, singledispatch
from itertools import repeat
from operator import and_, or_, sub
from typing import Any, Optional, TypeVar, Union, Literal
import typing
from warnings import warn, filterwarnings

from natsort import natsorted
import numpy as np
import pandas as pd
from pandas.api.extensions import ExtensionDtype
from scipy import sparse
from scipy.sparse import spmatrix

Expand Down Expand Up @@ -211,7 +209,7 @@ def unify_dtypes(dfs: Iterable[pd.DataFrame]) -> list[pd.DataFrame]:
df_dtypes = [dict(df.dtypes) for df in dfs]
columns = reduce(lambda x, y: x.union(y), [df.columns for df in dfs])

dtypes = {col: list() for col in columns}
dtypes: dict[str, list[np.dtype | ExtensionDtype]] = {col: [] for col in columns}
for col in columns:
for df in df_dtypes:
dtypes[col].append(df.get(col, None))
Expand All @@ -235,7 +233,9 @@ def unify_dtypes(dfs: Iterable[pd.DataFrame]) -> list[pd.DataFrame]:
return dfs


def try_unifying_dtype(col: list) -> pd.core.dtypes.base.ExtensionDtype | None:
def try_unifying_dtype(
col: Sequence[np.dtype | ExtensionDtype],
) -> pd.core.dtypes.base.ExtensionDtype | None:
"""
If dtypes can be unified, returns the dtype they would be unified to.
Expand All @@ -248,26 +248,26 @@ def try_unifying_dtype(col: list) -> pd.core.dtypes.base.ExtensionDtype | None:
A list of dtypes to unify. Can be numpy/ pandas dtypes, or None (which denotes
a missing value)
"""
dtypes = set()
dtypes: set[pd.CategoricalDtype] = set()
# Categorical
if any([pd.api.types.is_categorical_dtype(x) for x in col]):
if any(isinstance(dtype, pd.CategoricalDtype) for dtype in col):
ordered = False
for dtype in col:
if pd.api.types.is_categorical_dtype(dtype):
if isinstance(dtype, pd.CategoricalDtype):
dtypes.add(dtype)
ordered = ordered | dtype.ordered
elif not pd.isnull(dtype):
return False
if len(dtypes) > 0 and not ordered:
categories = reduce(
lambda x, y: x.union(y),
[x.categories for x in dtypes if not pd.isnull(x)],
[dtype.categories for dtype in dtypes if not pd.isnull(dtype)],
)

return pd.CategoricalDtype(natsorted(categories), ordered=False)
# Boolean
elif all([pd.api.types.is_bool_dtype(x) or x is None for x in col]):
if any([x is None for x in col]):
elif all(pd.api.types.is_bool_dtype(dtype) or dtype is None for dtype in col):
if any(dtype is None for dtype in col):
return pd.BooleanDtype()
else:
return None
Expand Down Expand Up @@ -942,7 +942,7 @@ def merge_outer(mappings, batch_keys, *, join_index="-", merge=merge_unique):
return out


def _resolve_dim(*, dim: str = None, axis: int = None) -> Tuple[int, str]:
def _resolve_dim(*, dim: str = None, axis: int = None) -> tuple[int, str]:
_dims = ("obs", "var")
if (dim is None and axis is None) or (dim is not None and axis is not None):
raise ValueError(
Expand Down
43 changes: 35 additions & 8 deletions anndata/_io/specs/methods.py
Original file line number Diff line number Diff line change
Expand Up @@ -38,6 +38,18 @@
H5File = h5py.File


####################
# Dask utils #
####################

try:
from dask.utils import SerializableLock as Lock
except ImportError:
from threading import Lock

# to fix https://github.com/dask/distributed/issues/780
GLOBAL_LOCK = Lock()

####################
# Dispatch methods #
####################
Expand Down Expand Up @@ -270,7 +282,7 @@ def read_anndata(elem, _reader):
@_REGISTRY.register_write(H5Group, Raw, IOSpec("raw", "0.1.0"))
@_REGISTRY.register_write(ZarrGroup, Raw, IOSpec("raw", "0.1.0"))
def write_raw(f, k, raw, _writer, dataset_kwargs=MappingProxyType({})):
g = f.create_group(k)
g = f.require_group(k)
_writer.write_elem(g, "X", raw.X, dataset_kwargs=dataset_kwargs)
_writer.write_elem(g, "var", raw.var, dataset_kwargs=dataset_kwargs)
_writer.write_elem(g, "varm", dict(raw.varm), dataset_kwargs=dataset_kwargs)
Expand All @@ -290,7 +302,7 @@ def read_mapping(elem, _reader):
@_REGISTRY.register_write(H5Group, dict, IOSpec("dict", "0.1.0"))
@_REGISTRY.register_write(ZarrGroup, dict, IOSpec("dict", "0.1.0"))
def write_mapping(f, k, v, _writer, dataset_kwargs=MappingProxyType({})):
g = f.create_group(k)
g = f.require_group(k)
for sub_k, sub_v in v.items():
_writer.write_elem(g, sub_k, sub_v, dataset_kwargs=dataset_kwargs)

Expand Down Expand Up @@ -331,9 +343,24 @@ def write_basic(f, k, elem, _writer, dataset_kwargs=MappingProxyType({})):


@_REGISTRY.register_write(ZarrGroup, DaskArray, IOSpec("array", "0.2.0"))
def write_basic_dask_zarr(f, k, elem, _writer, dataset_kwargs=MappingProxyType({})):
import dask.array as da

g = f.require_dataset(k, shape=elem.shape, dtype=elem.dtype, **dataset_kwargs)
da.store(elem, g, lock=GLOBAL_LOCK)


# Adding this seperately because h5py isn't serializable
# https://github.com/pydata/xarray/issues/4242
@_REGISTRY.register_write(H5Group, DaskArray, IOSpec("array", "0.2.0"))
def write_basic_dask(f, k, elem, _writer, dataset_kwargs=MappingProxyType({})):
def write_basic_dask_h5(f, k, elem, _writer, dataset_kwargs=MappingProxyType({})):
import dask.array as da
import dask.config as dc

if dc.get("scheduler", None) == "dask.distributed":
raise ValueError(
"Cannot write dask arrays to hdf5 when using distributed scheduler"
)

g = f.require_dataset(k, shape=elem.shape, dtype=elem.dtype, **dataset_kwargs)
da.store(elem, g)
Expand Down Expand Up @@ -459,7 +486,7 @@ def write_sparse_compressed(
fmt: Literal["csr", "csc"],
dataset_kwargs=MappingProxyType({}),
):
g = f.create_group(key)
g = f.require_group(key)
g.attrs["shape"] = value.shape

# Allow resizing for hdf5
Expand Down Expand Up @@ -548,7 +575,7 @@ def read_sparse_partial(elem, *, items=None, indices=(slice(None), slice(None)))
def write_awkward(f, k, v, _writer, dataset_kwargs=MappingProxyType({})):
from anndata.compat import awkward as ak

group = f.create_group(k)
group = f.require_group(k)
form, length, container = ak.to_buffers(ak.to_packed(v))
group.attrs["length"] = length
group.attrs["form"] = form.to_json()
Expand Down Expand Up @@ -582,7 +609,7 @@ def write_dataframe(f, key, df, _writer, dataset_kwargs=MappingProxyType({})):
for reserved in ("_index",):
if reserved in df.columns:
raise ValueError(f"{reserved!r} is a reserved name for dataframe columns.")
group = f.create_group(key)
group = f.require_group(key)
col_names = [check_key(c) for c in df.columns]
group.attrs["column-order"] = col_names

Expand Down Expand Up @@ -701,7 +728,7 @@ def read_partial_dataframe_0_1_0(
@_REGISTRY.register_write(H5Group, pd.Categorical, IOSpec("categorical", "0.2.0"))
@_REGISTRY.register_write(ZarrGroup, pd.Categorical, IOSpec("categorical", "0.2.0"))
def write_categorical(f, k, v, _writer, dataset_kwargs=MappingProxyType({})):
g = f.create_group(k)
g = f.require_group(k)
g.attrs["ordered"] = bool(v.ordered)

_writer.write_elem(g, "codes", v.codes, dataset_kwargs=dataset_kwargs)
Expand Down Expand Up @@ -748,7 +775,7 @@ def read_partial_categorical(elem, *, items=None, indices=(slice(None),)):
ZarrGroup, pd.arrays.BooleanArray, IOSpec("nullable-boolean", "0.1.0")
)
def write_nullable_integer(f, k, v, _writer, dataset_kwargs=MappingProxyType({})):
g = f.create_group(k)
g = f.require_group(k)
if v._mask is not None:
_writer.write_elem(g, "mask", v._mask, dataset_kwargs=dataset_kwargs)
_writer.write_elem(g, "values", v._data, dataset_kwargs=dataset_kwargs)
Expand Down
Loading

0 comments on commit be37320

Please sign in to comment.