diff --git a/anndata/anndata.py b/anndata/anndata.py index 22eb90fb8..d6e5de813 100644 --- a/anndata/anndata.py +++ b/anndata/anndata.py @@ -237,6 +237,38 @@ class AnnData(IndexMixin): ad = AnnData(np.ones((2, 2))) ad[:, 0].X == ad.X[:, 0] + + Examples + -------- + >>> adata1 = AnnData(np.array([[1, 2, 3], [4, 5, 6]]), + >>> {'smp_names': ['s1', 's2'], + >>> 'anno1': ['c1', 'c2']}, + >>> {'var_names': ['a', 'b', 'c']}) + >>> adata2 = AnnData(np.array([[1, 2, 3], [4, 5, 6]]), + >>> {'smp_names': ['s3', 's4'], + >>> 'anno1': ['c3', 'c4']}, + >>> {'var_names': ['b', 'c', 'd']}) + >>> adata3 = AnnData(np.array([[1, 2, 3], [4, 5, 6]]), + >>> {'smp_names': ['s5', 's6'], + >>> 'anno2': ['d3', 'd4']}, + >>> {'var_names': ['b', 'c', 'd']}) + >>> + >>> adata = adata1.concatenate([adata2, adata3]) + >>> adata.X + [[ 2. 3.] + [ 5. 6.] + [ 1. 2.] + [ 4. 5.] + [ 1. 2.] + [ 4. 5.]] + >>> adata.smp + anno1 anno2 batch + s1 c1 NaN 0 + s2 c2 NaN 0 + s3 c3 NaN 1 + s4 c4 NaN 1 + s5 NaN d3 2 + s6 NaN d4 2 """ def __init__(self, data, smp=None, var=None, uns=None, smpm=None, varm=None, @@ -608,6 +640,42 @@ def copy(self): return AnnData(self._data.copy(), self._smp.copy(), self._var.copy(), self._uns.copy(), self._smpm.copy(), self._varm.copy()) + def concatenate(self, adatas): + """Concatenate along the samples axis after intersecting the variables names. + + The `.var`, `.varm`, and `.uns` attributes of the passed adatas are ignored. + + Parameters + ---------- + adatas : AnnData or list of AnnData + AnnData matrices to concatenate with. + + Returns + ------- + adata : AnnData + The concatenated AnnData, where `adata.smp['batch']` stores a + categorical variable labeling the batch. + """ + if isinstance(adatas, AnnData): adatas = [adatas] + joint_variables = self.var_names + for adata2 in adatas: + joint_variables = np.intersect1d( + joint_variables, adata2.var_names, assume_unique=True) + adatas_to_concat = [] + categories = [str(i) for i in range(len(adatas)+1)] + for i, ad in enumerate([self] + adatas): + ad = ad[:, joint_variables] + ad.smp['batch'] = pd.Categorical( + ad.n_smps*[categories[i]], categories=categories) + adatas_to_concat.append(ad) + X = np.concatenate([ad.X for ad in adatas_to_concat]) + smp = pd.concat([ad.smp for ad in adatas_to_concat]) + smpm = np.concatenate([ad.smpm for ad in adatas_to_concat]) + var = adatas_to_concat[0].var + varm = adatas_to_concat[0].varm + uns = adatas_to_concat[0].uns + return AnnData(X, smp, var, uns, smpm, varm) + def __contains__(self, key): raise AttributeError('AnnData has no attribute __contains__, don\'t check `in adata`.') diff --git a/anndata/tests/ann_data.py b/anndata/tests/ann_data.py index 45455b0ec..3384f6417 100644 --- a/anndata/tests/ann_data.py +++ b/anndata/tests/ann_data.py @@ -179,6 +179,24 @@ def test_n_smps(): assert adata1.n_smps == 2 +def test_concatenate(): + adata1 = AnnData(np.array([[1, 2, 3], [4, 5, 6]]), + {'smp_names': ['s1', 's2'], + 'anno1': ['c1', 'c2']}, + {'var_names': ['a', 'b', 'c']}) + adata2 = AnnData(np.array([[1, 2, 3], [4, 5, 6]]), + {'smp_names': ['s3', 's4'], + 'anno1': ['c3', 'c4']}, + {'var_names': ['b', 'c', 'd']}) + adata3 = AnnData(np.array([[1, 2, 3], [4, 5, 6]]), + {'smp_names': ['s5', 's6'], + 'anno2': ['d3', 'd4']}, + {'var_names': ['b', 'c', 'd']}) + adata = adata1.concatenate([adata2, adata3]) + assert adata.n_vars == 2 + assert adata.smp_keys() == ['anno1', 'anno2', 'batch'] + + # TODO: remove logging and actually test values # from scanpy import logging as logg diff --git a/setup.py b/setup.py index c7f82b04a..8bff96a11 100644 --- a/setup.py +++ b/setup.py @@ -17,10 +17,10 @@ name=package_name, version=versioneer.get_version(), cmdclass=versioneer.get_cmdclass(), - description='Class for storing an annotated data matrix.', + description='An annotated data matrix.', long_description=readme, url='http://github.com/theislab/anndata', - author='Alex Wolf, Philipp Angerer', + author='Philipp Angerer, Alex Wolf', author_email='alex.wolf@helmholtz-muenchen.de', license='BSD-3-Clause', install_requires=requires,