neurostuff · tsalo · Jun 30, 2021 · Jun 30, 2021 · Jun 30, 2021 · Jul 1, 2021
diff --git a/nimare/dataset.py b/nimare/dataset.py
@@ -548,7 +548,8 @@ def get_studies_by_label(self, labels=None, label_threshold=0.001):
             the labels above the threshold, it will be returned.
             Default is None.
         label_threshold : :obj:`float`, optional
-            Default is 0.5.
+            Default is 0.001, which corresponds to terms appearing at least once in abstracts
+            for the commonly-used Neurosynth TF-IDF feature set.
 
         Returns
         -------
@@ -600,27 +601,49 @@ def get_studies_by_mask(self, mask):
         found_ids = list(self.coordinates.loc[distances, "id"].unique())
         return found_ids
 
-    def get_studies_by_coordinate(self, xyz, r=20):
+    def get_studies_by_coordinate(self, xyz, r=None, n=None):
         """Extract list of studies with at least one focus within radius of requested coordinates.
 
+        .. versionchanged:: 0.0.9
+
+            New option (n) supported to identify closest n studies to coordinate.
+            Default value for r changed to None.
+
         Parameters
         ----------
         xyz : (X x 3) array_like
             List of coordinates against which to find studies.
         r : :obj:`float`, optional
-            Radius (in mm) within which to find studies. Default is 20mm.
+            Radius (in mm) within which to find studies. Mutually exclusive with ``n``.
+            Default is None.
+        n : :obj:`int`, optional
+            Number of closest studies to identify. Mutually exclusive with ``r``.
+            Default is None.
 
         Returns
         -------
         found_ids : :obj:`list`
-            A list of IDs from the Dataset with at least one focus within
-            radius r of requested coordinates.
+            A list of IDs from the Dataset matching the search criterion.
         """
+        if n and r:
+            raise ValueError("Only one of 'r' and 'n' may be provided.")
+        elif not n and not r:
+            raise ValueError("Either 'r' or 'n' must be provided.")
+
         from scipy.spatial.distance import cdist
 
-        xyz = np.array(xyz)
+        temp_coords = self.coordinates.copy()
+
+        xyz = np.atleast_2d(xyz)
         assert xyz.shape[1] == 3 and xyz.ndim == 2
-        distances = cdist(xyz, self.coordinates[["x", "y", "z"]].values)
-        distances = np.any(distances <= r, axis=0)
-        found_ids = list(self.coordinates.loc[distances, "id"].unique())
+        distances = cdist(xyz, temp_coords[["x", "y", "z"]].values)
+        distances = np.min(distances, axis=0)
+        temp_coords["distance"] = distances
+        min_dist_ser = temp_coords.groupby("id")["distance"].min()
+
+        if r:
+            found_ids = min_dist_ser.loc[min_dist_ser <= r].index.tolist()
+        elif n:
+            found_ids = min_dist_ser.nsmallest(n).index.tolist()
+
         return found_ids
diff --git a/nimare/parcellate.py b/nimare/parcellate.py
@@ -0,0 +1,223 @@
+"""Parcellation tools."""
+import datetime
+import inspect
+import logging
+import os
+from tempfile import mkstemp
+
+import numpy as np
+from sklearn.cluster import KMeans
+
+from .base import NiMAREBase
+from .extract.utils import _get_dataset_dir
+from .meta.base import CBMAEstimator
+from .meta.cbma.ale import ALE
+from .results import MetaResult
+from .utils import add_metadata_to_dataframe, check_type, listify, use_memmap, vox2mm
+
+LGR = logging.getLogger(__name__)
+
+
+class CoordCBP(NiMAREBase):
+    """Perform coordinate-based coactivation-based parcellation.
+
+    .. versionadded:: 0.0.10
+
+    Parameters
+    ----------
+    target_mask : :obj:`nibabel.Nifti1.Nifti1Image`
+        Mask of target of parcellation.
+        Currently must be in same space/resolution as Dataset mask.
+    n_clusters : :obj:`list` of :obj:`int`
+        Number of clusters to evaluate in clustering.
+        Metrics will be calculated for each cluster-count in the list, to allow users to select
+        the optimal cluster solution.
+    r : :obj:`float` or :obj:`list` of :obj:`float` or None, optional
+        Radius (in mm) within which to find studies.
+        If a list of values is provided, then MACMs and clustering will be performed across all
+        values, and a selection procedure will be performed to identify the optimal ``r``.
+        Mutually exclusive with ``n``. Default is None.
+    n : :obj:`int` or :obj:`list` of :obj:`int` or None, optional
+        Number of closest studies to identify.
+        If a list of values is provided, then MACMs and clustering will be performed across all
+        values, and a selection procedure will be performed to identify the optimal ``n``.
+        Mutually exclusive with ``r``. Default is None.
+    meta_estimator : :obj:`nimare.meta.base.CBMAEstimator`, optional
+        CBMA Estimator with which to run the MACMs.
+        Default is :obj:`nimare.meta.cbma.ale.ALE`.
+    target_image : :obj:`str`, optional
+        Name of meta-analysis results image to use for clustering.
+        Default is "ale", which is specific to the ALE estimator.
+    """
+
+    _required_inputs = {"coordinates": ("coordinates", None)}
+
+    def __init__(
+        self,
+        target_mask,
+        n_clusters,
+        r=None,
+        n=None,
+        meta_estimator=None,
+        target_image="ale",
+        *args,
+        **kwargs,
+    ):
+        super().__init__(*args, **kwargs)
+
+        if meta_estimator is None:
+            meta_estimator = ALE()
+        else:
+            meta_estimator = check_type(meta_estimator, CBMAEstimator)
+
+        if r and n:
+            raise ValueError("Only one of 'r' and 'n' may be provided.")
+        elif not r and not n:
+            raise ValueError("Either 'r' or 'n' must be provided.")
+
+        self.meta_estimator = meta_estimator
+        self.target_image = target_image
+        self.n_clusters = listify(n_clusters)
+        self.filter_selection = isinstance(r, list) or isinstance(n, list)
+        self.filter_type = "r" if r else "n"
+        self.r = listify(r)
+        self.n = listify(n)
+
+    def _preprocess_input(self, dataset):
+        """Mask required input images using either the dataset's mask or the estimator's.
+
+        Also, insert required metadata into coordinates DataFrame.
+        """
+        super()._preprocess_input(dataset)
+
+        # All extra (non-ijk) parameters for a kernel should be overrideable as
+        # parameters to __init__, so we can access them with get_params()
+        kt_args = list(self.meta_estimator.kernel_transformer.get_params().keys())
+
+        # Integrate "sample_size" from metadata into DataFrame so that
+        # kernel_transformer can access it.
+        if "sample_size" in kt_args:
+            self.inputs_["coordinates"] = add_metadata_to_dataframe(
+                dataset,
+                self.inputs_["coordinates"],
+                metadata_field="sample_sizes",
+                target_column="sample_size",
+                filter_func=np.mean,
+            )
+
+    @use_memmap(LGR, n_files=1)
+    def _fit(self, dataset):
+        """Perform coordinate-based coactivation-based parcellation on dataset.
+
+        Parameters
+        ----------
+        dataset : :obj:`nimare.dataset.Dataset`
+            Dataset to analyze.
+        """
+        self.dataset = dataset
+        self.masker = self.masker or dataset.masker
+
+        # Loop through voxels in target_mask, selecting studies for each and running MACMs (no MCC)
+        target_ijk = np.vstack(np.where(self.target_mask.get_fdata()))
+        target_xyz = vox2mm(target_ijk, self.masker.mask_img.affine)
+        n_target_voxels = target_xyz.shape[1]
+        n_mask_voxels = self.masker.transform(self.masker.mask_img).shape[0]
+
+        n_filters = len(getattr(self, self.filter_type))
+        labels = np.zeros((n_filters, len(self.n_clusters), n_target_voxels), dtype=int)
+        kwargs = {"r": None, "n": None}
+
+        # Use a memmapped 2D array
+        start_time = datetime.datetime.now().strftime("%Y%m%dT%H%M%S")
+        dataset_dir = _get_dataset_dir("temporary_files", data_dir=None)
+        _, memmap_filename = mkstemp(
+            prefix=self.__class__.__name__,
+            suffix=start_time,
+            dir=dataset_dir,
+        )
+        data = np.memmap(
+            memmap_filename,
+            dtype=float,
+            mode="w+",
+            shape=(n_target_voxels, n_mask_voxels),
+        )
+
+        for i_filter in range(n_filters):
+            kwargs[self.filter_type] = getattr(self, self.filter_type)[i_filter]
+            for j_coord in n_target_voxels:
+                xyz = target_xyz[:, j_coord]
+                macm_ids = dataset.get_studies_by_coordinate(xyz, **kwargs)
+                coord_dset = dataset.slice(macm_ids)
+
+                # This seems like a somewhat inelegant solution
+                # Check if the meta method is a pairwise estimator
+                if "dataset2" in inspect.getfullargspec(self.meta_estimator.fit).args:
+                    unselected_ids = sorted(list(set(dataset.ids) - set(macm_ids)))
+                    unselected_dset = dataset.slice(unselected_ids)
+                    self.meta_estimator.fit(coord_dset, unselected_dset)
+                else:
+                    self.meta_estimator.fit(coord_dset)
+
+                data[j_coord, :] = self.meta_estimator.results.get_map(
+                    self.target_image,
+                    return_type="array",
+                )  # data is overwritten across filters
+
+            # Perform clustering
+            for j_cluster, cluster_count in enumerate(self.n_clusters):
+                kmeans = KMeans(
+                    n_clusters=cluster_count,
+                    init="k-means++",
+                    n_init=10,
+                    random_state=0,
+                    algorithm="elkan",
+                ).fit(data)
+                labels[i_filter, j_cluster, :] = kmeans.labels_
+
+        # Clean up MACM data memmap
+        LGR.info(f"Removing temporary file: {memmap_filename}")
+        os.remove(memmap_filename)
+
+        images = {"labels": labels}
+        return images
+
+    def _filter_selection(self):
+        pass
+
+    def _silhouette(self):
+        pass
+
+    def _voxel_misclassification(self):
+        pass
+
+    def _variation_of_information(self):
+        pass
+
+    def _nondominant_voxel_percentage(self):
+        pass
+
+    def _cluster_distance_ratio(self):
+        pass
+
+    def fit(self, dataset, drop_invalid=True):
+        """Perform coordinate-based coactivation-based parcellation on dataset.
+
+        Parameters
+        ----------
+        dataset : :obj:`nimare.dataset.Dataset`
+            Dataset to analyze.
+        drop_invalid : :obj:`bool`, optional
+            Whether to automatically ignore any studies without the required data or not.
+            Default is True.
+        """
+        self._validate_input(dataset, drop_invalid=drop_invalid)
+        self._preprocess_input(dataset)
+        maps = self._fit(dataset)
+
+        if hasattr(self, "masker") and self.masker is not None:
+            masker = self.masker
+        else:
+            masker = dataset.masker
+
+        self.results = MetaResult(self, masker, maps)
+        return self.results
diff --git a/nimare/tests/test_dataset.py b/nimare/tests/test_dataset.py
@@ -3,6 +3,7 @@
 
 import nibabel as nib
 import numpy as np
+import pytest
 
 import nimare
 from nimare import dataset
@@ -23,7 +24,15 @@ def test_dataset_smoke():
     assert isinstance(dset.get_images(imtype="beta"), list)
     assert isinstance(dset.get_metadata(field="sample_sizes"), list)
     assert isinstance(dset.get_studies_by_label("cogat_cognitive_control"), list)
-    assert isinstance(dset.get_studies_by_coordinate(np.array([[20, 20, 20]])), list)
+    assert isinstance(dset.get_studies_by_coordinate([20, 20, 20], r=20), list)
+    assert len(dset.get_studies_by_coordinate([20, 20, 20], n=2)) == 2
+    assert len(dset.get_studies_by_coordinate([20, 20, 20], n=3)) == 3
+    with pytest.raises(ValueError):
+        dset.get_studies_by_coordinate([20, 20, 20])
+
+    with pytest.raises(ValueError):
+        dset.get_studies_by_coordinate([20, 20, 20], r=20, n=4)
+
     mask_data = np.zeros(dset.masker.mask_img.shape, int)
     mask_data[40, 40, 40] = 1
     mask_img = nib.Nifti1Image(mask_data, dset.masker.mask_img.affine)