Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Add coordinate-based coactivation-based parcellation class #533

Draft
wants to merge 27 commits into
base: main
Choose a base branch
from
Draft
Show file tree
Hide file tree
Changes from 7 commits
Commits
Show all changes
27 commits
Select commit Hold shift + click to select a range
08ae309
Add n option to get_studies_by_coordinate
tsalo Jun 30, 2021
88d5c05
Mock up maybe passable CoordCBP class.
tsalo Jun 30, 2021
3ce90f1
Fix docstrings.
tsalo Jun 30, 2021
40f4585
Don't calculate distances.
tsalo Jul 1, 2021
530048f
Remove extra newline.
tsalo Jul 1, 2021
8382704
Clustering should be fairly good. Still need to implement metrics.
tsalo Jul 1, 2021
c79b251
Add empty methods for the different metrics.
tsalo Jul 1, 2021
3e2d580
Work on filter-selection metric. Far from done.
tsalo Jul 1, 2021
3365d5c
Add a couple of metrics.
tsalo Jul 1, 2021
7081604
Improve metric documentation.
tsalo Jul 2, 2021
12cddec
VI should be fairly good now.
tsalo Jul 2, 2021
7023291
Merge branch 'main' into coord-cbp
tsalo Jul 9, 2021
f37751a
Draft voxel misclassification metric.
tsalo Jul 9, 2021
f555790
Merge branch 'main' into coord-cbp
tsalo Jul 12, 2021
0896745
Draft another metric.
tsalo Jul 14, 2021
f9611db
Little cleanup.
tsalo Jul 14, 2021
909c822
Move ratio calculations to main fit method.
tsalo Jul 14, 2021
65857d1
More cleanup.
tsalo Jul 14, 2021
65b9675
More work and debugging.
tsalo Jul 14, 2021
118e718
Mention re-labeling (not implemented).
tsalo Jul 15, 2021
267a135
Merge branch 'main' into coord-cbp
tsalo Dec 21, 2021
530a4de
Remove added line.
tsalo Dec 21, 2021
e58a9d9
Merge branch 'main' into coord-cbp
tsalo Apr 20, 2022
a1bcf0f
Merge remote-tracking branch 'upstream/main' into jperaza-coord-cbp
JulioAPeraza Jul 25, 2022
1c352f7
Update test_dataset.py
JulioAPeraza Oct 4, 2022
917e943
Merge branch 'main' into coord-cbp
JulioAPeraza Oct 4, 2022
0c60dd5
Update utils.py
JulioAPeraza Oct 4, 2022
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
41 changes: 32 additions & 9 deletions nimare/dataset.py
Original file line number Diff line number Diff line change
Expand Up @@ -548,7 +548,8 @@ def get_studies_by_label(self, labels=None, label_threshold=0.001):
the labels above the threshold, it will be returned.
Default is None.
label_threshold : :obj:`float`, optional
Default is 0.5.
Default is 0.001, which corresponds to terms appearing at least once in abstracts
for the commonly-used Neurosynth TF-IDF feature set.

Returns
-------
Expand Down Expand Up @@ -600,27 +601,49 @@ def get_studies_by_mask(self, mask):
found_ids = list(self.coordinates.loc[distances, "id"].unique())
return found_ids

def get_studies_by_coordinate(self, xyz, r=20):
def get_studies_by_coordinate(self, xyz, r=None, n=None):
"""Extract list of studies with at least one focus within radius of requested coordinates.

.. versionchanged:: 0.0.9

New option (n) supported to identify closest n studies to coordinate.
Default value for r changed to None.

Parameters
----------
xyz : (X x 3) array_like
List of coordinates against which to find studies.
r : :obj:`float`, optional
Radius (in mm) within which to find studies. Default is 20mm.
Radius (in mm) within which to find studies. Mutually exclusive with ``n``.
Default is None.
n : :obj:`int`, optional
Number of closest studies to identify. Mutually exclusive with ``r``.
Default is None.

Returns
-------
found_ids : :obj:`list`
A list of IDs from the Dataset with at least one focus within
radius r of requested coordinates.
A list of IDs from the Dataset matching the search criterion.
"""
if n and r:
raise ValueError("Only one of 'r' and 'n' may be provided.")
elif not n and not r:
raise ValueError("Either 'r' or 'n' must be provided.")

from scipy.spatial.distance import cdist

xyz = np.array(xyz)
temp_coords = self.coordinates.copy()

xyz = np.atleast_2d(xyz)
assert xyz.shape[1] == 3 and xyz.ndim == 2
distances = cdist(xyz, self.coordinates[["x", "y", "z"]].values)
distances = np.any(distances <= r, axis=0)
found_ids = list(self.coordinates.loc[distances, "id"].unique())
distances = cdist(xyz, temp_coords[["x", "y", "z"]].values)
distances = np.min(distances, axis=0)
temp_coords["distance"] = distances
min_dist_ser = temp_coords.groupby("id")["distance"].min()

if r:
found_ids = min_dist_ser.loc[min_dist_ser <= r].index.tolist()
elif n:
found_ids = min_dist_ser.nsmallest(n).index.tolist()

return found_ids
223 changes: 223 additions & 0 deletions nimare/parcellate.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,223 @@
"""Parcellation tools."""
import datetime
import inspect
import logging
import os
from tempfile import mkstemp

import numpy as np
from sklearn.cluster import KMeans

from .base import NiMAREBase
from .extract.utils import _get_dataset_dir
from .meta.base import CBMAEstimator
from .meta.cbma.ale import ALE
from .results import MetaResult
from .utils import add_metadata_to_dataframe, check_type, listify, use_memmap, vox2mm

LGR = logging.getLogger(__name__)


class CoordCBP(NiMAREBase):
"""Perform coordinate-based coactivation-based parcellation.

.. versionadded:: 0.0.10

Parameters
----------
target_mask : :obj:`nibabel.Nifti1.Nifti1Image`
Mask of target of parcellation.
Currently must be in same space/resolution as Dataset mask.
n_clusters : :obj:`list` of :obj:`int`
Number of clusters to evaluate in clustering.
Metrics will be calculated for each cluster-count in the list, to allow users to select
the optimal cluster solution.
r : :obj:`float` or :obj:`list` of :obj:`float` or None, optional
Radius (in mm) within which to find studies.
If a list of values is provided, then MACMs and clustering will be performed across all
values, and a selection procedure will be performed to identify the optimal ``r``.
Mutually exclusive with ``n``. Default is None.
n : :obj:`int` or :obj:`list` of :obj:`int` or None, optional
Number of closest studies to identify.
If a list of values is provided, then MACMs and clustering will be performed across all
values, and a selection procedure will be performed to identify the optimal ``n``.
Mutually exclusive with ``r``. Default is None.
meta_estimator : :obj:`nimare.meta.base.CBMAEstimator`, optional
CBMA Estimator with which to run the MACMs.
Default is :obj:`nimare.meta.cbma.ale.ALE`.
target_image : :obj:`str`, optional
Name of meta-analysis results image to use for clustering.
Default is "ale", which is specific to the ALE estimator.
"""

_required_inputs = {"coordinates": ("coordinates", None)}

def __init__(
self,
target_mask,
n_clusters,
r=None,
n=None,
meta_estimator=None,
target_image="ale",
*args,
**kwargs,
):
super().__init__(*args, **kwargs)

if meta_estimator is None:
meta_estimator = ALE()
else:
meta_estimator = check_type(meta_estimator, CBMAEstimator)

if r and n:
raise ValueError("Only one of 'r' and 'n' may be provided.")
elif not r and not n:
raise ValueError("Either 'r' or 'n' must be provided.")

self.meta_estimator = meta_estimator
self.target_image = target_image
self.n_clusters = listify(n_clusters)
self.filter_selection = isinstance(r, list) or isinstance(n, list)
self.filter_type = "r" if r else "n"
self.r = listify(r)
self.n = listify(n)

def _preprocess_input(self, dataset):
"""Mask required input images using either the dataset's mask or the estimator's.

Also, insert required metadata into coordinates DataFrame.
"""
super()._preprocess_input(dataset)

# All extra (non-ijk) parameters for a kernel should be overrideable as
# parameters to __init__, so we can access them with get_params()
kt_args = list(self.meta_estimator.kernel_transformer.get_params().keys())

# Integrate "sample_size" from metadata into DataFrame so that
# kernel_transformer can access it.
if "sample_size" in kt_args:
self.inputs_["coordinates"] = add_metadata_to_dataframe(
dataset,
self.inputs_["coordinates"],
metadata_field="sample_sizes",
target_column="sample_size",
filter_func=np.mean,
)

@use_memmap(LGR, n_files=1)
def _fit(self, dataset):
"""Perform coordinate-based coactivation-based parcellation on dataset.

Parameters
----------
dataset : :obj:`nimare.dataset.Dataset`
Dataset to analyze.
"""
self.dataset = dataset
self.masker = self.masker or dataset.masker

# Loop through voxels in target_mask, selecting studies for each and running MACMs (no MCC)
target_ijk = np.vstack(np.where(self.target_mask.get_fdata()))
target_xyz = vox2mm(target_ijk, self.masker.mask_img.affine)
n_target_voxels = target_xyz.shape[1]
n_mask_voxels = self.masker.transform(self.masker.mask_img).shape[0]

n_filters = len(getattr(self, self.filter_type))
labels = np.zeros((n_filters, len(self.n_clusters), n_target_voxels), dtype=int)
kwargs = {"r": None, "n": None}

# Use a memmapped 2D array
start_time = datetime.datetime.now().strftime("%Y%m%dT%H%M%S")
dataset_dir = _get_dataset_dir("temporary_files", data_dir=None)
_, memmap_filename = mkstemp(
prefix=self.__class__.__name__,
suffix=start_time,
dir=dataset_dir,
)
data = np.memmap(
memmap_filename,
dtype=float,
mode="w+",
shape=(n_target_voxels, n_mask_voxels),
)

for i_filter in range(n_filters):
kwargs[self.filter_type] = getattr(self, self.filter_type)[i_filter]
for j_coord in n_target_voxels:
xyz = target_xyz[:, j_coord]
macm_ids = dataset.get_studies_by_coordinate(xyz, **kwargs)
coord_dset = dataset.slice(macm_ids)

# This seems like a somewhat inelegant solution
# Check if the meta method is a pairwise estimator
if "dataset2" in inspect.getfullargspec(self.meta_estimator.fit).args:
unselected_ids = sorted(list(set(dataset.ids) - set(macm_ids)))
unselected_dset = dataset.slice(unselected_ids)
self.meta_estimator.fit(coord_dset, unselected_dset)
else:
self.meta_estimator.fit(coord_dset)

data[j_coord, :] = self.meta_estimator.results.get_map(
self.target_image,
return_type="array",
) # data is overwritten across filters

# Perform clustering
for j_cluster, cluster_count in enumerate(self.n_clusters):
kmeans = KMeans(
n_clusters=cluster_count,
init="k-means++",
n_init=10,
random_state=0,
algorithm="elkan",
).fit(data)
labels[i_filter, j_cluster, :] = kmeans.labels_

# Clean up MACM data memmap
LGR.info(f"Removing temporary file: {memmap_filename}")
os.remove(memmap_filename)

images = {"labels": labels}
return images

def _filter_selection(self):
Copy link
Member Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Chase 2020:

We implemented a two-step procedure that involved a decision on those filter sizes to be included in the final analysis and subsequently a decision on the optimal cluster solution. In the first step, we examined the consistency of the cluster assignment for the individual voxels across the cluster solutions of the co-occurrence maps performed at different filter sizes. We selected a filter range with the lowest number of deviants, that is, number of voxels that were assigned differently compared with the solution from the majority of filters. In other words, we identified those filter sizes which produced solutions most similar to the consensus-solution across all filter sizes. For example, the proportion of deviants for the second parcellation is illustrated in Figure S1; this shows the borders of the filter range to be used for subsequent steps was based on the Z-scores of the number of deviants.

I interpret this to mean:

  1. Derive mode array of label assignments for each cluster count across filter sizes.
    • I assume this means mode of each voxel determined independently, rather than mode of full set of assignments.
    • What if label numbers don't match? E.g., label 1 in filter size 1 is most similar to label 2 in filter size 2.
    • I assume we should do some kind of synchronization, unless there's some inherent order to KMeans labels?
  2. Count number of voxels that don't match mode for each filter size.
  3. Calculate proportion of deviants in each cluster solution and filter size.
  4. Calculate weighted z-score for each filter size (across cluster solutions) somehow?
    • What is it weighted by?
  5. Select range of filter sizes with lowest z-scores.
    • How? Is there some kind of threshold? Figure S1 grabs range with z-scores < -0.5. No clue if that's a meaningful threshold or something like 2 standard deviations from avg z-score or what.
    • What if there are multiple dips? Does amplitude (z-scores of filters below threshold) or width (number of filters below threshold) matter more?

pass

def _silhouette(self):
pass

def _voxel_misclassification(self):
pass

def _variation_of_information(self):
pass

def _nondominant_voxel_percentage(self):
pass

def _cluster_distance_ratio(self):
pass

def fit(self, dataset, drop_invalid=True):
"""Perform coordinate-based coactivation-based parcellation on dataset.

Parameters
----------
dataset : :obj:`nimare.dataset.Dataset`
Dataset to analyze.
drop_invalid : :obj:`bool`, optional
Whether to automatically ignore any studies without the required data or not.
Default is True.
"""
self._validate_input(dataset, drop_invalid=drop_invalid)
self._preprocess_input(dataset)
maps = self._fit(dataset)

if hasattr(self, "masker") and self.masker is not None:
masker = self.masker
else:
masker = dataset.masker

self.results = MetaResult(self, masker, maps)
return self.results
11 changes: 10 additions & 1 deletion nimare/tests/test_dataset.py
Original file line number Diff line number Diff line change
Expand Up @@ -3,6 +3,7 @@

import nibabel as nib
import numpy as np
import pytest

import nimare
from nimare import dataset
Expand All @@ -23,7 +24,15 @@ def test_dataset_smoke():
assert isinstance(dset.get_images(imtype="beta"), list)
assert isinstance(dset.get_metadata(field="sample_sizes"), list)
assert isinstance(dset.get_studies_by_label("cogat_cognitive_control"), list)
assert isinstance(dset.get_studies_by_coordinate(np.array([[20, 20, 20]])), list)
assert isinstance(dset.get_studies_by_coordinate([20, 20, 20], r=20), list)
assert len(dset.get_studies_by_coordinate([20, 20, 20], n=2)) == 2
assert len(dset.get_studies_by_coordinate([20, 20, 20], n=3)) == 3
with pytest.raises(ValueError):
dset.get_studies_by_coordinate([20, 20, 20])

with pytest.raises(ValueError):
dset.get_studies_by_coordinate([20, 20, 20], r=20, n=4)

mask_data = np.zeros(dset.masker.mask_img.shape, int)
mask_data[40, 40, 40] = 1
mask_img = nib.Nifti1Image(mask_data, dset.masker.mask_img.affine)
Expand Down