scikit-tda · pulquero · Feb 13, 2021 · Feb 13, 2021 · Mar 19, 2021 · deargle
diff --git a/examples/nearest_node.py b/examples/nearest_node.py
@@ -0,0 +1,32 @@
+"""
+nearest_nodes example based on breast cancer data.
+"""
+
+from plot_breast_cancer import *
+from sklearn import neighbors, preprocessing
+
+# new patient data incoming
+i = np.random.randint(len(X))
+new_patient_data = 1.05*X[i]
+new_patient_data = new_patient_data.reshape(1, -1)
+
+# re-use lens1 model
+newlens1 = model.decision_function(new_patient_data)
+
+# re-construct lens2 model
+X_norm = np.linalg.norm(X, axis=1)
+scaler = preprocessing.MinMaxScaler()
+scaler.fit(X_norm.reshape(-1, 1))
+
+newlens2 = scaler.transform(np.linalg.norm(new_patient_data, axis=1).reshape(1, -1))
+
+newlens = np.c_[newlens1, newlens2]
+
+# find nearest nodes
+nn = neighbors.NearestNeighbors(n_neighbors=3)
+node_ids = mapper.nearest_nodes(newlens, new_patient_data, graph, mapper.cover, lens, X, nn)
+
+print("Nearest nodes:")
+for node_id in node_ids:
+    diags = y[graph['nodes'][node_id]]
+    print("  {}: diagnosis {:.1f}%".format(node_id, np.sum(diags)*100.0/len(diags)))
diff --git a/kmapper/cover.py b/kmapper/cover.py
@@ -243,9 +243,9 @@ def transform_single(self, data, center, i=0):
 
         return hypercube
 
-    def transform(self, data, centers=None):
-        """Find entries of all hypercubes. If `centers=None`, then use `self.centers_` as computed in `self.fit`.
-
+    def transform(self, data, centers=None, return_centers=False):
+        """ Find entries of all hypercubes. If `centers=None`, then use `self.centers_` as computed in `self.fit`.
+            
             Empty hypercubes are removed from the result
 
         Parameters
@@ -255,12 +255,15 @@ def transform(self, data, centers=None):
             Data to find in entries in cube. Warning: first column must be index column.
         centers: list of array-like
             Center points for all cubes as returned by `self.fit`. Default is to use `self.centers_`.
+        return_centers: boolean
+            Whether to also return the kept center IDs.
 
         Returns
         =========
         hypercubes: list of array-like
             list of entries in each hypercube in `data`.
-
+        center_ids: array-like
+            list of center IDs kept.
         """
 
         centers = centers or self.centers_
@@ -269,30 +272,38 @@ def transform(self, data, centers=None):
         ]
 
         # Clean out any empty cubes (common in high dimensions)
-        hypercubes = [cube for cube in hypercubes if len(cube)]
-        return hypercubes
+        trimmed_hypercubes = [cube for cube in hypercubes if len(cube)]
+        if return_centers:
+            trimmed_cube_ids = np.array([i for i, cube in enumerate(hypercubes) if len(cube)])
+            return trimmed_hypercubes, trimmed_cube_ids
+        else:
+            return trimmed_hypercubes
 
     def fit_transform(self, data):
         self.fit(data)
         return self.transform(data)
 
-    def find(self, data_point):
-        """Finds the hypercubes that contain the given data point.
+    def find(self, data_point, centers=None):
+        """ Finds the hypercubes that contain the given data point.
+            If `centers=None`, then use `self.centers_` as computed in `self.fit`.
 
         Parameters
         ===========
 
         data_point: array-like
             The data point to locate.
+        centers: list of array-like
+            Center points for all cubes as returned by `self.fit`. Default is to use `self.centers_`.
 
         Returns
         =========
         cube_ids: list of int
-            list of hypercube indices, empty if the data point is outside the cover.
+            list of hypercube indices (w.r.t. `self.fit`), empty if the data point is outside the cover.
 
         """
         cube_ids = []
-        for i, center in enumerate(self.centers_):
+        centers = centers or self.centers_
+        for i, center in enumerate(centers):
             lower_bounds, upper_bounds = center - self.radius_, center + self.radius_
             if np.all(data_point >= lower_bounds) and np.all(
                 data_point <= upper_bounds

diff --git a/kmapper/kmapper.py b/kmapper/kmapper.py
@@ -951,6 +951,90 @@ def data_from_cluster_id(self, cluster_id, graph, data):
         else:
             return np.array([])
 
+    def find_nodes(self, cube_ids, graph, cover, lens):
+        """Returns the clusters and their members from the subset of the cover spanned by the given cube_ids
+
+          Parameters
+          ----------
+          cube_ids : list of int
+              List of hypercube indices.
+          graph : dict
+              The resulting dictionary after applying map().
+          cover : kmapper.Cover
+              The cover used to build `graph`.
+          lens: Numpy Array
+              Lower dimensional representation of data.
+
+          Returns
+          -------
+          nodes : dict
+              cluster membership indexed by cluster ID (subset of `graph["nodes"]`).
+
+        """
+        lens_ids = np.array([x for x in range(lens.shape[0])])
+        lens = np.c_[lens_ids, lens]
+        _, cube_id_mapping = cover.transform(lens, return_centers=True)
+
+        transformed_cube_ids = np.concatenate([np.flatnonzero(cube_id_mapping==cube_id) for cube_id in cube_ids])
+
+        clusters = {}
+        cluster_id_prefixes = tuple(["cube"+str(i)+"_" for i in transformed_cube_ids])
+        for cluster_id, cluster_members in graph["nodes"].items():
+            if cluster_id.startswith(cluster_id_prefixes):
+                clusters[cluster_id] = cluster_members
+        return clusters
+
+    def nearest_nodes(self, newlens, newdata, graph, cover, lens, data, nn):
+        """Returns the nodes nearest to the `newdata` using the given NearestNeighbors algorithm
+
+          Parameters
+          ----------
+          newdata : Numpy array
+              New dataset. Accepts both 1-D and 2-D array.
+          graph : dict
+              The resulting dictionary after applying map().
+          cover : kmapper.Cover
+              The cover used to build `graph`.
+          data : Numpy array
+              Original dataset.
+          lens: Numpy Array
+              Lower dimensional representation of data.
+          nn : NearestNeighbors
+              Scikit-learn NearestNeighbors instance to use.
+
+          Returns
+          -------
+          node_ids : numpy array
+              Node IDs.
+
+        """
+        if newlens.shape[0] != newdata.shape[0]:
+            raise Exception("newlens and newdata must have the same number of rows.")
+
+        if len(newdata.shape) == 1:
+            newlens = newlens[np.newaxis]
+            newdata = newdata[np.newaxis]
+
+        cube_ids = np.concatenate([cover.find(row) for row in newlens])
+        if len(cube_ids) == 0:
+            return np.empty((0,))
+
+        nodes = self.find_nodes(cube_ids, graph, cover, lens)
+        if len(nodes) == 0:
+            return np.empty((0,))
+
+        nn_data = []
+        nn_cluster_ids = []
+        for cluster_id, cluster_members in nodes.items():
+            cluster_data = data[cluster_members]
+            nn_data.append(cluster_data)
+            nn_cluster_ids.append([cluster_id]*len(cluster_data))
+        nn_data = np.vstack(nn_data)
+        nn_cluster_ids = np.concatenate(nn_cluster_ids)
+        nn.fit(nn_data)
+        nn_ids = nn.kneighbors(newdata, return_distance=False)
+        return np.unique(nn_cluster_ids[nn_ids])
+
     def _process_projection_tuple(self, projection):
         # Detect if projection is a tuple (for prediction functions)
         # TODO: multi-label models

diff --git a/test/test_mapper.py b/test/test_mapper.py
@@ -74,6 +74,54 @@ def test_wrong_id(self):
         mems = mapper.data_from_cluster_id("new node", graph, data)
         np.testing.assert_array_equal(mems, np.array([]))
 
+    def test_find_nodes(self):
+        mapper = KeplerMapper(verbose=1)
+        data = np.random.rand(100, 2)
+
+        graph = mapper.map(data)
+        # pick a data point that exists in the graph
+        _, members = list(graph["nodes"].items())[-1]
+        data_point = data[members[-1]]
+
+        cube_ids = mapper.cover.find(data_point)
+        mems = mapper.find_nodes(cube_ids, graph, mapper.cover, data)
+        assert len(mems) > 0
+        for cluster_id, cluster_members in mems.items():
+            np.testing.assert_array_equal(cluster_members, graph["nodes"][cluster_id])
+
+    def test_node_not_found(self):
+        mapper = KeplerMapper(verbose=1)
+        data = np.random.rand(100, 2)
+
+        graph = mapper.map(data)
+        mems = mapper.find_nodes([999], graph, mapper.cover, data)
+        assert len(mems) == 0
+
+    def test_nearest_nodes_1(self):
+        mapper = KeplerMapper(verbose=1)
+        data = np.random.rand(100, 2)
+
+        graph = mapper.map(data)
+        nn = neighbors.NearestNeighbors(n_neighbors=1)
+        expected_id, members = list(graph["nodes"].items())[-1]
+        newdata = data[members[-1]]
+        node_ids = mapper.nearest_nodes(newdata, newdata, graph, mapper.cover, data, data, nn)
+        assert all(node_ids == [expected_id]), node_ids
+
+    def test_nearest_nodes_2(self):
+        mapper = KeplerMapper(verbose=1)
+        data = np.random.rand(100, 2)
+
+        graph = mapper.map(data)
+        nn = neighbors.NearestNeighbors(n_neighbors=1)
+        expected_clusters = [(cluster_id, members) for cluster_id, members in graph['nodes'].items()][:2]
+        cluster_id1 = expected_clusters[0][0]
+        cluster_id2 = expected_clusters[1][0]
+        newdata1 = data[expected_clusters[0][1][-1]]
+        newdata2 = data[expected_clusters[1][1][-1]]
+        newdata = np.vstack([newdata1, newdata2])
+        node_ids = mapper.nearest_nodes(newdata, newdata, graph, mapper.cover, data, data, nn)
+        assert all(node_ids == [cluster_id1, cluster_id2]), node_ids
 
 class TestMap:
     def test_simplices(self):
@@ -94,6 +142,22 @@ def test_simplices(self):
         assert len(nodes) == 3
         assert len(edges) == 3
 
+    def test_nodes(self):
+        mapper = KeplerMapper()
+
+        X = np.random.rand(100, 2)
+        lens = mapper.fit_transform(X)
+        graph = mapper.map(
+            lens,
+            X=X,
+            cover=Cover(n_cubes=3, perc_overlap=0.75),
+            clusterer=cluster.DBSCAN(metric="euclidean", min_samples=3),
+        )
+        assert len(graph["nodes"]) == 3
+        for i, cluster_id in enumerate(graph["nodes"]):
+            # verify cluster ID format
+            assert cluster_id == "cube{}_cluster0".format(i)
+
     def test_precomputed(self):
         mapper = KeplerMapper()