From 39d6a096cc596fae12a5e8dd3cb1f3a694773ca7 Mon Sep 17 00:00:00 2001 From: Mark Hale Date: Sat, 13 Feb 2021 18:24:10 +0000 Subject: [PATCH 1/3] added clusters_from_cover to kmapper. --- kmapper/kmapper.py | 23 +++++++++++++++++++++++ test/test_mapper.py | 34 ++++++++++++++++++++++++++++++++++ 2 files changed, 57 insertions(+) diff --git a/kmapper/kmapper.py b/kmapper/kmapper.py index cffa4c20..355c33d0 100644 --- a/kmapper/kmapper.py +++ b/kmapper/kmapper.py @@ -951,6 +951,29 @@ def data_from_cluster_id(self, cluster_id, graph, data): else: return np.array([]) + def clusters_from_cover(self, cube_ids, graph): + """Returns the clusters and their members from the subset of the cover spanned by the given cube_ids + + Parameters + ---------- + cube_ids : list of int + List of hypercube indices. + graph : dict + The resulting dictionary after applying map(). + + Returns + ------- + clusters : dict + cluster membership indexed by cluster ID (subset of `graph["nodes"]`). + + """ + clusters = {} + cluster_id_prefixes = tuple(["cube"+str(i)+"_" for i in cube_ids]) + for cluster_id, cluster_members in graph["nodes"].items(): + if cluster_id.startswith(cluster_id_prefixes): + clusters[cluster_id] = cluster_members + return clusters + def _process_projection_tuple(self, projection): # Detect if projection is a tuple (for prediction functions) # TODO: multi-label models diff --git a/test/test_mapper.py b/test/test_mapper.py index 6551cf4f..45713551 100644 --- a/test/test_mapper.py +++ b/test/test_mapper.py @@ -74,6 +74,24 @@ def test_wrong_id(self): mems = mapper.data_from_cluster_id("new node", graph, data) np.testing.assert_array_equal(mems, np.array([])) + def test_clusters_from_cover(self): + mapper = KeplerMapper(verbose=1) + data = np.random.rand(100, 2) + + graph = mapper.map(data) + cube_ids = mapper.cover.find(data[0]) + mems = mapper.clusters_from_cover(cube_ids, graph) + assert len(mems) > 0 + for cluster_id, cluster_members in mems.items(): + np.testing.assert_array_equal(cluster_members, graph["nodes"][cluster_id]) + + def test_no_clusters_from_cover(self): + mapper = KeplerMapper(verbose=1) + data = np.random.rand(100, 2) + + graph = mapper.map(data) + mems = mapper.clusters_from_cover([999], graph) + assert len(mems) == 0 class TestMap: def test_simplices(self): @@ -94,6 +112,22 @@ def test_simplices(self): assert len(nodes) == 3 assert len(edges) == 3 + def test_nodes(self): + mapper = KeplerMapper() + + X = np.random.rand(100, 2) + lens = mapper.fit_transform(X) + graph = mapper.map( + lens, + X=X, + cover=Cover(n_cubes=3, perc_overlap=0.75), + clusterer=cluster.DBSCAN(metric="euclidean", min_samples=3), + ) + assert len(graph["nodes"]) == 3 + for i, cluster_id in enumerate(graph["nodes"]): + # verify cluster ID format + assert cluster_id == "cube{}_cluster0".format(i) + def test_precomputed(self): mapper = KeplerMapper() From aded0178b4c74c6e230da89e055716c602e3de05 Mon Sep 17 00:00:00 2001 From: Mark Hale Date: Sat, 13 Feb 2021 18:24:10 +0000 Subject: [PATCH 2/3] added nearest_nodes and find_nodes to kmapper. --- kmapper/cover.py | 31 +++++++++++++------- kmapper/kmapper.py | 69 ++++++++++++++++++++++++++++++++++++++++++--- test/test_mapper.py | 40 ++++++++++++++++++++++---- 3 files changed, 121 insertions(+), 19 deletions(-) diff --git a/kmapper/cover.py b/kmapper/cover.py index a2e56664..202d7b9e 100644 --- a/kmapper/cover.py +++ b/kmapper/cover.py @@ -243,9 +243,9 @@ def transform_single(self, data, center, i=0): return hypercube - def transform(self, data, centers=None): - """Find entries of all hypercubes. If `centers=None`, then use `self.centers_` as computed in `self.fit`. - + def transform(self, data, centers=None, return_centers=False): + """ Find entries of all hypercubes. If `centers=None`, then use `self.centers_` as computed in `self.fit`. + Empty hypercubes are removed from the result Parameters @@ -255,12 +255,15 @@ def transform(self, data, centers=None): Data to find in entries in cube. Warning: first column must be index column. centers: list of array-like Center points for all cubes as returned by `self.fit`. Default is to use `self.centers_`. + return_centers: boolean + Whether to also return the kept center IDs. Returns ========= hypercubes: list of array-like list of entries in each hypercube in `data`. - + center_ids: array-like + list of center IDs kept. """ centers = centers or self.centers_ @@ -269,30 +272,38 @@ def transform(self, data, centers=None): ] # Clean out any empty cubes (common in high dimensions) - hypercubes = [cube for cube in hypercubes if len(cube)] - return hypercubes + trimmed_hypercubes = [cube for cube in hypercubes if len(cube)] + if return_centers: + trimmed_cube_ids = np.array([i for i, cube in enumerate(hypercubes) if len(cube)]) + return trimmed_hypercubes, trimmed_cube_ids + else: + return trimmed_hypercubes def fit_transform(self, data): self.fit(data) return self.transform(data) - def find(self, data_point): - """Finds the hypercubes that contain the given data point. + def find(self, data_point, centers=None): + """ Finds the hypercubes that contain the given data point. + If `centers=None`, then use `self.centers_` as computed in `self.fit`. Parameters =========== data_point: array-like The data point to locate. + centers: list of array-like + Center points for all cubes as returned by `self.fit`. Default is to use `self.centers_`. Returns ========= cube_ids: list of int - list of hypercube indices, empty if the data point is outside the cover. + list of hypercube indices (w.r.t. `self.fit`), empty if the data point is outside the cover. """ cube_ids = [] - for i, center in enumerate(self.centers_): + centers = centers or self.centers_ + for i, center in enumerate(centers): lower_bounds, upper_bounds = center - self.radius_, center + self.radius_ if np.all(data_point >= lower_bounds) and np.all( data_point <= upper_bounds diff --git a/kmapper/kmapper.py b/kmapper/kmapper.py index 355c33d0..7afaee29 100644 --- a/kmapper/kmapper.py +++ b/kmapper/kmapper.py @@ -951,7 +951,7 @@ def data_from_cluster_id(self, cluster_id, graph, data): else: return np.array([]) - def clusters_from_cover(self, cube_ids, graph): + def find_nodes(self, cube_ids, graph, cover, lens): """Returns the clusters and their members from the subset of the cover spanned by the given cube_ids Parameters @@ -960,20 +960,81 @@ def clusters_from_cover(self, cube_ids, graph): List of hypercube indices. graph : dict The resulting dictionary after applying map(). - + cover : kmapper.Cover + The cover used to build `graph`. + lens: Numpy Array + Lower dimensional representation of data. + Returns ------- - clusters : dict + nodes : dict cluster membership indexed by cluster ID (subset of `graph["nodes"]`). """ + lens_ids = np.array([x for x in range(lens.shape[0])]) + lens = np.c_[lens_ids, lens] + _, cube_id_mapping = cover.transform(lens, return_centers=True) + + transformed_cube_ids = np.concatenate([np.flatnonzero(cube_id_mapping==cube_id) for cube_id in cube_ids]) + clusters = {} - cluster_id_prefixes = tuple(["cube"+str(i)+"_" for i in cube_ids]) + cluster_id_prefixes = tuple(["cube"+str(i)+"_" for i in transformed_cube_ids]) for cluster_id, cluster_members in graph["nodes"].items(): if cluster_id.startswith(cluster_id_prefixes): clusters[cluster_id] = cluster_members return clusters + def nearest_nodes(self, newlens, newdata, graph, cover, lens, data, nn): + """Returns the nodes nearest to the `newdata` using the given NearestNeighbors algorithm + + Parameters + ---------- + newdata : Numpy array + New dataset. Accepts both 1-D and 2-D array. + graph : dict + The resulting dictionary after applying map(). + cover : kmapper.Cover + The cover used to build `graph`. + data : Numpy array + Original dataset. + lens: Numpy Array + Lower dimensional representation of data. + nn : NearestNeighbors + Scikit-learn NearestNeighbors instance to use. + + Returns + ------- + node_ids : numpy array + Node IDs. + + """ + if newlens.shape[0] != newdata.shape[0]: + raise Exception("newlens and newdata must have the same number of rows.") + + if len(newdata.shape) == 1: + newlens = newlens[np.newaxis] + newdata = newdata[np.newaxis] + + cube_ids = np.concatenate([cover.find(row) for row in newlens]) + if len(cube_ids) == 0: + return np.empty((0,)) + + nodes = self.find_nodes(cube_ids, graph, cover, lens) + if len(nodes) == 0: + return np.empty((0,)) + + nn_data = [] + nn_cluster_ids = [] + for cluster_id, cluster_members in nodes.items(): + cluster_data = data[cluster_members] + nn_data.append(cluster_data) + nn_cluster_ids.append([cluster_id]*len(cluster_data)) + nn_data = np.vstack(nn_data) + nn_cluster_ids = np.concatenate(nn_cluster_ids) + nn.fit(nn_data) + nn_ids = nn.kneighbors(newdata, return_distance=False) + return np.unique(nn_cluster_ids[nn_ids]) + def _process_projection_tuple(self, projection): # Detect if projection is a tuple (for prediction functions) # TODO: multi-label models diff --git a/test/test_mapper.py b/test/test_mapper.py index 45713551..a4ccba4d 100644 --- a/test/test_mapper.py +++ b/test/test_mapper.py @@ -74,25 +74,55 @@ def test_wrong_id(self): mems = mapper.data_from_cluster_id("new node", graph, data) np.testing.assert_array_equal(mems, np.array([])) - def test_clusters_from_cover(self): + def test_find_nodes(self): mapper = KeplerMapper(verbose=1) data = np.random.rand(100, 2) graph = mapper.map(data) - cube_ids = mapper.cover.find(data[0]) - mems = mapper.clusters_from_cover(cube_ids, graph) + # pick a data point that exists in the graph + _, members = list(graph["nodes"].items())[-1] + data_point = data[members[-1]] + + cube_ids = mapper.cover.find(data_point) + mems = mapper.find_nodes(cube_ids, graph, mapper.cover, data) assert len(mems) > 0 for cluster_id, cluster_members in mems.items(): np.testing.assert_array_equal(cluster_members, graph["nodes"][cluster_id]) - def test_no_clusters_from_cover(self): + def test_node_not_found(self): mapper = KeplerMapper(verbose=1) data = np.random.rand(100, 2) graph = mapper.map(data) - mems = mapper.clusters_from_cover([999], graph) + mems = mapper.find_nodes([999], graph, mapper.cover, data) assert len(mems) == 0 + def test_nearest_nodes_1(self): + mapper = KeplerMapper(verbose=1) + data = np.random.rand(100, 2) + + graph = mapper.map(data) + nn = neighbors.NearestNeighbors(n_neighbors=1) + expected_id, members = list(graph["nodes"].items())[-1] + newdata = data[members[-1]] + node_ids = mapper.nearest_nodes(newdata, newdata, graph, mapper.cover, data, data, nn) + assert all(node_ids == [expected_id]), node_ids + + def test_nearest_nodes_2(self): + mapper = KeplerMapper(verbose=1) + data = np.random.rand(100, 2) + + graph = mapper.map(data) + nn = neighbors.NearestNeighbors(n_neighbors=1) + expected_clusters = [(cluster_id, members) for cluster_id, members in graph['nodes'].items()][:2] + cluster_id1 = expected_clusters[0][0] + cluster_id2 = expected_clusters[1][0] + newdata1 = data[expected_clusters[0][1][-1]] + newdata2 = data[expected_clusters[1][1][-1]] + newdata = np.vstack([newdata1, newdata2]) + node_ids = mapper.nearest_nodes(newdata, newdata, graph, mapper.cover, data, data, nn) + assert all(node_ids == [cluster_id1, cluster_id2]), node_ids + class TestMap: def test_simplices(self): mapper = KeplerMapper() From 0bd7b00f2a3d7e703f2652eb088818505123f1f8 Mon Sep 17 00:00:00 2001 From: Mark Hale Date: Fri, 19 Mar 2021 19:49:32 +0000 Subject: [PATCH 3/3] Added nearest_node example. --- examples/nearest_node.py | 32 ++++++++++++++++++++++++++++++++ 1 file changed, 32 insertions(+) create mode 100644 examples/nearest_node.py diff --git a/examples/nearest_node.py b/examples/nearest_node.py new file mode 100644 index 00000000..5f430422 --- /dev/null +++ b/examples/nearest_node.py @@ -0,0 +1,32 @@ +""" +nearest_nodes example based on breast cancer data. +""" + +from plot_breast_cancer import * +from sklearn import neighbors, preprocessing + +# new patient data incoming +i = np.random.randint(len(X)) +new_patient_data = 1.05*X[i] +new_patient_data = new_patient_data.reshape(1, -1) + +# re-use lens1 model +newlens1 = model.decision_function(new_patient_data) + +# re-construct lens2 model +X_norm = np.linalg.norm(X, axis=1) +scaler = preprocessing.MinMaxScaler() +scaler.fit(X_norm.reshape(-1, 1)) + +newlens2 = scaler.transform(np.linalg.norm(new_patient_data, axis=1).reshape(1, -1)) + +newlens = np.c_[newlens1, newlens2] + +# find nearest nodes +nn = neighbors.NearestNeighbors(n_neighbors=3) +node_ids = mapper.nearest_nodes(newlens, new_patient_data, graph, mapper.cover, lens, X, nn) + +print("Nearest nodes:") +for node_id in node_ids: + diags = y[graph['nodes'][node_id]] + print(" {}: diagnosis {:.1f}%".format(node_id, np.sum(diags)*100.0/len(diags)))