diff --git a/libpysal/weights/distance.py b/libpysal/weights/distance.py index 2e9552cd0..4ac0efb83 100644 --- a/libpysal/weights/distance.py +++ b/libpysal/weights/distance.py @@ -12,6 +12,49 @@ import scipy.sparse as sp import numpy as np +def duplicated(array): + """Identify duplicate rows in an array + Parameters + ---------- + array : np.ndarray + (n,k) + Returns + ------- + duplicate : np.ndarray + (n, 3) + First column indicates if the row is a duplicate + Second column indicates if the row is a duplicate of a row with + a lower index + Third column contains the index of the first row that + duplicates current row + Examples + --------- + >>> a = np.array([[1,1,1],[2,2,2],[3,3,3],[4,4,4],[5,5,5],[1,1,1], + [2,2,2], [1,1,1]]) + >>> duplicated(a) + array([[1, 0, 0], + [1, 0, 0], + [0, 0, 0], + [0, 0, 0], + [0, 0, 0], + [1, 1, 0], + [1, 1, 1], + [1, 1, 0]]) + >>> duplicated(a)[:,0].any() + True + """ + array = np.asarray(array) + n = array.shape[0] + duplicate = np.zeros((n,3), dtype=int) + unq, count = np.unique(array, axis=0, return_counts=True) + repeated_groups = unq[count > 1] + for repeated_group in repeated_groups: + repeated_idx = np.argwhere(np.all(array == repeated_group, axis=1)) + duplicate[repeated_idx, 0] = 1 + duplicate[repeated_idx[1:], 1] = 1 + duplicate[repeated_idx[1:], 2] = repeated_idx[0] + + return duplicate def knnW(data, k=2, p=2, ids=None, radius=None, distance_metric='euclidean'): """ @@ -21,6 +64,7 @@ def knnW(data, k=2, p=2, ids=None, radius=None, distance_metric='euclidean'): return KNN(data, k=k, p=p, ids=ids, radius=radius, distance_metric=distance_metric) + class KNN(W): """ Creates nearest neighbor weights matrix based on k nearest @@ -50,6 +94,27 @@ class KNN(W): instance Weights object with binary weights + See Also + -------- + :class:`libpysal.weights.weights.W` + + Notes + ----- + + Ties between neighbors of equal distance are arbitrarily broken. + + Coincident points can cause challenges for distance based weights since the + distance separating a pair of coincident points is 0 by definition. We + handle this situation as follows. Define `P` as the set of indices for all + points in a data set. The first record in a set of duplicates (i.e., points + with same coordinates) is defined as the coincident seed and the remaining + points that are coincident with the seed are coincident duplicates. Define + `D` as the set of indices for the coincident duplicates. Initial neighbors + are identified using the set `S = P\D` (i.e., the coincident duplicates are + not included initially). Then, each coincident duplicate has its neighbors + set equal to that of its coincident seed. + + Examples -------- >>> import libpysal @@ -75,17 +140,28 @@ class KNN(W): >>> 0 in wnn2.neighbors False - Notes - ----- + coincident points + >>> points = [(10, 10), (20, 10), (10,10), (20,10), (40, 10), + (15, 20), (30, 20), (30, 30)] + >>> wknn2 = KNN.from_array(points, 2) + >>> wknn2.neighbors + {0: [1, 5], + 1: [0, 5], + 4: [6, 1], + 5: [1, 0], + 6: [7, 1], + 7: [6, 5], + 2: [1, 5], + 3: [0, 5]} + + - Ties between neighbors of equal distance are arbitrarily broken. - See Also - -------- - :class:`libpysal.weights.weights.W` """ def __init__(self, data, k=2, p=2, ids=None, radius=None, distance_metric='euclidean', **kwargs): + + if radius is not None: distance_metric='arc' if isKDTree(data): @@ -94,21 +170,46 @@ def __init__(self, data, k=2, p=2, ids=None, radius=None, else: self.kdtree = KDTree(data, radius=radius, distance_metric=distance_metric) self.data = self.kdtree.data + + duplicates = duplicated(self.data) + coincident = duplicates[:,1].any() + + self.duplicates = duplicates self.k = k self.p = p - this_nnq = self.kdtree.query(self.data, k=k+1, p=p) + data = self.data + if coincident: + duplicate_ids = np.nonzero(duplicates[:,1]) + data = self.data[np.nonzero(duplicates[:,1]==0)] + self.kdtree = KDTree(data, radius=radius, distance_metric=distance_metric) + + this_nnq = self.kdtree.query(data, k=k+1, p=p) to_weight = this_nnq[1] if ids is None: ids = list(range(to_weight.shape[0])) neighbors = {} - for i,row in enumerate(to_weight): - row = row.tolist() - row.remove(i) - row = [ids[j] for j in row] - focal = ids[i] - neighbors[focal] = row + if coincident: + unique_ids = np.nonzero(duplicates[:,1]==0)[0] + for i, row in enumerate(to_weight): + row = row.tolist() + row.remove(i) + row = [unique_ids[j] for j in row] + focal = unique_ids[i] + neighbors[focal] = row + for row in duplicate_ids[0]: + neighbors[row] = neighbors[duplicates[row, 2]] + n = self.data.shape[0] + ids = list(range(n)) + else: + for i, row in enumerate(to_weight): + row = row.tolist() + row.remove(i) + row = [ids[j] for j in row] + focal = ids[i] + neighbors[focal] = row + W.__init__(self, neighbors, id_order=ids, **kwargs) @classmethod @@ -249,15 +350,18 @@ def from_dataframe(cls, df, geom_col='geometry', ids=None, *args, **kwargs): if iterable, a list of ids to use for the W if None, df.index is used. + See Also -------- :class:`libpysal.weights.weights.W` """ pts = get_points_array(df[geom_col]) + if ids is None: ids = df.index.tolist() elif isinstance(ids, str): ids = df[ids].tolist() + return cls(pts, *args, ids=ids, **kwargs) def reweight(self, k=None, p=None, new_data=None, new_ids=None, inplace=True): diff --git a/libpysal/weights/tests/test_distance.py b/libpysal/weights/tests/test_distance.py index 455b5e294..d11d8660a 100644 --- a/libpysal/weights/tests/test_distance.py +++ b/libpysal/weights/tests/test_distance.py @@ -20,6 +20,9 @@ class Distance_Mixin(object): arc_path = pysal_examples.get_path('stl_hom.shp') points = [(10, 10), (20, 10), (40, 10), (15, 20), (30, 20), (30, 30)] + coincident_points = [(10, 10), (20, 10), (10,10), + (20,10), (40, 10), (15, 20), + (30, 20), (30, 30)] euclidean_kdt = KDTree(points, distance_metric='euclidean') polygon_f = psopen(polygon_path) # our file handler @@ -74,6 +77,10 @@ def setUp(self): self.known_w2 = [1, 3, 9, 12] self.known_wi3 = 40 self.known_w3 = [31, 38, 45, 49] + self.known_coincident_neighbors = {0: [1, 5], 1: [0, 5], + 4: [6, 1], 5: [1, 0], + 6: [7, 1], 7: [6, 5], + 2: [1, 5], 3: [0, 5]} ########################## # Classmethod tests # @@ -94,16 +101,24 @@ def test_from_array(self): w = d.KNN.from_array(self.poly_centroids, k=4) self.assertEqual(w.neighbors[self.known_wi0], self.known_w0) self.assertEqual(w.neighbors[self.known_wi1], self.known_w1) + w = d.KNN.from_array(self.coincident_points) + self.assertEqual(w.neighbors, self.known_coincident_neighbors) def test_from_shapefile(self): w = d.KNN.from_shapefile(self.polygon_path, k=4) self.assertEqual(w.neighbors[self.known_wi0], self.known_w0) self.assertEqual(w.neighbors[self.known_wi1], self.known_w1) + ########################## # Function/User tests # ########################## + def test_duplicated(self): + p = self.coincident_points + self.assertTrue(d.duplicated(p)[:,0].any()) + + def test_reweight(self): w = d.KNN(self.points, k=2) new_point = [(21,21)] diff --git a/notebooks/knn_coincident.ipynb b/notebooks/knn_coincident.ipynb new file mode 100644 index 000000000..aa7838f02 --- /dev/null +++ b/notebooks/knn_coincident.ipynb @@ -0,0 +1,795 @@ +{ + "cells": [ + { + "cell_type": "code", + "execution_count": 27, + "metadata": {}, + "outputs": [], + "source": [ + "import libpysal\n", + "import geopandas\n", + "import pandas\n", + "import copy" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [] + }, + { + "cell_type": "code", + "execution_count": 28, + "metadata": {}, + "outputs": [ + { + "data": { + "text/plain": [ + "6" + ] + }, + "execution_count": 28, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "points = [(10, 10), (20, 10), (40, 10), (15, 20), (30, 20), (30, 30)]\n", + "\n", + "from libpysal.weights import KNN\n", + "\n", + "wknn2 = KNN.from_array(points, 2)\n", + "wknn2.n" + ] + }, + { + "cell_type": "code", + "execution_count": 29, + "metadata": {}, + "outputs": [ + { + "data": { + "text/plain": [ + "{0: [1, 3], 1: [0, 3], 2: [4, 1], 3: [1, 0], 4: [5, 1], 5: [4, 3]}" + ] + }, + "execution_count": 29, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "wknn2.neighbors" + ] + }, + { + "cell_type": "code", + "execution_count": 30, + "metadata": {}, + "outputs": [], + "source": [ + "# create duplicate points\n", + "points = [(10, 10), (20, 10), (10,10), (20,10), (40, 10), (15, 20), (30, 20), (30, 30)]\n", + "\n", + "wknn2 = KNN.from_array(points, 2)" + ] + }, + { + "cell_type": "code", + "execution_count": 31, + "metadata": {}, + "outputs": [ + { + "data": { + "text/plain": [ + "{0: [1, 5],\n", + " 1: [0, 5],\n", + " 4: [6, 1],\n", + " 5: [1, 0],\n", + " 6: [7, 1],\n", + " 7: [6, 5],\n", + " 2: [1, 5],\n", + " 3: [0, 5]}" + ] + }, + "execution_count": 31, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "wknn2.neighbors" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [] + }, + { + "cell_type": "code", + "execution_count": 32, + "metadata": {}, + "outputs": [], + "source": [ + "balt = libpysal.examples.load_example('Baltimore')" + ] + }, + { + "cell_type": "code", + "execution_count": 33, + "metadata": {}, + "outputs": [], + "source": [ + "import geopandas" + ] + }, + { + "cell_type": "code", + "execution_count": 34, + "metadata": {}, + "outputs": [], + "source": [ + "gdf = geopandas.read_file(balt.get_path(\"baltim.shp\"))" + ] + }, + { + "cell_type": "code", + "execution_count": 35, + "metadata": {}, + "outputs": [ + { + "data": { + "text/plain": [ + "(211, 18)" + ] + }, + "execution_count": 35, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "gdf.shape" + ] + }, + { + "cell_type": "code", + "execution_count": 36, + "metadata": {}, + "outputs": [], + "source": [ + "w1 = libpysal.weights.KNN.from_dataframe(gdf, k=2)" + ] + }, + { + "cell_type": "code", + "execution_count": 37, + "metadata": {}, + "outputs": [ + { + "data": { + "text/html": [ + "
\n", + " | STATION | \n", + "PRICE | \n", + "NROOM | \n", + "DWELL | \n", + "NBATH | \n", + "PATIO | \n", + "FIREPL | \n", + "AC | \n", + "BMENT | \n", + "NSTOR | \n", + "GAR | \n", + "AGE | \n", + "CITCOU | \n", + "LOTSZ | \n", + "SQFT | \n", + "X | \n", + "Y | \n", + "geometry | \n", + "
---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
0 | \n", + "1 | \n", + "47.0 | \n", + "4.0 | \n", + "0.0 | \n", + "1.0 | \n", + "0.0 | \n", + "0.0 | \n", + "0.0 | \n", + "2.0 | \n", + "3.0 | \n", + "0.0 | \n", + "148.0 | \n", + "0.0 | \n", + "5.70 | \n", + "11.25 | \n", + "907.0 | \n", + "534.0 | \n", + "POINT (907.000 534.000) | \n", + "
1 | \n", + "2 | \n", + "113.0 | \n", + "7.0 | \n", + "1.0 | \n", + "2.5 | \n", + "1.0 | \n", + "1.0 | \n", + "1.0 | \n", + "2.0 | \n", + "2.0 | \n", + "2.0 | \n", + "9.0 | \n", + "1.0 | \n", + "279.51 | \n", + "28.92 | \n", + "922.0 | \n", + "574.0 | \n", + "POINT (922.000 574.000) | \n", + "
2 | \n", + "3 | \n", + "165.0 | \n", + "7.0 | \n", + "1.0 | \n", + "2.5 | \n", + "1.0 | \n", + "1.0 | \n", + "0.0 | \n", + "3.0 | \n", + "2.0 | \n", + "2.0 | \n", + "23.0 | \n", + "1.0 | \n", + "70.64 | \n", + "30.62 | \n", + "920.0 | \n", + "581.0 | \n", + "POINT (920.000 581.000) | \n", + "
3 | \n", + "4 | \n", + "104.3 | \n", + "7.0 | \n", + "1.0 | \n", + "2.5 | \n", + "1.0 | \n", + "1.0 | \n", + "1.0 | \n", + "2.0 | \n", + "2.0 | \n", + "2.0 | \n", + "5.0 | \n", + "1.0 | \n", + "174.63 | \n", + "26.12 | \n", + "923.0 | \n", + "578.0 | \n", + "POINT (923.000 578.000) | \n", + "
4 | \n", + "5 | \n", + "62.5 | \n", + "7.0 | \n", + "1.0 | \n", + "1.5 | \n", + "1.0 | \n", + "1.0 | \n", + "0.0 | \n", + "2.0 | \n", + "2.0 | \n", + "0.0 | \n", + "19.0 | \n", + "1.0 | \n", + "107.80 | \n", + "22.04 | \n", + "918.0 | \n", + "574.0 | \n", + "POINT (918.000 574.000) | \n", + "