From d212f6f43c07699b8549a16ec74ea328422307e8 Mon Sep 17 00:00:00 2001 From: Serge Rey Date: Sun, 3 May 2020 09:06:08 -0700 Subject: [PATCH 1/6] ENH: handle coincident points in the case of KNN.from_dataframe --- libpysal/weights/distance.py | 48 ++++++++++++++++++++++++++++++++++-- 1 file changed, 46 insertions(+), 2 deletions(-) diff --git a/libpysal/weights/distance.py b/libpysal/weights/distance.py index 2e9552cd0..97d6c8182 100644 --- a/libpysal/weights/distance.py +++ b/libpysal/weights/distance.py @@ -12,6 +12,23 @@ import scipy.sparse as sp import numpy as np +try: + import pandas + PANDAS=True +except ImportError: + PANDAS=False + + + +def _check_duplicates(data): + if PANDAS: + df = pandas.DataFrame(data) + return df.duplicated() + else: + print('pandas is required for _check_duplicates') + return None + + def knnW(data, k=2, p=2, ids=None, radius=None, distance_metric='euclidean'): """ @@ -21,6 +38,7 @@ def knnW(data, k=2, p=2, ids=None, radius=None, distance_metric='euclidean'): return KNN(data, k=k, p=p, ids=ids, radius=radius, distance_metric=distance_metric) + class KNN(W): """ Creates nearest neighbor weights matrix based on k nearest @@ -86,6 +104,8 @@ class KNN(W): """ def __init__(self, data, k=2, p=2, ids=None, radius=None, distance_metric='euclidean', **kwargs): + + if radius is not None: distance_metric='arc' if isKDTree(data): @@ -249,16 +269,40 @@ def from_dataframe(cls, df, geom_col='geometry', ids=None, *args, **kwargs): if iterable, a list of ids to use for the W if None, df.index is used. + Notes + ----- + In the case of coincident points, the first record in a set of duplicates (i.e., points with same coordinates) is defined as the coincident seed and the remaining points in the set are coincident duplicates. Initial weights are defined on the set of unique+coincident seed points (i.e., the coincident duplicates are not included initially). Then each coincident point has its neighbors set equal to that of its coincident seed. + See Also -------- :class:`libpysal.weights.weights.W` """ - pts = get_points_array(df[geom_col]) + duplicate = df[geom_col].duplicated() + coincident = duplicate.any() + if coincident: + df['coincident'] = duplicate.values + pts = get_points_array(df[~duplicate][geom_col]) + else: + pts = get_points_array(df[geom_col]) + if ids is None: ids = df.index.tolist() elif isinstance(ids, str): ids = df[ids].tolist() - return cls(pts, *args, ids=ids, **kwargs) + + if coincident: + ids = [idx for j,idx in enumerate(ids) if not duplicate.values[j]] + df.reset_index(inplace=True) + tmp = cls(pts, *args, ids=ids, **kwargs) + neighbors = copy.deepcopy(tmp.neighbors) + for index, value in df[df['coincident']].iterrows(): + match = value['index'] + neighbors[index] = neighbors[match] + w = W(neighbors=neighbors) + w.k = k + w.p = p + return w + return cls(pts, *args, ids=ids, coincident=coincident, **kwargs) def reweight(self, k=None, p=None, new_data=None, new_ids=None, inplace=True): """ From 9a4d5f0225176dd4d414180f9ce6466764cc6aeb Mon Sep 17 00:00:00 2001 From: Serge Rey Date: Sun, 3 May 2020 10:46:48 -0700 Subject: [PATCH 2/6] Move coincident handling into constructor --- libpysal/weights/distance.py | 87 +++++++++++++++++++++++++++++++----- 1 file changed, 76 insertions(+), 11 deletions(-) diff --git a/libpysal/weights/distance.py b/libpysal/weights/distance.py index 97d6c8182..2b65838b0 100644 --- a/libpysal/weights/distance.py +++ b/libpysal/weights/distance.py @@ -18,7 +18,48 @@ except ImportError: PANDAS=False - +def duplicated(array): + """Identify duplicate rows in an array + Parameters + ---------- + array : np.ndarray + (n,k) + Returns + ------- + duplicate : np.ndarray + (n, 3) + First column indicates if the row is a duplicate + Second column indicates if the row is a duplicate of a row with + a lower index + Third column contains the index of the first row that + duplicates current row + Examples + --------- + >>> a = np.array([[1,1,1],[2,2,2],[3,3,3],[4,4,4],[5,5,5],[1,1,1], + [2,2,2], [1,1,1]]) + >>> duplicated(a) + array([[1, 0, 0], + [1, 0, 0], + [0, 0, 0], + [0, 0, 0], + [0, 0, 0], + [1, 1, 0], + [1, 1, 1], + [1, 1, 0]]) + >>> duplicated(a)[:,0].any() + True + """ + n = array.shape[0] + duplicate = np.zeros((n,3), dtype=int) + unq, count = np.unique(array, axis=0, return_counts=True) + repeated_groups = unq[count > 1] + for repeated_group in repeated_groups: + repeated_idx = np.argwhere(np.all(array == repeated_group, axis=1)) + duplicate[repeated_idx, 0] = 1 + duplicate[repeated_idx[1:], 1] = 1 + duplicate[repeated_idx[1:], 2] = repeated_idx[0] + + return duplicate def _check_duplicates(data): if PANDAS: @@ -114,21 +155,47 @@ def __init__(self, data, k=2, p=2, ids=None, radius=None, else: self.kdtree = KDTree(data, radius=radius, distance_metric=distance_metric) self.data = self.kdtree.data + + duplicates = duplicated(self.data) + coincident = duplicates[:,1].any() + + self.duplicates = duplicates self.k = k self.p = p - this_nnq = self.kdtree.query(self.data, k=k+1, p=p) + data = self.data + if coincident: + duplicate_ids = np.nonzero(duplicates[:,1]) + data = self.data[np.nonzero(duplicates[:,1]==0)] + self.kdtree = KDTree(data, radius=radius, distance_metric=distance_metric) + + this_nnq = self.kdtree.query(data, k=k+1, p=p) to_weight = this_nnq[1] if ids is None: ids = list(range(to_weight.shape[0])) neighbors = {} - for i,row in enumerate(to_weight): - row = row.tolist() - row.remove(i) - row = [ids[j] for j in row] - focal = ids[i] - neighbors[focal] = row + if coincident: + unique_ids = np.nonzero(duplicates[:,1]==0)[0] + print(unique_ids) + for i, row in enumerate(to_weight): + row = row.tolist() + row.remove(i) + row = [unique_ids[j] for j in row] + focal = unique_ids[i] + neighbors[focal] = row + print(neighbors) + for row in duplicate_ids[0]: + neighbors[row] = neighbors[duplicates[row, 2]] + n = self.data.shape[0] + ids = list(range(n)) + else: + for i,row in enumerate(to_weight): + row = row.tolist() + row.remove(i) + row = [ids[j] for j in row] + focal = ids[i] + neighbors[focal] = row W.__init__(self, neighbors, id_order=ids, **kwargs) @classmethod @@ -299,10 +366,8 @@ def from_dataframe(cls, df, geom_col='geometry', ids=None, *args, **kwargs): match = value['index'] neighbors[index] = neighbors[match] w = W(neighbors=neighbors) - w.k = k - w.p = p return w - return cls(pts, *args, ids=ids, coincident=coincident, **kwargs) + return cls(pts, *args, ids=ids, **kwargs) def reweight(self, k=None, p=None, new_data=None, new_ids=None, inplace=True): """ From a5e0dc18c21c979cfbdc212f5c8760994d1ddbcb Mon Sep 17 00:00:00 2001 From: Serge Rey Date: Sun, 3 May 2020 11:08:05 -0700 Subject: [PATCH 3/6] numpydoc format --- libpysal/weights/distance.py | 73 ++++++++++++------------------------ 1 file changed, 25 insertions(+), 48 deletions(-) diff --git a/libpysal/weights/distance.py b/libpysal/weights/distance.py index 2b65838b0..f15d01870 100644 --- a/libpysal/weights/distance.py +++ b/libpysal/weights/distance.py @@ -61,16 +61,6 @@ def duplicated(array): return duplicate -def _check_duplicates(data): - if PANDAS: - df = pandas.DataFrame(data) - return df.duplicated() - else: - print('pandas is required for _check_duplicates') - return None - - - def knnW(data, k=2, p=2, ids=None, radius=None, distance_metric='euclidean'): """ This is deprecated. Use the pysal.weights.KNN class instead. @@ -109,6 +99,22 @@ class KNN(W): instance Weights object with binary weights + See Also + -------- + :class:`libpysal.weights.weights.W` + + Notes + ----- + + Ties between neighbors of equal distance are arbitrarily broken. + + In the case of coincident points, the first record in a set of duplicates + (i.e., points with same coordinates) is defined as the coincident seed and + the remaining points in the set are coincident duplicates. Initial neighbors are identified using the the set of unique+coincident seed points (i.e., the + coincident duplicates are not included initially). Then, each coincident + duplicate has its neighbors set equal to that of its coincident seed. + + Examples -------- >>> import libpysal @@ -133,15 +139,6 @@ class KNN(W): {1: 1.0, 4: 1.0} >>> 0 in wnn2.neighbors False - - Notes - ----- - - Ties between neighbors of equal distance are arbitrarily broken. - - See Also - -------- - :class:`libpysal.weights.weights.W` """ def __init__(self, data, k=2, p=2, ids=None, radius=None, distance_metric='euclidean', **kwargs): @@ -177,25 +174,24 @@ def __init__(self, data, k=2, p=2, ids=None, radius=None, neighbors = {} if coincident: unique_ids = np.nonzero(duplicates[:,1]==0)[0] - print(unique_ids) for i, row in enumerate(to_weight): row = row.tolist() row.remove(i) row = [unique_ids[j] for j in row] focal = unique_ids[i] neighbors[focal] = row - print(neighbors) for row in duplicate_ids[0]: neighbors[row] = neighbors[duplicates[row, 2]] n = self.data.shape[0] ids = list(range(n)) else: - for i,row in enumerate(to_weight): - row = row.tolist() - row.remove(i) - row = [ids[j] for j in row] - focal = ids[i] - neighbors[focal] = row + for i, row in enumerate(to_weight): + row = row.tolist() + row.remove(i) + row = [ids[j] for j in row] + focal = ids[i] + neighbors[focal] = row + W.__init__(self, neighbors, id_order=ids, **kwargs) @classmethod @@ -336,37 +332,18 @@ def from_dataframe(cls, df, geom_col='geometry', ids=None, *args, **kwargs): if iterable, a list of ids to use for the W if None, df.index is used. - Notes - ----- - In the case of coincident points, the first record in a set of duplicates (i.e., points with same coordinates) is defined as the coincident seed and the remaining points in the set are coincident duplicates. Initial weights are defined on the set of unique+coincident seed points (i.e., the coincident duplicates are not included initially). Then each coincident point has its neighbors set equal to that of its coincident seed. - + See Also -------- :class:`libpysal.weights.weights.W` """ - duplicate = df[geom_col].duplicated() - coincident = duplicate.any() - if coincident: - df['coincident'] = duplicate.values - pts = get_points_array(df[~duplicate][geom_col]) - else: - pts = get_points_array(df[geom_col]) + pts = get_points_array(df[geom_col]) if ids is None: ids = df.index.tolist() elif isinstance(ids, str): ids = df[ids].tolist() - if coincident: - ids = [idx for j,idx in enumerate(ids) if not duplicate.values[j]] - df.reset_index(inplace=True) - tmp = cls(pts, *args, ids=ids, **kwargs) - neighbors = copy.deepcopy(tmp.neighbors) - for index, value in df[df['coincident']].iterrows(): - match = value['index'] - neighbors[index] = neighbors[match] - w = W(neighbors=neighbors) - return w return cls(pts, *args, ids=ids, **kwargs) def reweight(self, k=None, p=None, new_data=None, new_ids=None, inplace=True): From 2552d9ed56282f73efbe876e8576bd64fef7bec8 Mon Sep 17 00:00:00 2001 From: Serge Rey Date: Sun, 3 May 2020 11:38:41 -0700 Subject: [PATCH 4/6] tests for coincident points --- libpysal/weights/tests/test_distance.py | 15 +++++++++++++++ 1 file changed, 15 insertions(+) diff --git a/libpysal/weights/tests/test_distance.py b/libpysal/weights/tests/test_distance.py index 455b5e294..d11d8660a 100644 --- a/libpysal/weights/tests/test_distance.py +++ b/libpysal/weights/tests/test_distance.py @@ -20,6 +20,9 @@ class Distance_Mixin(object): arc_path = pysal_examples.get_path('stl_hom.shp') points = [(10, 10), (20, 10), (40, 10), (15, 20), (30, 20), (30, 30)] + coincident_points = [(10, 10), (20, 10), (10,10), + (20,10), (40, 10), (15, 20), + (30, 20), (30, 30)] euclidean_kdt = KDTree(points, distance_metric='euclidean') polygon_f = psopen(polygon_path) # our file handler @@ -74,6 +77,10 @@ def setUp(self): self.known_w2 = [1, 3, 9, 12] self.known_wi3 = 40 self.known_w3 = [31, 38, 45, 49] + self.known_coincident_neighbors = {0: [1, 5], 1: [0, 5], + 4: [6, 1], 5: [1, 0], + 6: [7, 1], 7: [6, 5], + 2: [1, 5], 3: [0, 5]} ########################## # Classmethod tests # @@ -94,16 +101,24 @@ def test_from_array(self): w = d.KNN.from_array(self.poly_centroids, k=4) self.assertEqual(w.neighbors[self.known_wi0], self.known_w0) self.assertEqual(w.neighbors[self.known_wi1], self.known_w1) + w = d.KNN.from_array(self.coincident_points) + self.assertEqual(w.neighbors, self.known_coincident_neighbors) def test_from_shapefile(self): w = d.KNN.from_shapefile(self.polygon_path, k=4) self.assertEqual(w.neighbors[self.known_wi0], self.known_w0) self.assertEqual(w.neighbors[self.known_wi1], self.known_w1) + ########################## # Function/User tests # ########################## + def test_duplicated(self): + p = self.coincident_points + self.assertTrue(d.duplicated(p)[:,0].any()) + + def test_reweight(self): w = d.KNN(self.points, k=2) new_point = [(21,21)] From c391ce2fea9e01e710f729549504f2d5233626e2 Mon Sep 17 00:00:00 2001 From: Serge Rey Date: Sun, 3 May 2020 11:39:30 -0700 Subject: [PATCH 5/6] add coincident nb --- notebooks/knn_coincident.ipynb | 795 +++++++++++++++++++++++++++++++++ 1 file changed, 795 insertions(+) create mode 100644 notebooks/knn_coincident.ipynb diff --git a/notebooks/knn_coincident.ipynb b/notebooks/knn_coincident.ipynb new file mode 100644 index 000000000..aa7838f02 --- /dev/null +++ b/notebooks/knn_coincident.ipynb @@ -0,0 +1,795 @@ +{ + "cells": [ + { + "cell_type": "code", + "execution_count": 27, + "metadata": {}, + "outputs": [], + "source": [ + "import libpysal\n", + "import geopandas\n", + "import pandas\n", + "import copy" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [] + }, + { + "cell_type": "code", + "execution_count": 28, + "metadata": {}, + "outputs": [ + { + "data": { + "text/plain": [ + "6" + ] + }, + "execution_count": 28, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "points = [(10, 10), (20, 10), (40, 10), (15, 20), (30, 20), (30, 30)]\n", + "\n", + "from libpysal.weights import KNN\n", + "\n", + "wknn2 = KNN.from_array(points, 2)\n", + "wknn2.n" + ] + }, + { + "cell_type": "code", + "execution_count": 29, + "metadata": {}, + "outputs": [ + { + "data": { + "text/plain": [ + "{0: [1, 3], 1: [0, 3], 2: [4, 1], 3: [1, 0], 4: [5, 1], 5: [4, 3]}" + ] + }, + "execution_count": 29, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "wknn2.neighbors" + ] + }, + { + "cell_type": "code", + "execution_count": 30, + "metadata": {}, + "outputs": [], + "source": [ + "# create duplicate points\n", + "points = [(10, 10), (20, 10), (10,10), (20,10), (40, 10), (15, 20), (30, 20), (30, 30)]\n", + "\n", + "wknn2 = KNN.from_array(points, 2)" + ] + }, + { + "cell_type": "code", + "execution_count": 31, + "metadata": {}, + "outputs": [ + { + "data": { + "text/plain": [ + "{0: [1, 5],\n", + " 1: [0, 5],\n", + " 4: [6, 1],\n", + " 5: [1, 0],\n", + " 6: [7, 1],\n", + " 7: [6, 5],\n", + " 2: [1, 5],\n", + " 3: [0, 5]}" + ] + }, + "execution_count": 31, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "wknn2.neighbors" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [] + }, + { + "cell_type": "code", + "execution_count": 32, + "metadata": {}, + "outputs": [], + "source": [ + "balt = libpysal.examples.load_example('Baltimore')" + ] + }, + { + "cell_type": "code", + "execution_count": 33, + "metadata": {}, + "outputs": [], + "source": [ + "import geopandas" + ] + }, + { + "cell_type": "code", + "execution_count": 34, + "metadata": {}, + "outputs": [], + "source": [ + "gdf = geopandas.read_file(balt.get_path(\"baltim.shp\"))" + ] + }, + { + "cell_type": "code", + "execution_count": 35, + "metadata": {}, + "outputs": [ + { + "data": { + "text/plain": [ + "(211, 18)" + ] + }, + "execution_count": 35, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "gdf.shape" + ] + }, + { + "cell_type": "code", + "execution_count": 36, + "metadata": {}, + "outputs": [], + "source": [ + "w1 = libpysal.weights.KNN.from_dataframe(gdf, k=2)" + ] + }, + { + "cell_type": "code", + "execution_count": 37, + "metadata": {}, + "outputs": [ + { + "data": { + "text/html": [ + "
\n", + "\n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "
STATIONPRICENROOMDWELLNBATHPATIOFIREPLACBMENTNSTORGARAGECITCOULOTSZSQFTXYgeometry
0147.04.00.01.00.00.00.02.03.00.0148.00.05.7011.25907.0534.0POINT (907.000 534.000)
12113.07.01.02.51.01.01.02.02.02.09.01.0279.5128.92922.0574.0POINT (922.000 574.000)
23165.07.01.02.51.01.00.03.02.02.023.01.070.6430.62920.0581.0POINT (920.000 581.000)
34104.37.01.02.51.01.01.02.02.02.05.01.0174.6326.12923.0578.0POINT (923.000 578.000)
4562.57.01.01.51.01.00.02.02.00.019.01.0107.8022.04918.0574.0POINT (918.000 574.000)
\n", + "
" + ], + "text/plain": [ + " STATION PRICE NROOM DWELL NBATH PATIO FIREPL AC BMENT NSTOR GAR \\\n", + "0 1 47.0 4.0 0.0 1.0 0.0 0.0 0.0 2.0 3.0 0.0 \n", + "1 2 113.0 7.0 1.0 2.5 1.0 1.0 1.0 2.0 2.0 2.0 \n", + "2 3 165.0 7.0 1.0 2.5 1.0 1.0 0.0 3.0 2.0 2.0 \n", + "3 4 104.3 7.0 1.0 2.5 1.0 1.0 1.0 2.0 2.0 2.0 \n", + "4 5 62.5 7.0 1.0 1.5 1.0 1.0 0.0 2.0 2.0 0.0 \n", + "\n", + " AGE CITCOU LOTSZ SQFT X Y geometry \n", + "0 148.0 0.0 5.70 11.25 907.0 534.0 POINT (907.000 534.000) \n", + "1 9.0 1.0 279.51 28.92 922.0 574.0 POINT (922.000 574.000) \n", + "2 23.0 1.0 70.64 30.62 920.0 581.0 POINT (920.000 581.000) \n", + "3 5.0 1.0 174.63 26.12 923.0 578.0 POINT (923.000 578.000) \n", + "4 19.0 1.0 107.80 22.04 918.0 574.0 POINT (918.000 574.000) " + ] + }, + "execution_count": 37, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "gdf.head()" + ] + }, + { + "cell_type": "code", + "execution_count": 38, + "metadata": {}, + "outputs": [], + "source": [ + "gdf1 = gdf.iloc[[0,1,2,3,4]]" + ] + }, + { + "cell_type": "code", + "execution_count": 39, + "metadata": {}, + "outputs": [], + "source": [ + "gdf2 = pandas.concat([gdf, gdf1, gdf1, gdf1])" + ] + }, + { + "cell_type": "code", + "execution_count": 40, + "metadata": {}, + "outputs": [ + { + "data": { + "text/plain": [ + "0 False\n", + "1 False\n", + "2 False\n", + "3 False\n", + "4 False\n", + " ... \n", + "0 True\n", + "1 True\n", + "2 True\n", + "3 True\n", + "4 True\n", + "Length: 226, dtype: bool" + ] + }, + "execution_count": 40, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "gdf2.duplicated()" + ] + }, + { + "cell_type": "code", + "execution_count": 41, + "metadata": {}, + "outputs": [ + { + "name": "stderr", + "output_type": "stream", + "text": [ + "/home/serge/Dropbox/p/pysal/src/subpackages/libpysal/libpysal/weights/weights.py:172: UserWarning: The weights matrix is not fully connected: \n", + " There are 10 disconnected components.\n", + " warnings.warn(message)\n" + ] + } + ], + "source": [ + "w2 = libpysal.weights.KNN.from_dataframe(gdf2, k=2)" + ] + }, + { + "cell_type": "code", + "execution_count": 42, + "metadata": {}, + "outputs": [ + { + "data": { + "text/plain": [ + "226" + ] + }, + "execution_count": 42, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "w2.n" + ] + }, + { + "cell_type": "code", + "execution_count": 43, + "metadata": {}, + "outputs": [ + { + "data": { + "text/plain": [ + "True" + ] + }, + "execution_count": 43, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "w1.neighbors[0] == w2.neighbors[0]" + ] + }, + { + "cell_type": "code", + "execution_count": 44, + "metadata": {}, + "outputs": [], + "source": [ + "W1 = w1.full()[0]\n", + "W2 = w2.full()[0]" + ] + }, + { + "cell_type": "code", + "execution_count": 45, + "metadata": {}, + "outputs": [ + { + "data": { + "text/plain": [ + "True" + ] + }, + "execution_count": 45, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "(W1 == W2[:w1.n,:w1.n]).all()" + ] + }, + { + "cell_type": "code", + "execution_count": 46, + "metadata": {}, + "outputs": [ + { + "data": { + "text/plain": [ + "(226, 226)" + ] + }, + "execution_count": 46, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "W2.shape" + ] + }, + { + "cell_type": "code", + "execution_count": 47, + "metadata": {}, + "outputs": [ + { + "data": { + "text/plain": [ + "(211, 211)" + ] + }, + "execution_count": 47, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "W1.shape" + ] + }, + { + "cell_type": "code", + "execution_count": 48, + "metadata": {}, + "outputs": [ + { + "data": { + "text/plain": [ + "{0: 2,\n", + " 1: 2,\n", + " 2: 2,\n", + " 3: 2,\n", + " 4: 2,\n", + " 5: 2,\n", + " 6: 2,\n", + " 7: 2,\n", + " 8: 2,\n", + " 9: 2,\n", + " 10: 2,\n", + " 11: 2,\n", + " 12: 2,\n", + " 13: 2,\n", + " 14: 2,\n", + " 15: 2,\n", + " 16: 2,\n", + " 17: 2,\n", + " 18: 2,\n", + " 19: 2,\n", + " 20: 2,\n", + " 21: 2,\n", + " 22: 2,\n", + " 23: 2,\n", + " 24: 2,\n", + " 25: 2,\n", + " 26: 2,\n", + " 27: 2,\n", + " 28: 2,\n", + " 29: 2,\n", + " 30: 2,\n", + " 31: 2,\n", + " 32: 2,\n", + " 33: 2,\n", + " 34: 2,\n", + " 35: 2,\n", + " 36: 2,\n", + " 37: 2,\n", + " 38: 2,\n", + " 39: 2,\n", + " 40: 2,\n", + " 41: 2,\n", + " 42: 2,\n", + " 43: 2,\n", + " 44: 2,\n", + " 45: 2,\n", + " 46: 2,\n", + " 47: 2,\n", + " 48: 2,\n", + " 49: 2,\n", + " 50: 2,\n", + " 51: 2,\n", + " 52: 2,\n", + " 53: 2,\n", + " 54: 2,\n", + " 55: 2,\n", + " 56: 2,\n", + " 57: 2,\n", + " 58: 2,\n", + " 59: 2,\n", + " 60: 2,\n", + " 61: 2,\n", + " 62: 2,\n", + " 63: 2,\n", + " 64: 2,\n", + " 65: 2,\n", + " 66: 2,\n", + " 67: 2,\n", + " 68: 2,\n", + " 69: 2,\n", + " 70: 2,\n", + " 71: 2,\n", + " 72: 2,\n", + " 73: 2,\n", + " 74: 2,\n", + " 75: 2,\n", + " 76: 2,\n", + " 77: 2,\n", + " 78: 2,\n", + " 79: 2,\n", + " 80: 2,\n", + " 81: 2,\n", + " 82: 2,\n", + " 83: 2,\n", + " 84: 2,\n", + " 85: 2,\n", + " 86: 2,\n", + " 87: 2,\n", + " 88: 2,\n", + " 89: 2,\n", + " 90: 2,\n", + " 91: 2,\n", + " 92: 2,\n", + " 93: 2,\n", + " 94: 2,\n", + " 95: 2,\n", + " 96: 2,\n", + " 97: 2,\n", + " 98: 2,\n", + " 99: 2,\n", + " 100: 2,\n", + " 101: 2,\n", + " 102: 2,\n", + " 103: 2,\n", + " 104: 2,\n", + " 105: 2,\n", + " 106: 2,\n", + " 107: 2,\n", + " 108: 2,\n", + " 109: 2,\n", + " 110: 2,\n", + " 111: 2,\n", + " 112: 2,\n", + " 113: 2,\n", + " 114: 2,\n", + " 115: 2,\n", + " 116: 2,\n", + " 117: 2,\n", + " 118: 2,\n", + " 119: 2,\n", + " 120: 2,\n", + " 121: 2,\n", + " 122: 2,\n", + " 123: 2,\n", + " 124: 2,\n", + " 125: 2,\n", + " 126: 2,\n", + " 127: 2,\n", + " 128: 2,\n", + " 129: 2,\n", + " 130: 2,\n", + " 131: 2,\n", + " 132: 2,\n", + " 133: 2,\n", + " 134: 2,\n", + " 135: 2,\n", + " 136: 2,\n", + " 137: 2,\n", + " 138: 2,\n", + " 139: 2,\n", + " 140: 2,\n", + " 141: 2,\n", + " 142: 2,\n", + " 143: 2,\n", + " 144: 2,\n", + " 145: 2,\n", + " 146: 2,\n", + " 147: 2,\n", + " 148: 2,\n", + " 149: 2,\n", + " 150: 2,\n", + " 151: 2,\n", + " 152: 2,\n", + " 153: 2,\n", + " 154: 2,\n", + " 155: 2,\n", + " 156: 2,\n", + " 157: 2,\n", + " 158: 2,\n", + " 159: 2,\n", + " 160: 2,\n", + " 161: 2,\n", + " 162: 2,\n", + " 163: 2,\n", + " 164: 2,\n", + " 165: 2,\n", + " 166: 2,\n", + " 167: 2,\n", + " 168: 2,\n", + " 169: 2,\n", + " 170: 2,\n", + " 171: 2,\n", + " 172: 2,\n", + " 173: 2,\n", + " 174: 2,\n", + " 175: 2,\n", + " 176: 2,\n", + " 177: 2,\n", + " 178: 2,\n", + " 179: 2,\n", + " 180: 2,\n", + " 181: 2,\n", + " 182: 2,\n", + " 183: 2,\n", + " 184: 2,\n", + " 185: 2,\n", + " 186: 2,\n", + " 187: 2,\n", + " 188: 2,\n", + " 189: 2,\n", + " 190: 2,\n", + " 191: 2,\n", + " 192: 2,\n", + " 193: 2,\n", + " 194: 2,\n", + " 195: 2,\n", + " 196: 2,\n", + " 197: 2,\n", + " 198: 2,\n", + " 199: 2,\n", + " 200: 2,\n", + " 201: 2,\n", + " 202: 2,\n", + " 203: 2,\n", + " 204: 2,\n", + " 205: 2,\n", + " 206: 2,\n", + " 207: 2,\n", + " 208: 2,\n", + " 209: 2,\n", + " 210: 2,\n", + " 211: 2,\n", + " 212: 2,\n", + " 213: 2,\n", + " 214: 2,\n", + " 215: 2,\n", + " 216: 2,\n", + " 217: 2,\n", + " 218: 2,\n", + " 219: 2,\n", + " 220: 2,\n", + " 221: 2,\n", + " 222: 2,\n", + " 223: 2,\n", + " 224: 2,\n", + " 225: 2}" + ] + }, + "execution_count": 48, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "w2.cardinalities" + ] + } + ], + "metadata": { + "kernelspec": { + "display_name": "Python 3", + "language": "python", + "name": "python3" + }, + "language_info": { + "codemirror_mode": { + "name": "ipython", + "version": 3 + }, + "file_extension": ".py", + "mimetype": "text/x-python", + "name": "python", + "nbconvert_exporter": "python", + "pygments_lexer": "ipython3", + "version": "3.7.6" + } + }, + "nbformat": 4, + "nbformat_minor": 4 +} From 523b07ff459112122ba4c03a0407d23f49c675c7 Mon Sep 17 00:00:00 2001 From: Serge Rey Date: Sun, 3 May 2020 11:41:22 -0700 Subject: [PATCH 6/6] remove pandas check --- libpysal/weights/distance.py | 40 ++++++++++++++++++++++++++---------- 1 file changed, 29 insertions(+), 11 deletions(-) diff --git a/libpysal/weights/distance.py b/libpysal/weights/distance.py index f15d01870..4ac0efb83 100644 --- a/libpysal/weights/distance.py +++ b/libpysal/weights/distance.py @@ -12,12 +12,6 @@ import scipy.sparse as sp import numpy as np -try: - import pandas - PANDAS=True -except ImportError: - PANDAS=False - def duplicated(array): """Identify duplicate rows in an array Parameters @@ -49,6 +43,7 @@ def duplicated(array): >>> duplicated(a)[:,0].any() True """ + array = np.asarray(array) n = array.shape[0] duplicate = np.zeros((n,3), dtype=int) unq, count = np.unique(array, axis=0, return_counts=True) @@ -108,11 +103,16 @@ class KNN(W): Ties between neighbors of equal distance are arbitrarily broken. - In the case of coincident points, the first record in a set of duplicates - (i.e., points with same coordinates) is defined as the coincident seed and - the remaining points in the set are coincident duplicates. Initial neighbors are identified using the the set of unique+coincident seed points (i.e., the - coincident duplicates are not included initially). Then, each coincident - duplicate has its neighbors set equal to that of its coincident seed. + Coincident points can cause challenges for distance based weights since the + distance separating a pair of coincident points is 0 by definition. We + handle this situation as follows. Define `P` as the set of indices for all + points in a data set. The first record in a set of duplicates (i.e., points + with same coordinates) is defined as the coincident seed and the remaining + points that are coincident with the seed are coincident duplicates. Define + `D` as the set of indices for the coincident duplicates. Initial neighbors + are identified using the set `S = P\D` (i.e., the coincident duplicates are + not included initially). Then, each coincident duplicate has its neighbors + set equal to that of its coincident seed. Examples @@ -139,6 +139,24 @@ class KNN(W): {1: 1.0, 4: 1.0} >>> 0 in wnn2.neighbors False + + coincident points + >>> points = [(10, 10), (20, 10), (10,10), (20,10), (40, 10), + (15, 20), (30, 20), (30, 30)] + >>> wknn2 = KNN.from_array(points, 2) + >>> wknn2.neighbors + {0: [1, 5], + 1: [0, 5], + 4: [6, 1], + 5: [1, 0], + 6: [7, 1], + 7: [6, 5], + 2: [1, 5], + 3: [0, 5]} + +​ + + """ def __init__(self, data, k=2, p=2, ids=None, radius=None, distance_metric='euclidean', **kwargs):