From 689493023e01ee1abf5cf93b1d4c36a8752df81b Mon Sep 17 00:00:00 2001 From: jnclelland Date: Wed, 12 Jul 2023 14:11:41 -0600 Subject: [PATCH 01/15] Lots of new stuff here! I'll post a long comment in the slack channel. --- maup/__init__.py | 28 +- maup/adjacencies.py | 43 +- maup/assign.py | 2 + maup/indexed_geometries.py | 34 +- maup/intersections.py | 26 +- maup/repair.py | 1125 ++++++++++++++++++++++++++++++------ maup/repair_old.py | 378 ++++++++++++ 7 files changed, 1429 insertions(+), 207 deletions(-) create mode 100644 maup/repair_old.py diff --git a/maup/__init__.py b/maup/__init__.py index ad21d21..31d9f78 100644 --- a/maup/__init__.py +++ b/maup/__init__.py @@ -2,10 +2,22 @@ from .assign import assign from .indexed_geometries import IndexedGeometries from .intersections import intersections, prorate -from .repair import close_gaps, resolve_overlaps, make_valid, autorepair, snap_to_grid, crop_to, expand_to, doctor +from .repair import close_gaps, autorepair, snap_to_grid, crop_to, doctor from .normalize import normalize from .progress_bar import progress +# Moved the source for the import of make_valid from .repair to shapely since it lives +# there now and the maup version should be removed. (Also removing make_valid from the +# __all__ list below.) +from shapely import make_valid + +# For the old autorepair functions, uncomment the following line, along with the +# corresponding line in the __all__ assignments below: + +# from .repair_old import close_gaps_old, resolve_overlaps_old, autorepair_old + + + import geopandas # warn about https://github.com/geopandas/geopandas/issues/2199 @@ -18,17 +30,17 @@ __version__ = "1.0.8" __all__ = [ + "adjacencies", "assign", + "IndexedGeometries", "intersections", "prorate", - "adjacencies", "close_gaps", - "resolve_overlaps", + "autorepair", "snap_to_grid", - "IndexedGeometries", + "crop_to", + "doctor", "normalize", - "progress", - "make_valid", - "autorepair", - "doctor" + "progress" ] +# + ["autorepair_old", "close_gaps_old", "resolve_overlaps_old"] diff --git a/maup/adjacencies.py b/maup/adjacencies.py index 517b19a..73aaf89 100644 --- a/maup/adjacencies.py +++ b/maup/adjacencies.py @@ -1,8 +1,10 @@ import warnings -from geopandas import GeoSeries +from geopandas import GeoSeries, GeoDataFrame -from .indexed_geometries import IndexedGeometries +from shapely import make_valid + +from .indexed_geometries import IndexedGeometries, get_geometries from .progress_bar import progress @@ -14,7 +16,7 @@ class IslandWarning(UserWarning): pass -def iter_adjacencies(geometries): +def iter_adjacencies(geometries): indexed = IndexedGeometries(geometries) for i, geometry in progress(indexed.geometries.items(), len(indexed.geometries)): possible = indexed.query(geometry) @@ -25,20 +27,41 @@ def iter_adjacencies(geometries): yield (i, j), inter +# Added option to output result as GeoDataFrame instead of GeoSeries def adjacencies( - geometries, adjacency_type="rook", *, warn_for_overlaps=True, warn_for_islands=True + geometries, + output_type="geoseries", + adjacency_type="rook", *, warn_for_overlaps=True, warn_for_islands=True ): - """Returns adjacencies between geometries. The return type is a + """Returns adjacencies between geometries. + The default return type is a `GeoSeries` with a `MultiIndex`, whose (i, j)th entry is the pairwise intersection between geometry `i` and geometry `j`. We ensure that `i < j` always holds, so that any adjacency is represented just once in the series. + If output_type == "geodataframe", the return type is a range-indexed GeoDataFrame + with a "neighbors" column containing the pair (i,j) for the geometry consisting + of the intersection between geometry `i` and geometry `j`. + """ if adjacency_type not in ["rook", "queen"]: raise ValueError('adjacency_type must be "rook" or "queen"') - index, geoms = zip(*iter_adjacencies(geometries)) - inters = GeoSeries(geoms, index=index, crs=geometries.crs) + geometries = get_geometries(geometries) + geometries = make_valid(geometries) + +# Change here suggested by Hazel to handle the case where there are no adjacencies: + adjs = list(iter_adjacencies(geometries)) + if adjs: + index, geoms = zip(*adjs) + else: + index, geoms = [[],[]] + +# Made a change here to add the GeoDataFrame option: + if output_type == "geodataframe": + inters = GeoDataFrame({"neighbors" : index, "geometry" : geoms}, crs = geometries.crs) + else: + inters = GeoSeries(geoms, index=index, crs=geometries.crs) if adjacency_type == "rook": inters = inters[inters.length > 0] @@ -53,8 +76,12 @@ def adjacencies( OverlapWarning, ) +# Made a change here to accomodate the GeoDataFrame option: if warn_for_islands: - islands = set(geometries.index) - set(i for pair in inters.index for i in pair) + if output_type == "geodataframe": + islands = set(geometries.index) - set(i for pair in inters["neighbors"] for i in pair) + else: + islands = set(geometries.index) - set(i for pair in inters.index for i in pair) if len(islands) > 0: warnings.warn( "Found islands.\n" "Indices of islands: {}".format(islands), diff --git a/maup/assign.py b/maup/assign.py index d28d4ef..ce4a194 100644 --- a/maup/assign.py +++ b/maup/assign.py @@ -24,6 +24,8 @@ def assign(sources, targets): dtype="float" ) assignment.update(assignments_by_area) + +# We should add a warning here if there are still unassigned source geometries! return assignment.astype(targets.index.dtype, errors="ignore") diff --git a/maup/indexed_geometries.py b/maup/indexed_geometries.py index fc3c74f..2ade161 100644 --- a/maup/indexed_geometries.py +++ b/maup/indexed_geometries.py @@ -1,5 +1,8 @@ import pandas import geopandas +# Added numpy import to handle output of STRtree query +import numpy + from shapely.prepared import prep from shapely.strtree import STRtree from .progress_bar import progress @@ -12,19 +15,33 @@ def get_geometries(geometries): class IndexedGeometries: def __init__(self, geometries): self.geometries = get_geometries(geometries) +# Lines commented out because this won't work in shapely 2; geometries are now immutable. +# (But I don't really understand why it was ever here because it isn't needed!) +# for i, geometry in self.geometries.items(): +# geometry.index = i self.spatial_index = STRtree(self.geometries) + self.index = self.geometries.index + - def query(self, geometry): - relevant_indices = [index for index in self.spatial_index.query(geometry)] - relevant_geometries = self.geometries.iloc[relevant_indices] + def query(self, geometry): +# Edited because STRTree queries are handled differently in shapely 2; they return +# a list of INTEGER INDICES (i.e., RangeIndex) of relevant geometries instead of the +# geometries themselves. +# IMPORTANT EDGE CASE: When "geometry" is multi-part, this query will return a +# (2 x n) array instead of a (1 x n) array, so it's safest to flatten the query +# output before proceeding. +# relevant_indices = [geom.index for geom in self.spatial_index.query(geometry)] + relevant_integer_index_array = self.spatial_index.query(geometry) + relevant_integer_indices = [*set(numpy.ndarray.flatten(relevant_integer_index_array))] + relevant_geometries = self.geometries.iloc[relevant_integer_indices] return relevant_geometries - def intersections(self, geometry): - relevant_geometries = self.query(geometry) + def intersections(self, geometry): + relevant_geometries = self.query(geometry) intersections = relevant_geometries.intersection(geometry) return intersections[-(intersections.is_empty | intersections.isna())] - def covered_by(self, container): + def covered_by(self, container): relevant_geometries = self.query(container) prepared_container = prep(container) @@ -34,7 +51,7 @@ def covered_by(self, container): selected_geometries = relevant_geometries.apply(prepared_container.covers) return relevant_geometries[selected_geometries] - def assign(self, targets): + def assign(self, targets): target_geometries = get_geometries(targets) groups = [ self.covered_by(container).apply(lambda x: container_index) @@ -43,10 +60,11 @@ def assign(self, targets): ) ] if groups: - return pandas.concat(groups).reindex(self.geometries.index) + return pandas.concat(groups).reindex(self.index) else: return geopandas.GeoSeries() + def enumerate_intersections(self, targets): target_geometries = get_geometries(targets) for i, target in progress(target_geometries.items(), len(target_geometries)): diff --git a/maup/intersections.py b/maup/intersections.py index dff1de1..d915ddb 100644 --- a/maup/intersections.py +++ b/maup/intersections.py @@ -7,12 +7,16 @@ @require_same_crs -def intersections(sources, targets, area_cutoff=None): - """Computes all of the nonempty intersections between two sets of geometries. - The returned `~geopandas.GeoSeries` will have a MultiIndex, where the geometry - at index *(i, j)* is the intersection of ``sources[i]`` and ``targets[j]`` - (if it is not empty). +# Added option to output result as GeoDataFrame instead of GeoSeries +def intersections(sources, targets, output_type= "geoseries", area_cutoff=None): + """Computes all of the nonempty intersections between two sets of geometries. + By default, the returned `~geopandas.GeoSeries` will have a MultiIndex, where the + geometry at index *(i, j)* is the intersection of ``sources[i]`` and ``targets[j]`` + (if it is not empty). + If output_type == "geodataframe", the return type is a range-indexed GeoDataFrame + with "source" and "target" columns containing the indices i,j, respectively, for the + intersection of ``sources[i]`` and ``targets[j]`` :param sources: geometries :type sources: :class:`~geopandas.GeoSeries` or :class:`~geopandas.GeoDataFrame` :param targets: geometries @@ -22,8 +26,10 @@ def intersections(sources, targets, area_cutoff=None): area greater than ``area_cutoff`` :type area_cutoff: Number or None """ + reindexed_sources = get_geometries_with_range_index(sources) reindexed_targets = get_geometries_with_range_index(targets) + spatially_indexed_sources = IndexedGeometries(reindexed_sources) records = [ @@ -33,15 +39,23 @@ def intersections(sources, targets, area_cutoff=None): reindexed_targets ) ] + df = GeoDataFrame.from_records(records, columns=["source", "target", "geometry"]) + df = df.sort_values(by=["source", "target"]).reset_index(drop=True) + df.crs = sources.crs + geometries = df.set_index(["source", "target"]).geometry geometries.sort_index(inplace=True) geometries.crs = sources.crs if area_cutoff is not None: + df = df[df.area > area_cutoff].reset_index(drop=True) geometries = geometries[geometries.area > area_cutoff] - return geometries + if output_type == "geodataframe": + return df + else: + return geometries def prorate(relationship, data, weights, aggregate_by="sum"): diff --git a/maup/repair.py b/maup/repair.py index 3491854..3dbb863 100644 --- a/maup/repair.py +++ b/maup/repair.py @@ -1,27 +1,896 @@ import math import pandas +import geopandas +from geopandas import GeoSeries, GeoDataFrame + import functools -import warnings -from geopandas import GeoSeries, GeoDataFrame -from shapely.geometry import MultiPolygon, Polygon -from shapely.ops import unary_union -# from shapely.validation import make_valid # currently in alpha +import numpy + +import shapely +from shapely import make_valid +from shapely.ops import unary_union, split, polygonize, linemerge +from shapely.geometry import Polygon, MultiPolygon, Point, MultiPoint, LineString, MultiLineString +from shapely.strtree import STRtree from .adjacencies import adjacencies from .assign import assign_to_max from .crs import require_same_crs from .indexed_geometries import get_geometries from .intersections import intersections +from .progress_bar import progress + +from itertools import combinations +from math import sqrt + +import warnings; warnings.filterwarnings('ignore', 'GeoSeries.isna', UserWarning) + +pandas.options.mode.chained_assignment = None + + + +""" +Here we define a function "autorepair" that takes a GeoDataFrame and: + +(1) Resolves all overlaps. + +(2) (optionally) Closes all gaps; default is fill_holes = True. + +(3) (optionally) Replaces all rook adjacencies with length below a + user-specified threshold with queen adjacencies. Default is + min_rook_length = None; this step will be skipped unless a + length threshold is provided. + +Note that (2) and (3) are optional but (1) is not! The procedure for (1) +produces a clean tiling of the region that is necessary for the procedures +in (2) and (3) to work properly. (If the region is already free of overlaps, +Step (1) won't change anything.) + +The algorithm differs significantly from that the original maup version and +avoids the small floating point errors that tend to occur there. +(HOW, I HEAR YOU ASK???) + +Here's what happens in the original maup version (or really in shapely): +When two line segments L1, L2 intersect, their point of intersection P is a +rational function of the coordinates of their endpoints. In general, these +coordinates don't have an exact floating point expression and so must be +approximated. (In this case, if you ask shapely whether P is contained in +either L1 or L2, it will tell you no!) So P becomes a new vertex on the +polygon that defines the overlap/hole, but it is not (quite) contained in the +boundaries of the polygons whose intersection created it. This discrepancy +is what ends up leaving behind small overlaps even when they have all +theoretically been removed. + +Additionally, shapely has a rare (but not rare enough!) bug where computing +the difference between a large polygon and a very small one completely (or +almost completely) obliterates the large one. THIS IS NOT OKAY! +(Edit: I don't know whether this bug has been resolved in shapely 2.) + +In order to avoid both of these issues, these procedures have been completely +redesigned so that no intersections or differences between polygons are ever +used during the construction process. (They are, however, used to identify +adjacencies; I don't think there's any way around that.) +1-dimensional intersections between polygon *boundaries* ARE used, but not +transverse intersections, and not in any way that looks for any interior +intersection points between the endpoints of line segments, so +the floating point approximation issue doesn't come up. + +The result is a completely clean GeoDataFrame with NO overlaps, gaps, or rook +adjacencies below the length threshold! If unusual topology interferes with +any of this, warning messages will alert the user to specific places that +should be looked at with human eyes. + +Brief sketch of the new overlap-resolution algorithm: + +(1) Take the unary union of all the polygon boundaries. This produces a + huge MultiLineString, and the unary_union function does the VERY slick + trick of adding those (approximate) points of intersection to the + LineStrings that should contain them. (THIS IS HOW THE MAGIC HAPPENS!) + +(2) Polygonize this structure. This will produce a set of polygons ("pieces") + that give a clean geometric partition of the region, with no gaps or + overlaps between them. Every polygon in this set is the intersection of + zero or more polygons from the original GeoDataFrame, with very small + adjustments resulting from the addition of intersection points to the + polygon boundaries. (Pieces where zero polygons "intersect" correspond + to holes in the original geometry.) + +(3) Choose a representative point in the interior of each piece and identify + all the original polygons that contain it. (Using a representative point + rather than checking for full containment avoids errors that would result + from the polygons having changed slightly!) This will allow us to identify + exactly which polygons from the original geometry intersected to produce + that piece. + +(4) Re-organize the info in (3) into separate GeoDataFrames according to the + degree of the overlap. This will produce a list of GeoDataFrames, one for + intersections of each degree that occurs (including degree 1), plus another + GeoDataFrame consisting of all the holes. + +(5) Rebuild the polygons from the pieces, using the data in (4). Overlaps are + assigned to parent polygons, first to resolve any connectivity issues, and + then by largest shared perimeter. + + +Brief sketch of the new gap-closing algorithm: + +(1) For any hole whose boundary touches 3 or fewer polygons, attach it to the + polygon with which it shares the greatest perimeter. (Doesn't really matter; + in practice holes have very small area, and adjacency relationships won't be + affected regardless of where we attach it.) + +(2) For a hole whose boundary touches 4 or more polygons, find the pair of + non-adjacent polygons with the shortest distance between them. Cut out a + piece of the hole and use it to attach these two polygons with a boundary + segment of positive length. (This is a "best guess" for when two polygons + on opposite sides of a long, thin gap should really be adjacent.) + This leaves behind two new holes, each with strictly fewer boundary + polygons than the original. + +(3) Repeat until all holes are filled. + + +Brief sketch of the new rook-to-queen algorithm: + +Let L be a linestring with length below the specified threshold. + +(1) Construct a small disk D that contains the linestring within its interior. + Find all polygons P_1, ..., P_k that intersect D. + +(2) Take the unary_union of all the boundaries of P_1, ..., P_k, and D, and + polygonize this structure. Assign all pieces outside of D to the polygons + that they came from, and assign the pieces inside D to D. (This is + equivalent to taking the difference of each polygon minus D, except that + it works cleanly and reliably!) + +(3) For each of the polygons whose boundary (now) contains an arc of D's boundary + circle, create a "pie wedge" inside D subtending this arc, and join it to + the polygon that it shares a boundary with. + +""" + + + + + +# Some useful short functions: + +# Count the number of connected components of a shapely object: +def num_components(geom): + if geom.is_empty: + return 0 + elif (geom.geom_type == "Polygon") or (geom.geom_type == "Point") or (geom.geom_type == "LineString"): + return 1 + elif (geom.geom_type == "MultiPolygon") or (geom.geom_type == "MultiLineString") or (geom.geom_type == "GeometryCollection"): + return len(geom.geoms) + + """ -Some of these functions are based on the functions in Mary Barker's -check_shapefile_connectivity.py script in @gerrymandr/Preprocessing. +MAIN FUNCTION """ -class AreaCroppingWarning(UserWarning): - pass +def autorepair(geometries0_df, fill_gaps = True, min_rook_length = None, + holes_to_keep = None): + + if isinstance(geometries0_df, geopandas.GeoDataFrame) == False: + raise TypeError(f"Input must be a GeoDataFrame!") + + geometries_df = geometries0_df.copy() + + # Ensure that geometries are 2-D and not 3-D! + for i in geometries_df.index: + geometries_df["geometry"][i] = shapely.wkb.loads( + shapely.wkb.dumps(geometries_df["geometry"][i], output_dimension=2)) + + # Warn if crs is geographic: + if geometries_df.crs is not None: + if geometries_df.crs.is_geographic: + warnings.warn("Geometry is in a geographic CRS. Results from 'length' are likely incorrect. Use 'GeoSeries.to_crs()' to re-project geometries to a projected CRS before this operation.") + + # If holes_to_keep and/or overlaps_to_keep are present, require them to have the + # same CRS as the main shapefile. + if holes_to_keep is not None: + if holes_to_keep.crs != geometries_df.crs: + raise Exception("holes_to_keep must be in the same CRS as the shapefile being repaired.") + + + + # Construct data about overlaps of all orders, plus holes. + overlap_tower, holes_df = building_blocks(geometries_df) + + # Use data from the overlap tower to rebuild precincts with no overlaps. + print("Resolving overlaps...") + reconstructed_df = reconstruct_from_overlap_tower(geometries_df, overlap_tower) + + # Use data about the holes to fill holes if desired. + if fill_gaps == True: + print("Closing gaps...") + reconstructed_df = close_gaps(reconstructed_df, holes_df) + + + # Check for precincts that have become (more) disconnected, generally with an extra + # component of negligible area. If any are found and the area is negligible, + # reassign to an adjacent precinct by shared perimeter. + # If the area is not negligible, leave it alone and report it so that a human + # can decide what to do about it! + + disconnected_df = reconstructed_df[reconstructed_df["geometry"].apply(lambda x: x.geom_type != "Polygon")] + # This will include precincts that were disconnected in the original; need to + # filter by whether they got worse. + + if len(disconnected_df) > 0: + disconnected_poly_indices = [] + for ind in disconnected_df.index: + if num_components(reconstructed_df["geometry"][ind]) > num_components(geometries0_df["geometry"][ind]): + disconnected_poly_indices.append(ind) + + if len(disconnected_poly_indices) > 0: # These are the ones (if any) that got worse. + geometries = get_geometries(reconstructed_df) + spatial_index = STRtree(geometries) + index_by_iloc = dict((i, list(geometries.index)[i]) for i in range(len(geometries.index))) # Build indexing dictionary + + for g_ind in disconnected_poly_indices: + excess = num_components(reconstructed_df["geometry"][g_ind]) - num_components(geometries0_df["geometry"][g_ind]) + + component_num_list = [x for x in range(len(reconstructed_df["geometry"][g_ind].geoms))] + component_areas = [] + + for c_ind in range(len(reconstructed_df["geometry"][g_ind].geoms)): + component_areas.append((c_ind, reconstructed_df["geometry"][g_ind].geoms[c_ind].area)) + + component_areas_sorted = sorted(component_areas, key=lambda tup: tup[1]) + + big_area = max([reconstructed_df["geometry"][g_ind].area, geometries0_df["geometry"][g_ind].area]) + + for i in range(excess): # Check that the ith smallest component has small enough area find a better polygon to add it to. + c_ind = component_areas_sorted[i][0] + this_fragment = reconstructed_df["geometry"][g_ind].geoms[c_ind] + if component_areas_sorted[i][1] < 0.0001*big_area: # Less than 0.01% + component_num_list.remove(c_ind) # Tells us to take out this component later + possible_intersect_integer_indices = [*set(numpy.ndarray.flatten(spatial_index.query(this_fragment)))] + possible_intersect_indices = [(index_by_iloc[k]) for k in possible_intersect_integer_indices] + + shared_perimeters = [] + for g_ind2 in possible_intersect_indices: + if g_ind2 != g_ind and not (this_fragment.boundary).intersection(reconstructed_df["geometry"][g_ind2].boundary).is_empty: + shared_perimeters.append((g_ind2, (this_fragment.boundary).intersection(reconstructed_df["geometry"][g_ind2].boundary).length)) + + max_shared_perim = sorted(shared_perimeters, key=lambda tup: tup[1])[-1] + poly_to_add_to = max_shared_perim[0] + reconstructed_df["geometry"][poly_to_add_to] = unary_union( + [reconstructed_df["geometry"][poly_to_add_to], this_fragment]) + + if len(component_num_list) == 1: + reconstructed_df["geometry"][g_ind] = reconstructed_df["geometry"][g_ind].geoms[component_num_list[0]] + elif len(component_num_list) > 1: + reconstructed_df["geometry"][g_ind] = MultiPolygon( + [reconstructed_df["geometry"][g_ind].geoms[c_ind] for c_ind in component_num_list]) + else: + print("WARNING: A component of the geometry at index ", g_ind, " was badly disconnected and redistributed to other polygons!") + + # We SHOULD be back to the correct number of components everywhere, but check again just to make sure! + + disconnected_df_2 = reconstructed_df[reconstructed_df["geometry"].apply(lambda x: x.geom_type != "Polygon")] + + if len(disconnected_df_2) > 0: + for ind in disconnected_df_2.index: + if num_components(reconstructed_df["geometry"][ind]) > num_components(geometries0_df["geometry"][ind]): + print("WARNING: A component of the geometry at index ", ind, " may have been disconnected!") + + if min_rook_length is not None: + # Find all inter-polygon boundaries shorter than min_rook_length and replace them + # with queen adjacencies by manipulating coordinates of all surrounding polygon. + print("Converting small rook adjacencies to queen...") + reconstructed_df = small_rook_to_queen(reconstructed_df, min_rook_length) + + if holes_to_keep is not None: + holes_to_keep = get_geometries(holes_to_keep) + + reconstructed_spatial_index = STRtree(reconstructed_df["geometry"]) # Build STRtree for all the reconstructed geometries + reconstructed_index_by_iloc = dict((i, list(reconstructed_df.index)[i]) for i in range(len(reconstructed_df))) # Build indexing dictionary + + for keeper in holes_to_keep: + possible_geom_integer_indices = [*set(numpy.ndarray.flatten(reconstructed_spatial_index.query(keeper)))] + possible_geom_indices = [reconstructed_index_by_iloc[k] for k in possible_geom_integer_indices] + for g_ind in possible_geom_indices: + reconstructed_df['geometry'][g_ind] = reconstructed_df['geometry'][g_ind].difference(keeper) + + return reconstructed_df + + +# SUPPORTING FUNCTIONS: + +# Partition the region via ALL precinct boundaries, identify each polygon in the +# partition according to which polygons in the original intersected to create it, +# and organize this data according to order of the overlaps. (Order zero = hole) + +def building_blocks(geometries0_df): + if isinstance(geometries0_df, geopandas.GeoDataFrame) == False: + raise TypeError(f"Input must be a GeoDataFrame!") + + geometries_df = geometries0_df.copy() + + # Make a list of all the boundaries of all the polygons. + # This won't work properly with multi-polygons, so explode first: + + boundaries = [] + geometries_exploded_df = geometries_df.explode(index_parts=False).reset_index(drop=True) + for i in geometries_exploded_df.index: + + boundaries.append(shapely.boundary(geometries_exploded_df["geometry"][i])) + + boundaries_union = unary_union(boundaries) + + # Create geodataframe with all the pieces created by overlaps of all orders, + # together with a set for each piece consisting of the polygons that created the overlap. + + pieces_df = geopandas.GeoDataFrame(columns = ["polygon indices"], + geometry = geopandas.GeoSeries([geom for geom in polygonize(boundaries_union)]), + crs = geometries_df.crs) + + for i in pieces_df.index: + pieces_df["polygon indices"][i] = set() + + + g_spatial_index = STRtree(geometries_df["geometry"]) # Build STRtree for the main geometries + g_index_by_iloc = dict((i, list(geometries_df.index)[i]) for i in range(len(geometries_df))) # Build indexing dictionary + + print("Identifying overlaps...") + for i in progress(pieces_df.index, len(pieces_df.index)): + possible_geom_integer_indices = [*set(numpy.ndarray.flatten(g_spatial_index.query(pieces_df["geometry"][i])))] + possible_geom_indices = [g_index_by_iloc[k] for k in possible_geom_integer_indices] + + for j in possible_geom_indices: + if pieces_df["geometry"][i].representative_point().distance(geometries_df["geometry"][j]) == 0: + pieces_df["polygon indices"][i] = pieces_df["polygon indices"][i].union({j}) + + + + # Organize this info into separate gdf's for overlaps of all orders - including + # order zero, which corresponds to holes. + # This will be easier if we temporarily add a column for overlap degree. + + overlap_degree_list = [len(x) for x in pieces_df["polygon indices"]] + pieces_df["overlap degree"] = overlap_degree_list + + # Here are the holes: + + holes_df = (pieces_df[pieces_df["overlap degree"] == 0]).reset_index(drop=True) + + # And here is a list of GeoDataFrames, one consisting of all overlaps of each order: + + overlap_tower = [] + + for i in range(max(pieces_df["overlap degree"])): + overlap_tower.append(pieces_df[pieces_df["overlap degree"] == i+1]) + + + # Get rid of unnecessary column and reindex each GeoDataFrame: + + for i in range(len(overlap_tower)): + del overlap_tower[i]["overlap degree"] + overlap_tower[i] = overlap_tower[i].reset_index(drop=True) + + del holes_df["overlap degree"] + + return overlap_tower, holes_df + + +# Rebuild the polygons with overlaps removed: + +def reconstruct_from_overlap_tower(geometries0_df, overlap_tower0): + + # We want to preserve all the shapefile columns from the original, but + # we're going to completely rebuild the geometry from scratch. + + geometries_df = geometries0_df.copy() + overlap_tower = [df.copy() for df in overlap_tower0] + + geometries_df["geometry"] = Polygon() + + max_overlap_level = len(overlap_tower) + + # Start by assigning all order-1 pieces to the polygon they came from: + + for ind in overlap_tower[0].index: + this_poly_ind = list(overlap_tower[0]["polygon indices"][ind])[0] + this_piece = overlap_tower[0]["geometry"][ind] + geometries_df["geometry"][this_poly_ind] = unary_union([geometries_df["geometry"][this_poly_ind], this_piece]) + + # IMPORTANT: We need to know which geometries were disconnected by removing + # overlaps! Add columns for numbers of components in the original and refined + # geometries to each dataframe for future use. + + geometries_df["num components orig"] = 0 + geometries_df["num components refined"] = 0 + + for ind in geometries_df.index: + geometries_df["num components orig"][ind] = num_components(geometries0_df["geometry"][ind]) + geometries_df["num components refined"][ind] = num_components(geometries_df["geometry"][ind]) + + # Now, start with the order-2 and gradually add overlaps at higher order until done. + + # IMPORTANT: First look for geometries at the top level that were disconnected + # by the refinement process, and give them first dibs at grabbing overlaps + # (regardless of perimeter!) until they are reconnected or run out of overlaps + # to grab. + # (This doesn't always completely work; in rare cases a single overlap + # can disconnect two polygons, and only one of them gets to grab it back. + # This will be addressed at the end, within the main function.) + + geometries_disconnected_df = geometries_df[geometries_df["num components refined"] > geometries_df["num components orig"]] + + for i in range(1, max_overlap_level): + overlaps_df = overlap_tower[i] + overlaps_df_unused_indices = overlaps_df.index.tolist() + # Need to make sure each overlap only gets used once! + + o_spatial_index = STRtree(overlaps_df["geometry"]) # Build STRtree for the overlaps + o_index_by_iloc = dict((i, list(overlaps_df.index)[i]) for i in range(len(overlaps_df))) # Build indexing dictionary + + for g_ind in geometries_disconnected_df.index: + possible_overlap_integer_indices = [*set(numpy.ndarray.flatten(o_spatial_index.query(geometries_disconnected_df["geometry"][g_ind])))] + possible_overlap_indices_0 = [o_index_by_iloc[k] for k in possible_overlap_integer_indices] + possible_overlap_indices = list(set(possible_overlap_indices_0) & set(overlaps_df_unused_indices)) + + + geom_finished = False # Only keep adding things until it gets connected again + + for o_ind in possible_overlap_indices: + # If the corresponding overlap intersects this geometry (and was + # contained in it originally!), grab it. + + if (geom_finished == False) and (g_ind in list(overlaps_df["polygon indices"][o_ind])) and (not geometries_disconnected_df["geometry"][g_ind].intersection(overlaps_df["geometry"][o_ind]).is_empty): + + if (geometries_disconnected_df["geometry"][g_ind].intersection(overlaps_df["geometry"][o_ind])).length > 0: + geometries_disconnected_df["geometry"][g_ind] = unary_union([ + geometries_disconnected_df["geometry"][g_ind], overlaps_df["geometry"][o_ind] + ]) + overlaps_df_unused_indices.remove(o_ind) + if num_components(geometries_disconnected_df["geometry"][g_ind]) == geometries_df["num components orig"][g_ind]: + geom_finished = True + + geometries_df["geometry"][g_ind] = geometries_disconnected_df["geometry"][g_ind] + + if geom_finished == True: + geometries_disconnected_df = geometries_disconnected_df.drop(g_ind) + + # Okay, that's all we can do for the disconnected geometries at this level. + # Go on to filling in the rest of the overlaps by greatest perimeter. + + + g_spatial_index = STRtree(geometries_df["geometry"]) # Build STRtree for the main geometries + g_index_by_iloc = dict((i, list(geometries_df.index)[i]) for i in range(len(geometries_df))) # Build indexing dictionary + + print("Assigning order", i+1, "pieces...") + for o_ind in progress(overlaps_df_unused_indices, len(overlaps_df_unused_indices)): + this_overlap = overlaps_df["geometry"][o_ind] + + shared_perimeters = [] + + possible_geom_integer_indices = [*set(numpy.ndarray.flatten(g_spatial_index.query(overlaps_df["geometry"][o_ind])))] + possible_geom_indices = [g_index_by_iloc[k] for k in possible_geom_integer_indices] + + + for g_ind in possible_geom_indices: + if (g_ind in list(overlaps_df["polygon indices"][o_ind])) and not (this_overlap.boundary).intersection(geometries_df["geometry"][g_ind].boundary).is_empty: + shared_perimeters.append((g_ind, (this_overlap.boundary).intersection(geometries_df["geometry"][g_ind].boundary).length)) + + # This possibility came up in a previous version, but I hope it will be + # obsolete in this version! + if len(shared_perimeters) > 0: + max_shared_perim = sorted(shared_perimeters, key=lambda tup: tup[1])[-1] + poly_to_add_to = max_shared_perim[0] + geometries_df["geometry"][poly_to_add_to] = unary_union( + [geometries_df["geometry"][poly_to_add_to], this_overlap]) + else: + print("Couldn't find a polygon to glue a component of intersection ", o_ind, " to") + + + reconstructed_df = geometries_df.copy() + + del reconstructed_df["num components orig"] + del reconstructed_df["num components refined"] + + return reconstructed_df + + + + +# Close the gaps: + +def close_gaps(geometries0_df, holes0_df): + + geometries_df = geometries0_df.copy() + holes_df = holes0_df.copy() + + # Check whether there are any holes to fill: + if len(holes_df) == 0: + fill_complete = True + else: + fill_complete = False + + + + while fill_complete == False: + + hole_boundaries_df = intersections(holes_df, geometries_df, output_type = "geodataframe") + # Note: all geometries here will (stupidly!) be MultiLineStrings; + # convert to LineStrings before exploding. + + for ind in hole_boundaries_df.index: + if hole_boundaries_df["geometry"][ind].geom_type == "MultiLineString": + hole_boundaries_df["geometry"][ind] = linemerge(hole_boundaries_df["geometry"][ind]) + + hole_boundaries_df = hole_boundaries_df.explode(index_parts=False).reset_index(drop=True) + # All geometries here should be LineStrings, or occasionally Points. + + activity_this_round = False # Keep track of whether there's still anything + # left to do + + new_holes = [] # Holes that will be created by partial fills + # to add before next round + + for h_ind in progress(holes_df.index, len(holes_df.index)): + + this_hole = holes_df["geometry"][h_ind] + this_hole_boundaries_df = hole_boundaries_df[hole_boundaries_df["source"] == h_ind] + + if len(set(this_hole_boundaries_df["target"])) <= 3: # Fill the hole! + + shared_perimeters = [] + + for b_ind in this_hole_boundaries_df.index: + g_ind = this_hole_boundaries_df["target"][b_ind] + shared_perimeters.append((b_ind, (this_hole.boundary).intersection(geometries_df["geometry"][g_ind].boundary).length)) + + max_shared_perim = sorted(shared_perimeters, key=lambda tup: tup[1])[-1] + poly_to_add_to = this_hole_boundaries_df["target"][max_shared_perim[0]] + + geometries_df["geometry"][poly_to_add_to] = unary_union( + [geometries_df["geometry"][poly_to_add_to], this_hole]) + holes_df = holes_df.drop(h_ind) + + activity_this_round = True + + else: # Partially fill the hole! + + poly_to_add_to, piece_to_connect, new_holes_this_time = partial_fill_data(this_hole, this_hole_boundaries_df) + + if poly_to_add_to is not None: + # If a relative threshold is set, check that the piece to connect is + # under the threshold relative to poly_to_add_to before adjoining it + # to the polygon. + geometries_df["geometry"][poly_to_add_to] = unary_union( + [geometries_df["geometry"][poly_to_add_to], piece_to_connect]) + holes_df = holes_df.drop(h_ind) + + + for k in range(len(new_holes_this_time)): + new_holes.append(new_holes_this_time[k]) + + activity_this_round = True + + if activity_this_round == True: # Add any new holes for the next round! + for k in range(len(new_holes)): + + hole_to_append_df = geopandas.GeoDataFrame(columns = ["polygon indices"], + geometry = geopandas.GeoSeries([new_holes[k]]), + crs = geometries_df.crs) + hole_to_append_df["polygon indices"][0] = set() + + holes_df = pandas.concat([holes_df, hole_to_append_df], ignore_index=True) + holes_df.reset_index(drop=True) + + + + else: # All done! + fill_complete = True + + + return geometries_df + + + +# Partially fill a hole with 4 or more boundary components by attaching two +# non-adjacent boundary components, leaving behind two new holes, each with strictly +# fewer boundary components than the original. + +def partial_fill_data(hole0, hole_boundaries_df0): + + # Compute all distances between pairs of non-adjacent polygons along the + # boundary of the hole. + + hole = hole0 + hole_boundaries_df = hole_boundaries_df0.copy() + + poly_distances = [] + + for i in hole_boundaries_df.index: + for j in hole_boundaries_df.index: + if j > i: + this_distance = hole_boundaries_df["geometry"][i].distance(hole_boundaries_df["geometry"][j]) + if this_distance != 0: #maybe use not is_close? + poly_distances.append((i, j, this_distance)) + + + # Choose the shortest-distance non-adjacent pair to connect by gluing a piece + # of the hole to one of them. + + + distance_data_sorted = sorted(poly_distances, key=lambda tup: tup[2]) + + poly_pair_found = False + + while poly_pair_found == False and len(distance_data_sorted) > 0: + + min_distance_data = distance_data_sorted[0] + polys_to_connect = (min_distance_data[0], min_distance_data[1]) + + # For each of the two polygons that we want to connect, identify the two + # vertices at the ends of their boundary segment. + # For each vertex, find the nearest point in the OTHER of the two polygons, + # along with the corresponding distances. + + # Add a check to make sure that the geometry works as expected; if not, we'll + # drop this polygon pair and move on to the next-closest one. + + + boundary_point_coords = [] + + vertices = [[], []] + vertices_nearest_point_pairs = [[], []] + vertices_distances = [[], []] + + for i in range(2): + boundary_point_coords.append([x for x in hole_boundaries_df["geometry"][polys_to_connect[i]].coords]) + vertices[i].append(Point(boundary_point_coords[i][0][0], boundary_point_coords[i][0][1])) + vertices[i].append(Point(boundary_point_coords[i][-1][0], boundary_point_coords[i][-1][1])) + + for i in range(2): + for j in range(2): + vertices_nearest_point_pairs[i].append(shapely.ops.nearest_points(vertices[i][j], MultiPoint(boundary_point_coords[1-i]))) + vertices_distances[i].append(vertices_nearest_point_pairs[i][j][0].distance(vertices_nearest_point_pairs[i][j][1])) + + + # Identify the list positions of the two shortest of these four distances, and + # split the hole with the two corresponding line segments. (Unfortunately this + # requires two steps, because the split operation doesn't support splitting + # with MultiLineStrings.) + + # IT CAN HAPPEN THAT THE TWO SHORTEST LINES DON'T PROVIDE TWO TRUE SPLITS! + # If that happens, try the 3rd and 4th lines. And if THAT doesn't work, go + # back to the distance data and pick a different pair of polygons to connect! + + sorted_distances = sorted([(0,0,vertices_distances[0][0]), (0,1,vertices_distances[0][1]), + (1,0,vertices_distances[1][0]), (1,1,vertices_distances[1][1])], + key = lambda tup: tup[2]) + sorted_line_segments_0 = [] + for k in range(4): + i = sorted_distances[k][0] + j = sorted_distances[k][1] + sorted_line_segments_0.append(LineString(list(vertices_nearest_point_pairs[i][j]))) + + # Ignore any of the line segments that aren't contained within the hole (but + # contained within the boundary is okay - and happens!): + + sorted_line_segments = [] + for k in range(4): + if hole.contains(sorted_line_segments_0[k]) or hole.boundary.contains(sorted_line_segments_0[k]): + sorted_line_segments.append(sorted_line_segments_0[k]) + + + line_pair_found = False + + if len(sorted_line_segments) > 1: + for j in range(1, len(sorted_line_segments)): + for i in range(j): + if sorted_line_segments[i].disjoint(sorted_line_segments[j]) and line_pair_found == False: + line_pair = [i,j] + line_pair_found = True + + + if line_pair_found==True: + + new_holes = [] + + first_split = shapely.ops.split(hole, sorted_line_segments[line_pair[0]]) + + # First_split should contain two pieces, one of which also intersects the second line. + # Find that piece, and split it with the second line. + + for k in range(len(first_split.geoms)): + if first_split.geoms[k].intersects(sorted_line_segments[line_pair[1]]): + second_split = shapely.ops.split(first_split.geoms[k], sorted_line_segments[line_pair[1]]) + else: + new_holes.append(first_split.geoms[k]) + + # Find the piece in second_split that also touches the first line. + # THIS WILL BE OUR PIECE TO ADJOIN TO ONE OF THE TWO POLYGONS! + + + for k in range(len(second_split.geoms)): + if second_split.geoms[k].intersects(sorted_line_segments[line_pair[0]]): + piece_to_connect = second_split.geoms[k] + poly_pair_found = True # SUCCESS!! + else: + new_holes.append(second_split.geoms[k]) + + if poly_pair_found == True: + if piece_to_connect.intersection(hole_boundaries_df["geometry"][polys_to_connect[0]]).length > piece_to_connect.intersection(hole_boundaries_df["geometry"][polys_to_connect[1]]).length: + poly_to_add_to = hole_boundaries_df["target"][polys_to_connect[0]] + elif piece_to_connect.intersection(hole_boundaries_df["geometry"][polys_to_connect[0]]).length < piece_to_connect.intersection(hole_boundaries_df["geometry"][polys_to_connect[1]]).length: + poly_to_add_to = hole_boundaries_df["target"][polys_to_connect[1]] + + else: #Try again with a different polygon pair + distance_data_sorted.remove(distance_data_sorted[0]) + + + + + if poly_pair_found == False: # Not much to do at this point except join by shared + # perimeter and hope for the best! (Usually these will + # be small enough that they'll get converted to queen + # adjacencies eventually anyway.) + shared_perimeters = [] + + for b_ind in hole_boundaries_df.index: + g_ind = hole_boundaries_df["target"][b_ind] + shared_perimeters.append((b_ind, hole_boundaries_df["geometry"][b_ind].length)) + + max_shared_perim = sorted(shared_perimeters, key=lambda tup: tup[1])[-1] + + poly_to_add_to = hole_boundaries_df["target"][max_shared_perim[0]] + piece_to_connect = hole + new_holes = [] + + return poly_to_add_to, piece_to_connect, new_holes + + +# Convert all rook adjacencies with boundary length less than min_rook_length to queen adjacencies + +def small_rook_to_queen(geometries0_df, min_rook_length): + + geometries_df = geometries0_df.copy() + + adj_df = adjacencies(geometries_df, output_type = "geodataframe") # We're assuming the input is clean, so these should all be 1-D or less + + + for ind in adj_df.index: + if adj_df["geometry"][ind].geom_type == "GeometryCollection": + adj_list = list(adj_df["geometry"][ind].geoms) + adj_list_no_point = [x for x in adj_list if x.geom_type != "Point"] + adj_df["geometry"][ind] = MultiLineString(adj_list_no_point) + + if adj_df["geometry"][ind].geom_type == "MultiLineString": + adj_df["geometry"][ind] = linemerge(adj_df["geometry"][ind]) + + adj_df = adj_df.explode(index_parts=False).reset_index(drop=True) + + for ind in adj_df.index: + if adj_df["geometry"][ind].geom_type == "Point": + adj_df = adj_df.drop(ind) + + + # Add column for boundary length and pick off the small ones: + + adj_df["boundary length"] = adj_df["geometry"].length + + small_adj_df = adj_df[adj_df["boundary length"] < min_rook_length] + + + for a_ind in small_adj_df.index: # Don't put a progress bar here; it gets out of control! + + # Make sure we haven't inadvertently killed off this adjacency while we were fixing others! + + geoms_for_this_adj = list(small_adj_df["neighbors"][a_ind]) + adj_len = geometries_df["geometry"][geoms_for_this_adj[0]].intersection(geometries_df["geometry"][geoms_for_this_adj[1]]).length + + if adj_len > 0 and adj_len < min_rook_length: + + # Rebuild the STRtree every time because the geometries will change every time. + g_spatial_index = STRtree(geometries_df["geometry"]) # Build STRtree for the main geometries + g_index_by_iloc = dict((i, list(geometries_df.index)[i]) for i in range(len(geometries_df))) # Build indexing dictionary + + # Build a disk enclosing this adjacency; the idea will be to cut it out and replace it + # with a "pie chart" so that all polygons touching this disk meet at a queen adjacency + # point at the center of the disk. + adj_diam = small_adj_df["geometry"][a_ind].length + fat_point_radius = 0.6*adj_diam # slightly more than the radius from the midpoint to the endpoints + + adjacency_points = [x for x in small_adj_df["geometry"][a_ind].coords] + endpoint1 = adjacency_points[0] + endpoint2 = adjacency_points[-1] + midpoint = LineString([endpoint1, endpoint2]).centroid + midpoint_coords = midpoint.coords[0] + + disk_to_remove = midpoint.buffer(fat_point_radius) + + # Identify geometries that might intersect this disk. + possible_geom_integer_indices = [*set(numpy.ndarray.flatten(g_spatial_index.query(small_adj_df["geometry"][a_ind].buffer(2*fat_point_radius))))] + possible_geom_indices = [g_index_by_iloc[k] for k in possible_geom_integer_indices] + + # Use the boundaries of these geometries together with the boundary of the disk to + # polygonize and divide geometries into pieces inside and outside the disk + boundaries = [geometries_df["geometry"][i].boundary for i in possible_geom_indices] + boundaries.append(LineString(list(disk_to_remove.exterior.coords))) + + boundaries_union = unary_union(boundaries) + + pieces_df = geopandas.GeoDataFrame(columns = ["polygon indices"], + geometry = geopandas.GeoSeries([geom for geom in polygonize(boundaries_union)]), + crs = geometries_df.crs) + + # Associate the pieces to the main geometries. (Note that if there are holes, some pieces may + # be unassigned.) + for i in pieces_df.index: + pieces_df["polygon indices"][i] = set() + + for i in pieces_df.index: + temp_possible_geom_integer_indices = [*set(numpy.ndarray.flatten(g_spatial_index.query(pieces_df["geometry"][i])))] + temp_possible_geom_indices = [g_index_by_iloc[k] for k in temp_possible_geom_integer_indices] + + for j in temp_possible_geom_indices: + if pieces_df["geometry"][i].representative_point().distance(geometries_df["geometry"][j]) == 0: + pieces_df["polygon indices"][i] = pieces_df["polygon indices"][i].union({j}) + + + # Now rebuild the disk from the pieces that are inside the circle, and drop them from + # pieces_df. Then we'll give the pieces outside the circle back to the geometries that they came from. + + disk_to_remove = Polygon() + + for p_ind in pieces_df.index: + if pieces_df["geometry"][p_ind].representative_point().distance(midpoint) < fat_point_radius: + disk_to_remove = unary_union([disk_to_remove, pieces_df["geometry"][p_ind]]) + pieces_df = pieces_df.drop(p_ind) + + for g_ind in possible_geom_indices: + geometries_df["geometry"][g_ind] = Polygon() + + for p_ind in pieces_df.index: + if len(pieces_df["polygon indices"][p_ind]) == 1: #Note that it won't be >1 if the file is clean! + this_poly_ind = list(pieces_df["polygon indices"][p_ind])[0] + this_piece = pieces_df["geometry"][p_ind] + if this_poly_ind in possible_geom_indices: + # This check is needed because the geometries in possible_geom_incides can form a + # non-simply-connected region, in which case the interior holes - which may consist + # of multiple precincts each - may be assigned someplace they shouldn't be! + geometries_df["geometry"][this_poly_ind] = unary_union([geometries_df["geometry"][this_poly_ind], this_piece]) + + # Find the boundary arcs between geometries and the disk (and make sure each arc is a connected piece): + possible_geoms = geometries_df.loc[possible_geom_indices] + circle_boundaries_df = intersections(geopandas.GeoDataFrame(geometry = geopandas.GeoSeries([disk_to_remove], crs = geometries_df.crs)), possible_geoms, output_type = "geodataframe") + + for b_ind in circle_boundaries_df.index: + if circle_boundaries_df["geometry"][b_ind].geom_type == "MultiLineString": + circle_boundaries_df["geometry"][b_ind] = linemerge(circle_boundaries_df["geometry"][b_ind]) + + circle_boundaries_df = circle_boundaries_df.explode(index_parts=False).reset_index(drop=True) + + # For each boundary arc, create a "pie wedge" from the center of the disk subtending this arc: + + for b_ind in circle_boundaries_df.index: + boundary_arc_coords = [x for x in circle_boundaries_df["geometry"][b_ind].coords] + boundary_wedge_coords = boundary_arc_coords + [midpoint_coords] + + g_ind = circle_boundaries_df["target"][b_ind] + + geometries_df["geometry"][g_ind] = unary_union([geometries_df["geometry"][g_ind], Polygon(boundary_wedge_coords)]) + + + return geometries_df + + + + + + +""" +Functions that are unchanged from repair_old +""" def holes_of_union(geometries): @@ -30,14 +899,14 @@ def holes_of_union(geometries): if not all( isinstance(geometry, (Polygon, MultiPolygon)) for geometry in geometries ): - raise TypeError(f"Must be a Polygon or MultiPolygon (got types {set([type(x) for x in geometries])})!") +# Changed "type(x) to "x.geom_type" + raise TypeError(f"Must be a Polygon or MultiPolygon (got types {set([x.geom_type for x in geometries])})!") union = unary_union(geometries) series = holes(union) series.crs = geometries.crs return series - - + def holes(geometry): if isinstance(geometry, MultiPolygon): return GeoSeries( @@ -53,102 +922,15 @@ def holes(geometry): raise TypeError("geometry must be a Polygon or MultiPolygon to have holes") -def close_gaps(geometries, relative_threshold=0.1): - """Closes gaps between geometries by assigning the hole to the polygon - that shares the most perimeter with the hole. - If the area of the gap is greater than `relative_threshold` times the - area of the polygon, then the gap is left alone. The default value - of `relative_threshold` is 0.1. This is intended to preserve intentional - gaps while closing the tiny gaps that can occur as artifacts of - geospatial operations. Set `relative_threshold=None` to attempt close all - gaps. Due to floating point precision issues, all gaps may not be closed. - """ - geometries = get_geometries(geometries) - gaps = holes_of_union(geometries) - return absorb_by_shared_perimeter( - gaps, geometries, relative_threshold=relative_threshold - ) -def resolve_overlaps(geometries, relative_threshold=0.1): - """For any pair of overlapping geometries, assigns the overlapping area to the - geometry that shares the most perimeter with the overlap. Returns the GeoSeries - of geometries, which will have no overlaps. - - If the ratio of the overlap's area to either of the overlapping geometries' - areas is greater than `relative_threshold`, then the overlap is ignored. - The default `relative_threshold` is `0.1`. This default is chosen to include - tiny overlaps that can be safely auto-fixed while preserving major overlaps - that might indicate deeper issues and should be handled on a case-by-case - basis. Set `relative_threshold=None` to attempt to resolve all overlaps. Due - to floating point precision issues, all overlaps may not be resolved. - """ - geometries = get_geometries(geometries) - inters = adjacencies(geometries, warn_for_islands=False, warn_for_overlaps=False) - overlaps = inters[inters.area > 0].buffer(0) - - if relative_threshold is not None: - left_areas, right_areas = split_by_level(geometries.area, overlaps.index) - under_threshold = ((overlaps.area / left_areas) < relative_threshold) & ( - (overlaps.area / right_areas) < relative_threshold - ) - overlaps = overlaps[under_threshold] - - if len(overlaps) == 0: - return geometries - - to_remove = GeoSeries( - pandas.concat([overlaps.droplevel(1), overlaps.droplevel(0)]), crs=overlaps.crs - ) - with_overlaps_removed = geometries.apply(lambda x: x.difference(unary_union(to_remove))) - - return absorb_by_shared_perimeter( - overlaps, with_overlaps_removed, relative_threshold=None +def split_by_level(series, multiindex): + return tuple( + multiindex.get_level_values(i).to_series(index=multiindex).map(series) + for i in range(multiindex.nlevels) ) -def autorepair(geometries, relative_threshold=0.1): - """ - Applies all the tricks in `maup.repair` with default args. Should work by default. - The default `relative_threshold` is `0.1`. This default is chosen to include - tiny overlaps that can be safely auto-fixed while preserving major overlaps - that might indicate deeper issues and should be handled on a case-by-case - basis. Set `relative_threshold=None` to attempt to resolve all overlaps. See - `resolve_overlaps()` and `close_gaps()` for more. - """ - shp = geometries.copy() - - shp["geometry"] = remove_repeated_vertices(shp) - shp["geometry"] = make_valid(shp) - shp["geometry"] = resolve_overlaps(shp, relative_threshold=relative_threshold) - shp["geometry"] = make_valid(shp) - shp["geometry"] = close_gaps(shp, relative_threshold=relative_threshold) - shp["geometry"] = make_valid(shp) - - return shp["geometry"] - -def make_valid(geometries): - """ - Applies the shapely .buffer(0) and make_valid (once released) trick to all - geometries. Should help resolve various topological issues in shapefiles. - """ - return geometries["geometry"].simplify(0).buffer(0) - # return geometries["geometry"].buffer(0).apply(lambda x: make_valid(x)) - -def remove_repeated_vertices(geometries): - """ - Removes repeated vertices. Vertices are considered to be repeated if they - appear consecutively, excluding the start and end points. - """ - return geometries["geometry"].apply(lambda x: apply_func_to_polygon_parts(x, dedup_vertices)) - -def snap_to_grid(geometries, n=-7): - """ - Snap the geometries to a grid by rounding to the nearest 10^n. Helps to - resolve floating point precision issues in shapefiles. - """ - func = functools.partial(snap_polygon_to_grid, n=n) - return geometries["geometry"].apply(lambda x: apply_func_to_polygon_parts(x, func)) @require_same_crs def crop_to(source, target): @@ -168,24 +950,9 @@ def crop_to(source, target): return cropped_geometries -@require_same_crs -def expand_to(source, target): - """ - Expands the source geometries to the target geometries. - """ - geometries = get_geometries(source).simplify(0).buffer(0) - - source_union = unary_union(geometries) - - leftover_geometries = get_geometries(target).apply(lambda x: x - source_union) - leftover_geometries = leftover_geometries[~leftover_geometries.is_empty].explode() - - geometries = absorb_by_shared_perimeter( - leftover_geometries, get_geometries(source), relative_threshold=None - ) - - return geometries +# Changing assertions to truth checks so doctor can give a complete report instead of +# stopping at the first problem. def doctor(source, target=None): """ Detects quality issues in a given set of source and target geometries. Quality @@ -197,33 +964,86 @@ def doctor(source, target=None): shapefiles = [source] source_union = unary_union(get_geometries(source)) +# Adding "health_check" variable to return instead of using assertions. + health_check = True + if target is not None: shapefiles.append(target) target_union = unary_union(get_geometries(target)) sym_area = target_union.symmetric_difference(source_union).area - assert sym_area == 0, "The unions of target and source differ!" + if sym_area != 0: + print("The unions of target and source differ!") + health_check = False +# assert sym_area == 0, "The unions of target and source differ!" for shp in shapefiles: - assert shp["geometry"].apply(lambda x: isinstance(x, (Polygon, MultiPolygon))).all(), "Some rows do not have geometries" +# Changing "shp["geometry"]" to "shp.geometry" so it will work for both GeoSeries and +# GeoDataFrames,and also changing the assertion behavior. + if not shp.geometry.apply(lambda x: isinstance(x, (Polygon, MultiPolygon))).all(): + print("Some rows do not have geometries.") + health_check = False +# assert shp["geometry"].apply(lambda x: isinstance(x, (Polygon, MultiPolygon))).all(), "Some rows do not have geometries" overlaps = count_overlaps(shp) holes = len(holes_of_union(shp)) - assert overlaps == 0, f"There are {overlaps} overlaps!" - assert holes == 0, f"There are {holes} holes!" - assert shp.is_valid.all(), "There are some invalid geometries!" + if overlaps != 0: + print("There are", overlaps, "overlaps.") + health_check = False + if holes != 0: + print("There are", holes, "holes.") + health_check = False + if not shp.is_valid.all(): + print("There are some invalid geometries.") + health_check = False + + +# assert overlaps == 0, f"There are {overlaps} overlaps!" +# assert holes == 0, f"There are {holes} holes!" +# assert shp.is_valid.all(), "There are some invalid geometries!" + + return health_check +# return True - return True def count_overlaps(shp): """ Counts overlaps. Code is taken directly from the resolve_overlaps function in maup. """ - inters = adjacencies(shp["geometry"], warn_for_islands=False, warn_for_overlaps=False) - overlaps = inters[inters.area > 0].buffer(0) +# Changing "shp["geometry"]" to "shp.geometry" so it will work for both GeoSeries and +# GeoDataFrames: + inters = adjacencies(shp.geometry, warn_for_islands=False, warn_for_overlaps=False) +# inters = adjacencies(shp["geometry"], warn_for_islands=False, warn_for_overlaps=False) +# Changing buffer(0) to make_valid: + overlaps = make_valid(inters[inters.area > 0]) +# overlaps = inters[inters.area > 0].buffer(0) return len(overlaps) + + +# Adding an analogous function "count_holes" that can be used for reporting in an +# expanded version of doctor. +def count_holes(shp): + gaps = holes_of_union(shp.geometry) + return(len(gaps)) + + +def snap_to_grid(geometries, n=-7): + """ + Snap the geometries to a grid by rounding to the nearest 10^n. Helps to + resolve floating point precision issues in shapefiles. + """ + func = functools.partial(snap_polygon_to_grid, n=n) +# Changing "geometries["geometry"]" to "geometries.geometry" so it will work for both +# GeoSeries and GeoDataFrames: + return geometries.geometry.apply(lambda x: apply_func_to_polygon_parts(x, func)) +# return geometries["geometry"].apply(lambda x: apply_func_to_polygon_parts(x, func)) + + +def snap_polygon_to_grid(polygon, n=-7): + return Polygon([(round(x, -n), round(y, -n)) for x, y in polygon.exterior.coords]) + def apply_func_to_polygon_parts(shape, func): if isinstance(shape, Polygon): @@ -233,53 +1053,4 @@ def apply_func_to_polygon_parts(shape, func): else: raise TypeError(f"Can only apply {func} to a Polygon or MultiPolygon (got {shape} with type {type(shape)})!") -def dedup_vertices(polygon): - deduped_vertices = [] - for c, p in enumerate(list(polygon.exterior.coords)): - if c == 0: - deduped_vertices.append(p) - elif p != deduped_vertices[-1]: - deduped_vertices.append(p) - - return Polygon(deduped_vertices) - -def snap_polygon_to_grid(polygon, n=-7): - return Polygon([(round(x, -n), round(y, -n)) for x, y in polygon.exterior.coords]) - -def split_by_level(series, multiindex): - return tuple( - multiindex.get_level_values(i).to_series(index=multiindex).map(series) - for i in range(multiindex.nlevels) - ) - - -@require_same_crs -def absorb_by_shared_perimeter(sources, targets, relative_threshold=None): - if len(sources) == 0: - return targets - - if len(targets) == 0: - raise IndexError("targets must be nonempty") - - inters = intersections(sources, targets, area_cutoff=None).buffer(0) - assignment = assign_to_max(inters.length) - - if relative_threshold is not None: - under_threshold = ( - sources.area / assignment.map(targets.area) - ) < relative_threshold - assignment = assignment[under_threshold] - - sources_to_absorb = GeoSeries( - sources.groupby(assignment).apply(unary_union), crs=sources.crs, - ) - - result = targets.union(sources_to_absorb) - - # The .union call only returns the targets who had a corresponding - # source to absorb. Now we fill in all of the unchanged targets. - result = result.reindex(targets.index) - did_not_absorb = result.isna() | result.is_empty - result.loc[did_not_absorb] = targets[did_not_absorb] - return result diff --git a/maup/repair_old.py b/maup/repair_old.py new file mode 100644 index 0000000..5ff8131 --- /dev/null +++ b/maup/repair_old.py @@ -0,0 +1,378 @@ +import math +import pandas +import functools +import warnings + +from geopandas import GeoSeries, GeoDataFrame +from shapely.geometry import MultiPolygon, Polygon +from shapely.ops import unary_union +# from shapely.validation import make_valid # currently in alpha +# UPDATE: make_valid exists in shapely 2.0.1 with minimum GEOS 3.8.0, +# so import it here: +from shapely import make_valid + +from .adjacencies import adjacencies +from .assign import assign_to_max +from .crs import require_same_crs +from .indexed_geometries import get_geometries +from .intersections import intersections + + +""" +Some of these functions are based on the functions in Mary Barker's +check_shapefile_connectivity.py script in @gerrymandr/Preprocessing. +""" + +class AreaCroppingWarning(UserWarning): + pass + + +def holes_of_union(geometries): + """Returns any holes in the union of the given geometries.""" + geometries = get_geometries(geometries) + if not all( + isinstance(geometry, (Polygon, MultiPolygon)) for geometry in geometries + ): +# Changed "type(x) to "x.geom_type" + raise TypeError(f"Must be a Polygon or MultiPolygon (got types {set([x.geom_type for x in geometries])})!") + + union = unary_union(geometries) + series = holes(union) + series.crs = geometries.crs + return series + + +def holes(geometry): + if isinstance(geometry, MultiPolygon): + return GeoSeries( + [ + Polygon(list(hole.coords)) + for polygon in geometry.geoms + for hole in polygon.interiors + ] + ) + elif isinstance(geometry, Polygon): + return GeoSeries([Polygon(list(hole.coords)) for hole in geometry.interiors]) + else: + raise TypeError("geometry must be a Polygon or MultiPolygon to have holes") + + + + +def close_gaps_old(geometries, relative_threshold=0.1): + """Closes gaps between geometries by assigning the hole to the polygon + that shares the most perimeter with the hole. + + If the area of the gap is greater than `relative_threshold` times the + area of the polygon, then the gap is left alone. The default value + of `relative_threshold` is 0.1. This is intended to preserve intentional + gaps while closing the tiny gaps that can occur as artifacts of + geospatial operations. Set `relative_threshold=None` to attempt close all + gaps. Due to floating point precision issues, all gaps may not be closed. + """ + geometries = get_geometries(geometries) + gaps = holes_of_union(geometries) + return absorb_by_shared_perimeter( + gaps, geometries, relative_threshold=relative_threshold + ) + + +def resolve_overlaps_old(geometries, relative_threshold=0.1): + """For any pair of overlapping geometries, assigns the overlapping area to the + geometry that shares the most perimeter with the overlap. Returns the GeoSeries + of geometries, which will have no overlaps. + + If the ratio of the overlap's area to either of the overlapping geometries' + areas is greater than `relative_threshold`, then the overlap is ignored. + The default `relative_threshold` is `0.1`. This default is chosen to include + tiny overlaps that can be safely auto-fixed while preserving major overlaps + that might indicate deeper issues and should be handled on a case-by-case + basis. Set `relative_threshold=None` to attempt to resolve all overlaps. Due + to floating point precision issues, all overlaps may not be resolved. + """ + geometries = get_geometries(geometries) + inters = adjacencies(geometries, warn_for_islands=False, warn_for_overlaps=False) +# Changing buffer(0) to make_valid + overlaps = make_valid(inters[inters.area > 0]) + + if relative_threshold is not None: + left_areas, right_areas = split_by_level(geometries.area, overlaps.index) + under_threshold = ((overlaps.area / left_areas) < relative_threshold) & ( + (overlaps.area / right_areas) < relative_threshold + ) + overlaps = overlaps[under_threshold] + + if len(overlaps) == 0: + return geometries + + to_remove = GeoSeries( + pandas.concat([overlaps.droplevel(1), overlaps.droplevel(0)]), crs=overlaps.crs + ) + with_overlaps_removed = geometries.apply(lambda x: x.difference(unary_union(to_remove))) + + return absorb_by_shared_perimeter( + overlaps, with_overlaps_removed, relative_threshold=None + ) + +def autorepair_old(geometries, relative_threshold=0.1): + """ + Applies all the tricks in `maup.repair` with default args. Should work by default. + The default `relative_threshold` is `0.1`. This default is chosen to include + tiny overlaps that can be safely auto-fixed while preserving major overlaps + that might indicate deeper issues and should be handled on a case-by-case + basis. Set `relative_threshold=None` to attempt to resolve all overlaps. See + `resolve_overlaps()` and `close_gaps()` for more. + """ +# Edited to allow for the possibility that input is a GeoSeries instead of a +# GeoDataFrame; this relies on all the functions called here having been similarly +# edited. + geometries = get_geometries(geometries) + + geometries = remove_repeated_vertices(geometries) + geometries = make_valid(geometries) + geometries = resolve_overlaps(geometries, relative_threshold=relative_threshold) + geometries = make_valid(geometries) + geometries = close_gaps(geometries, relative_threshold=relative_threshold) + geometries = make_valid(geometries) + + return geometries + +# shp = geometries.copy() + +# shp["geometry"] = remove_repeated_vertices(shp) +# shp["geometry"] = make_valid(shp) +# shp["geometry"] = resolve_overlaps(shp, relative_threshold=relative_threshold) +# shp["geometry"] = make_valid(shp) +# shp["geometry"] = close_gaps(shp, relative_threshold=relative_threshold) +# shp["geometry"] = make_valid(shp) + +# return shp["geometry"] + + +# This function is no longer needed; shapely 2's make_valid function applies +# equally well to single geometries or GeoSeries, and keeping it while importing +# shapely's version creates a name conflict. +# When applied to a GeoDataFrame, shapely.make_valid +# returns the GeoSeries consisting of the validated geometries of the +# GeoData Frame, so the current behavior will be retained for GeoDataFrames +# and expanded to work as well on GeoSeries and single geometries. + +# def make_valid(geometries): +# """ +# Applies the shapely .buffer(0) and make_valid (once released) trick to all +# geometries. Should help resolve various topological issues in shapefiles. +# return geometries["geometry"].simplify(0).buffer(0) +# # return geometries["geometry"].apply(lambda x: make_valid(x)) + + +def remove_repeated_vertices(geometries): + """ + Removes repeated vertices. Vertices are considered to be repeated if they + appear consecutively, excluding the start and end points. + """ +# Changing "geometries["geometry"]" to "geometries.geometry" so it will work for both +# GeoSeries and GeoDataFrames: + return geometries.geometry.apply(lambda x: apply_func_to_polygon_parts(x, dedup_vertices)) +# return geometries["geometry"].apply(lambda x: apply_func_to_polygon_parts(x, dedup_vertices)) + +def snap_to_grid(geometries, n=-7): + """ + Snap the geometries to a grid by rounding to the nearest 10^n. Helps to + resolve floating point precision issues in shapefiles. + """ + func = functools.partial(snap_polygon_to_grid, n=n) +# Changing "geometries["geometry"]" to "geometries.geometry" so it will work for both +# GeoSeries and GeoDataFrames: + return geometries.geometry.apply(lambda x: apply_func_to_polygon_parts(x, func)) +# return geometries["geometry"].apply(lambda x: apply_func_to_polygon_parts(x, func)) + +@require_same_crs +def crop_to(source, target): + """ + Crops the source geometries to the target geometries. + """ + target_union = unary_union(get_geometries(target)) + cropped_geometries = get_geometries(source).apply(lambda x: x.intersection(target_union)) + + if (cropped_geometries.area == 0).any(): + warnings.warn("Some cropped geometries have zero area, likely due to\n"+ + "large differences in the union of the geometries in your\n"+ + "source and target shapefiles. This may become an issue\n"+ + "when maupping.\n", + AreaCroppingWarning + ) + + return cropped_geometries + +@require_same_crs +def expand_to(source, target): + """ + Expands the source geometries to the target geometries. + """ +# Changing simplify(0).buffer(0) to make_valid: + geometries = make_valid(get_geometries(source)) +# geometries = get_geometries(source).simplify(0).buffer(0) + + source_union = unary_union(geometries) + + leftover_geometries = get_geometries(target).apply(lambda x: x - source_union) +# Added index_parts=False option to explode() to accommodate a FutureWarning in +# geopandas + leftover_geometries = leftover_geometries[~leftover_geometries.is_empty].explode(index_parts=False) + + geometries = absorb_by_shared_perimeter( + leftover_geometries, get_geometries(source), relative_threshold=None + ) + + return geometries + + + +# Changing assertions to truth checks so doctor can give a complete report instead of +# stopping at the first problem. +def doctor(source, target=None): + """ + Detects quality issues in a given set of source and target geometries. Quality + issues include overlaps, gaps, invalid geometries, repeated verticies, non-perfect + tiling, and not entirely overlapping source and targets. If `maup.doctor()` returns + `True`, votes should not be lost when prorating or assigning (beyond a few due to + rounding, etc.). Passing a target to doctor is optional. + """ + shapefiles = [source] + source_union = unary_union(get_geometries(source)) + +# Adding "health_check" variable to return instead of using assertions. + health_check = True + + if target is not None: + shapefiles.append(target) + + target_union = unary_union(get_geometries(target)) + sym_area = target_union.symmetric_difference(source_union).area + + if sym_area != 0: + print("The unions of target and source differ!") + health_check = False +# assert sym_area == 0, "The unions of target and source differ!" + + for shp in shapefiles: +# Changing "shp["geometry"]" to "shp.geometry" so it will work for both GeoSeries and +# GeoDataFrames,and also changing the assertion behavior. + if not shp.geometry.apply(lambda x: isinstance(x, (Polygon, MultiPolygon))).all(): + print("Some rows do not have geometries.") + health_check = False +# assert shp["geometry"].apply(lambda x: isinstance(x, (Polygon, MultiPolygon))).all(), "Some rows do not have geometries" + + overlaps = count_overlaps(shp) + holes = len(holes_of_union(shp)) + + if overlaps != 0: + print("There are", overlaps, "overlaps.") + health_check = False + if holes != 0: + print("There are", holes, "holes.") + health_check = False + if not shp.is_valid.all(): + print("There are some invalid geometries.") + health_check = False + + +# assert overlaps == 0, f"There are {overlaps} overlaps!" +# assert holes == 0, f"There are {holes} holes!" +# assert shp.is_valid.all(), "There are some invalid geometries!" + + return health_check +# return True + +def count_overlaps(shp): + """ + Counts overlaps. Code is taken directly from the resolve_overlaps function in maup. + """ +# Changing "shp["geometry"]" to "shp.geometry" so it will work for both GeoSeries and +# GeoDataFrames: + inters = adjacencies(shp.geometry, warn_for_islands=False, warn_for_overlaps=False) +# inters = adjacencies(shp["geometry"], warn_for_islands=False, warn_for_overlaps=False) +# Changing buffer(0) to make_valid: + overlaps = make_valid(inters[inters.area > 0]) +# overlaps = inters[inters.area > 0].buffer(0) + return len(overlaps) + + +# Adding an analogous function "count_holes" that can be used for reporting in an +# expanded version of doctor. + +def count_holes(shp): + gaps = holes_of_union(shp.geometry) + return(len(gaps)) + + + + +def apply_func_to_polygon_parts(shape, func): + if isinstance(shape, Polygon): + return func(shape) + elif isinstance(shape, MultiPolygon): + return MultiPolygon([func(poly) for poly in shape.geoms]) + else: + raise TypeError(f"Can only apply {func} to a Polygon or MultiPolygon (got {shape} with type {type(shape)})!") + +def dedup_vertices(polygon): + deduped_vertices = [] + for c, p in enumerate(list(polygon.exterior.coords)): + if c == 0: + deduped_vertices.append(p) + elif p != deduped_vertices[-1]: + deduped_vertices.append(p) + + return Polygon(deduped_vertices) + +def snap_polygon_to_grid(polygon, n=-7): + return Polygon([(round(x, -n), round(y, -n)) for x, y in polygon.exterior.coords]) + +def split_by_level(series, multiindex): + return tuple( + multiindex.get_level_values(i).to_series(index=multiindex).map(series) + for i in range(multiindex.nlevels) + ) + + +@require_same_crs +def absorb_by_shared_perimeter(sources, targets, relative_threshold=None): + if len(sources) == 0: + return targets + + if len(targets) == 0: + raise IndexError("targets must be nonempty") + + +# Changing buffer(0) to make_valid + inters = make_valid(intersections(sources, targets, area_cutoff=None)) +# inters = intersections(sources, targets, area_cutoff=None).buffer(0) + assignment = assign_to_max(inters.length) + + if relative_threshold is not None: + under_threshold = ( + sources.area / assignment.map(targets.area) + ) < relative_threshold + assignment = assignment[under_threshold] + + sources_to_absorb = GeoSeries( + sources.groupby(assignment).apply(unary_union), crs=sources.crs, + ) + +# Note: this line produces a warning message when sources_to_absorb and targets +# have different indices: +# lib/python3.11/site-packages/geopandas/base.py:31: UserWarning: The indices of the two +# GeoSeries are different. +# warn("The indices of the two GeoSeries are different.") +# This difference in indices is expected since not all target geometries may have sources +# to absorb, so it would be nice to remove this warning. + result = targets.union(sources_to_absorb) + + # The .union call only returns the targets who had a corresponding + # source to absorb. Now we fill in all of the unchanged targets. + result = result.reindex(targets.index) + did_not_absorb = result.isna() | result.is_empty + result.loc[did_not_absorb] = targets[did_not_absorb] + + return result From adf52c6ab5b1ef2fb8d6b3f639d792a32491f8d0 Mon Sep 17 00:00:00 2001 From: "Parker J. Rule" Date: Mon, 17 Jul 2023 23:58:55 -0400 Subject: [PATCH 02/15] Use old repair logic --- maup/repair.py | 1120 +++++++++----------------------------------- maup/repair_old.py | 378 --------------- 2 files changed, 221 insertions(+), 1277 deletions(-) delete mode 100644 maup/repair_old.py diff --git a/maup/repair.py b/maup/repair.py index 3dbb863..5ff8131 100644 --- a/maup/repair.py +++ b/maup/repair.py @@ -1,896 +1,30 @@ import math import pandas -import geopandas -from geopandas import GeoSeries, GeoDataFrame - import functools +import warnings -import numpy - -import shapely +from geopandas import GeoSeries, GeoDataFrame +from shapely.geometry import MultiPolygon, Polygon +from shapely.ops import unary_union +# from shapely.validation import make_valid # currently in alpha +# UPDATE: make_valid exists in shapely 2.0.1 with minimum GEOS 3.8.0, +# so import it here: from shapely import make_valid -from shapely.ops import unary_union, split, polygonize, linemerge -from shapely.geometry import Polygon, MultiPolygon, Point, MultiPoint, LineString, MultiLineString -from shapely.strtree import STRtree from .adjacencies import adjacencies from .assign import assign_to_max from .crs import require_same_crs from .indexed_geometries import get_geometries from .intersections import intersections -from .progress_bar import progress - -from itertools import combinations -from math import sqrt - -import warnings; warnings.filterwarnings('ignore', 'GeoSeries.isna', UserWarning) - -pandas.options.mode.chained_assignment = None - """ -Here we define a function "autorepair" that takes a GeoDataFrame and: - -(1) Resolves all overlaps. - -(2) (optionally) Closes all gaps; default is fill_holes = True. - -(3) (optionally) Replaces all rook adjacencies with length below a - user-specified threshold with queen adjacencies. Default is - min_rook_length = None; this step will be skipped unless a - length threshold is provided. - -Note that (2) and (3) are optional but (1) is not! The procedure for (1) -produces a clean tiling of the region that is necessary for the procedures -in (2) and (3) to work properly. (If the region is already free of overlaps, -Step (1) won't change anything.) - -The algorithm differs significantly from that the original maup version and -avoids the small floating point errors that tend to occur there. -(HOW, I HEAR YOU ASK???) - -Here's what happens in the original maup version (or really in shapely): -When two line segments L1, L2 intersect, their point of intersection P is a -rational function of the coordinates of their endpoints. In general, these -coordinates don't have an exact floating point expression and so must be -approximated. (In this case, if you ask shapely whether P is contained in -either L1 or L2, it will tell you no!) So P becomes a new vertex on the -polygon that defines the overlap/hole, but it is not (quite) contained in the -boundaries of the polygons whose intersection created it. This discrepancy -is what ends up leaving behind small overlaps even when they have all -theoretically been removed. - -Additionally, shapely has a rare (but not rare enough!) bug where computing -the difference between a large polygon and a very small one completely (or -almost completely) obliterates the large one. THIS IS NOT OKAY! -(Edit: I don't know whether this bug has been resolved in shapely 2.) - -In order to avoid both of these issues, these procedures have been completely -redesigned so that no intersections or differences between polygons are ever -used during the construction process. (They are, however, used to identify -adjacencies; I don't think there's any way around that.) -1-dimensional intersections between polygon *boundaries* ARE used, but not -transverse intersections, and not in any way that looks for any interior -intersection points between the endpoints of line segments, so -the floating point approximation issue doesn't come up. - -The result is a completely clean GeoDataFrame with NO overlaps, gaps, or rook -adjacencies below the length threshold! If unusual topology interferes with -any of this, warning messages will alert the user to specific places that -should be looked at with human eyes. - -Brief sketch of the new overlap-resolution algorithm: - -(1) Take the unary union of all the polygon boundaries. This produces a - huge MultiLineString, and the unary_union function does the VERY slick - trick of adding those (approximate) points of intersection to the - LineStrings that should contain them. (THIS IS HOW THE MAGIC HAPPENS!) - -(2) Polygonize this structure. This will produce a set of polygons ("pieces") - that give a clean geometric partition of the region, with no gaps or - overlaps between them. Every polygon in this set is the intersection of - zero or more polygons from the original GeoDataFrame, with very small - adjustments resulting from the addition of intersection points to the - polygon boundaries. (Pieces where zero polygons "intersect" correspond - to holes in the original geometry.) - -(3) Choose a representative point in the interior of each piece and identify - all the original polygons that contain it. (Using a representative point - rather than checking for full containment avoids errors that would result - from the polygons having changed slightly!) This will allow us to identify - exactly which polygons from the original geometry intersected to produce - that piece. - -(4) Re-organize the info in (3) into separate GeoDataFrames according to the - degree of the overlap. This will produce a list of GeoDataFrames, one for - intersections of each degree that occurs (including degree 1), plus another - GeoDataFrame consisting of all the holes. - -(5) Rebuild the polygons from the pieces, using the data in (4). Overlaps are - assigned to parent polygons, first to resolve any connectivity issues, and - then by largest shared perimeter. - - -Brief sketch of the new gap-closing algorithm: - -(1) For any hole whose boundary touches 3 or fewer polygons, attach it to the - polygon with which it shares the greatest perimeter. (Doesn't really matter; - in practice holes have very small area, and adjacency relationships won't be - affected regardless of where we attach it.) - -(2) For a hole whose boundary touches 4 or more polygons, find the pair of - non-adjacent polygons with the shortest distance between them. Cut out a - piece of the hole and use it to attach these two polygons with a boundary - segment of positive length. (This is a "best guess" for when two polygons - on opposite sides of a long, thin gap should really be adjacent.) - This leaves behind two new holes, each with strictly fewer boundary - polygons than the original. - -(3) Repeat until all holes are filled. - - -Brief sketch of the new rook-to-queen algorithm: - -Let L be a linestring with length below the specified threshold. - -(1) Construct a small disk D that contains the linestring within its interior. - Find all polygons P_1, ..., P_k that intersect D. - -(2) Take the unary_union of all the boundaries of P_1, ..., P_k, and D, and - polygonize this structure. Assign all pieces outside of D to the polygons - that they came from, and assign the pieces inside D to D. (This is - equivalent to taking the difference of each polygon minus D, except that - it works cleanly and reliably!) - -(3) For each of the polygons whose boundary (now) contains an arc of D's boundary - circle, create a "pie wedge" inside D subtending this arc, and join it to - the polygon that it shares a boundary with. - +Some of these functions are based on the functions in Mary Barker's +check_shapefile_connectivity.py script in @gerrymandr/Preprocessing. """ - - - - -# Some useful short functions: - -# Count the number of connected components of a shapely object: -def num_components(geom): - if geom.is_empty: - return 0 - elif (geom.geom_type == "Polygon") or (geom.geom_type == "Point") or (geom.geom_type == "LineString"): - return 1 - elif (geom.geom_type == "MultiPolygon") or (geom.geom_type == "MultiLineString") or (geom.geom_type == "GeometryCollection"): - return len(geom.geoms) - - - - -""" -MAIN FUNCTION -""" - -def autorepair(geometries0_df, fill_gaps = True, min_rook_length = None, - holes_to_keep = None): - - if isinstance(geometries0_df, geopandas.GeoDataFrame) == False: - raise TypeError(f"Input must be a GeoDataFrame!") - - geometries_df = geometries0_df.copy() - - # Ensure that geometries are 2-D and not 3-D! - for i in geometries_df.index: - geometries_df["geometry"][i] = shapely.wkb.loads( - shapely.wkb.dumps(geometries_df["geometry"][i], output_dimension=2)) - - # Warn if crs is geographic: - if geometries_df.crs is not None: - if geometries_df.crs.is_geographic: - warnings.warn("Geometry is in a geographic CRS. Results from 'length' are likely incorrect. Use 'GeoSeries.to_crs()' to re-project geometries to a projected CRS before this operation.") - - # If holes_to_keep and/or overlaps_to_keep are present, require them to have the - # same CRS as the main shapefile. - if holes_to_keep is not None: - if holes_to_keep.crs != geometries_df.crs: - raise Exception("holes_to_keep must be in the same CRS as the shapefile being repaired.") - - - - # Construct data about overlaps of all orders, plus holes. - overlap_tower, holes_df = building_blocks(geometries_df) - - # Use data from the overlap tower to rebuild precincts with no overlaps. - print("Resolving overlaps...") - reconstructed_df = reconstruct_from_overlap_tower(geometries_df, overlap_tower) - - # Use data about the holes to fill holes if desired. - if fill_gaps == True: - print("Closing gaps...") - reconstructed_df = close_gaps(reconstructed_df, holes_df) - - - # Check for precincts that have become (more) disconnected, generally with an extra - # component of negligible area. If any are found and the area is negligible, - # reassign to an adjacent precinct by shared perimeter. - # If the area is not negligible, leave it alone and report it so that a human - # can decide what to do about it! - - disconnected_df = reconstructed_df[reconstructed_df["geometry"].apply(lambda x: x.geom_type != "Polygon")] - # This will include precincts that were disconnected in the original; need to - # filter by whether they got worse. - - if len(disconnected_df) > 0: - disconnected_poly_indices = [] - for ind in disconnected_df.index: - if num_components(reconstructed_df["geometry"][ind]) > num_components(geometries0_df["geometry"][ind]): - disconnected_poly_indices.append(ind) - - if len(disconnected_poly_indices) > 0: # These are the ones (if any) that got worse. - geometries = get_geometries(reconstructed_df) - spatial_index = STRtree(geometries) - index_by_iloc = dict((i, list(geometries.index)[i]) for i in range(len(geometries.index))) # Build indexing dictionary - - for g_ind in disconnected_poly_indices: - excess = num_components(reconstructed_df["geometry"][g_ind]) - num_components(geometries0_df["geometry"][g_ind]) - - component_num_list = [x for x in range(len(reconstructed_df["geometry"][g_ind].geoms))] - component_areas = [] - - for c_ind in range(len(reconstructed_df["geometry"][g_ind].geoms)): - component_areas.append((c_ind, reconstructed_df["geometry"][g_ind].geoms[c_ind].area)) - - component_areas_sorted = sorted(component_areas, key=lambda tup: tup[1]) - - big_area = max([reconstructed_df["geometry"][g_ind].area, geometries0_df["geometry"][g_ind].area]) - - for i in range(excess): # Check that the ith smallest component has small enough area find a better polygon to add it to. - c_ind = component_areas_sorted[i][0] - this_fragment = reconstructed_df["geometry"][g_ind].geoms[c_ind] - if component_areas_sorted[i][1] < 0.0001*big_area: # Less than 0.01% - component_num_list.remove(c_ind) # Tells us to take out this component later - possible_intersect_integer_indices = [*set(numpy.ndarray.flatten(spatial_index.query(this_fragment)))] - possible_intersect_indices = [(index_by_iloc[k]) for k in possible_intersect_integer_indices] - - shared_perimeters = [] - for g_ind2 in possible_intersect_indices: - if g_ind2 != g_ind and not (this_fragment.boundary).intersection(reconstructed_df["geometry"][g_ind2].boundary).is_empty: - shared_perimeters.append((g_ind2, (this_fragment.boundary).intersection(reconstructed_df["geometry"][g_ind2].boundary).length)) - - max_shared_perim = sorted(shared_perimeters, key=lambda tup: tup[1])[-1] - poly_to_add_to = max_shared_perim[0] - reconstructed_df["geometry"][poly_to_add_to] = unary_union( - [reconstructed_df["geometry"][poly_to_add_to], this_fragment]) - - if len(component_num_list) == 1: - reconstructed_df["geometry"][g_ind] = reconstructed_df["geometry"][g_ind].geoms[component_num_list[0]] - elif len(component_num_list) > 1: - reconstructed_df["geometry"][g_ind] = MultiPolygon( - [reconstructed_df["geometry"][g_ind].geoms[c_ind] for c_ind in component_num_list]) - else: - print("WARNING: A component of the geometry at index ", g_ind, " was badly disconnected and redistributed to other polygons!") - - # We SHOULD be back to the correct number of components everywhere, but check again just to make sure! - - disconnected_df_2 = reconstructed_df[reconstructed_df["geometry"].apply(lambda x: x.geom_type != "Polygon")] - - if len(disconnected_df_2) > 0: - for ind in disconnected_df_2.index: - if num_components(reconstructed_df["geometry"][ind]) > num_components(geometries0_df["geometry"][ind]): - print("WARNING: A component of the geometry at index ", ind, " may have been disconnected!") - - if min_rook_length is not None: - # Find all inter-polygon boundaries shorter than min_rook_length and replace them - # with queen adjacencies by manipulating coordinates of all surrounding polygon. - print("Converting small rook adjacencies to queen...") - reconstructed_df = small_rook_to_queen(reconstructed_df, min_rook_length) - - if holes_to_keep is not None: - holes_to_keep = get_geometries(holes_to_keep) - - reconstructed_spatial_index = STRtree(reconstructed_df["geometry"]) # Build STRtree for all the reconstructed geometries - reconstructed_index_by_iloc = dict((i, list(reconstructed_df.index)[i]) for i in range(len(reconstructed_df))) # Build indexing dictionary - - for keeper in holes_to_keep: - possible_geom_integer_indices = [*set(numpy.ndarray.flatten(reconstructed_spatial_index.query(keeper)))] - possible_geom_indices = [reconstructed_index_by_iloc[k] for k in possible_geom_integer_indices] - for g_ind in possible_geom_indices: - reconstructed_df['geometry'][g_ind] = reconstructed_df['geometry'][g_ind].difference(keeper) - - return reconstructed_df - - -# SUPPORTING FUNCTIONS: - -# Partition the region via ALL precinct boundaries, identify each polygon in the -# partition according to which polygons in the original intersected to create it, -# and organize this data according to order of the overlaps. (Order zero = hole) - -def building_blocks(geometries0_df): - if isinstance(geometries0_df, geopandas.GeoDataFrame) == False: - raise TypeError(f"Input must be a GeoDataFrame!") - - geometries_df = geometries0_df.copy() - - # Make a list of all the boundaries of all the polygons. - # This won't work properly with multi-polygons, so explode first: - - boundaries = [] - geometries_exploded_df = geometries_df.explode(index_parts=False).reset_index(drop=True) - for i in geometries_exploded_df.index: - - boundaries.append(shapely.boundary(geometries_exploded_df["geometry"][i])) - - boundaries_union = unary_union(boundaries) - - # Create geodataframe with all the pieces created by overlaps of all orders, - # together with a set for each piece consisting of the polygons that created the overlap. - - pieces_df = geopandas.GeoDataFrame(columns = ["polygon indices"], - geometry = geopandas.GeoSeries([geom for geom in polygonize(boundaries_union)]), - crs = geometries_df.crs) - - for i in pieces_df.index: - pieces_df["polygon indices"][i] = set() - - - g_spatial_index = STRtree(geometries_df["geometry"]) # Build STRtree for the main geometries - g_index_by_iloc = dict((i, list(geometries_df.index)[i]) for i in range(len(geometries_df))) # Build indexing dictionary - - print("Identifying overlaps...") - for i in progress(pieces_df.index, len(pieces_df.index)): - possible_geom_integer_indices = [*set(numpy.ndarray.flatten(g_spatial_index.query(pieces_df["geometry"][i])))] - possible_geom_indices = [g_index_by_iloc[k] for k in possible_geom_integer_indices] - - for j in possible_geom_indices: - if pieces_df["geometry"][i].representative_point().distance(geometries_df["geometry"][j]) == 0: - pieces_df["polygon indices"][i] = pieces_df["polygon indices"][i].union({j}) - - - - # Organize this info into separate gdf's for overlaps of all orders - including - # order zero, which corresponds to holes. - # This will be easier if we temporarily add a column for overlap degree. - - overlap_degree_list = [len(x) for x in pieces_df["polygon indices"]] - pieces_df["overlap degree"] = overlap_degree_list - - # Here are the holes: - - holes_df = (pieces_df[pieces_df["overlap degree"] == 0]).reset_index(drop=True) - - # And here is a list of GeoDataFrames, one consisting of all overlaps of each order: - - overlap_tower = [] - - for i in range(max(pieces_df["overlap degree"])): - overlap_tower.append(pieces_df[pieces_df["overlap degree"] == i+1]) - - - # Get rid of unnecessary column and reindex each GeoDataFrame: - - for i in range(len(overlap_tower)): - del overlap_tower[i]["overlap degree"] - overlap_tower[i] = overlap_tower[i].reset_index(drop=True) - - del holes_df["overlap degree"] - - return overlap_tower, holes_df - - -# Rebuild the polygons with overlaps removed: - -def reconstruct_from_overlap_tower(geometries0_df, overlap_tower0): - - # We want to preserve all the shapefile columns from the original, but - # we're going to completely rebuild the geometry from scratch. - - geometries_df = geometries0_df.copy() - overlap_tower = [df.copy() for df in overlap_tower0] - - geometries_df["geometry"] = Polygon() - - max_overlap_level = len(overlap_tower) - - # Start by assigning all order-1 pieces to the polygon they came from: - - for ind in overlap_tower[0].index: - this_poly_ind = list(overlap_tower[0]["polygon indices"][ind])[0] - this_piece = overlap_tower[0]["geometry"][ind] - geometries_df["geometry"][this_poly_ind] = unary_union([geometries_df["geometry"][this_poly_ind], this_piece]) - - # IMPORTANT: We need to know which geometries were disconnected by removing - # overlaps! Add columns for numbers of components in the original and refined - # geometries to each dataframe for future use. - - geometries_df["num components orig"] = 0 - geometries_df["num components refined"] = 0 - - for ind in geometries_df.index: - geometries_df["num components orig"][ind] = num_components(geometries0_df["geometry"][ind]) - geometries_df["num components refined"][ind] = num_components(geometries_df["geometry"][ind]) - - # Now, start with the order-2 and gradually add overlaps at higher order until done. - - # IMPORTANT: First look for geometries at the top level that were disconnected - # by the refinement process, and give them first dibs at grabbing overlaps - # (regardless of perimeter!) until they are reconnected or run out of overlaps - # to grab. - # (This doesn't always completely work; in rare cases a single overlap - # can disconnect two polygons, and only one of them gets to grab it back. - # This will be addressed at the end, within the main function.) - - geometries_disconnected_df = geometries_df[geometries_df["num components refined"] > geometries_df["num components orig"]] - - for i in range(1, max_overlap_level): - overlaps_df = overlap_tower[i] - overlaps_df_unused_indices = overlaps_df.index.tolist() - # Need to make sure each overlap only gets used once! - - o_spatial_index = STRtree(overlaps_df["geometry"]) # Build STRtree for the overlaps - o_index_by_iloc = dict((i, list(overlaps_df.index)[i]) for i in range(len(overlaps_df))) # Build indexing dictionary - - for g_ind in geometries_disconnected_df.index: - possible_overlap_integer_indices = [*set(numpy.ndarray.flatten(o_spatial_index.query(geometries_disconnected_df["geometry"][g_ind])))] - possible_overlap_indices_0 = [o_index_by_iloc[k] for k in possible_overlap_integer_indices] - possible_overlap_indices = list(set(possible_overlap_indices_0) & set(overlaps_df_unused_indices)) - - - geom_finished = False # Only keep adding things until it gets connected again - - for o_ind in possible_overlap_indices: - # If the corresponding overlap intersects this geometry (and was - # contained in it originally!), grab it. - - if (geom_finished == False) and (g_ind in list(overlaps_df["polygon indices"][o_ind])) and (not geometries_disconnected_df["geometry"][g_ind].intersection(overlaps_df["geometry"][o_ind]).is_empty): - - if (geometries_disconnected_df["geometry"][g_ind].intersection(overlaps_df["geometry"][o_ind])).length > 0: - geometries_disconnected_df["geometry"][g_ind] = unary_union([ - geometries_disconnected_df["geometry"][g_ind], overlaps_df["geometry"][o_ind] - ]) - overlaps_df_unused_indices.remove(o_ind) - if num_components(geometries_disconnected_df["geometry"][g_ind]) == geometries_df["num components orig"][g_ind]: - geom_finished = True - - geometries_df["geometry"][g_ind] = geometries_disconnected_df["geometry"][g_ind] - - if geom_finished == True: - geometries_disconnected_df = geometries_disconnected_df.drop(g_ind) - - # Okay, that's all we can do for the disconnected geometries at this level. - # Go on to filling in the rest of the overlaps by greatest perimeter. - - - g_spatial_index = STRtree(geometries_df["geometry"]) # Build STRtree for the main geometries - g_index_by_iloc = dict((i, list(geometries_df.index)[i]) for i in range(len(geometries_df))) # Build indexing dictionary - - print("Assigning order", i+1, "pieces...") - for o_ind in progress(overlaps_df_unused_indices, len(overlaps_df_unused_indices)): - this_overlap = overlaps_df["geometry"][o_ind] - - shared_perimeters = [] - - possible_geom_integer_indices = [*set(numpy.ndarray.flatten(g_spatial_index.query(overlaps_df["geometry"][o_ind])))] - possible_geom_indices = [g_index_by_iloc[k] for k in possible_geom_integer_indices] - - - for g_ind in possible_geom_indices: - if (g_ind in list(overlaps_df["polygon indices"][o_ind])) and not (this_overlap.boundary).intersection(geometries_df["geometry"][g_ind].boundary).is_empty: - shared_perimeters.append((g_ind, (this_overlap.boundary).intersection(geometries_df["geometry"][g_ind].boundary).length)) - - # This possibility came up in a previous version, but I hope it will be - # obsolete in this version! - if len(shared_perimeters) > 0: - max_shared_perim = sorted(shared_perimeters, key=lambda tup: tup[1])[-1] - poly_to_add_to = max_shared_perim[0] - geometries_df["geometry"][poly_to_add_to] = unary_union( - [geometries_df["geometry"][poly_to_add_to], this_overlap]) - else: - print("Couldn't find a polygon to glue a component of intersection ", o_ind, " to") - - - reconstructed_df = geometries_df.copy() - - del reconstructed_df["num components orig"] - del reconstructed_df["num components refined"] - - return reconstructed_df - - - - -# Close the gaps: - -def close_gaps(geometries0_df, holes0_df): - - geometries_df = geometries0_df.copy() - holes_df = holes0_df.copy() - - # Check whether there are any holes to fill: - if len(holes_df) == 0: - fill_complete = True - else: - fill_complete = False - - - - while fill_complete == False: - - hole_boundaries_df = intersections(holes_df, geometries_df, output_type = "geodataframe") - # Note: all geometries here will (stupidly!) be MultiLineStrings; - # convert to LineStrings before exploding. - - for ind in hole_boundaries_df.index: - if hole_boundaries_df["geometry"][ind].geom_type == "MultiLineString": - hole_boundaries_df["geometry"][ind] = linemerge(hole_boundaries_df["geometry"][ind]) - - hole_boundaries_df = hole_boundaries_df.explode(index_parts=False).reset_index(drop=True) - # All geometries here should be LineStrings, or occasionally Points. - - activity_this_round = False # Keep track of whether there's still anything - # left to do - - new_holes = [] # Holes that will be created by partial fills - # to add before next round - - for h_ind in progress(holes_df.index, len(holes_df.index)): - - this_hole = holes_df["geometry"][h_ind] - this_hole_boundaries_df = hole_boundaries_df[hole_boundaries_df["source"] == h_ind] - - if len(set(this_hole_boundaries_df["target"])) <= 3: # Fill the hole! - - shared_perimeters = [] - - for b_ind in this_hole_boundaries_df.index: - g_ind = this_hole_boundaries_df["target"][b_ind] - shared_perimeters.append((b_ind, (this_hole.boundary).intersection(geometries_df["geometry"][g_ind].boundary).length)) - - max_shared_perim = sorted(shared_perimeters, key=lambda tup: tup[1])[-1] - poly_to_add_to = this_hole_boundaries_df["target"][max_shared_perim[0]] - - geometries_df["geometry"][poly_to_add_to] = unary_union( - [geometries_df["geometry"][poly_to_add_to], this_hole]) - holes_df = holes_df.drop(h_ind) - - activity_this_round = True - - else: # Partially fill the hole! - - poly_to_add_to, piece_to_connect, new_holes_this_time = partial_fill_data(this_hole, this_hole_boundaries_df) - - if poly_to_add_to is not None: - # If a relative threshold is set, check that the piece to connect is - # under the threshold relative to poly_to_add_to before adjoining it - # to the polygon. - geometries_df["geometry"][poly_to_add_to] = unary_union( - [geometries_df["geometry"][poly_to_add_to], piece_to_connect]) - holes_df = holes_df.drop(h_ind) - - - for k in range(len(new_holes_this_time)): - new_holes.append(new_holes_this_time[k]) - - activity_this_round = True - - if activity_this_round == True: # Add any new holes for the next round! - for k in range(len(new_holes)): - - hole_to_append_df = geopandas.GeoDataFrame(columns = ["polygon indices"], - geometry = geopandas.GeoSeries([new_holes[k]]), - crs = geometries_df.crs) - hole_to_append_df["polygon indices"][0] = set() - - holes_df = pandas.concat([holes_df, hole_to_append_df], ignore_index=True) - holes_df.reset_index(drop=True) - - - - else: # All done! - fill_complete = True - - - return geometries_df - - - -# Partially fill a hole with 4 or more boundary components by attaching two -# non-adjacent boundary components, leaving behind two new holes, each with strictly -# fewer boundary components than the original. - -def partial_fill_data(hole0, hole_boundaries_df0): - - # Compute all distances between pairs of non-adjacent polygons along the - # boundary of the hole. - - hole = hole0 - hole_boundaries_df = hole_boundaries_df0.copy() - - poly_distances = [] - - for i in hole_boundaries_df.index: - for j in hole_boundaries_df.index: - if j > i: - this_distance = hole_boundaries_df["geometry"][i].distance(hole_boundaries_df["geometry"][j]) - if this_distance != 0: #maybe use not is_close? - poly_distances.append((i, j, this_distance)) - - - # Choose the shortest-distance non-adjacent pair to connect by gluing a piece - # of the hole to one of them. - - - distance_data_sorted = sorted(poly_distances, key=lambda tup: tup[2]) - - poly_pair_found = False - - while poly_pair_found == False and len(distance_data_sorted) > 0: - - min_distance_data = distance_data_sorted[0] - polys_to_connect = (min_distance_data[0], min_distance_data[1]) - - # For each of the two polygons that we want to connect, identify the two - # vertices at the ends of their boundary segment. - # For each vertex, find the nearest point in the OTHER of the two polygons, - # along with the corresponding distances. - - # Add a check to make sure that the geometry works as expected; if not, we'll - # drop this polygon pair and move on to the next-closest one. - - - boundary_point_coords = [] - - vertices = [[], []] - vertices_nearest_point_pairs = [[], []] - vertices_distances = [[], []] - - for i in range(2): - boundary_point_coords.append([x for x in hole_boundaries_df["geometry"][polys_to_connect[i]].coords]) - vertices[i].append(Point(boundary_point_coords[i][0][0], boundary_point_coords[i][0][1])) - vertices[i].append(Point(boundary_point_coords[i][-1][0], boundary_point_coords[i][-1][1])) - - for i in range(2): - for j in range(2): - vertices_nearest_point_pairs[i].append(shapely.ops.nearest_points(vertices[i][j], MultiPoint(boundary_point_coords[1-i]))) - vertices_distances[i].append(vertices_nearest_point_pairs[i][j][0].distance(vertices_nearest_point_pairs[i][j][1])) - - - # Identify the list positions of the two shortest of these four distances, and - # split the hole with the two corresponding line segments. (Unfortunately this - # requires two steps, because the split operation doesn't support splitting - # with MultiLineStrings.) - - # IT CAN HAPPEN THAT THE TWO SHORTEST LINES DON'T PROVIDE TWO TRUE SPLITS! - # If that happens, try the 3rd and 4th lines. And if THAT doesn't work, go - # back to the distance data and pick a different pair of polygons to connect! - - sorted_distances = sorted([(0,0,vertices_distances[0][0]), (0,1,vertices_distances[0][1]), - (1,0,vertices_distances[1][0]), (1,1,vertices_distances[1][1])], - key = lambda tup: tup[2]) - sorted_line_segments_0 = [] - for k in range(4): - i = sorted_distances[k][0] - j = sorted_distances[k][1] - sorted_line_segments_0.append(LineString(list(vertices_nearest_point_pairs[i][j]))) - - # Ignore any of the line segments that aren't contained within the hole (but - # contained within the boundary is okay - and happens!): - - sorted_line_segments = [] - for k in range(4): - if hole.contains(sorted_line_segments_0[k]) or hole.boundary.contains(sorted_line_segments_0[k]): - sorted_line_segments.append(sorted_line_segments_0[k]) - - - line_pair_found = False - - if len(sorted_line_segments) > 1: - for j in range(1, len(sorted_line_segments)): - for i in range(j): - if sorted_line_segments[i].disjoint(sorted_line_segments[j]) and line_pair_found == False: - line_pair = [i,j] - line_pair_found = True - - - if line_pair_found==True: - - new_holes = [] - - first_split = shapely.ops.split(hole, sorted_line_segments[line_pair[0]]) - - # First_split should contain two pieces, one of which also intersects the second line. - # Find that piece, and split it with the second line. - - for k in range(len(first_split.geoms)): - if first_split.geoms[k].intersects(sorted_line_segments[line_pair[1]]): - second_split = shapely.ops.split(first_split.geoms[k], sorted_line_segments[line_pair[1]]) - else: - new_holes.append(first_split.geoms[k]) - - # Find the piece in second_split that also touches the first line. - # THIS WILL BE OUR PIECE TO ADJOIN TO ONE OF THE TWO POLYGONS! - - - for k in range(len(second_split.geoms)): - if second_split.geoms[k].intersects(sorted_line_segments[line_pair[0]]): - piece_to_connect = second_split.geoms[k] - poly_pair_found = True # SUCCESS!! - else: - new_holes.append(second_split.geoms[k]) - - if poly_pair_found == True: - if piece_to_connect.intersection(hole_boundaries_df["geometry"][polys_to_connect[0]]).length > piece_to_connect.intersection(hole_boundaries_df["geometry"][polys_to_connect[1]]).length: - poly_to_add_to = hole_boundaries_df["target"][polys_to_connect[0]] - elif piece_to_connect.intersection(hole_boundaries_df["geometry"][polys_to_connect[0]]).length < piece_to_connect.intersection(hole_boundaries_df["geometry"][polys_to_connect[1]]).length: - poly_to_add_to = hole_boundaries_df["target"][polys_to_connect[1]] - - else: #Try again with a different polygon pair - distance_data_sorted.remove(distance_data_sorted[0]) - - - - - if poly_pair_found == False: # Not much to do at this point except join by shared - # perimeter and hope for the best! (Usually these will - # be small enough that they'll get converted to queen - # adjacencies eventually anyway.) - shared_perimeters = [] - - for b_ind in hole_boundaries_df.index: - g_ind = hole_boundaries_df["target"][b_ind] - shared_perimeters.append((b_ind, hole_boundaries_df["geometry"][b_ind].length)) - - max_shared_perim = sorted(shared_perimeters, key=lambda tup: tup[1])[-1] - - poly_to_add_to = hole_boundaries_df["target"][max_shared_perim[0]] - piece_to_connect = hole - new_holes = [] - - return poly_to_add_to, piece_to_connect, new_holes - - -# Convert all rook adjacencies with boundary length less than min_rook_length to queen adjacencies - -def small_rook_to_queen(geometries0_df, min_rook_length): - - geometries_df = geometries0_df.copy() - - adj_df = adjacencies(geometries_df, output_type = "geodataframe") # We're assuming the input is clean, so these should all be 1-D or less - - - for ind in adj_df.index: - if adj_df["geometry"][ind].geom_type == "GeometryCollection": - adj_list = list(adj_df["geometry"][ind].geoms) - adj_list_no_point = [x for x in adj_list if x.geom_type != "Point"] - adj_df["geometry"][ind] = MultiLineString(adj_list_no_point) - - if adj_df["geometry"][ind].geom_type == "MultiLineString": - adj_df["geometry"][ind] = linemerge(adj_df["geometry"][ind]) - - adj_df = adj_df.explode(index_parts=False).reset_index(drop=True) - - for ind in adj_df.index: - if adj_df["geometry"][ind].geom_type == "Point": - adj_df = adj_df.drop(ind) - - - # Add column for boundary length and pick off the small ones: - - adj_df["boundary length"] = adj_df["geometry"].length - - small_adj_df = adj_df[adj_df["boundary length"] < min_rook_length] - - - for a_ind in small_adj_df.index: # Don't put a progress bar here; it gets out of control! - - # Make sure we haven't inadvertently killed off this adjacency while we were fixing others! - - geoms_for_this_adj = list(small_adj_df["neighbors"][a_ind]) - adj_len = geometries_df["geometry"][geoms_for_this_adj[0]].intersection(geometries_df["geometry"][geoms_for_this_adj[1]]).length - - if adj_len > 0 and adj_len < min_rook_length: - - # Rebuild the STRtree every time because the geometries will change every time. - g_spatial_index = STRtree(geometries_df["geometry"]) # Build STRtree for the main geometries - g_index_by_iloc = dict((i, list(geometries_df.index)[i]) for i in range(len(geometries_df))) # Build indexing dictionary - - # Build a disk enclosing this adjacency; the idea will be to cut it out and replace it - # with a "pie chart" so that all polygons touching this disk meet at a queen adjacency - # point at the center of the disk. - adj_diam = small_adj_df["geometry"][a_ind].length - fat_point_radius = 0.6*adj_diam # slightly more than the radius from the midpoint to the endpoints - - adjacency_points = [x for x in small_adj_df["geometry"][a_ind].coords] - endpoint1 = adjacency_points[0] - endpoint2 = adjacency_points[-1] - midpoint = LineString([endpoint1, endpoint2]).centroid - midpoint_coords = midpoint.coords[0] - - disk_to_remove = midpoint.buffer(fat_point_radius) - - # Identify geometries that might intersect this disk. - possible_geom_integer_indices = [*set(numpy.ndarray.flatten(g_spatial_index.query(small_adj_df["geometry"][a_ind].buffer(2*fat_point_radius))))] - possible_geom_indices = [g_index_by_iloc[k] for k in possible_geom_integer_indices] - - # Use the boundaries of these geometries together with the boundary of the disk to - # polygonize and divide geometries into pieces inside and outside the disk - boundaries = [geometries_df["geometry"][i].boundary for i in possible_geom_indices] - boundaries.append(LineString(list(disk_to_remove.exterior.coords))) - - boundaries_union = unary_union(boundaries) - - pieces_df = geopandas.GeoDataFrame(columns = ["polygon indices"], - geometry = geopandas.GeoSeries([geom for geom in polygonize(boundaries_union)]), - crs = geometries_df.crs) - - # Associate the pieces to the main geometries. (Note that if there are holes, some pieces may - # be unassigned.) - for i in pieces_df.index: - pieces_df["polygon indices"][i] = set() - - for i in pieces_df.index: - temp_possible_geom_integer_indices = [*set(numpy.ndarray.flatten(g_spatial_index.query(pieces_df["geometry"][i])))] - temp_possible_geom_indices = [g_index_by_iloc[k] for k in temp_possible_geom_integer_indices] - - for j in temp_possible_geom_indices: - if pieces_df["geometry"][i].representative_point().distance(geometries_df["geometry"][j]) == 0: - pieces_df["polygon indices"][i] = pieces_df["polygon indices"][i].union({j}) - - - # Now rebuild the disk from the pieces that are inside the circle, and drop them from - # pieces_df. Then we'll give the pieces outside the circle back to the geometries that they came from. - - disk_to_remove = Polygon() - - for p_ind in pieces_df.index: - if pieces_df["geometry"][p_ind].representative_point().distance(midpoint) < fat_point_radius: - disk_to_remove = unary_union([disk_to_remove, pieces_df["geometry"][p_ind]]) - pieces_df = pieces_df.drop(p_ind) - - for g_ind in possible_geom_indices: - geometries_df["geometry"][g_ind] = Polygon() - - for p_ind in pieces_df.index: - if len(pieces_df["polygon indices"][p_ind]) == 1: #Note that it won't be >1 if the file is clean! - this_poly_ind = list(pieces_df["polygon indices"][p_ind])[0] - this_piece = pieces_df["geometry"][p_ind] - if this_poly_ind in possible_geom_indices: - # This check is needed because the geometries in possible_geom_incides can form a - # non-simply-connected region, in which case the interior holes - which may consist - # of multiple precincts each - may be assigned someplace they shouldn't be! - geometries_df["geometry"][this_poly_ind] = unary_union([geometries_df["geometry"][this_poly_ind], this_piece]) - - # Find the boundary arcs between geometries and the disk (and make sure each arc is a connected piece): - possible_geoms = geometries_df.loc[possible_geom_indices] - circle_boundaries_df = intersections(geopandas.GeoDataFrame(geometry = geopandas.GeoSeries([disk_to_remove], crs = geometries_df.crs)), possible_geoms, output_type = "geodataframe") - - for b_ind in circle_boundaries_df.index: - if circle_boundaries_df["geometry"][b_ind].geom_type == "MultiLineString": - circle_boundaries_df["geometry"][b_ind] = linemerge(circle_boundaries_df["geometry"][b_ind]) - - circle_boundaries_df = circle_boundaries_df.explode(index_parts=False).reset_index(drop=True) - - # For each boundary arc, create a "pie wedge" from the center of the disk subtending this arc: - - for b_ind in circle_boundaries_df.index: - boundary_arc_coords = [x for x in circle_boundaries_df["geometry"][b_ind].coords] - boundary_wedge_coords = boundary_arc_coords + [midpoint_coords] - - g_ind = circle_boundaries_df["target"][b_ind] - - geometries_df["geometry"][g_ind] = unary_union([geometries_df["geometry"][g_ind], Polygon(boundary_wedge_coords)]) - - - return geometries_df - - - - - - -""" -Functions that are unchanged from repair_old -""" +class AreaCroppingWarning(UserWarning): + pass def holes_of_union(geometries): @@ -906,7 +40,8 @@ def holes_of_union(geometries): series = holes(union) series.crs = geometries.crs return series - + + def holes(geometry): if isinstance(geometry, MultiPolygon): return GeoSeries( @@ -924,13 +59,132 @@ def holes(geometry): +def close_gaps_old(geometries, relative_threshold=0.1): + """Closes gaps between geometries by assigning the hole to the polygon + that shares the most perimeter with the hole. -def split_by_level(series, multiindex): - return tuple( - multiindex.get_level_values(i).to_series(index=multiindex).map(series) - for i in range(multiindex.nlevels) + If the area of the gap is greater than `relative_threshold` times the + area of the polygon, then the gap is left alone. The default value + of `relative_threshold` is 0.1. This is intended to preserve intentional + gaps while closing the tiny gaps that can occur as artifacts of + geospatial operations. Set `relative_threshold=None` to attempt close all + gaps. Due to floating point precision issues, all gaps may not be closed. + """ + geometries = get_geometries(geometries) + gaps = holes_of_union(geometries) + return absorb_by_shared_perimeter( + gaps, geometries, relative_threshold=relative_threshold + ) + + +def resolve_overlaps_old(geometries, relative_threshold=0.1): + """For any pair of overlapping geometries, assigns the overlapping area to the + geometry that shares the most perimeter with the overlap. Returns the GeoSeries + of geometries, which will have no overlaps. + + If the ratio of the overlap's area to either of the overlapping geometries' + areas is greater than `relative_threshold`, then the overlap is ignored. + The default `relative_threshold` is `0.1`. This default is chosen to include + tiny overlaps that can be safely auto-fixed while preserving major overlaps + that might indicate deeper issues and should be handled on a case-by-case + basis. Set `relative_threshold=None` to attempt to resolve all overlaps. Due + to floating point precision issues, all overlaps may not be resolved. + """ + geometries = get_geometries(geometries) + inters = adjacencies(geometries, warn_for_islands=False, warn_for_overlaps=False) +# Changing buffer(0) to make_valid + overlaps = make_valid(inters[inters.area > 0]) + + if relative_threshold is not None: + left_areas, right_areas = split_by_level(geometries.area, overlaps.index) + under_threshold = ((overlaps.area / left_areas) < relative_threshold) & ( + (overlaps.area / right_areas) < relative_threshold + ) + overlaps = overlaps[under_threshold] + + if len(overlaps) == 0: + return geometries + + to_remove = GeoSeries( + pandas.concat([overlaps.droplevel(1), overlaps.droplevel(0)]), crs=overlaps.crs ) + with_overlaps_removed = geometries.apply(lambda x: x.difference(unary_union(to_remove))) + + return absorb_by_shared_perimeter( + overlaps, with_overlaps_removed, relative_threshold=None + ) + +def autorepair_old(geometries, relative_threshold=0.1): + """ + Applies all the tricks in `maup.repair` with default args. Should work by default. + The default `relative_threshold` is `0.1`. This default is chosen to include + tiny overlaps that can be safely auto-fixed while preserving major overlaps + that might indicate deeper issues and should be handled on a case-by-case + basis. Set `relative_threshold=None` to attempt to resolve all overlaps. See + `resolve_overlaps()` and `close_gaps()` for more. + """ +# Edited to allow for the possibility that input is a GeoSeries instead of a +# GeoDataFrame; this relies on all the functions called here having been similarly +# edited. + geometries = get_geometries(geometries) + + geometries = remove_repeated_vertices(geometries) + geometries = make_valid(geometries) + geometries = resolve_overlaps(geometries, relative_threshold=relative_threshold) + geometries = make_valid(geometries) + geometries = close_gaps(geometries, relative_threshold=relative_threshold) + geometries = make_valid(geometries) + return geometries + +# shp = geometries.copy() + +# shp["geometry"] = remove_repeated_vertices(shp) +# shp["geometry"] = make_valid(shp) +# shp["geometry"] = resolve_overlaps(shp, relative_threshold=relative_threshold) +# shp["geometry"] = make_valid(shp) +# shp["geometry"] = close_gaps(shp, relative_threshold=relative_threshold) +# shp["geometry"] = make_valid(shp) + +# return shp["geometry"] + + +# This function is no longer needed; shapely 2's make_valid function applies +# equally well to single geometries or GeoSeries, and keeping it while importing +# shapely's version creates a name conflict. +# When applied to a GeoDataFrame, shapely.make_valid +# returns the GeoSeries consisting of the validated geometries of the +# GeoData Frame, so the current behavior will be retained for GeoDataFrames +# and expanded to work as well on GeoSeries and single geometries. + +# def make_valid(geometries): +# """ +# Applies the shapely .buffer(0) and make_valid (once released) trick to all +# geometries. Should help resolve various topological issues in shapefiles. +# return geometries["geometry"].simplify(0).buffer(0) +# # return geometries["geometry"].apply(lambda x: make_valid(x)) + + +def remove_repeated_vertices(geometries): + """ + Removes repeated vertices. Vertices are considered to be repeated if they + appear consecutively, excluding the start and end points. + """ +# Changing "geometries["geometry"]" to "geometries.geometry" so it will work for both +# GeoSeries and GeoDataFrames: + return geometries.geometry.apply(lambda x: apply_func_to_polygon_parts(x, dedup_vertices)) +# return geometries["geometry"].apply(lambda x: apply_func_to_polygon_parts(x, dedup_vertices)) + +def snap_to_grid(geometries, n=-7): + """ + Snap the geometries to a grid by rounding to the nearest 10^n. Helps to + resolve floating point precision issues in shapefiles. + """ + func = functools.partial(snap_polygon_to_grid, n=n) +# Changing "geometries["geometry"]" to "geometries.geometry" so it will work for both +# GeoSeries and GeoDataFrames: + return geometries.geometry.apply(lambda x: apply_func_to_polygon_parts(x, func)) +# return geometries["geometry"].apply(lambda x: apply_func_to_polygon_parts(x, func)) @require_same_crs def crop_to(source, target): @@ -950,6 +204,29 @@ def crop_to(source, target): return cropped_geometries +@require_same_crs +def expand_to(source, target): + """ + Expands the source geometries to the target geometries. + """ +# Changing simplify(0).buffer(0) to make_valid: + geometries = make_valid(get_geometries(source)) +# geometries = get_geometries(source).simplify(0).buffer(0) + + source_union = unary_union(geometries) + + leftover_geometries = get_geometries(target).apply(lambda x: x - source_union) +# Added index_parts=False option to explode() to accommodate a FutureWarning in +# geopandas + leftover_geometries = leftover_geometries[~leftover_geometries.is_empty].explode(index_parts=False) + + geometries = absorb_by_shared_perimeter( + leftover_geometries, get_geometries(source), relative_threshold=None + ) + + return geometries + + # Changing assertions to truth checks so doctor can give a complete report instead of # stopping at the first problem. @@ -1007,7 +284,6 @@ def doctor(source, target=None): return health_check # return True - def count_overlaps(shp): """ Counts overlaps. Code is taken directly from the resolve_overlaps function in maup. @@ -1024,26 +300,13 @@ def count_overlaps(shp): # Adding an analogous function "count_holes" that can be used for reporting in an # expanded version of doctor. + def count_holes(shp): gaps = holes_of_union(shp.geometry) return(len(gaps)) - -def snap_to_grid(geometries, n=-7): - """ - Snap the geometries to a grid by rounding to the nearest 10^n. Helps to - resolve floating point precision issues in shapefiles. - """ - func = functools.partial(snap_polygon_to_grid, n=n) -# Changing "geometries["geometry"]" to "geometries.geometry" so it will work for both -# GeoSeries and GeoDataFrames: - return geometries.geometry.apply(lambda x: apply_func_to_polygon_parts(x, func)) -# return geometries["geometry"].apply(lambda x: apply_func_to_polygon_parts(x, func)) - - -def snap_polygon_to_grid(polygon, n=-7): - return Polygon([(round(x, -n), round(y, -n)) for x, y in polygon.exterior.coords]) - + + def apply_func_to_polygon_parts(shape, func): if isinstance(shape, Polygon): @@ -1053,4 +316,63 @@ def apply_func_to_polygon_parts(shape, func): else: raise TypeError(f"Can only apply {func} to a Polygon or MultiPolygon (got {shape} with type {type(shape)})!") +def dedup_vertices(polygon): + deduped_vertices = [] + for c, p in enumerate(list(polygon.exterior.coords)): + if c == 0: + deduped_vertices.append(p) + elif p != deduped_vertices[-1]: + deduped_vertices.append(p) + + return Polygon(deduped_vertices) + +def snap_polygon_to_grid(polygon, n=-7): + return Polygon([(round(x, -n), round(y, -n)) for x, y in polygon.exterior.coords]) + +def split_by_level(series, multiindex): + return tuple( + multiindex.get_level_values(i).to_series(index=multiindex).map(series) + for i in range(multiindex.nlevels) + ) + + +@require_same_crs +def absorb_by_shared_perimeter(sources, targets, relative_threshold=None): + if len(sources) == 0: + return targets + + if len(targets) == 0: + raise IndexError("targets must be nonempty") + + +# Changing buffer(0) to make_valid + inters = make_valid(intersections(sources, targets, area_cutoff=None)) +# inters = intersections(sources, targets, area_cutoff=None).buffer(0) + assignment = assign_to_max(inters.length) + + if relative_threshold is not None: + under_threshold = ( + sources.area / assignment.map(targets.area) + ) < relative_threshold + assignment = assignment[under_threshold] + + sources_to_absorb = GeoSeries( + sources.groupby(assignment).apply(unary_union), crs=sources.crs, + ) +# Note: this line produces a warning message when sources_to_absorb and targets +# have different indices: +# lib/python3.11/site-packages/geopandas/base.py:31: UserWarning: The indices of the two +# GeoSeries are different. +# warn("The indices of the two GeoSeries are different.") +# This difference in indices is expected since not all target geometries may have sources +# to absorb, so it would be nice to remove this warning. + result = targets.union(sources_to_absorb) + + # The .union call only returns the targets who had a corresponding + # source to absorb. Now we fill in all of the unchanged targets. + result = result.reindex(targets.index) + did_not_absorb = result.isna() | result.is_empty + result.loc[did_not_absorb] = targets[did_not_absorb] + + return result diff --git a/maup/repair_old.py b/maup/repair_old.py deleted file mode 100644 index 5ff8131..0000000 --- a/maup/repair_old.py +++ /dev/null @@ -1,378 +0,0 @@ -import math -import pandas -import functools -import warnings - -from geopandas import GeoSeries, GeoDataFrame -from shapely.geometry import MultiPolygon, Polygon -from shapely.ops import unary_union -# from shapely.validation import make_valid # currently in alpha -# UPDATE: make_valid exists in shapely 2.0.1 with minimum GEOS 3.8.0, -# so import it here: -from shapely import make_valid - -from .adjacencies import adjacencies -from .assign import assign_to_max -from .crs import require_same_crs -from .indexed_geometries import get_geometries -from .intersections import intersections - - -""" -Some of these functions are based on the functions in Mary Barker's -check_shapefile_connectivity.py script in @gerrymandr/Preprocessing. -""" - -class AreaCroppingWarning(UserWarning): - pass - - -def holes_of_union(geometries): - """Returns any holes in the union of the given geometries.""" - geometries = get_geometries(geometries) - if not all( - isinstance(geometry, (Polygon, MultiPolygon)) for geometry in geometries - ): -# Changed "type(x) to "x.geom_type" - raise TypeError(f"Must be a Polygon or MultiPolygon (got types {set([x.geom_type for x in geometries])})!") - - union = unary_union(geometries) - series = holes(union) - series.crs = geometries.crs - return series - - -def holes(geometry): - if isinstance(geometry, MultiPolygon): - return GeoSeries( - [ - Polygon(list(hole.coords)) - for polygon in geometry.geoms - for hole in polygon.interiors - ] - ) - elif isinstance(geometry, Polygon): - return GeoSeries([Polygon(list(hole.coords)) for hole in geometry.interiors]) - else: - raise TypeError("geometry must be a Polygon or MultiPolygon to have holes") - - - - -def close_gaps_old(geometries, relative_threshold=0.1): - """Closes gaps between geometries by assigning the hole to the polygon - that shares the most perimeter with the hole. - - If the area of the gap is greater than `relative_threshold` times the - area of the polygon, then the gap is left alone. The default value - of `relative_threshold` is 0.1. This is intended to preserve intentional - gaps while closing the tiny gaps that can occur as artifacts of - geospatial operations. Set `relative_threshold=None` to attempt close all - gaps. Due to floating point precision issues, all gaps may not be closed. - """ - geometries = get_geometries(geometries) - gaps = holes_of_union(geometries) - return absorb_by_shared_perimeter( - gaps, geometries, relative_threshold=relative_threshold - ) - - -def resolve_overlaps_old(geometries, relative_threshold=0.1): - """For any pair of overlapping geometries, assigns the overlapping area to the - geometry that shares the most perimeter with the overlap. Returns the GeoSeries - of geometries, which will have no overlaps. - - If the ratio of the overlap's area to either of the overlapping geometries' - areas is greater than `relative_threshold`, then the overlap is ignored. - The default `relative_threshold` is `0.1`. This default is chosen to include - tiny overlaps that can be safely auto-fixed while preserving major overlaps - that might indicate deeper issues and should be handled on a case-by-case - basis. Set `relative_threshold=None` to attempt to resolve all overlaps. Due - to floating point precision issues, all overlaps may not be resolved. - """ - geometries = get_geometries(geometries) - inters = adjacencies(geometries, warn_for_islands=False, warn_for_overlaps=False) -# Changing buffer(0) to make_valid - overlaps = make_valid(inters[inters.area > 0]) - - if relative_threshold is not None: - left_areas, right_areas = split_by_level(geometries.area, overlaps.index) - under_threshold = ((overlaps.area / left_areas) < relative_threshold) & ( - (overlaps.area / right_areas) < relative_threshold - ) - overlaps = overlaps[under_threshold] - - if len(overlaps) == 0: - return geometries - - to_remove = GeoSeries( - pandas.concat([overlaps.droplevel(1), overlaps.droplevel(0)]), crs=overlaps.crs - ) - with_overlaps_removed = geometries.apply(lambda x: x.difference(unary_union(to_remove))) - - return absorb_by_shared_perimeter( - overlaps, with_overlaps_removed, relative_threshold=None - ) - -def autorepair_old(geometries, relative_threshold=0.1): - """ - Applies all the tricks in `maup.repair` with default args. Should work by default. - The default `relative_threshold` is `0.1`. This default is chosen to include - tiny overlaps that can be safely auto-fixed while preserving major overlaps - that might indicate deeper issues and should be handled on a case-by-case - basis. Set `relative_threshold=None` to attempt to resolve all overlaps. See - `resolve_overlaps()` and `close_gaps()` for more. - """ -# Edited to allow for the possibility that input is a GeoSeries instead of a -# GeoDataFrame; this relies on all the functions called here having been similarly -# edited. - geometries = get_geometries(geometries) - - geometries = remove_repeated_vertices(geometries) - geometries = make_valid(geometries) - geometries = resolve_overlaps(geometries, relative_threshold=relative_threshold) - geometries = make_valid(geometries) - geometries = close_gaps(geometries, relative_threshold=relative_threshold) - geometries = make_valid(geometries) - - return geometries - -# shp = geometries.copy() - -# shp["geometry"] = remove_repeated_vertices(shp) -# shp["geometry"] = make_valid(shp) -# shp["geometry"] = resolve_overlaps(shp, relative_threshold=relative_threshold) -# shp["geometry"] = make_valid(shp) -# shp["geometry"] = close_gaps(shp, relative_threshold=relative_threshold) -# shp["geometry"] = make_valid(shp) - -# return shp["geometry"] - - -# This function is no longer needed; shapely 2's make_valid function applies -# equally well to single geometries or GeoSeries, and keeping it while importing -# shapely's version creates a name conflict. -# When applied to a GeoDataFrame, shapely.make_valid -# returns the GeoSeries consisting of the validated geometries of the -# GeoData Frame, so the current behavior will be retained for GeoDataFrames -# and expanded to work as well on GeoSeries and single geometries. - -# def make_valid(geometries): -# """ -# Applies the shapely .buffer(0) and make_valid (once released) trick to all -# geometries. Should help resolve various topological issues in shapefiles. -# return geometries["geometry"].simplify(0).buffer(0) -# # return geometries["geometry"].apply(lambda x: make_valid(x)) - - -def remove_repeated_vertices(geometries): - """ - Removes repeated vertices. Vertices are considered to be repeated if they - appear consecutively, excluding the start and end points. - """ -# Changing "geometries["geometry"]" to "geometries.geometry" so it will work for both -# GeoSeries and GeoDataFrames: - return geometries.geometry.apply(lambda x: apply_func_to_polygon_parts(x, dedup_vertices)) -# return geometries["geometry"].apply(lambda x: apply_func_to_polygon_parts(x, dedup_vertices)) - -def snap_to_grid(geometries, n=-7): - """ - Snap the geometries to a grid by rounding to the nearest 10^n. Helps to - resolve floating point precision issues in shapefiles. - """ - func = functools.partial(snap_polygon_to_grid, n=n) -# Changing "geometries["geometry"]" to "geometries.geometry" so it will work for both -# GeoSeries and GeoDataFrames: - return geometries.geometry.apply(lambda x: apply_func_to_polygon_parts(x, func)) -# return geometries["geometry"].apply(lambda x: apply_func_to_polygon_parts(x, func)) - -@require_same_crs -def crop_to(source, target): - """ - Crops the source geometries to the target geometries. - """ - target_union = unary_union(get_geometries(target)) - cropped_geometries = get_geometries(source).apply(lambda x: x.intersection(target_union)) - - if (cropped_geometries.area == 0).any(): - warnings.warn("Some cropped geometries have zero area, likely due to\n"+ - "large differences in the union of the geometries in your\n"+ - "source and target shapefiles. This may become an issue\n"+ - "when maupping.\n", - AreaCroppingWarning - ) - - return cropped_geometries - -@require_same_crs -def expand_to(source, target): - """ - Expands the source geometries to the target geometries. - """ -# Changing simplify(0).buffer(0) to make_valid: - geometries = make_valid(get_geometries(source)) -# geometries = get_geometries(source).simplify(0).buffer(0) - - source_union = unary_union(geometries) - - leftover_geometries = get_geometries(target).apply(lambda x: x - source_union) -# Added index_parts=False option to explode() to accommodate a FutureWarning in -# geopandas - leftover_geometries = leftover_geometries[~leftover_geometries.is_empty].explode(index_parts=False) - - geometries = absorb_by_shared_perimeter( - leftover_geometries, get_geometries(source), relative_threshold=None - ) - - return geometries - - - -# Changing assertions to truth checks so doctor can give a complete report instead of -# stopping at the first problem. -def doctor(source, target=None): - """ - Detects quality issues in a given set of source and target geometries. Quality - issues include overlaps, gaps, invalid geometries, repeated verticies, non-perfect - tiling, and not entirely overlapping source and targets. If `maup.doctor()` returns - `True`, votes should not be lost when prorating or assigning (beyond a few due to - rounding, etc.). Passing a target to doctor is optional. - """ - shapefiles = [source] - source_union = unary_union(get_geometries(source)) - -# Adding "health_check" variable to return instead of using assertions. - health_check = True - - if target is not None: - shapefiles.append(target) - - target_union = unary_union(get_geometries(target)) - sym_area = target_union.symmetric_difference(source_union).area - - if sym_area != 0: - print("The unions of target and source differ!") - health_check = False -# assert sym_area == 0, "The unions of target and source differ!" - - for shp in shapefiles: -# Changing "shp["geometry"]" to "shp.geometry" so it will work for both GeoSeries and -# GeoDataFrames,and also changing the assertion behavior. - if not shp.geometry.apply(lambda x: isinstance(x, (Polygon, MultiPolygon))).all(): - print("Some rows do not have geometries.") - health_check = False -# assert shp["geometry"].apply(lambda x: isinstance(x, (Polygon, MultiPolygon))).all(), "Some rows do not have geometries" - - overlaps = count_overlaps(shp) - holes = len(holes_of_union(shp)) - - if overlaps != 0: - print("There are", overlaps, "overlaps.") - health_check = False - if holes != 0: - print("There are", holes, "holes.") - health_check = False - if not shp.is_valid.all(): - print("There are some invalid geometries.") - health_check = False - - -# assert overlaps == 0, f"There are {overlaps} overlaps!" -# assert holes == 0, f"There are {holes} holes!" -# assert shp.is_valid.all(), "There are some invalid geometries!" - - return health_check -# return True - -def count_overlaps(shp): - """ - Counts overlaps. Code is taken directly from the resolve_overlaps function in maup. - """ -# Changing "shp["geometry"]" to "shp.geometry" so it will work for both GeoSeries and -# GeoDataFrames: - inters = adjacencies(shp.geometry, warn_for_islands=False, warn_for_overlaps=False) -# inters = adjacencies(shp["geometry"], warn_for_islands=False, warn_for_overlaps=False) -# Changing buffer(0) to make_valid: - overlaps = make_valid(inters[inters.area > 0]) -# overlaps = inters[inters.area > 0].buffer(0) - return len(overlaps) - - -# Adding an analogous function "count_holes" that can be used for reporting in an -# expanded version of doctor. - -def count_holes(shp): - gaps = holes_of_union(shp.geometry) - return(len(gaps)) - - - - -def apply_func_to_polygon_parts(shape, func): - if isinstance(shape, Polygon): - return func(shape) - elif isinstance(shape, MultiPolygon): - return MultiPolygon([func(poly) for poly in shape.geoms]) - else: - raise TypeError(f"Can only apply {func} to a Polygon or MultiPolygon (got {shape} with type {type(shape)})!") - -def dedup_vertices(polygon): - deduped_vertices = [] - for c, p in enumerate(list(polygon.exterior.coords)): - if c == 0: - deduped_vertices.append(p) - elif p != deduped_vertices[-1]: - deduped_vertices.append(p) - - return Polygon(deduped_vertices) - -def snap_polygon_to_grid(polygon, n=-7): - return Polygon([(round(x, -n), round(y, -n)) for x, y in polygon.exterior.coords]) - -def split_by_level(series, multiindex): - return tuple( - multiindex.get_level_values(i).to_series(index=multiindex).map(series) - for i in range(multiindex.nlevels) - ) - - -@require_same_crs -def absorb_by_shared_perimeter(sources, targets, relative_threshold=None): - if len(sources) == 0: - return targets - - if len(targets) == 0: - raise IndexError("targets must be nonempty") - - -# Changing buffer(0) to make_valid - inters = make_valid(intersections(sources, targets, area_cutoff=None)) -# inters = intersections(sources, targets, area_cutoff=None).buffer(0) - assignment = assign_to_max(inters.length) - - if relative_threshold is not None: - under_threshold = ( - sources.area / assignment.map(targets.area) - ) < relative_threshold - assignment = assignment[under_threshold] - - sources_to_absorb = GeoSeries( - sources.groupby(assignment).apply(unary_union), crs=sources.crs, - ) - -# Note: this line produces a warning message when sources_to_absorb and targets -# have different indices: -# lib/python3.11/site-packages/geopandas/base.py:31: UserWarning: The indices of the two -# GeoSeries are different. -# warn("The indices of the two GeoSeries are different.") -# This difference in indices is expected since not all target geometries may have sources -# to absorb, so it would be nice to remove this warning. - result = targets.union(sources_to_absorb) - - # The .union call only returns the targets who had a corresponding - # source to absorb. Now we fill in all of the unchanged targets. - result = result.reindex(targets.index) - did_not_absorb = result.isna() | result.is_empty - result.loc[did_not_absorb] = targets[did_not_absorb] - - return result From 6e788a5aac4f4d89bbaab2c8ca8d21ab63ecd1a7 Mon Sep 17 00:00:00 2001 From: "Parker J. Rule" Date: Tue, 18 Jul 2023 00:06:46 -0400 Subject: [PATCH 03/15] Clean up inline comments --- maup/__init__.py | 4 ++-- maup/adjacencies.py | 5 ----- maup/indexed_geometries.py | 17 +++------------ maup/repair.py | 43 +++----------------------------------- 4 files changed, 8 insertions(+), 61 deletions(-) diff --git a/maup/__init__.py b/maup/__init__.py index 31d9f78..5644be0 100644 --- a/maup/__init__.py +++ b/maup/__init__.py @@ -14,7 +14,7 @@ # For the old autorepair functions, uncomment the following line, along with the # corresponding line in the __all__ assignments below: -# from .repair_old import close_gaps_old, resolve_overlaps_old, autorepair_old +# from .repair_old import close_gaps, resolve_overlaps, autorepair @@ -43,4 +43,4 @@ "normalize", "progress" ] -# + ["autorepair_old", "close_gaps_old", "resolve_overlaps_old"] +# + ["autorepair", "close_gaps", "resolve_overlaps"] diff --git a/maup/adjacencies.py b/maup/adjacencies.py index 73aaf89..14129da 100644 --- a/maup/adjacencies.py +++ b/maup/adjacencies.py @@ -26,8 +26,6 @@ def iter_adjacencies(geometries): for j, inter in inters.items(): yield (i, j), inter - -# Added option to output result as GeoDataFrame instead of GeoSeries def adjacencies( geometries, output_type="geoseries", @@ -50,14 +48,12 @@ def adjacencies( geometries = get_geometries(geometries) geometries = make_valid(geometries) -# Change here suggested by Hazel to handle the case where there are no adjacencies: adjs = list(iter_adjacencies(geometries)) if adjs: index, geoms = zip(*adjs) else: index, geoms = [[],[]] -# Made a change here to add the GeoDataFrame option: if output_type == "geodataframe": inters = GeoDataFrame({"neighbors" : index, "geometry" : geoms}, crs = geometries.crs) else: @@ -76,7 +72,6 @@ def adjacencies( OverlapWarning, ) -# Made a change here to accomodate the GeoDataFrame option: if warn_for_islands: if output_type == "geodataframe": islands = set(geometries.index) - set(i for pair in inters["neighbors"] for i in pair) diff --git a/maup/indexed_geometries.py b/maup/indexed_geometries.py index 2ade161..6bb3dd0 100644 --- a/maup/indexed_geometries.py +++ b/maup/indexed_geometries.py @@ -15,25 +15,14 @@ def get_geometries(geometries): class IndexedGeometries: def __init__(self, geometries): self.geometries = get_geometries(geometries) -# Lines commented out because this won't work in shapely 2; geometries are now immutable. -# (But I don't really understand why it was ever here because it isn't needed!) -# for i, geometry in self.geometries.items(): -# geometry.index = i self.spatial_index = STRtree(self.geometries) self.index = self.geometries.index def query(self, geometry): -# Edited because STRTree queries are handled differently in shapely 2; they return -# a list of INTEGER INDICES (i.e., RangeIndex) of relevant geometries instead of the -# geometries themselves. -# IMPORTANT EDGE CASE: When "geometry" is multi-part, this query will return a -# (2 x n) array instead of a (1 x n) array, so it's safest to flatten the query -# output before proceeding. -# relevant_indices = [geom.index for geom in self.spatial_index.query(geometry)] - relevant_integer_index_array = self.spatial_index.query(geometry) - relevant_integer_indices = [*set(numpy.ndarray.flatten(relevant_integer_index_array))] - relevant_geometries = self.geometries.iloc[relevant_integer_indices] + relevant_index_array = self.spatial_index.query(geometry) + relevant_indices = [*set(numpy.ndarray.flatten(relevant_index_array))] + relevant_geometries = self.geometries.iloc[relevant_indices] return relevant_geometries def intersections(self, geometry): diff --git a/maup/repair.py b/maup/repair.py index 5ff8131..0b85a0a 100644 --- a/maup/repair.py +++ b/maup/repair.py @@ -59,7 +59,7 @@ def holes(geometry): -def close_gaps_old(geometries, relative_threshold=0.1): +def close_gaps(geometries, relative_threshold=0.1): """Closes gaps between geometries by assigning the hole to the polygon that shares the most perimeter with the hole. @@ -77,7 +77,7 @@ def close_gaps_old(geometries, relative_threshold=0.1): ) -def resolve_overlaps_old(geometries, relative_threshold=0.1): +def resolve_overlaps(geometries, relative_threshold=0.1): """For any pair of overlapping geometries, assigns the overlapping area to the geometry that shares the most perimeter with the overlap. Returns the GeoSeries of geometries, which will have no overlaps. @@ -114,7 +114,7 @@ def resolve_overlaps_old(geometries, relative_threshold=0.1): overlaps, with_overlaps_removed, relative_threshold=None ) -def autorepair_old(geometries, relative_threshold=0.1): +def autorepair(geometries, relative_threshold=0.1): """ Applies all the tricks in `maup.repair` with default args. Should work by default. The default `relative_threshold` is `0.1`. This default is chosen to include @@ -123,9 +123,6 @@ def autorepair_old(geometries, relative_threshold=0.1): basis. Set `relative_threshold=None` to attempt to resolve all overlaps. See `resolve_overlaps()` and `close_gaps()` for more. """ -# Edited to allow for the possibility that input is a GeoSeries instead of a -# GeoDataFrame; this relies on all the functions called here having been similarly -# edited. geometries = get_geometries(geometries) geometries = remove_repeated_vertices(geometries) @@ -137,43 +134,12 @@ def autorepair_old(geometries, relative_threshold=0.1): return geometries -# shp = geometries.copy() - -# shp["geometry"] = remove_repeated_vertices(shp) -# shp["geometry"] = make_valid(shp) -# shp["geometry"] = resolve_overlaps(shp, relative_threshold=relative_threshold) -# shp["geometry"] = make_valid(shp) -# shp["geometry"] = close_gaps(shp, relative_threshold=relative_threshold) -# shp["geometry"] = make_valid(shp) - -# return shp["geometry"] - - -# This function is no longer needed; shapely 2's make_valid function applies -# equally well to single geometries or GeoSeries, and keeping it while importing -# shapely's version creates a name conflict. -# When applied to a GeoDataFrame, shapely.make_valid -# returns the GeoSeries consisting of the validated geometries of the -# GeoData Frame, so the current behavior will be retained for GeoDataFrames -# and expanded to work as well on GeoSeries and single geometries. - -# def make_valid(geometries): -# """ -# Applies the shapely .buffer(0) and make_valid (once released) trick to all -# geometries. Should help resolve various topological issues in shapefiles. -# return geometries["geometry"].simplify(0).buffer(0) -# # return geometries["geometry"].apply(lambda x: make_valid(x)) - - def remove_repeated_vertices(geometries): """ Removes repeated vertices. Vertices are considered to be repeated if they appear consecutively, excluding the start and end points. """ -# Changing "geometries["geometry"]" to "geometries.geometry" so it will work for both -# GeoSeries and GeoDataFrames: return geometries.geometry.apply(lambda x: apply_func_to_polygon_parts(x, dedup_vertices)) -# return geometries["geometry"].apply(lambda x: apply_func_to_polygon_parts(x, dedup_vertices)) def snap_to_grid(geometries, n=-7): """ @@ -181,10 +147,7 @@ def snap_to_grid(geometries, n=-7): resolve floating point precision issues in shapefiles. """ func = functools.partial(snap_polygon_to_grid, n=n) -# Changing "geometries["geometry"]" to "geometries.geometry" so it will work for both -# GeoSeries and GeoDataFrames: return geometries.geometry.apply(lambda x: apply_func_to_polygon_parts(x, func)) -# return geometries["geometry"].apply(lambda x: apply_func_to_polygon_parts(x, func)) @require_same_crs def crop_to(source, target): From d5af48e99e0e36f71f56e164c36d6f0a94b63823 Mon Sep 17 00:00:00 2001 From: "Parker J. Rule" Date: Tue, 18 Jul 2023 21:47:55 -0400 Subject: [PATCH 04/15] Use old repair logic (with minor enhancements) --- maup/repair.py | 75 ++++++-------------------------------------------- 1 file changed, 9 insertions(+), 66 deletions(-) diff --git a/maup/repair.py b/maup/repair.py index 0b85a0a..b7ecdf3 100644 --- a/maup/repair.py +++ b/maup/repair.py @@ -1,14 +1,10 @@ -import math import pandas import functools import warnings -from geopandas import GeoSeries, GeoDataFrame +from geopandas import GeoSeries from shapely.geometry import MultiPolygon, Polygon from shapely.ops import unary_union -# from shapely.validation import make_valid # currently in alpha -# UPDATE: make_valid exists in shapely 2.0.1 with minimum GEOS 3.8.0, -# so import it here: from shapely import make_valid from .adjacencies import adjacencies @@ -33,7 +29,6 @@ def holes_of_union(geometries): if not all( isinstance(geometry, (Polygon, MultiPolygon)) for geometry in geometries ): -# Changed "type(x) to "x.geom_type" raise TypeError(f"Must be a Polygon or MultiPolygon (got types {set([x.geom_type for x in geometries])})!") union = unary_union(geometries) @@ -190,9 +185,6 @@ def expand_to(source, target): return geometries - -# Changing assertions to truth checks so doctor can give a complete report instead of -# stopping at the first problem. def doctor(source, target=None): """ Detects quality issues in a given set of source and target geometries. Quality @@ -204,72 +196,34 @@ def doctor(source, target=None): shapefiles = [source] source_union = unary_union(get_geometries(source)) -# Adding "health_check" variable to return instead of using assertions. - health_check = True - if target is not None: shapefiles.append(target) target_union = unary_union(get_geometries(target)) sym_area = target_union.symmetric_difference(source_union).area - if sym_area != 0: - print("The unions of target and source differ!") - health_check = False -# assert sym_area == 0, "The unions of target and source differ!" + assert sym_area == 0, "The unions of target and source differ!" for shp in shapefiles: -# Changing "shp["geometry"]" to "shp.geometry" so it will work for both GeoSeries and -# GeoDataFrames,and also changing the assertion behavior. - if not shp.geometry.apply(lambda x: isinstance(x, (Polygon, MultiPolygon))).all(): - print("Some rows do not have geometries.") - health_check = False -# assert shp["geometry"].apply(lambda x: isinstance(x, (Polygon, MultiPolygon))).all(), "Some rows do not have geometries" + assert shp["geometry"].apply(lambda x: isinstance(x, (Polygon, MultiPolygon))).all(), "Some rows do not have geometries" overlaps = count_overlaps(shp) holes = len(holes_of_union(shp)) - if overlaps != 0: - print("There are", overlaps, "overlaps.") - health_check = False - if holes != 0: - print("There are", holes, "holes.") - health_check = False - if not shp.is_valid.all(): - print("There are some invalid geometries.") - health_check = False - - -# assert overlaps == 0, f"There are {overlaps} overlaps!" -# assert holes == 0, f"There are {holes} holes!" -# assert shp.is_valid.all(), "There are some invalid geometries!" + assert overlaps == 0, f"There are {overlaps} overlaps!" + assert holes == 0, f"There are {holes} holes!" + assert shp.is_valid.all(), "There are some invalid geometries!" - return health_check -# return True + return True def count_overlaps(shp): """ Counts overlaps. Code is taken directly from the resolve_overlaps function in maup. """ -# Changing "shp["geometry"]" to "shp.geometry" so it will work for both GeoSeries and -# GeoDataFrames: inters = adjacencies(shp.geometry, warn_for_islands=False, warn_for_overlaps=False) -# inters = adjacencies(shp["geometry"], warn_for_islands=False, warn_for_overlaps=False) -# Changing buffer(0) to make_valid: - overlaps = make_valid(inters[inters.area > 0]) -# overlaps = inters[inters.area > 0].buffer(0) + overlaps = (inters[inters.area > 0]).buffer(0) return len(overlaps) - -# Adding an analogous function "count_holes" that can be used for reporting in an -# expanded version of doctor. - -def count_holes(shp): - gaps = holes_of_union(shp.geometry) - return(len(gaps)) - - - def apply_func_to_polygon_parts(shape, func): if isinstance(shape, Polygon): @@ -298,7 +252,6 @@ def split_by_level(series, multiindex): for i in range(multiindex.nlevels) ) - @require_same_crs def absorb_by_shared_perimeter(sources, targets, relative_threshold=None): if len(sources) == 0: @@ -307,10 +260,7 @@ def absorb_by_shared_perimeter(sources, targets, relative_threshold=None): if len(targets) == 0: raise IndexError("targets must be nonempty") - -# Changing buffer(0) to make_valid - inters = make_valid(intersections(sources, targets, area_cutoff=None)) -# inters = intersections(sources, targets, area_cutoff=None).buffer(0) + inters = intersections(sources, targets, area_cutoff=None).buffer(0) assignment = assign_to_max(inters.length) if relative_threshold is not None: @@ -323,13 +273,6 @@ def absorb_by_shared_perimeter(sources, targets, relative_threshold=None): sources.groupby(assignment).apply(unary_union), crs=sources.crs, ) -# Note: this line produces a warning message when sources_to_absorb and targets -# have different indices: -# lib/python3.11/site-packages/geopandas/base.py:31: UserWarning: The indices of the two -# GeoSeries are different. -# warn("The indices of the two GeoSeries are different.") -# This difference in indices is expected since not all target geometries may have sources -# to absorb, so it would be nice to remove this warning. result = targets.union(sources_to_absorb) # The .union call only returns the targets who had a corresponding From fbfde94bc63cab89d69441718e5a317cd31b7e69 Mon Sep 17 00:00:00 2001 From: "Parker J. Rule" Date: Tue, 18 Jul 2023 21:58:44 -0400 Subject: [PATCH 05/15] Restore legacy make_valid for release --- maup/adjacencies.py | 2 +- maup/repair.py | 7 +++++++ 2 files changed, 8 insertions(+), 1 deletion(-) diff --git a/maup/adjacencies.py b/maup/adjacencies.py index 14129da..b2c782c 100644 --- a/maup/adjacencies.py +++ b/maup/adjacencies.py @@ -16,7 +16,7 @@ class IslandWarning(UserWarning): pass -def iter_adjacencies(geometries): +def iter_adjacencies(geometries): indexed = IndexedGeometries(geometries) for i, geometry in progress(indexed.geometries.items(), len(indexed.geometries)): possible = indexed.query(geometry) diff --git a/maup/repair.py b/maup/repair.py index b7ecdf3..1c5e6ac 100644 --- a/maup/repair.py +++ b/maup/repair.py @@ -128,6 +128,13 @@ def autorepair(geometries, relative_threshold=0.1): geometries = make_valid(geometries) return geometries + +def make_valid(geometries): + """ + Applies the shapely .buffer(0) and make_valid (once released) trick to all + geometries. Should help resolve various topological issues in shapefiles. + """ + return geometries["geometry"].simplify(0).buffer(0) def remove_repeated_vertices(geometries): """ From de40c97932ab56e7ab1d78522d8fab40858aa7ff Mon Sep 17 00:00:00 2001 From: "Parker J. Rule" Date: Tue, 18 Jul 2023 21:59:48 -0400 Subject: [PATCH 06/15] Reformat TODO --- maup/assign.py | 3 +-- 1 file changed, 1 insertion(+), 2 deletions(-) diff --git a/maup/assign.py b/maup/assign.py index ce4a194..4069c21 100644 --- a/maup/assign.py +++ b/maup/assign.py @@ -25,8 +25,7 @@ def assign(sources, targets): ) assignment.update(assignments_by_area) -# We should add a warning here if there are still unassigned source geometries! - + # TODO: add a warning here if there are still unassigned source geometries/ return assignment.astype(targets.index.dtype, errors="ignore") From c0765e9539b807d7d17f690e6a1ab0bcbb33011f Mon Sep 17 00:00:00 2001 From: "Parker J. Rule" Date: Fri, 21 Jul 2023 00:31:42 -0400 Subject: [PATCH 07/15] Clean up indexed_geometries --- maup/indexed_geometries.py | 7 +++---- 1 file changed, 3 insertions(+), 4 deletions(-) diff --git a/maup/indexed_geometries.py b/maup/indexed_geometries.py index 6bb3dd0..6892a36 100644 --- a/maup/indexed_geometries.py +++ b/maup/indexed_geometries.py @@ -1,6 +1,5 @@ import pandas import geopandas -# Added numpy import to handle output of STRtree query import numpy from shapely.prepared import prep @@ -25,12 +24,12 @@ def query(self, geometry): relevant_geometries = self.geometries.iloc[relevant_indices] return relevant_geometries - def intersections(self, geometry): + def intersections(self, geometry): relevant_geometries = self.query(geometry) intersections = relevant_geometries.intersection(geometry) return intersections[-(intersections.is_empty | intersections.isna())] - def covered_by(self, container): + def covered_by(self, container): relevant_geometries = self.query(container) prepared_container = prep(container) @@ -40,7 +39,7 @@ def covered_by(self, container): selected_geometries = relevant_geometries.apply(prepared_container.covers) return relevant_geometries[selected_geometries] - def assign(self, targets): + def assign(self, targets): target_geometries = get_geometries(targets) groups = [ self.covered_by(container).apply(lambda x: container_index) From 9e0d2a92f000f7365631e1b1e3a1ce3e5b2db5fa Mon Sep 17 00:00:00 2001 From: "Parker J. Rule" Date: Fri, 21 Jul 2023 00:34:31 -0400 Subject: [PATCH 08/15] Remove import from 2.x --- maup/__init__.py | 13 ------------- 1 file changed, 13 deletions(-) diff --git a/maup/__init__.py b/maup/__init__.py index 5644be0..f1bbd26 100644 --- a/maup/__init__.py +++ b/maup/__init__.py @@ -6,19 +6,6 @@ from .normalize import normalize from .progress_bar import progress -# Moved the source for the import of make_valid from .repair to shapely since it lives -# there now and the maup version should be removed. (Also removing make_valid from the -# __all__ list below.) -from shapely import make_valid - -# For the old autorepair functions, uncomment the following line, along with the -# corresponding line in the __all__ assignments below: - -# from .repair_old import close_gaps, resolve_overlaps, autorepair - - - -import geopandas # warn about https://github.com/geopandas/geopandas/issues/2199 if geopandas.options.use_pygeos: From 7c2324088c27fdc8f3e500349ed83574994325ad Mon Sep 17 00:00:00 2001 From: "Parker J. Rule" Date: Fri, 21 Jul 2023 00:36:48 -0400 Subject: [PATCH 09/15] Fix comment typo --- maup/assign.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/maup/assign.py b/maup/assign.py index 4069c21..3a63c76 100644 --- a/maup/assign.py +++ b/maup/assign.py @@ -25,7 +25,7 @@ def assign(sources, targets): ) assignment.update(assignments_by_area) - # TODO: add a warning here if there are still unassigned source geometries/ + # TODO: add a warning here if there are still unassigned source geometries. return assignment.astype(targets.index.dtype, errors="ignore") From c79a2f20d56e09eb518f947df0c26f232a3f6f13 Mon Sep 17 00:00:00 2001 From: "Parker J. Rule" Date: Fri, 21 Jul 2023 00:40:20 -0400 Subject: [PATCH 10/15] Clean up inline maup.repair comments --- maup/repair.py | 5 ----- 1 file changed, 5 deletions(-) diff --git a/maup/repair.py b/maup/repair.py index 1c5e6ac..33e21ad 100644 --- a/maup/repair.py +++ b/maup/repair.py @@ -87,7 +87,6 @@ def resolve_overlaps(geometries, relative_threshold=0.1): """ geometries = get_geometries(geometries) inters = adjacencies(geometries, warn_for_islands=False, warn_for_overlaps=False) -# Changing buffer(0) to make_valid overlaps = make_valid(inters[inters.area > 0]) if relative_threshold is not None: @@ -174,15 +173,11 @@ def expand_to(source, target): """ Expands the source geometries to the target geometries. """ -# Changing simplify(0).buffer(0) to make_valid: geometries = make_valid(get_geometries(source)) -# geometries = get_geometries(source).simplify(0).buffer(0) source_union = unary_union(geometries) leftover_geometries = get_geometries(target).apply(lambda x: x - source_union) -# Added index_parts=False option to explode() to accommodate a FutureWarning in -# geopandas leftover_geometries = leftover_geometries[~leftover_geometries.is_empty].explode(index_parts=False) geometries = absorb_by_shared_perimeter( From 1777bd24716e5d40d596441abebb10593e5ac496 Mon Sep 17 00:00:00 2001 From: "Parker J. Rule" Date: Fri, 21 Jul 2023 00:43:39 -0400 Subject: [PATCH 11/15] Clean up more inline comments --- maup/__init__.py | 3 +-- maup/intersections.py | 1 - 2 files changed, 1 insertion(+), 3 deletions(-) diff --git a/maup/__init__.py b/maup/__init__.py index f1bbd26..447409b 100644 --- a/maup/__init__.py +++ b/maup/__init__.py @@ -29,5 +29,4 @@ "doctor", "normalize", "progress" -] -# + ["autorepair", "close_gaps", "resolve_overlaps"] +] \ No newline at end of file diff --git a/maup/intersections.py b/maup/intersections.py index d915ddb..9eca7e7 100644 --- a/maup/intersections.py +++ b/maup/intersections.py @@ -8,7 +8,6 @@ @require_same_crs -# Added option to output result as GeoDataFrame instead of GeoSeries def intersections(sources, targets, output_type= "geoseries", area_cutoff=None): """Computes all of the nonempty intersections between two sets of geometries. By default, the returned `~geopandas.GeoSeries` will have a MultiIndex, where the From e986584ef48ee483237f66acaf57798fa1547170 Mon Sep 17 00:00:00 2001 From: "Parker J. Rule" Date: Sun, 8 Oct 2023 22:32:09 -0400 Subject: [PATCH 12/15] Fix typos and missing imports --- maup/__init__.py | 4 +++- maup/intersections.py | 5 +---- 2 files changed, 4 insertions(+), 5 deletions(-) diff --git a/maup/__init__.py b/maup/__init__.py index 447409b..aabc84e 100644 --- a/maup/__init__.py +++ b/maup/__init__.py @@ -1,8 +1,9 @@ +import geopandas from .adjacencies import adjacencies from .assign import assign from .indexed_geometries import IndexedGeometries from .intersections import intersections, prorate -from .repair import close_gaps, autorepair, snap_to_grid, crop_to, doctor +from .repair import close_gaps, autorepair, snap_to_grid, crop_to, doctor, resolve_overlaps from .normalize import normalize from .progress_bar import progress @@ -24,6 +25,7 @@ "prorate", "close_gaps", "autorepair", + "resolve_overlaps", "snap_to_grid", "crop_to", "doctor", diff --git a/maup/intersections.py b/maup/intersections.py index 9eca7e7..0251549 100644 --- a/maup/intersections.py +++ b/maup/intersections.py @@ -51,10 +51,7 @@ def intersections(sources, targets, output_type= "geoseries", area_cutoff=None): df = df[df.area > area_cutoff].reset_index(drop=True) geometries = geometries[geometries.area > area_cutoff] - if output_type == "geodataframe": - return df - else: - return geometries + return df if output_type == "geodataframe" else geometries def prorate(relationship, data, weights, aggregate_by="sum"): From f455f3e461be1bacbf90976711dbbda67c550de0 Mon Sep 17 00:00:00 2001 From: "Parker J. Rule" Date: Sun, 8 Oct 2023 22:45:16 -0400 Subject: [PATCH 13/15] Fix failing adjacency tests --- maup/adjacencies.py | 14 ++++++++------ 1 file changed, 8 insertions(+), 6 deletions(-) diff --git a/maup/adjacencies.py b/maup/adjacencies.py index b2c782c..3836eb0 100644 --- a/maup/adjacencies.py +++ b/maup/adjacencies.py @@ -27,9 +27,9 @@ def iter_adjacencies(geometries): yield (i, j), inter def adjacencies( - geometries, - output_type="geoseries", - adjacency_type="rook", *, warn_for_overlaps=True, warn_for_islands=True + geometries, + adjacency_type="rook", + output_type="geoseries", *, warn_for_overlaps=True, warn_for_islands=True ): """Returns adjacencies between geometries. The default return type is a @@ -45,6 +45,7 @@ def adjacencies( if adjacency_type not in ["rook", "queen"]: raise ValueError('adjacency_type must be "rook" or "queen"') + orig_crs = geometries.crs geometries = get_geometries(geometries) geometries = make_valid(geometries) @@ -55,12 +56,12 @@ def adjacencies( index, geoms = [[],[]] if output_type == "geodataframe": - inters = GeoDataFrame({"neighbors" : index, "geometry" : geoms}, crs = geometries.crs) + inters = GeoDataFrame({"neighbors" : index, "geometry" : geoms}) else: - inters = GeoSeries(geoms, index=index, crs=geometries.crs) + inters = GeoSeries(geoms, index=index) if adjacency_type == "rook": - inters = inters[inters.length > 0] + inters = inters[inters.length > 0].copy() if warn_for_overlaps: overlaps = inters[inters.area > 0] @@ -83,4 +84,5 @@ def adjacencies( IslandWarning, ) + inters.crs = orig_crs return inters From 2c700302464ea6fb9f5ea8d2e54e27346d67ee1c Mon Sep 17 00:00:00 2001 From: "Parker J. Rule" Date: Sun, 8 Oct 2023 23:04:32 -0400 Subject: [PATCH 14/15] Fix failing overlaps tests --- maup/indexed_geometries.py | 4 +++- maup/repair.py | 6 +++--- 2 files changed, 6 insertions(+), 4 deletions(-) diff --git a/maup/indexed_geometries.py b/maup/indexed_geometries.py index 6892a36..956eba2 100644 --- a/maup/indexed_geometries.py +++ b/maup/indexed_geometries.py @@ -8,7 +8,9 @@ def get_geometries(geometries): - return getattr(geometries, "geometry", geometries) + if isinstance(geometries, geopandas.GeoDataFrame): + return getattr(geometries, "geometry", geometries) + return geometries class IndexedGeometries: diff --git a/maup/repair.py b/maup/repair.py index 33e21ad..6f42444 100644 --- a/maup/repair.py +++ b/maup/repair.py @@ -133,14 +133,14 @@ def make_valid(geometries): Applies the shapely .buffer(0) and make_valid (once released) trick to all geometries. Should help resolve various topological issues in shapefiles. """ - return geometries["geometry"].simplify(0).buffer(0) + return get_geometries(geometries).simplify(0).buffer(0) def remove_repeated_vertices(geometries): """ Removes repeated vertices. Vertices are considered to be repeated if they appear consecutively, excluding the start and end points. """ - return geometries.geometry.apply(lambda x: apply_func_to_polygon_parts(x, dedup_vertices)) + return get_geometries(geometries).apply(lambda x: apply_func_to_polygon_parts(x, dedup_vertices)) def snap_to_grid(geometries, n=-7): """ @@ -148,7 +148,7 @@ def snap_to_grid(geometries, n=-7): resolve floating point precision issues in shapefiles. """ func = functools.partial(snap_polygon_to_grid, n=n) - return geometries.geometry.apply(lambda x: apply_func_to_polygon_parts(x, func)) + return get_geometries(geometries).apply(lambda x: apply_func_to_polygon_parts(x, func)) @require_same_crs def crop_to(source, target): From c9679a7a1a2c698219a7795597c07febac40f76e Mon Sep 17 00:00:00 2001 From: "Parker J. Rule" Date: Wed, 11 Oct 2023 08:52:14 -0400 Subject: [PATCH 15/15] Fix more tests --- tests/test_crs.py | 4 ++-- tests/test_indexed_geometries.py | 7 ++++--- 2 files changed, 6 insertions(+), 5 deletions(-) diff --git a/tests/test_crs.py b/tests/test_crs.py index 018ee94..f617870 100644 --- a/tests/test_crs.py +++ b/tests/test_crs.py @@ -5,8 +5,8 @@ def test_require_same_crs(square, four_square_grid): square_gdf = gpd.GeoDataFrame([{"geometry": square}]) - square_gdf.crs = "foo" - four_square_grid.crs = "bar" + square_gdf.crs = "epsg:4269" + four_square_grid.crs = "epsg:4326" @require_same_crs def f(sources, targets): diff --git a/tests/test_indexed_geometries.py b/tests/test_indexed_geometries.py index 2c128a1..9f20598 100644 --- a/tests/test_indexed_geometries.py +++ b/tests/test_indexed_geometries.py @@ -1,3 +1,4 @@ +import numpy as np from unittest.mock import patch from geopandas import GeoSeries @@ -25,10 +26,10 @@ def test_indexed_has_a_spatial_index(four_square_grid): def test_indexed_queries_its_spatial_index_when_intersections_is_called( four_square_grid, square ): - with patch("maup.indexed_geometries.STRtree") as SpatialIndex: - spatial_index = SpatialIndex.return_value + with patch("maup.indexed_geometries.STRtree.query",) as query_fn: + query_fn.return_value = np.array([]) IndexedGeometries(four_square_grid).intersections(square) - spatial_index.query.assert_called() + query_fn.assert_called() def test_intersections_correct_when_all_overlapping(four_square_grid, square):