diff --git a/.github/workflows/tests.yaml b/.github/workflows/tests.yaml index c7203aa..2813674 100644 --- a/.github/workflows/tests.yaml +++ b/.github/workflows/tests.yaml @@ -7,7 +7,7 @@ jobs: runs-on: ${{ matrix.os }} strategy: matrix: - python-version: [3.9, '3.10', 3.11] + python-version: [3.9, '3.10', 3.11, 3.12] os: [ubuntu-latest, macOS-latest] steps: diff --git a/docs/conf.py b/docs/conf.py index 12e33d8..243a85a 100644 --- a/docs/conf.py +++ b/docs/conf.py @@ -19,11 +19,11 @@ # -- Project information ----------------------------------------------------- project = 'maup' -copyright = '2021, MGGG' -author = 'Max Hully, Max Fan' +copyright = '2023, MGGG' +author = 'Jeanne Clelland, Max Fan, Max Hully ' # The full version, including alpha/beta/rc tags -release = '2.0.0' +release = '2.0.1' # -- General configuration --------------------------------------------------- diff --git a/maup/repair.py b/maup/repair.py index 712514e..74595b9 100644 --- a/maup/repair.py +++ b/maup/repair.py @@ -4,7 +4,7 @@ import pandas from geopandas import GeoSeries -from shapely.geometry import Polygon, MultiPolygon, LineString, MultiLineString +from shapely.geometry import Polygon, MultiPolygon, LineString, MultiLineString, GeometryCollection from shapely.ops import unary_union from .adjacencies import adjacencies @@ -38,6 +38,32 @@ class AreaCroppingWarning(UserWarning): pass +def make_valid_polygons(geometries, force_polygons=True): + """Extended make valid function with optional filter for non polygonal + data from the output.""" + geometries = geometries.make_valid() + if force_polygons: + # Map function causes CRS information to be dropped. + crs = geometries.crs + geometries = geometries.map(trim_valid) + if crs is not None: + geometries = geometries.set_crs(crs) + return geometries + + +def trim_valid(value): + """Ensures that the results from the "make_valid_polygons" method won't be + geometry collections with mixed types by filtering out non-polygons. + """ + if isinstance(value, GeometryCollection): + # List comprehension excluding non-Polygons + value = [item for item in value.geoms + if isinstance(item, (Polygon, MultiPolygon))] + # Re-aggregegating multiple Polygons into single MultiPolygon object. + value = value[0] if len(value) == 1 else MultiPolygon(value) + return value + + def holes_of_union(geometries): """Returns any holes in the union of the given geometries.""" geometries = get_geometries(geometries) @@ -68,7 +94,7 @@ def holes(geometry): raise TypeError("geometry must be a Polygon or MultiPolygon to have holes") -def close_gaps(geometries, relative_threshold=0.1): +def close_gaps(geometries, relative_threshold=0.1, force_polygons=False): """Closes gaps between geometries by assigning the hole to the polygon that shares the greatest perimeter with the hole. @@ -78,15 +104,18 @@ def close_gaps(geometries, relative_threshold=0.1): gaps while closing the tiny gaps that can occur as artifacts of geospatial operations. Set `relative_threshold=None` to attempt close all gaps. Due to floating point precision issues, all gaps may not be closed. + + Optional "force_polygons" argument included to apply an automatic filter + non-polygonal fragments during operation. """ geometries = get_geometries(geometries) gaps = holes_of_union(geometries) return absorb_by_shared_perimeter( - gaps, geometries, relative_threshold=relative_threshold + gaps, geometries, relative_threshold=relative_threshold, force_polygons=force_polygons ) -def resolve_overlaps(geometries, relative_threshold=0.1): +def resolve_overlaps(geometries, relative_threshold=0.1, force_polygons=False): """For any pair of overlapping geometries, assigns the overlapping area to the geometry that shares the greatest perimeter with the overlap. Returns the GeoSeries of geometries, which will have no overlaps. @@ -98,10 +127,16 @@ def resolve_overlaps(geometries, relative_threshold=0.1): that might indicate deeper issues and should be handled on a case-by-case basis. Set `relative_threshold=None` to attempt to resolve all overlaps. Due to floating point precision issues, all overlaps may not be resolved. + + Optional "force_polygons" argument included to apply an automatic filter + non-polygonal fragments during operation. """ geometries = get_geometries(geometries) inters = adjacencies(geometries, warn_for_islands=False, warn_for_overlaps=False) - overlaps = inters[inters.area > 0].make_valid() + if force_polygons: + overlaps = make_valid_polygons(inters[inters.area > 0]) + else: + overlaps = inters[inters.area > 0].make_valid() if relative_threshold is not None: left_areas, right_areas = split_by_level(geometries.area, overlaps.index) @@ -119,11 +154,11 @@ def resolve_overlaps(geometries, relative_threshold=0.1): with_overlaps_removed = geometries.apply(lambda x: x.difference(unary_union(to_remove))) return absorb_by_shared_perimeter( - overlaps, with_overlaps_removed, relative_threshold=None + overlaps, with_overlaps_removed, relative_threshold=None, force_polygons=force_polygons ) -def quick_repair(geometries, relative_threshold=0.1): +def quick_repair(geometries, relative_threshold=0.1, force_polygons=False): """ New name for autorepair function from Maup 1.x. Uses simplistic algorithms to repair most gaps and overlaps. @@ -137,12 +172,14 @@ def quick_repair(geometries, relative_threshold=0.1): For a more careful repair that takes adjacencies and higher-order overlaps between geometries into account, consider using smart_repair instead. """ - return autorepair(geometries, relative_threshold=relative_threshold) + return autorepair(geometries, relative_threshold=relative_threshold, force_polygons=force_polygons) -def autorepair(geometries, relative_threshold=0.1): +def autorepair(geometries, relative_threshold=0.1, force_polygons=False): """ - Uses simplistic algorithms to repair most gaps and overlaps. + Uses simplistic algorithms to repair most gaps and overlaps. Additional + optional argument provided that can drop any fragmented line segments that + may occur while using shapely's make valid function. Should work by default. The default `relative_threshold` is `0.1`. This default is chosen to include tiny overlaps that can be safely auto-fixed while preserving major overlaps @@ -155,9 +192,18 @@ def autorepair(geometries, relative_threshold=0.1): """ geometries = get_geometries(geometries) - geometries = remove_repeated_vertices(geometries).make_valid() - geometries = resolve_overlaps(geometries, relative_threshold=relative_threshold).make_valid() - geometries = close_gaps(geometries, relative_threshold=relative_threshold).make_valid() + if force_polygons: + geometries = make_valid_polygons(remove_repeated_vertices(geometries)) + geometries = make_valid_polygons(resolve_overlaps(geometries, + relative_threshold=relative_threshold, + force_polygons=force_polygons)) + geometries = make_valid_polygons(close_gaps(geometries, + relative_threshold=relative_threshold, + force_polygons=force_polygons)) + else: + geometries = remove_repeated_vertices(geometries).make_valid() + geometries = resolve_overlaps(geometries, relative_threshold=relative_threshold).make_valid() + geometries = close_gaps(geometries, relative_threshold=relative_threshold).make_valid() return geometries @@ -199,11 +245,15 @@ def crop_to(source, target): @require_same_crs -def expand_to(source, target): +def expand_to(source, target, force_polygons=False): """ Expands the source geometries to the target geometries. """ - geometries = get_geometries(source).make_valid() + + if force_polygons: + geometries = make_valid_polygons(get_geometries(source)) + else: + geometries = get_geometries(source).make_valid() source_union = unary_union(geometries) @@ -211,7 +261,7 @@ def expand_to(source, target): leftover_geometries = leftover_geometries[~leftover_geometries.is_empty].explode(index_parts=False) geometries = absorb_by_shared_perimeter( - leftover_geometries, get_geometries(source), relative_threshold=None + leftover_geometries, get_geometries(source), relative_threshold=None, force_polygons=force_polygons ) return geometries @@ -351,14 +401,17 @@ def split_by_level(series, multiindex): @require_same_crs -def absorb_by_shared_perimeter(sources, targets, relative_threshold=None): +def absorb_by_shared_perimeter(sources, targets, relative_threshold=None, force_polygons=False): if len(sources) == 0: return targets if len(targets) == 0: raise IndexError("targets must be nonempty") - inters = intersections(sources, targets, area_cutoff=None).make_valid() + if force_polygons: + inters = make_valid_polygons(intersections(sources, targets, area_cutoff=None)) + else: + inters = intersections(sources, targets, area_cutoff=None).make_valid() assignment = assign_to_max(inters.length) diff --git a/maup/smart_repair.py b/maup/smart_repair.py index 6d06900..7d4e0d2 100644 --- a/maup/smart_repair.py +++ b/maup/smart_repair.py @@ -39,9 +39,9 @@ ######### -def smart_repair(geometries_df, snapped=True, fill_gaps=True, fill_gaps_threshold=0.1, - disconnection_threshold=0.0001, nest_within_regions=None, - min_rook_length=None): +def smart_repair(geometries_df, snapped=True, snap_precision=10, fill_gaps=True, + fill_gaps_threshold=0.1, disconnection_threshold=0.0001, + nest_within_regions=None, min_rook_length=None): """ Repairs topology issues (overlaps, gaps, invalid polygons) in a geopandas GeoDataFrame or GeoSeries, with an emphasis on preserving intended adjacency @@ -50,22 +50,25 @@ def smart_repair(geometries_df, snapped=True, fill_gaps=True, fill_gaps_threshol Specifically, the algorithm (1) Applies shapely.make_valid to all polygon geometries. (2) If snapped = True (default), snaps all polygon vertices to a grid of size no - more than 10^(-10) times the max of width/height of the entire shapefile extent. - HIGHLY RECOMMENDED to avoid topological exceptions due to rounding errors. + more than 10^(-snap_precision) times the max of width/height of the entire + extent of the input. HIGHLY RECOMMENDED to avoid topological exceptions due to + rounding errors. Default value for snap_precision is 10; if topological + exceptions still occur, try reducing snap_precision (which must be integer- + valued) to 9 or 8. (3) Resolves all overlaps. (4) If fill_gaps = True (default), closes all simply connected gaps with area less than fill_gaps_threshold times the largest area of all geometries adjoining the gap. Default threshold is 10%; if fill_gaps_threshold = None then all simply connected gaps will be filled. - (5) If nest_within_regions is a secondary shapefile of region boundaries (e.g., - counties in a state) then all of the above will be performed so that repaired - geometries nest cleanly into the region boundaries; each repaired geometry - will be contained in the region with which the original geometry has the largest - area of intersection. Default value is None. + (5) If nest_within_regions is a secondary GeoDataFrame/GeoSeries of region boundaries + (e.g., counties in a state) then all of the above will be performed so that + repaired geometries nest cleanly into the region boundaries; each repaired + geometrywill be contained in the region with which the original geometry has the + largest area of intersection. Default value is None. (6) If min_rook_length is given a numerical value, replaces all rook adjacencies with length below this value with queen adjacencies. Note that this is an absolute value and not a relative value, so make sure that the value provided - is in the correct units with respect to the shapefile's CRS. + is in the correct units with respect to the input's CRS. Default value is None. (7) Sometimes the repair process creates tiny fragments that are disconnected from the district that they are assigned to. A final cleanup step assigns any such @@ -115,15 +118,18 @@ def smart_repair(geometries_df, snapped=True, fill_gaps=True, fill_gaps_threshol if doctor(nest_within_regions, silent=True, accept_holes=True) is False: raise Exception("nest_within_regions must be topologically clean---i.e., all geometries must be valid and there must be no overlaps between geometries. Generally the best source for region shapefiles is the U.S. Census Burueau.") - # Before doing anything else, make sure all polygons are valid and remove any - # LineStrings and MultiLineStrings. + # Before doing anything else, make sure all polygons are valid, convert any empty + # geometries to empty Polygons to avoid type errors, and remove any LineStrings and + # MultiLineStrings. for i in geometries_df.index: geometries_df["geometry"][i] = make_valid(geometries_df["geometry"][i]) + if geometries_df["geometry"][i] is None: + geometries_df["geometry"][i] = Polygon() if geometries_df["geometry"][i].geom_type == "GeometryCollection": geometries_df["geometry"][i] = unary_union([x for x in geometries_df["geometry"][i].geoms if x.geom_type in ("Polygon", "MultiPolygon")]) # If snapped is True, snap all polygon vertices to a grid of size no more than - # 10^(-10) times the max of width/height of the entire shapefile extent. + # 10^(-10) times the max of width/height of the entire extent of the input. # (For instance, in Texas this would be less than 1/100th of an inch.) # This avoids a rare "non-noded intersection" error due to a GEOS bug and leaves # several orders of magnitude for additional intersection operations before hitting @@ -132,7 +138,7 @@ def smart_repair(geometries_df, snapped=True, fill_gaps=True, fill_gaps_threshol # These bounds are in the form (xmin, ymin, xmax, ymax) geometries_total_bounds = geometries_df.total_bounds largest_bound = max(geometries_total_bounds[2] - geometries_total_bounds[0], geometries_total_bounds[3] - geometries_total_bounds[1]) - snap_magnitude = int(math.log10(largest_bound)) - 10 + snap_magnitude = int(math.log10(largest_bound)) - snap_precision geometries_df["geometry"] = snap_to_grid(geometries_df["geometry"], n=snap_magnitude) if nest_within_regions is not None: regions_df["geometry"] = snap_to_grid(regions_df["geometry"], n=snap_magnitude) @@ -320,10 +326,10 @@ def segments(curve): def building_blocks(geometries_df, nest_within_regions=None): """ Partitions the extent of the input via all boundaries of all geometries - (and regions, if nest_within_regions is a shapefile of region boundaries); - associates to each polygon in the partition the set of polygons in the original - shapefile whose intersection created it, and organizes this data according to - order of the overlaps. (Order zero = hole) + (and regions, if nest_within_regions is a GeoDataFrame/GeoSeries of region + boundaries); associates to each polygon in the partition the set of polygons in the + original shapefile whose intersection created it, and organizes this data according + to order of the overlaps. (Order zero = hole) """ if isinstance(geometries_df, GeoDataFrame) is False: raise TypeError("Primary input to building_blocks must be a GeoDataFrame.") @@ -383,7 +389,7 @@ def building_blocks(geometries_df, nest_within_regions=None): for i in progress(pieces_df.index, len(pieces_df.index)): # If region boundaries are included, identify the region for each piece. # Note that "None" is a possibility, and that each piece will belong to a unique - # region because the regions shapefile MUST be clean. + # region because the regions GeoDataFrame/GeoSeries MUST be clean. if nest_within_regions is not None: possible_region_integer_indices = [*set(numpy.ndarray.flatten(r_spatial_index.query(pieces_df["geometry"][i])))] possible_region_indices = [r_index_by_iloc[k] for k in possible_region_integer_indices] @@ -462,7 +468,7 @@ def building_blocks(geometries_df, nest_within_regions=None): def reconstruct_from_overlap_tower(geometries_df, overlap_tower, nested=False): """ - Rebuild the shapefile polygons with overlaps removed. + Rebuild the polygons in geometries_df with overlaps removed. """ # Keep a copy of the original input for comparisons later! geometries0_df = geometries_df.copy() @@ -1551,6 +1557,22 @@ def convexify_hole_boundaries(geometries_df, holes_df): # boundary with each geometry by the shortest path within the hole # between its endpoints and "filling in" the geometry up to the # new boundary. (Exterior boundaries, if any, should be left alone.) + + # In cases where there are 2 or more boundaries with the same target + # geometry, we'll need to check to see whether they end up adjacent + # after convexifying. If they do, their UNION isn't guaranteed to be + # convexified at the end, so we'll put that hole back in the queue for + # another round of processing. + if len(set(this_hole_boundaries_df["target"])) == len(this_hole_boundaries_df): + target_repetition = False + else: + target_repetition = True + repeated_targets = [] + for target in set(this_hole_boundaries_df["target"]): + boundaries_this_target = this_hole_boundaries_df[this_hole_boundaries_df["target"] == target] + if len(boundaries_this_target) > 1: + repeated_targets.append((target, len(boundaries_this_target))) + new_hole_in_progress = this_hole this_hole_triangulation = triangulate_polygon(new_hole_in_progress) @@ -1582,7 +1604,27 @@ def convexify_hole_boundaries(geometries_df, holes_df): new_hole = orient(new_hole) new_hole_df = GeoDataFrame(geometry=GeoSeries([new_hole]), crs=holes_df.crs) new_hole_df.insert(0, "region", this_region) - completed_holes_df = pandas.concat([completed_holes_df, new_hole_df]).reset_index(drop=True) + + if target_repetition: + # Check to see whether the target that was repeated in the original + # hole boundaries has fewer (but not zero!) hole boundaries + # associated to it in this hole. If so, some distinct boundaries + # may have been concatenated after convexifying, resulting in a + # non-convex boundary - so put the hole back in the queue for + # another round of processing. + new_hole_boundaries_df = construct_hole_boundaries(geometries_df, new_hole_df) + reprocess_hole = False + for target in repeated_targets: + new_boundaries_this_target = new_hole_boundaries_df[new_hole_boundaries_df["target"] == target[0]] + if len(new_boundaries_this_target) > 0 and len(new_boundaries_this_target) < target[1]: + reprocess_hole = True + break + if reprocess_hole: + holes_to_process.append(new_hole) + else: + completed_holes_df = pandas.concat([completed_holes_df, new_hole_df]).reset_index(drop=True) + else: + completed_holes_df = pandas.concat([completed_holes_df, new_hole_df]).reset_index(drop=True) pbar.update(pbar_increment) diff --git a/pyproject.toml b/pyproject.toml index 175fad7..a05bc7e 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -1,9 +1,9 @@ [tool.poetry] name = "maup" -version = "2.0.0" +version = "2.0.1" description = "The geospatial toolkit for redistricting data" authors = [ - "Metric Geometry and Gerrymandering Group ", + "Metric Geometry and Gerrymandering Group ", "Max Hully ", "Max Fan " ] @@ -13,6 +13,7 @@ classifiers=[ "Programming Language :: Python :: 3.9", "Programming Language :: Python :: 3.10", "Programming Language :: Python :: 3.11", + "Programming Language :: Python :: 3.12", "Operating System :: OS Independent", "License :: OSI Approved :: MIT License", ]