Skip to content

Commit

Permalink
Merge pull request #94 from mggg/2.0.1
Browse files Browse the repository at this point in the history
Update smart_repair.py
  • Loading branch information
peterrrock2 authored Dec 22, 2023
2 parents 0508cc5 + c3fea3a commit d71150f
Show file tree
Hide file tree
Showing 5 changed files with 142 additions and 46 deletions.
2 changes: 1 addition & 1 deletion .github/workflows/tests.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -7,7 +7,7 @@ jobs:
runs-on: ${{ matrix.os }}
strategy:
matrix:
python-version: [3.9, '3.10', 3.11]
python-version: [3.9, '3.10', 3.11, 3.12]
os: [ubuntu-latest, macOS-latest]

steps:
Expand Down
6 changes: 3 additions & 3 deletions docs/conf.py
Original file line number Diff line number Diff line change
Expand Up @@ -19,11 +19,11 @@
# -- Project information -----------------------------------------------------

project = 'maup'
copyright = '2021, MGGG'
author = 'Max Hully, Max Fan'
copyright = '2023, MGGG'
author = 'Jeanne Clelland, Max Fan, Max Hully '

# The full version, including alpha/beta/rc tags
release = '2.0.0'
release = '2.0.1'


# -- General configuration ---------------------------------------------------
Expand Down
89 changes: 71 additions & 18 deletions maup/repair.py
Original file line number Diff line number Diff line change
Expand Up @@ -4,7 +4,7 @@
import pandas

from geopandas import GeoSeries
from shapely.geometry import Polygon, MultiPolygon, LineString, MultiLineString
from shapely.geometry import Polygon, MultiPolygon, LineString, MultiLineString, GeometryCollection
from shapely.ops import unary_union

from .adjacencies import adjacencies
Expand Down Expand Up @@ -38,6 +38,32 @@ class AreaCroppingWarning(UserWarning):
pass


def make_valid_polygons(geometries, force_polygons=True):
"""Extended make valid function with optional filter for non polygonal
data from the output."""
geometries = geometries.make_valid()
if force_polygons:
# Map function causes CRS information to be dropped.
crs = geometries.crs
geometries = geometries.map(trim_valid)
if crs is not None:
geometries = geometries.set_crs(crs)
return geometries


def trim_valid(value):
"""Ensures that the results from the "make_valid_polygons" method won't be
geometry collections with mixed types by filtering out non-polygons.
"""
if isinstance(value, GeometryCollection):
# List comprehension excluding non-Polygons
value = [item for item in value.geoms
if isinstance(item, (Polygon, MultiPolygon))]
# Re-aggregegating multiple Polygons into single MultiPolygon object.
value = value[0] if len(value) == 1 else MultiPolygon(value)
return value


def holes_of_union(geometries):
"""Returns any holes in the union of the given geometries."""
geometries = get_geometries(geometries)
Expand Down Expand Up @@ -68,7 +94,7 @@ def holes(geometry):
raise TypeError("geometry must be a Polygon or MultiPolygon to have holes")


def close_gaps(geometries, relative_threshold=0.1):
def close_gaps(geometries, relative_threshold=0.1, force_polygons=False):
"""Closes gaps between geometries by assigning the hole to the polygon
that shares the greatest perimeter with the hole.
Expand All @@ -78,15 +104,18 @@ def close_gaps(geometries, relative_threshold=0.1):
gaps while closing the tiny gaps that can occur as artifacts of
geospatial operations. Set `relative_threshold=None` to attempt close all
gaps. Due to floating point precision issues, all gaps may not be closed.
Optional "force_polygons" argument included to apply an automatic filter
non-polygonal fragments during operation.
"""
geometries = get_geometries(geometries)
gaps = holes_of_union(geometries)
return absorb_by_shared_perimeter(
gaps, geometries, relative_threshold=relative_threshold
gaps, geometries, relative_threshold=relative_threshold, force_polygons=force_polygons
)


def resolve_overlaps(geometries, relative_threshold=0.1):
def resolve_overlaps(geometries, relative_threshold=0.1, force_polygons=False):
"""For any pair of overlapping geometries, assigns the overlapping area to the
geometry that shares the greatest perimeter with the overlap. Returns the GeoSeries
of geometries, which will have no overlaps.
Expand All @@ -98,10 +127,16 @@ def resolve_overlaps(geometries, relative_threshold=0.1):
that might indicate deeper issues and should be handled on a case-by-case
basis. Set `relative_threshold=None` to attempt to resolve all overlaps. Due
to floating point precision issues, all overlaps may not be resolved.
Optional "force_polygons" argument included to apply an automatic filter
non-polygonal fragments during operation.
"""
geometries = get_geometries(geometries)
inters = adjacencies(geometries, warn_for_islands=False, warn_for_overlaps=False)
overlaps = inters[inters.area > 0].make_valid()
if force_polygons:
overlaps = make_valid_polygons(inters[inters.area > 0])
else:
overlaps = inters[inters.area > 0].make_valid()

if relative_threshold is not None:
left_areas, right_areas = split_by_level(geometries.area, overlaps.index)
Expand All @@ -119,11 +154,11 @@ def resolve_overlaps(geometries, relative_threshold=0.1):
with_overlaps_removed = geometries.apply(lambda x: x.difference(unary_union(to_remove)))

return absorb_by_shared_perimeter(
overlaps, with_overlaps_removed, relative_threshold=None
overlaps, with_overlaps_removed, relative_threshold=None, force_polygons=force_polygons
)


def quick_repair(geometries, relative_threshold=0.1):
def quick_repair(geometries, relative_threshold=0.1, force_polygons=False):
"""
New name for autorepair function from Maup 1.x.
Uses simplistic algorithms to repair most gaps and overlaps.
Expand All @@ -137,12 +172,14 @@ def quick_repair(geometries, relative_threshold=0.1):
For a more careful repair that takes adjacencies and higher-order overlaps
between geometries into account, consider using smart_repair instead.
"""
return autorepair(geometries, relative_threshold=relative_threshold)
return autorepair(geometries, relative_threshold=relative_threshold, force_polygons=force_polygons)


def autorepair(geometries, relative_threshold=0.1):
def autorepair(geometries, relative_threshold=0.1, force_polygons=False):
"""
Uses simplistic algorithms to repair most gaps and overlaps.
Uses simplistic algorithms to repair most gaps and overlaps. Additional
optional argument provided that can drop any fragmented line segments that
may occur while using shapely's make valid function. Should work by default.
The default `relative_threshold` is `0.1`. This default is chosen to include
tiny overlaps that can be safely auto-fixed while preserving major overlaps
Expand All @@ -155,9 +192,18 @@ def autorepair(geometries, relative_threshold=0.1):
"""
geometries = get_geometries(geometries)

geometries = remove_repeated_vertices(geometries).make_valid()
geometries = resolve_overlaps(geometries, relative_threshold=relative_threshold).make_valid()
geometries = close_gaps(geometries, relative_threshold=relative_threshold).make_valid()
if force_polygons:
geometries = make_valid_polygons(remove_repeated_vertices(geometries))
geometries = make_valid_polygons(resolve_overlaps(geometries,
relative_threshold=relative_threshold,
force_polygons=force_polygons))
geometries = make_valid_polygons(close_gaps(geometries,
relative_threshold=relative_threshold,
force_polygons=force_polygons))
else:
geometries = remove_repeated_vertices(geometries).make_valid()
geometries = resolve_overlaps(geometries, relative_threshold=relative_threshold).make_valid()
geometries = close_gaps(geometries, relative_threshold=relative_threshold).make_valid()

return geometries

Expand Down Expand Up @@ -199,19 +245,23 @@ def crop_to(source, target):


@require_same_crs
def expand_to(source, target):
def expand_to(source, target, force_polygons=False):
"""
Expands the source geometries to the target geometries.
"""
geometries = get_geometries(source).make_valid()

if force_polygons:
geometries = make_valid_polygons(get_geometries(source))
else:
geometries = get_geometries(source).make_valid()

source_union = unary_union(geometries)

leftover_geometries = get_geometries(target).apply(lambda x: x - source_union)
leftover_geometries = leftover_geometries[~leftover_geometries.is_empty].explode(index_parts=False)

geometries = absorb_by_shared_perimeter(
leftover_geometries, get_geometries(source), relative_threshold=None
leftover_geometries, get_geometries(source), relative_threshold=None, force_polygons=force_polygons
)

return geometries
Expand Down Expand Up @@ -351,14 +401,17 @@ def split_by_level(series, multiindex):


@require_same_crs
def absorb_by_shared_perimeter(sources, targets, relative_threshold=None):
def absorb_by_shared_perimeter(sources, targets, relative_threshold=None, force_polygons=False):
if len(sources) == 0:
return targets

if len(targets) == 0:
raise IndexError("targets must be nonempty")

inters = intersections(sources, targets, area_cutoff=None).make_valid()
if force_polygons:
inters = make_valid_polygons(intersections(sources, targets, area_cutoff=None))
else:
inters = intersections(sources, targets, area_cutoff=None).make_valid()

assignment = assign_to_max(inters.length)

Expand Down
86 changes: 64 additions & 22 deletions maup/smart_repair.py
Original file line number Diff line number Diff line change
Expand Up @@ -39,9 +39,9 @@
#########


def smart_repair(geometries_df, snapped=True, fill_gaps=True, fill_gaps_threshold=0.1,
disconnection_threshold=0.0001, nest_within_regions=None,
min_rook_length=None):
def smart_repair(geometries_df, snapped=True, snap_precision=10, fill_gaps=True,
fill_gaps_threshold=0.1, disconnection_threshold=0.0001,
nest_within_regions=None, min_rook_length=None):
"""
Repairs topology issues (overlaps, gaps, invalid polygons) in a geopandas
GeoDataFrame or GeoSeries, with an emphasis on preserving intended adjacency
Expand All @@ -50,22 +50,25 @@ def smart_repair(geometries_df, snapped=True, fill_gaps=True, fill_gaps_threshol
Specifically, the algorithm
(1) Applies shapely.make_valid to all polygon geometries.
(2) If snapped = True (default), snaps all polygon vertices to a grid of size no
more than 10^(-10) times the max of width/height of the entire shapefile extent.
HIGHLY RECOMMENDED to avoid topological exceptions due to rounding errors.
more than 10^(-snap_precision) times the max of width/height of the entire
extent of the input. HIGHLY RECOMMENDED to avoid topological exceptions due to
rounding errors. Default value for snap_precision is 10; if topological
exceptions still occur, try reducing snap_precision (which must be integer-
valued) to 9 or 8.
(3) Resolves all overlaps.
(4) If fill_gaps = True (default), closes all simply connected gaps with area
less than fill_gaps_threshold times the largest area of all geometries adjoining
the gap. Default threshold is 10%; if fill_gaps_threshold = None then all
simply connected gaps will be filled.
(5) If nest_within_regions is a secondary shapefile of region boundaries (e.g.,
counties in a state) then all of the above will be performed so that repaired
geometries nest cleanly into the region boundaries; each repaired geometry
will be contained in the region with which the original geometry has the largest
area of intersection. Default value is None.
(5) If nest_within_regions is a secondary GeoDataFrame/GeoSeries of region boundaries
(e.g., counties in a state) then all of the above will be performed so that
repaired geometries nest cleanly into the region boundaries; each repaired
geometrywill be contained in the region with which the original geometry has the
largest area of intersection. Default value is None.
(6) If min_rook_length is given a numerical value, replaces all rook adjacencies
with length below this value with queen adjacencies. Note that this is an
absolute value and not a relative value, so make sure that the value provided
is in the correct units with respect to the shapefile's CRS.
is in the correct units with respect to the input's CRS.
Default value is None.
(7) Sometimes the repair process creates tiny fragments that are disconnected from
the district that they are assigned to. A final cleanup step assigns any such
Expand Down Expand Up @@ -115,15 +118,18 @@ def smart_repair(geometries_df, snapped=True, fill_gaps=True, fill_gaps_threshol
if doctor(nest_within_regions, silent=True, accept_holes=True) is False:
raise Exception("nest_within_regions must be topologically clean---i.e., all geometries must be valid and there must be no overlaps between geometries. Generally the best source for region shapefiles is the U.S. Census Burueau.")

# Before doing anything else, make sure all polygons are valid and remove any
# LineStrings and MultiLineStrings.
# Before doing anything else, make sure all polygons are valid, convert any empty
# geometries to empty Polygons to avoid type errors, and remove any LineStrings and
# MultiLineStrings.
for i in geometries_df.index:
geometries_df["geometry"][i] = make_valid(geometries_df["geometry"][i])
if geometries_df["geometry"][i] is None:
geometries_df["geometry"][i] = Polygon()
if geometries_df["geometry"][i].geom_type == "GeometryCollection":
geometries_df["geometry"][i] = unary_union([x for x in geometries_df["geometry"][i].geoms if x.geom_type in ("Polygon", "MultiPolygon")])

# If snapped is True, snap all polygon vertices to a grid of size no more than
# 10^(-10) times the max of width/height of the entire shapefile extent.
# 10^(-10) times the max of width/height of the entire extent of the input.
# (For instance, in Texas this would be less than 1/100th of an inch.)
# This avoids a rare "non-noded intersection" error due to a GEOS bug and leaves
# several orders of magnitude for additional intersection operations before hitting
Expand All @@ -132,7 +138,7 @@ def smart_repair(geometries_df, snapped=True, fill_gaps=True, fill_gaps_threshol
# These bounds are in the form (xmin, ymin, xmax, ymax)
geometries_total_bounds = geometries_df.total_bounds
largest_bound = max(geometries_total_bounds[2] - geometries_total_bounds[0], geometries_total_bounds[3] - geometries_total_bounds[1])
snap_magnitude = int(math.log10(largest_bound)) - 10
snap_magnitude = int(math.log10(largest_bound)) - snap_precision
geometries_df["geometry"] = snap_to_grid(geometries_df["geometry"], n=snap_magnitude)
if nest_within_regions is not None:
regions_df["geometry"] = snap_to_grid(regions_df["geometry"], n=snap_magnitude)
Expand Down Expand Up @@ -320,10 +326,10 @@ def segments(curve):
def building_blocks(geometries_df, nest_within_regions=None):
"""
Partitions the extent of the input via all boundaries of all geometries
(and regions, if nest_within_regions is a shapefile of region boundaries);
associates to each polygon in the partition the set of polygons in the original
shapefile whose intersection created it, and organizes this data according to
order of the overlaps. (Order zero = hole)
(and regions, if nest_within_regions is a GeoDataFrame/GeoSeries of region
boundaries); associates to each polygon in the partition the set of polygons in the
original shapefile whose intersection created it, and organizes this data according
to order of the overlaps. (Order zero = hole)
"""
if isinstance(geometries_df, GeoDataFrame) is False:
raise TypeError("Primary input to building_blocks must be a GeoDataFrame.")
Expand Down Expand Up @@ -383,7 +389,7 @@ def building_blocks(geometries_df, nest_within_regions=None):
for i in progress(pieces_df.index, len(pieces_df.index)):
# If region boundaries are included, identify the region for each piece.
# Note that "None" is a possibility, and that each piece will belong to a unique
# region because the regions shapefile MUST be clean.
# region because the regions GeoDataFrame/GeoSeries MUST be clean.
if nest_within_regions is not None:
possible_region_integer_indices = [*set(numpy.ndarray.flatten(r_spatial_index.query(pieces_df["geometry"][i])))]
possible_region_indices = [r_index_by_iloc[k] for k in possible_region_integer_indices]
Expand Down Expand Up @@ -462,7 +468,7 @@ def building_blocks(geometries_df, nest_within_regions=None):

def reconstruct_from_overlap_tower(geometries_df, overlap_tower, nested=False):
"""
Rebuild the shapefile polygons with overlaps removed.
Rebuild the polygons in geometries_df with overlaps removed.
"""
# Keep a copy of the original input for comparisons later!
geometries0_df = geometries_df.copy()
Expand Down Expand Up @@ -1551,6 +1557,22 @@ def convexify_hole_boundaries(geometries_df, holes_df):
# boundary with each geometry by the shortest path within the hole
# between its endpoints and "filling in" the geometry up to the
# new boundary. (Exterior boundaries, if any, should be left alone.)

# In cases where there are 2 or more boundaries with the same target
# geometry, we'll need to check to see whether they end up adjacent
# after convexifying. If they do, their UNION isn't guaranteed to be
# convexified at the end, so we'll put that hole back in the queue for
# another round of processing.
if len(set(this_hole_boundaries_df["target"])) == len(this_hole_boundaries_df):
target_repetition = False
else:
target_repetition = True
repeated_targets = []
for target in set(this_hole_boundaries_df["target"]):
boundaries_this_target = this_hole_boundaries_df[this_hole_boundaries_df["target"] == target]
if len(boundaries_this_target) > 1:
repeated_targets.append((target, len(boundaries_this_target)))

new_hole_in_progress = this_hole
this_hole_triangulation = triangulate_polygon(new_hole_in_progress)

Expand Down Expand Up @@ -1582,7 +1604,27 @@ def convexify_hole_boundaries(geometries_df, holes_df):
new_hole = orient(new_hole)
new_hole_df = GeoDataFrame(geometry=GeoSeries([new_hole]), crs=holes_df.crs)
new_hole_df.insert(0, "region", this_region)
completed_holes_df = pandas.concat([completed_holes_df, new_hole_df]).reset_index(drop=True)

if target_repetition:
# Check to see whether the target that was repeated in the original
# hole boundaries has fewer (but not zero!) hole boundaries
# associated to it in this hole. If so, some distinct boundaries
# may have been concatenated after convexifying, resulting in a
# non-convex boundary - so put the hole back in the queue for
# another round of processing.
new_hole_boundaries_df = construct_hole_boundaries(geometries_df, new_hole_df)
reprocess_hole = False
for target in repeated_targets:
new_boundaries_this_target = new_hole_boundaries_df[new_hole_boundaries_df["target"] == target[0]]
if len(new_boundaries_this_target) > 0 and len(new_boundaries_this_target) < target[1]:
reprocess_hole = True
break
if reprocess_hole:
holes_to_process.append(new_hole)
else:
completed_holes_df = pandas.concat([completed_holes_df, new_hole_df]).reset_index(drop=True)
else:
completed_holes_df = pandas.concat([completed_holes_df, new_hole_df]).reset_index(drop=True)

pbar.update(pbar_increment)

Expand Down
Loading

0 comments on commit d71150f

Please sign in to comment.