add ability to visualize districts in community map

gcrsef-gang · Feb 22, 2023 · 3081be9 · 3081be9
1 parent 07c76d8
commit 3081be9
Show file tree

Hide file tree

Showing 5 changed files with 116 additions and 62 deletions.
diff --git a/rba/__main__.py b/rba/__main__.py
@@ -48,6 +48,7 @@
     draw_parser.add_argument("--graph_file", type=str, default=os.path.join(package_dir, "data/2010/new_hampshire_geodata_merged.json"))
     draw_parser.add_argument("--edge_lifetime_file", type=str, default=None)
     draw_parser.add_argument("--num_frames", type=int, default=50)
+    draw_parser.add_argument("--partition_file", type=str, default=None)
     draw_parser.set_defaults(func=rba.visualization.visualize)
 
     optimize_parser = subparsers.add_parser("optimize")

diff --git a/rba/district_quantification.py b/rba/district_quantification.py
@@ -10,51 +10,16 @@
 
 from pyproj import CRS
 
+from .util import load_districts
 
-def load_districts(graph_file, district_file, verbose=False):
+def quantify_gerrymandering(state_graph, districts, community_lifespan, verbose=False):
     """
-    Given a path to the district boundaries of a state, creates a list of districts and their composition.
-    """
-    district_boundaries = gpd.read_file(district_file)
-    cc = CRS('esri:102008')
-    district_boundaries = district_boundaries.to_crs(cc)
-    if "GEOID10" in district_boundaries.columns:
-        district_boundaries["GEOID10"].type = str
-        district_boundaries.set_index("GEOID10", inplace=True)
-    else: 
-        district_boundaries["GEOID20"].type = str
-        district_boundaries.set_index("GEOID20", inplace=True)
-    with open(graph_file, "r") as f:
-        graph_json = json.load(f)
-    graph = nx.readwrite.json_graph.adjacency_graph(graph_json)
-    geodata_dict = {}
-    for node, data in graph.nodes(data=True):
-        data["geoid"] = node
-        data["geometry"] = shapely.geometry.shape(data["geometry"])
-        geodata_dict[node] = data
-    geodata_dataframe = pd.DataFrame.from_dict(geodata_dict, orient='index')
-    geodata = gpd.GeoDataFrame(geodata_dataframe, geometry=geodata_dataframe.geometry, crs='esri:102008')
-    district_assignment = maup.assign(geodata, district_boundaries)
-    district_assignment = district_assignment.astype(str)
-    district_assignment = district_assignment.str.split('.').str[0]
-    district_assignment.to_csv("district_assignment.csv")
-    districts = {}
-    for i, district in district_assignment.iteritems():
-        if district in districts:
-            districts[district].append(i)
-        else:
-            districts[district] = [i]
-    district_graphs = {district : graph.subgraph(districts[district]).copy() for district in districts}
-    return graph, district_graphs
-
-def quantify_gerrymandering(state_graph, district_graphs, community_lifespan, verbose=False):
-    """
-    Given a dictionary of district graphs/a state graph as well as dictionary of community boundary lifespan, calculates
+    Given a dictionary of districts to node lists/a state graph as well as dictionary of community boundary lifespan, calculates
     gerrymandering scores for each district and the state.
     """
     # Two lookups 
-    crossdistrict_edges = {district : [] for district in district_graphs}
-    for district, graph in district_graphs.items():
+    crossdistrict_edges = {district : [] for district in districts}
+    for district, graph in districts.items():
             # state_graph.remove_edge(edge[0], edge[1])
         # state_graph.remove_edges_from(graph.edges)
         for node in graph:
@@ -68,7 +33,7 @@ def quantify_gerrymandering(state_graph, district_graphs, community_lifespan, ve
     state_gerrymandering = 0
     district_gerrymanderings = {}
     num_crossedges = sum([len(edge_list) for edge_list in crossdistrict_edges.values()])
-    for district, district_graph in district_graphs.items():
+    for district, node_list in districts.items():
         district_gerrymandering = 0
         # for edge in district_graph.edges():
             # try:
@@ -96,7 +61,10 @@ def quantify_districts(graph_file, district_file, community_file, verbose=False)
     """
     Wraps both functions into a single function for direct use from main.py
     """
-    state_graph, district_graphs = load_districts(graph_file, district_file)
+    with open(graph_file, "r") as f:
+        graph_json = json.load(f)
+    graph = nx.readwrite.json_graph.adjacency_graph(graph_json)
+    districts = load_districts(graph, district_file)
 
     with open(community_file, "r") as f:
         supercommunity_output = json.load(f)  # Contains strings as keys.
@@ -107,6 +75,6 @@ def quantify_districts(graph_file, district_file, community_file, verbose=False)
         v = edge.split(",")[1][2:-2]
         community_lifespan[(u, v)] = lifetime
 
-    district_gerrymanderings, state_gerrymandering = quantify_gerrymandering(state_graph, district_graphs, community_lifespan)
+    district_gerrymanderings, state_gerrymandering = quantify_gerrymandering(graph, districts, community_lifespan)
     print(district_gerrymanderings, state_gerrymandering)
     return district_gerrymanderings, state_gerrymandering
diff --git a/rba/scripts/serialize.py b/rba/scripts/serialize.py
@@ -71,10 +71,12 @@ def merge_graphs():
     """
     for year in ["2010", "2020"]:
         for file in os.listdir(final_dir+"/"+year):
-            if "merged" in file or "simplified" in file:
+            if "merged" in file or "simplified" in file or ".7z" in file:
                 continue
             elif os.path.isfile(os.path.join(final_dir+"/"+year, file[:file.find(".")] + "_merged.json")):
                 continue
+            if file == "california_block_geodata.json" and year == "2010":
+                continue
             print(year, file)
             corresponding_file = file[:file.find(".")] + ".json"
             full_json_path = os.path.join(final_dir+"/"+year, corresponding_file)
@@ -86,6 +88,9 @@ def merge_graphs():
             graph = nx.readwrite.adjacency_graph(data)
             print(f"Total number of nodes: {len(graph.nodes)}")
             merged_graph = merge_empty(graph)
+            if merged_graph == None:
+                print(file, year, "FAILED!")
+                continue
             merged_data = nx.readwrite.adjacency_data(merged_graph)
             # for id, data in merged_graph.nodes(data=True):
                 # print(data)
@@ -98,8 +103,14 @@ def merge_graphs():
                 json.dump(merged_data, f)
                 # data_str = json.dumps(merged_data["nodes"][0])
                 # f.write(data_str)
-            # if os.path.isfile(os.path.join(final_dir+"/"+year, file[:file.find(".")] + ".7z")):
+            if os.path.isfile(os.path.join(final_dir+"/"+year, file[:file.find(".")] + ".7z")):
+                # os.path.join(final_dir+"/"+year, file[:file.find(".")] + ".7z")
+                subprocess.call(["7z", "a", os.path.join(final_dir+"/"+year, file[:file.find(".")] + ".7z"), full_json_path])
                 # os.remove(full_json_path)
+            del data
+            del graph
+            del merged_data
+            del merged_graph
 
 def split_multipolygons(geodata, assignment=None, block_data=None):
     """
@@ -558,7 +569,11 @@ def merge_empty(graph):
             for other_node in graph.neighbors(node):
                 bordering.add(other_node)
         bordering = bordering.difference(set(group))
-        substituted_node = list(bordering)[0]
+        # print(bordering, len(group))
+        try:
+            substituted_node = list(bordering)[0]
+        except IndexError:
+            return None
         geometry = [shapely.geometry.shape(graph.nodes[node]["geometry"]) for node in group]
         geometry.append(shapely.geometry.shape(graph.nodes[substituted_node]["geometry"]))
         geometry_union = shapely.ops.unary_union(geometry)
@@ -987,7 +1002,7 @@ def serialize(year, state, checkpoint="beginning"):
 
     with open(final_dir + f"/{year}/{state}_geodata.json", "w") as f:
         json.dump(data, f)
-    with open(final_dir + f"/{year}/{state}_block_geodata_help.json", "w") as f:
+    with open(final_dir + f"/{year}/{state}_block_geodata.json", "w") as f:
         json.dump(block_data, f)
 
     # Create a version with merged precincts/blocks under a certain threshold
@@ -1040,8 +1055,8 @@ def serialize_all():
 
 if __name__ == "__main__":
     # compress_all_data("final")
-    # merge_graphs()
+    merge_graphs()
     # serialize_all()
-    serialize(2010, "north_carolina", checkpoint="beginning")
+    # serialize(2010, "north_carolina", checkpoint="graph")
     # serialize(2010, "north_dakota", checkpoint="geometry")
     # serialize(2020, "missouri", checkpoint="graph")       
diff --git a/rba/util.py b/rba/util.py
@@ -3,10 +3,17 @@
 
 import random
 
+import json
 import networkx as nx
+import geopandas as gpd
+import shapely
+import pandas as pd
+import maup
+from pyproj import CRS
+
 
 from . import constants
-from .district_quantification import quantify_gerrymandering
+# from .district_quantification import quantify_gerrymandering
 
 
 def copy_adjacency(graph):
@@ -61,4 +68,39 @@ def get_county_weighted_random_spanning_tree(graph):
     spanning_tree = nx.tree.maximum_spanning_tree(
         graph, algorithm="kruskal", weight="random_weight"
     )
-    return spanning_tree
+    return spanning_tree
+
+def load_districts(graph, district_file, verbose=False):
+    """
+    Given a path to the district boundaries of a state, creates a list of districts and their composition.
+    """
+    district_boundaries = gpd.read_file(district_file)
+    cc = CRS('esri:102008')
+    district_boundaries = district_boundaries.to_crs(cc)
+    if "GEOID10" in district_boundaries.columns:
+        district_boundaries["GEOID10"].type = str
+        district_boundaries.set_index("GEOID10", inplace=True)
+    else: 
+        district_boundaries["GEOID20"].type = str
+        district_boundaries.set_index("GEOID20", inplace=True)
+
+    # graph = nx.readwrite.json_graph.adjacency_graph(graph_json)
+    geodata_dict = {}
+    for node, data in graph.nodes(data=True):
+        data["geoid"] = node
+        data["geometry"] = shapely.geometry.shape(data["geometry"])
+        geodata_dict[node] = data
+    geodata_dataframe = pd.DataFrame.from_dict(geodata_dict, orient='index')
+    geodata = gpd.GeoDataFrame(geodata_dataframe, geometry=geodata_dataframe.geometry, crs='esri:102008')
+    district_assignment = maup.assign(geodata, district_boundaries)
+    district_assignment = district_assignment.astype(str)
+    district_assignment = district_assignment.str.split('.').str[0]
+    district_assignment.to_csv("district_assignment.csv")
+    districts = {}
+    for i, district in district_assignment.iteritems():
+        if district in districts:
+            districts[district].append(i)
+        else:
+            districts[district] = [i]
+    # districts = {district : graph.subgraph(districts[district]).copy() for district in districts}
+    return districts
diff --git a/rba/visualization.py b/rba/visualization.py
@@ -16,6 +16,7 @@
 import shapely.ops
 import numpy as np
 import random
+import gerrychain
 
 from . import community_generation
 from . import util
@@ -119,7 +120,7 @@ def visualize_partition_geopandas(partition, *args, **kwargs):
 
 
 def visualize_map(graph, output_fpath, node_coords, edge_coords, node_colors=None, edge_colors=None,
-        edge_widths=None, node_list=None, additional_polygons=None, text=None, show=False):
+        edge_widths=None, partition=None, node_list=None, additional_polygons=None, text=None, show=False):
     """Creates an image of a map and saves it to a file.
 
     Parameters
@@ -178,17 +179,21 @@ def visualize_map(graph, output_fpath, node_coords, edge_coords, node_colors=Non
                 edge_width_values.append(edge_widths((u, v)))
                 edge_color_values.append(edge_colors((u, v)))
         except KeyError:
+            # print("THINGS BEING DROPPED", u, v)
             continue
     num_edge_line_strings = len(edge_color_values) - 1
+    split_nodes_num = 0
     for u in node_list:
         u_coords = node_coords(u)
             # print(shapely.geometry.mapping(u_coords))
         # Could be multiline string if the coordinates are a multipolygon
         if isinstance(u_coords.boundary, shapely.geometry.MultiLineString):
             for ls in u_coords.boundary.geoms:
                 all_flattened_coords += ls.coords
-            node_end_indices.append(len(all_flattened_coords) - 1)
-            node_color_values.append(node_colors(u))
+                node_end_indices.append(len(all_flattened_coords) - 1)
+                node_color_values.append(node_colors(u))
+                split_nodes_num += 1
+            split_nodes_num -= 1
         else:
             all_flattened_coords += list(u_coords.boundary.coords)
             node_end_indices.append(len(all_flattened_coords) - 1)
@@ -211,7 +216,7 @@ def visualize_map(graph, output_fpath, node_coords, edge_coords, node_colors=Non
     all_flattened_coords = modify_coords(all_flattened_coords, IMAGE_DIMS)
 
     # Fill in colors (in a lower layer than borders)
-    for i in range(len(node_list)):
+    for i in range(len(node_list)+split_nodes_num):
         if i == 0:
             # Because each edge line string corresponds to two points in the all flattened coords
             start_index = num_edge_line_strings*2 + 2
@@ -240,7 +245,7 @@ def visualize_map(graph, output_fpath, node_coords, edge_coords, node_colors=Non
         map_image.show()
 
 
-def visualize_community_generation(edge_lifetime_fpath, output_fpath, graph, num_frames):
+def visualize_community_generation(edge_lifetime_fpath, output_fpath, graph, num_frames, partition=None):
     """Writes frames for an animated visual of the community generation algorithm to a folder.
     The animation depicts borders between communities as black and borders between precints as gray.
     It also uses edge width as an indicator of similarity, and color as an indicator of
@@ -307,22 +312,27 @@ def visualize_community_generation(edge_lifetime_fpath, output_fpath, graph, num
     print("Getting edge coordinates... ", end="")
     sys.stdout.flush()
     edge_coords = {}
+    i = 0
     for u, v in graph.edges:
         intersection = node_coords[u].intersection(node_coords[v])
         if not intersection.is_empty:
             if isinstance(intersection, shapely.geometry.LineString):
                 edge_coords[frozenset((u, v))] = [intersection.coords]
+                # if i == 0:
+                    # i += 1
             elif isinstance(intersection, shapely.geometry.MultiLineString):
                 edge_coords[frozenset((u, v))] = [ls.coords for ls in intersection.geoms]
+                # print([ls.coords for ls in intersection.geoms], "intersection coords")
             elif isinstance(intersection, shapely.geometry.collection.GeometryCollection):
                 edge_coords[frozenset((u,v))] = [ls.coords for ls in intersection.geoms if isinstance(ls, shapely.geometry.LineString)]
             else:
                 raise ValueError(f"Intersection between {u} and {v} is not a LineString or"
                                 + f" MultiLineString. It is of type {type(intersection)}")
         # else:
             # This is an edge between an island and another precinct, so just connect their centers
-            # print([node_coords[u].centroid.coords, node_coords[v].centroid.coords])
-            # edge_coords[frozenset((u,v))] = [node_coords[u].centroid.coords, node_coords[v].centroid.coords]
+            # print([shapely.geometry.mapping(node_coords[u].centroid), shapely.geometry.mapping(node_coords[v].centroid)])
+            # unwrapped = [shapely.geometry.mapping(node_coords[u].centroid)["coordinates"], shapely.geometry.mapping(node_coords[v].centroid)["coordinates"]]
+            # edge_coords[frozenset((u,v))] = [shapely.geometry.LineString(unwrapped).coords]
             # raise ValueError(f"Overlap not found between {u} and {v}")
     print("Done!")
 
@@ -336,6 +346,7 @@ def visualize_community_generation(edge_lifetime_fpath, output_fpath, graph, num
     overall_border = shapely.ops.unary_union([node_coords[u] for u in graph.nodes])
     print("Done!")
 
+
     living_edges = set(frozenset(e) for e in graph.edges)
     unrendered_contractions = [tuple(c) for c in supercommunity_output["contractions"]]  # Not a set because order must be preserved.
     community_graph = util.copy_adjacency(graph)
@@ -344,6 +355,17 @@ def visualize_community_generation(edge_lifetime_fpath, output_fpath, graph, num
     for node in community_graph.nodes:
         community_graph.nodes[node]["constituent_nodes"] = {node}
 
+    if isinstance(partition, gerrychain.Partition):
+        assignment = partition.assignment
+        for node in graph.nodes:
+            graph.nodes[node]["partition"] = assignment[node]
+    elif isinstance(partition, str):
+        districts = util.load_districts(graph, partition)
+        for district, node_list in districts.items():
+            for node in node_list:
+                graph.nodes[node]["partition"] = district
+    print("Partition data integrated!")
+
     # Replay supercommunity algorithm
     for f in range(1, num_frames + 1):
         print(f"\rOn frame {f}/{num_frames}", end="")
@@ -352,9 +374,15 @@ def visualize_community_generation(edge_lifetime_fpath, output_fpath, graph, num
         edge_colors = {}
         for u, v in living_edges:
             if edge_lifetimes[frozenset((u, v))] < t:
-                edge_colors[frozenset((u, v))] = (106, 106, 106)
+                if graph.nodes[u]["partition"] != graph.nodes[v]["partition"]:
+                    edge_colors[frozenset((u, v))] = (156, 156, 255)
+                else:
+                    edge_colors[frozenset((u, v))] = (106, 106, 106)
             else:
-                edge_colors[frozenset((u, v))] = (0, 0, 0)
+                if graph.nodes[u]["partition"] != graph.nodes[v]["partition"]:
+                    edge_colors[frozenset((u, v))] = (50, 50, 150)
+                else:
+                    edge_colors[frozenset((u, v))] = (0, 0, 0)
 
         this_iter_contractions = set()
         for c1, c2, time in unrendered_contractions:
@@ -498,7 +526,7 @@ def visualize_graph(graph, output_path, coords, colors=None, edge_colors=None, n
         graph_image.show()
 
 
-def visualize(output_file, graph_file, edge_lifetime_file, num_frames, verbose):
+def visualize(output_file, graph_file, edge_lifetime_file, num_frames, partition_file, verbose):
     """General visualization function (figures out what to do based on inputs).
 
     TODO: right now this only works for supercommunity animations.
@@ -511,7 +539,7 @@ def visualize(output_file, graph_file, edge_lifetime_file, num_frames, verbose):
     geodata = nx.readwrite.json_graph.adjacency_graph(data)
     community_generation.compute_precinct_similarities(geodata)
 
-    visualize_community_generation(edge_lifetime_file, output_file, geodata, num_frames)
+    visualize_community_generation(edge_lifetime_file, output_file, geodata, num_frames, partition_file)
 
 # def graph_edge_lifetime_distribution(edge_lifetime_path):
 #     with open(edge_lifetime_path, "r") as f: