forked from mhouser42/Invasive_Species_Propogation
-
Notifications
You must be signed in to change notification settings - Fork 0
/
illinois_network.py
294 lines (243 loc) · 11.7 KB
/
illinois_network.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
# Illinois_network.py
"""
### Authors:
##### Justin Tung: 'https://github.com/JayTongue'
##### Matt Adam-Houser: 'https://github.com/mhouser42'
This file is for the creation of county network
It constructs a network with nodes and edges
outputs three binary files: the NX network, a county handler, and a neighbor handler.
TODO: doctests for get_neighbor_handler, calc_toh_density_coef, add_tree_density?
"""
import time
from tqdm import tqdm
import pickle
import pandas as pd
import numpy as np
import networkx as nx
import osmnx as ox
from collections import Counter
from my_classes import County
def get_lower_and_upper_bounds(df: pd.DataFrame, col_name: str) -> tuple:
"""
return the lower and upper bounds of a column's values
:param df: dataframe to be accessed
:param col_name: string name of column to retrieve boundaries
:return: the lower and upper bounds
>>> data = {'A': [1, 2, 3, 4, 5, 6, 7, 8, 9, 10]}
>>> df = pd.DataFrame(data)
>>> get_lower_and_upper_bounds(df, 'A')
(-3.5, 14.5)
>>> empty_df = pd.DataFrame()
>>> get_lower_and_upper_bounds(empty_df, 'A') # Empty DataFrame returns None
Traceback (most recent call last):
...
KeyError: 'A'
"""
Q1 = np.percentile(df[col_name], 25)
Q3 = np.percentile(df[col_name], 75)
IQR = Q3 - Q1
lower_bound = Q1 - 1.5 * IQR
upper_bound = Q3 + 1.5 * IQR
return lower_bound, upper_bound
def construct_nodes(CG: nx.Graph, df: pd.DataFrame) -> dict:
"""
takes a dataframe of county information, finds geolocation with OSMnx
:param CG: graph to be populated
:param df: dataframe with county information
:return count_dict: a dictionary to reference nodes by name
>>> CG = nx.Graph()
>>> data = {'name': ['Cook', 'Pope'], 'pop': [10000, 20000], 'PD_sqmi': [50, 60]}
>>> counties_df = pd.DataFrame(data)
>>> construct_nodes(CG, counties_df) # doctest: +ELLIPSIS
{'Cook': <my_classes.County object at ...>, 'Pope': <my_classes.County object at ...>}
>>> CG = nx.Graph()
>>> data = {'name': ['Cook', 'Pope'], 'pop': [10000, 20000], 'PD_sqmi': [50]}
>>> counties_df = pd.DataFrame(data)
Traceback (most recent call last):
...
ValueError: All arrays must be of the same length
"""
handler = {}
count = 0
pbar = tqdm(df.iterrows(), desc='Assembling Nodes')
for index, row in pbar:
county = ' County'
name, pop, pop_dense = row['name'], row['pop'], row['PD_sqmi']
pbar.set_postfix_str(f'Working on {name}{county}, IL')
gdf = ox.geocode_to_gdf(f'{name}{county}, IL').iloc[0]
node = County(name, lat=gdf['lat'], lon=gdf['lon'], geometry=gdf['geometry'],
pop=pop, popdense_sqmi=pop_dense, centroid=gdf['geometry'].centroid)
CG.add_node(node)
handler[name] = node
count += 1
if count >= len(df):
pbar.set_postfix_str('Done!')
time.sleep(.5)
return handler
def construct_edges(CG: nx.Graph, edge_df: pd.DataFrame, handler: dict):
"""
takes a graph with nodes, a dataframe instructions, and a dictionary of nodes
:param CG:
:param edge_df:
:param handler:
>>> CG = nx.Graph()
>>> handler = {'A': object(), 'B': object(), 'C': object()}
>>> data = {'Source': ['A', 'B'], 'Target': ['B', 'C'], 'Relation': ['adjacent', 'interstate'], 'Weight': [5, 8]}
>>> edges_df = pd.DataFrame(data)
>>> construct_edges(CG, edges_df, handler)
# nothing is outputted, but test shows verifiation of input
>>> CG = nx.Graph()
>>> handler = {'A': object(), 'B': object()}
>>> data = {'Source': ['A', 'B'], 'Target': ['B', 'C'], 'Relation': ['adjacent', 'interstate'], 'Weight': [5, 8]}
>>> edges_df = pd.DataFrame(data)
>>> construct_edges(CG, edges_df, handler)
Key error: 'C': This node doesn't exist in the handler
"""
edges = []
for i, row in edge_df.iterrows():
src = row.iloc[0]
tgt = row.iloc[1]
rel = row.iloc[2]
weight = row.iloc[3]
try:
edge = (handler[src], handler[tgt], {'weight': weight, 'rel': rel})
edges.append(edge)
except KeyError as e:
print(f'Key error: {e}: This node doesn\'t exist in the handler')
CG.add_edges_from(edges)
def get_neighbor_handler(CG: nx.Graph, handler: dict) -> dict:
"""
Ascertains the saturation status of all neighbors for each county instance,
returns them as a neighbor object
:param CG: the graph of county network
:param handler: the graph handler, with county names for keys and the counties themselves for values
:return neighbor_handle: another handler, with county names for keys and a
list of neighboring county nodes for values
"""
neighbor_handle = {}
for county in handler:
all_neighbors = []
for neighbor in handler[county].get_neighbor_objects(CG):
neighbor = handler[neighbor.name]
all_neighbors.append(neighbor)
neighbor_handle[county] = all_neighbors
return neighbor_handle
def get_toh_totals_by_county(df: pd.DataFrame, handler: dict) -> (dict, int):
"""
calculates the total saturation_index (acreage X density) and number of sightings of tree of heaven for all nodes
:param df: dataframe to be assessed
:param handler: graph handler with county names for keys and the counties themselves as values
:return county_tots, county_counts: the total saturation of tree of heaven by county
:return county_counts: the number of tree of heaven sightings in a county.
>>> handler = {'Cole': object(), 'LaSalle': object()} # Assuming handler is a dictionary with County objects
>>> data = {'county': ['Cole', 'LaSalle', 'Cole'], 'infest_index': [10, 5, 8]}
>>> df = pd.DataFrame(data)
>>> county_tots, county_counts = get_toh_totals_by_county(df, handler)
>>> county_tots['Cole'] # Checking the total saturation for 'County A'
18
>>> county_counts['LaSalle'] # Checking the number of sightings for 'County B'
1
>>> county_counts['Wabash']
0
"""
county_tots = {key: 0 for key in handler.keys()}
county_counts = Counter({key: 0 for key in handler.keys()})
for index, row in df.iterrows():
county = row['county']
infest_index = row['infest_index']
county_counts[county] += 1
county_tots[county] += infest_index
return county_tots, county_counts
def calc_toh_density_coef(df: pd.DataFrame, handler: dict, county_tots: dict, county_counts: dict):
"""
takes total saturation and sightings and returns relative tree of heaven density for each county in the network,
capping at 1.0 and bottoming out at 0.0. This is a relative level based on sightings, with outlier being
:param df: dataframe of tree of heaven data
:param handler: graph handler with county names as keys and the counties themselves as values
:param county_tots: total saturation index by county
:param county_counts: total sightings by county
"""
toh_df = pd.read_csv(f'data/tree/Il_toh.csv')
lower, upper = get_lower_and_upper_bounds(toh_df, 'infest_index')
toh_df['infest_index'] = np.clip(toh_df['infest_index'], lower, upper)
max_infest = max(df['infest_index'])
min_infest = min(df['infest_index'])
for name, node in handler.items():
total = county_tots[name]
count = county_counts[name]
avg_infest = total / count if count != 0 else 0
avg_infest = max(min(avg_infest, max_infest), min_infest)
node.toh_density = round((avg_infest / max_infest), 2) if max_infest > 0 else 0
def add_tree_density(handler: dict):
"""
Adds regular tree densities to nodes, based on the counties latitude and longitude.
REFERENCE: https://www.fs.usda.gov/nrs/pubs/rb/rb_nrs113.pdf, pages 5-6
:param handler: graph handler with county names for keys and the counties themselves as values
"""
peoria = handler['Peoria']
hardin = handler['Hardin']
clark = handler['Clark']
for name, county in handler.items():
if (county.centroid.y <= hardin.centroid.y) or \
((county.centroid.y <= peoria.centroid.y) and (county.centroid.x <= peoria.centroid.x)):
county.tree_density = 0.6
elif county.centroid.y <= clark.centroid.y:
county.tree_density = 0.4
else:
county.tree_density = 0.2
def set_up(path: str) -> (pd.DataFrame, pd.DataFrame, pd.DataFrame, pd.DataFrame):
"""
Loads and returns previously constructed csvs from preprocessing into pandas dataframes.
:param path: folder the csvs are stored in.
:return: csvs loaded into dataframes.
"""
county_df = pd.read_csv(f'{path}/counties.csv') # for nodes
edge_df = pd.read_csv(f'{path}/county_edges.csv') # for edges
f_edge_df = pd.read_csv(f'{path}/fast_county_edges.csv') # makes edge weight on highways lower, inc spread
toh_df = pd.read_csv(f'data/tree/Il_toh.csv')
return county_df, edge_df, f_edge_df, toh_df
def construct_graph_and_handlers(CG: nx.Graph, county_df: pd.DataFrame, edge_df: pd.DataFrame, toh_df: pd.DataFrame) \
-> (nx.Graph, dict, dict):
"""
Takes a NetworkX Graph,a pandas dataframe of counties for nodes, a dataframe of connections between the counties for
edges, and a dataframe of Tree of Heaven information about each county to be inserted into node attributes,
and constructs a network representing counties in Illinois, and handlers for both counties and neighboring counties.
:param CG: NetworkX Graph to be added to
:param county_df: dataframe containing counties and geographic/population information
:param edge_df: dataframe containing adjacency and interstate connections between counties
:param toh_df: dataframe of up-to-date ToH sightings by county.
:return: NetworkX Graph of Illinois counties, handler for counties, handler for neighboring counties of each county
"""
# adding nodes
county_handler = construct_nodes(CG, county_df)
# adding edges
construct_edges(CG, edge_df, county_handler)
# adding Tree of Heaven and regular tree densities
county_tots, county_counts = get_toh_totals_by_county(toh_df, county_handler)
calc_toh_density_coef(toh_df, county_handler, county_tots, county_counts)
add_tree_density(county_handler)
# getting a handler of all node neighbors in graph
neighbor_handler = get_neighbor_handler(CG, county_handler)
return CG, county_handler, neighbor_handler
def dump_graph_and_handler(CG: nx.Graph, county_handler: pd.DataFrame, neighbor_handler: pd.DataFrame, prefix=None):
"""
Pickles Illinois network graph, county handler, and neighbor handler.
:param CG: NetworkX graph
:param county_handler: handler for nodes of NetworkX graph
:param neighbor_handler: handler for nodes and nodes connected to them.
:param prefix: specify to pickle different versions of graphs/handlers
"""
prefix = '' if prefix is None else prefix
pickle.dump(CG, open(f'{path}/{prefix}IL_graph.dat', 'wb'))
pickle.dump(county_handler, open(f'{path}/{prefix}graph_handler_counties.dat', 'wb'))
pickle.dump(neighbor_handler, open(f'{path}/{prefix}graph_handler_neighbors.dat', 'wb'))
if __name__ == '__main__':
path = 'data/location'
county_df, edge_df, f_edge_df, toh_df = set_up(path)
CG = nx.Graph()
fCG = nx.Graph()
CG, county_handler, neighbor_handler = construct_graph_and_handlers(CG, county_df, edge_df, toh_df)
fCG, f_county_handler, f_neighbor_handler = construct_graph_and_handlers(fCG, county_df, f_edge_df, toh_df)
# pickling
dump_graph_and_handler(CG, county_handler, neighbor_handler)
dump_graph_and_handler(fCG, f_county_handler, f_neighbor_handler, prefix='fast_')