This repository has been archived by the owner on Apr 1, 2024. It is now read-only.
forked from NREL/celavi
-
Notifications
You must be signed in to change notification settings - Fork 0
/
celavi_input_checks.py
432 lines (353 loc) · 19.6 KB
/
celavi_input_checks.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
import pandas as pd
import numpy as np
import os.path
import argparse
class FileChecks:
"""
FileChecks checks the integrity of files used as input to CELAVI.
See the docstrings for each individual method for explanations.
If all the input checks pass, these methods run to completion and
nothing is returned. However, if there are errors found, each
respective method will raise an exception describing the error.
These exceptions shouldn't be caught; rather, they should term-
inate the program and show the exception. The exceptions are
created with descriptive messages before they are raised.
"""
def __init__(self, locations, step_costs, fac_edges, routes, transpo_edges):
"""
Initializes the file integrity checks by copying the filenames
to instance variables.
Parameters
----------
locations: str
Path to the locations file
step_costs: str
Path to the step_costs file
fac_edges: str
Path to the facility edges file
routes: str
Path to the routes file
transpo_edges: str
Path to the transportation edges file
"""
self.locations_filename = locations
self.step_costs_filename = step_costs
self.fac_edges_filename = fac_edges
self.routes_filename = routes
self.transpo_edges_filename = transpo_edges
self.locations: pd.DataFrame = None
self.step_costs: pd.DataFrame = None
self.fac_edges: pd.DataFrame = None
self.routes: pd.DataFrame = None
self.transpo: pd.DataFrame = None
def check_files_exist(self) -> bool:
"""
This method checks to see if files exist before trying
to open them.
Returns
-------
bool
True if all the files exist
Raises
------
Exception
Raises an exception if one of the files does not exist
"""
if not os.path.isfile(self.locations_filename):
raise Exception(f'Locations file {self.locations_filename} does not exist.')
if not os.path.isfile(self.step_costs_filename):
raise Exception(f'Step costs file {self.step_costs_filename} does not exist.')
if not os.path.isfile(self.fac_edges_filename):
raise Exception(f'Facility edges file {self.fac_edges_filename} does not exist.')
if not os.path.isfile(self.routes_filename):
raise Exception(f'Routes file {self.routes_filename} does not exist.')
if not os.path.isfile(self.transpo_edges_filename):
raise Exception(f'Transportation edges file {self.transpo_edges_filename} does not exist.')
return True
def open_files(self) -> None:
"""
This method attempts to open each file as a .csv and load it
into pandas.
Raises
------
Exception
Raises an exception if any of the files fail to open as .csv
files.
"""
try:
self.locations = pd.read_csv(self.locations_filename)
except (pd.EmptyDataError, FileNotFoundError):
raise Exception(f'{self.locations_filename} failed to read as a .csv')
try:
self.step_costs = pd.read_csv(self.step_costs_filename)
except (pd.EmptyDataError, FileNotFoundError):
raise Exception(f'{self.step_costs_filename} failed to read as a .csv')
try:
self.fac_edges = pd.read_csv(self.fac_edges_filename)
except (pd.EmptyDataError, FileNotFoundError):
raise Exception(f'{self.fac_edges_filename} failed to read as a .csv')
try:
self.routes = pd.read_csv(self.routes_filename)
except (pd.EmptyDataError, FileNotFoundError):
raise Exception(f'{self.routes_filename} failed to read as a .csv')
try:
self.transpo_edges = pd.read_csv(self.transpo_edges_filename)
except (pd.EmptyDataError, FileNotFoundError):
raise Exception(f'{self.transpo_edges_filename} failed to read as a .csv')
def check_facility_id_nulls(self):
"""
This method checks the facility_id columns across the tables
Raises
------
Exception
Raises an exception if any null values are found.
"""
if self.locations['facility_id'].isnull().values.any():
raise Exception(f'facility_id in locations has an empty value.')
if self.step_costs['facility_id'].isnull().values.any():
raise Exception(f'destination_facility_id in step costs has an empty value.')
if self.routes['source_facility_id'].isnull().values.any():
raise Exception(f'source_facility_id in routes has an empty value.')
if self.routes['destination_facility_id'].isnull().values.any():
raise Exception(f'destination_facility_id in routes has an empty value.')
def check_facility_type_nulls(self):
"""
This tests for any null values in facility_type columns
Raises
------
Exception
Raises an exception if any values are null
"""
if self.locations['facility_type'].isnull().values.any():
raise Exception('facility_type in locations has an empty value.')
if self.fac_edges['facility_type'].isnull().values.any():
raise Exception('facility_type in fac_edges has an empty value.')
if self.routes['source_facility_type'].isnull().values.any():
raise Exception('source_facility_type in routes has an empty value.')
if self.routes['destination_facility_type'].isnull().values.any():
raise Exception('destination_facility_type in routes has an empty value.')
def check_lat_long_nulls(self):
"""
The tests for any null latitude values
Raises
------
Exception
Raises an exception if any values are found to be missing
"""
if self.locations['lat'].isnull().values.any():
raise Exception('lat in locations has an empty value.')
if self.locations['long'].isnull().values.any():
raise Exception('long in locations has an empty value.')
def check_step_nulls(self):
"""
Checks to see if there are any nulls in the u_step or v_step
columns. Also check transpo_cost_method
Raises
------
Exception
Raises an exception if the u_step, v_step, step, or
step_cost_method are empty.
"""
if self.transpo_edges['u_step'].isnull().values.any():
raise Exception('u_step in transpo_edges has a null value.')
if self.transpo_edges['v_step'].isnull().values.any():
raise Exception('v_step in transpo_edges has a null value.')
if self.step_costs['step_cost_method'].isnull().values.any():
raise Exception('step_cost_method in step_costs has a null value')
if self.transpo_edges['transpo_cost_method'].isnull().values.any():
raise Exception('transpo_cost_method in transpo_edges has a null value')
if self.fac_edges['step'].isnull().values.any():
raise Exception('step in fac_edges has a null value')
def check_joins_on_facility_id(self):
"""
Checks the joins on facility_id among the tables.
Raises
------
Exception
Raises an exception if the joins do not work.
"""
# An outer join is used here to include all rows on both sides of the join
# Check for null values on the left/right side of the join, and use ids on the
# opposite right/left side of the join to generate error messages about
# unmatched rows on the other side of the join.
join1 = self.locations.merge(self.step_costs, on='facility_id', how='outer')
if join1['facility_type'].isna().values.any():
step_cost_facility_id = join1[join1['facility_type'].isna()]['facility_id'].values
raise Exception(f'There is a step_costs facility_id of {step_cost_facility_id} that does not exist in locations.facility_id')
if join1['step'].isna().values.any():
location_facility_id = join1[join1['step'].isna()]['facility_id'].values
raise Exception(f'There is a locations.facility_id of {location_facility_id} that does not exit in step_costs.facility_id')
# An outer join is used here to include all rows on both sides of the join.
# Use the left side of the join to check for facility ids referenced by routes that
# do not exist in locations.
# Use the right side of the join to create error messages about route facility ids
# that do not exist in location.
join2 = self.locations.merge(self.routes, left_on='facility_id', right_on='source_facility_id', how='outer')
if join2['facility_id'].isna().values.any():
source_facility_id = join2[join2['facility_id'].isna().values]['source_facility_id'].values
raise Exception(f'There is a routes.source_facility_id of {source_facility_id} that does not exist in locations.facility_id')
# An outer join is used here to include all rows on both sides of the join.
# Use the left side of the join to check for facility ids referenced by routes that
# do not exist in locations.
# Use the right side of the join to create error messages about route destination
# facility ids that do not exist in location.
join3 = self.locations.merge(self.routes, left_on='facility_id', right_on='destination_facility_id', how='outer')
if join3['facility_id'].isna().values.any():
destination_facility_id = join3[join3['facility_id'].isna().values]['destination_facility_id'].values
raise Exception(
f'There is a routes.destination_facility_id of {destination_facility_id} that does not exist in locations.facility_id')
# An outer join is used here to include all rows on both sides of the join.
# Use the left side of the join to check for facility ids referenced by routes that
# do not exist in step_costs.
# Use the right side of the join to create error messages about route source
# facility ids that do not exist step_costs.
join4 = self.step_costs.merge(self.routes, left_on='facility_id', right_on='source_facility_id', how='outer')
if join4['facility_id'].isna().values.any():
source_facility_id = join4[join4['facility_id'].isna().values]['source_facility_id']
raise Exception(f'There is a routes.source_facility_id {source_facility_id} that is not in step_costs.facility_id')
# An outer join is used here to include all rows on both sides of the join.
# Use the left side of the join to check for facility ids referenced by routes that
# do not exist in step_costs.
# Use the right side of the join to create error messages about route destination
# facility ids that do not exist step_costs.
join5 = self.step_costs.merge(self.routes, left_on='facility_id', right_on='destination_facility_id',
how='outer')
if join5['facility_id'].isna().values.any():
destination_facility_id = join5[join5['facility_id'].isna().values]['destination_facility_id']
raise Exception(
f'There is a routes.destination_facility_id {destination_facility_id} that is not in step_costs.facility_id')
def check_joins_on_facility_type(self):
"""
Check the joins on the facility_type among the tables
Raises
------
Exception
Raises an exception if there are any problems with the
facility_types.
"""
# An outer join is used here to include all rows on both sides of the join
# Check for null values on the left/right side of the join, and use ids on the
# opposite right/left side of the join to generate error messages about
# unmatched rows on the other side of the join.
join1 = self.locations.merge(self.fac_edges, on='facility_type', how='outer')
if join1['facility_id'].isna().values.any():
facility_type = join1[join1['facility_id'].isna().values]['facility_type'].values
raise Exception(
f'There is a fac_edges.facility_type {facility_type} that does not exist in locations.facility_type')
if join1['step'].isna().values.any():
facility_type = join1[join1['step'].isna().values]['facility_type'].values
raise Exception(
f'There is a locations.facility_type {facility_type} that does not exist in fac_edges.facility_type')
# An outer join is used here to include all rows on both sides of the join.
# Use the left side of the join to check for facility types referenced by routes that
# do not exist in locations.
# Use the right side of the join to create error messages about route source
# facility types that do not exist locations.
join2 = self.locations.merge(self.routes, left_on='facility_type', right_on='source_facility_type', how='outer')
if join2['facility_type'].isna().values.any():
source_facility_type = join2[join2['facility_type'].isna().values]['source_facility_type'].values
raise Exception(
f'There is a routes.source_facility_type {source_facility_type} that does not exist in locations.facility_type.'
)
# An outer join is used here to include all rows on both sides of the join.
# Use the left side of the join to check for facility types referenced by routes that
# do not exist in locations.
# Use the right side of the join to create error messages about route destination
# facility types that do not exist fac_edges.
join4 = self.locations.merge(self.routes, left_on='facility_type', right_on='destination_facility_type',
how='outer')
if join4['facility_type'].isna().values.any():
destination_facility_type = join4[join4['facility_type'].isna().values][
'destination_facility_type'].values
raise Exception(
f'There is a routes.destination_facility_type {destination_facility_type} that does not exist in locations.facility_type.'
)
# An outer join is used here to include all rows on both sides of the join.
# Use the left side of the join to check for facility types referenced by routes that
# do not exist in face_edges.
# Use the right side of the join to create error messages about route source
# facility types that do not exist fac_edges.
join3 = self.fac_edges.merge(self.routes, left_on='facility_type', right_on='source_facility_type', how='outer')
if join3['facility_type'].isna().values.any():
source_facility_type = join3[join3['facility_type'].isna().values]['source_facility_type'].values
raise Exception(
f'There is a routes.source_facility_type {source_facility_type} that does not exist in fac_edges.facility_type'
)
def check_step_joins(self):
"""
Checks all joins that involve steps.
Raises
------
Exception
Raises an exception if there are any problems with the step ids.
"""
# An outer join is used here to include all rows on both sides of the join
# Check for null values on the left/right side of the join, and use ids on the
# opposite right/left side of the join to generate error messages about
# unmatched rows on the other side of the join.
join1 = self.step_costs.merge(self.fac_edges, on='step', how='outer')
if join1['facility_type'].isna().values.any():
step = join1[join1['facility_type'].isna().values]['step'].values
raise Exception(f'There is a step_costs.step of {step} that does not exist in fac_edges.step.')
if join1['step_cost_method'].isna().values.any():
step = join1[join1['step_cost_method'].isna().values]['step'].values
raise Exception(f'There is a fac_edges.step {step} that does not exist in step_cost_method.step.')
# An outer join is used here to include all rows on both sides of the join.
# Use the left side of the join to check for u_steps referenced by transpo_edges
# that do not exist step_costs.
# Use the right side of the join to create error messages about transpo_edges
# u_steps that do not exist step_costs.
join2 = self.step_costs.merge(self.transpo_edges, left_on='step', right_on='u_step', how='outer')
if join2['step'].isna().values.any():
u_step = join2[join2['step'].isna().values]['u_step'].values
raise Exception(f'There is a transpo_edges.u_step {u_step} that does not exist in step_costs.step.')
# An outer join is used here to include all rows on both sides of the join.
# Use the left side of the join to check for v_steps referenced by transpo_edges
# that do not exist step_costs.
# Use the right side of the join to create error messages about transpo_edges
# v_steps that do not exist step_costs.
join3 = self.step_costs.merge(self.transpo_edges, left_on='step', right_on='v_step', how='outer')
if join3['step'].isna().values.any():
v_step = join3[join3['step'].isna().values]['v_step'].values
raise Exception(f'There is a transpo_edges.v_step {v_step} that does not exist in step_costs.step.')
# An outer join is used here to include all rows on both sides of the join.
# Use the left side of the join to check for next_steps referenced by fac_edges
# that do not exist step_costs.
# Use the right side of the join to create error messages about next_steps
# that do not exist step_costs.
# Note that since next_step is optional, do not generate an error messages
# for next_steps that are null.
join4 = self.step_costs.merge(self.fac_edges, left_on='step', right_on='next_step', how='outer')
if join4['step_x'].isna().values.any():
next_step = join4[join4['step_x'].isna().values]['next_step'].values
next_step_not_all_na = join4[join4['step_x'].isna().values]['next_step'].isna().values.all()
if not next_step_not_all_na: # next_step is optional, so nan should not throw an error
raise Exception(f'There is a fac_edges.next_step {next_step} that does not exist in step_costs.step.')
def main():
# Filenames
# locations, step_costs, fac_edges, routes, transpo_edges
parser = argparse.ArgumentParser(description='Check CELAVI input data')
parser.add_argument('--locations', help='Path to locations file')
parser.add_argument('--step_costs', help='Path to step_costs file')
parser.add_argument('--fac_edges', help='Facility edges file')
parser.add_argument('--routes', help='Routes file')
parser.add_argument('--transpo_edges', help='Transportation edges file')
args = parser.parse_args()
file_checks = FileChecks(
locations=args.locations,
step_costs=args.step_costs,
fac_edges=args.fac_edges,
routes=args.routes,
transpo_edges=args.transpo_edges
)
file_checks.check_files_exist()
file_checks.open_files()
file_checks.check_facility_id_nulls()
file_checks.check_facility_type_nulls()
file_checks.check_step_nulls()
file_checks.check_joins_on_facility_id()
file_checks.check_joins_on_facility_type()
file_checks.check_step_joins()
print('File check OK.')
if __name__ == '__main__':
main()