forked from dpendleton22/GTRI_ICL_Datasets
-
Notifications
You must be signed in to change notification settings - Fork 0
/
script.py
288 lines (214 loc) · 7.92 KB
/
script.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
import pandas as pd
import numpy as np
import datetime
cancellationFiles = ["data/Delay&Cancellation/" + x for x in ["2009.csv",
"2010.csv",
"2011.csv",
"2012.csv",
"2013.csv",
"2014.csv",
"2015.csv",
"2016.csv",
"2017.csv",
"2018.csv"]]
weatherFile = "data/Weather/weather_description.csv"
# cancellationFiles = ["data/Delay&Cancellation/" + x for x in ["2017.csv"]]
def topDelayAirlines(verbose=False):
'''
Provide the top 3 airlines with an average difference in expected arrival and actual arrival and their standard deviation
Input:
None
Output:
dict: Mapping airline to stddev of arrival delay
Approach:
Keep track of arrival differences through a dict; transform to dataframe and get metrics
'''
airline_dict = {}
for filename in cancellationFiles:
if verbose: print("Reading data from " + filename)
chunk_read = pd.read_csv(filename, chunksize=1000000)
for chunk in chunk_read:
# For each airline: get a dataframe, append delay in dict
for airline in chunk['OP_CARRIER'].unique():
if airline not in airline_dict:
airline_dict[airline] = []
#Extract Arrival delay from respective airline
airline_dict[airline].extend(chunk[chunk['OP_CARRIER'] == airline]['ARR_DELAY'].dropna().values)
#Convert lists to pandas Series so we don't run into pandas issues when casting
for airline in airline_dict:
airline_dict[airline] = pd.Series(airline_dict[airline])
delay_df = pd.DataFrame(airline_dict)
#Get mean & stddev of airlines
results_df = pd.DataFrame()
results_df['mean'] = delay_df.mean(axis=0)
results_df['stddev'] = delay_df.std(axis=0)
# Return top 3 based on mean
return results_df.sort_values(by="mean", ascending=False).head(3)
'''
mean stddev
VX 11.149536 42.773900
B6 10.977184 51.785661
EV 8.027348 62.579603
'''
def averageRainDelay(city, airport, verbose=False):
'''
Given a city and airport code provide the average delay time on days where there is any type of rain
Input:
city: str
airport: str
Output:
averageRainDelay: float
Approach:
Get days that rained in a city, filter airline data by these days/airport, extract delay data
'''
#Load our data, in particular the weather for our city
weather_df = pd.read_csv(weatherFile)
try:
city_df = weather_df[['datetime',city]]
except:
print("No city found!")
return
#Clean our data somewhat
city_df = city_df.dropna()
# Limit to just rainy days
city_df = city_df[city_df[city].str.contains("rain")]
# Get the unique days where it rained
city_df['datetime'] = pd.to_datetime(city_df['datetime'])
city_df['datetime'] = city_df['datetime'].dt.date
rainy_days = city_df['datetime'].unique()
# Go through our data and keep relevant data
rainy_delays = pd.DataFrame()
for filename in cancellationFiles:
if verbose: print("Reading data from " + filename)
chunk_read = pd.read_csv(filename, chunksize=1000000)
for chunk in chunk_read:
#Filter based on airport code
data_df = chunk[chunk['ORIGIN'] == airport]
#Format datetime correctly
data_df['FL_DATE'] = pd.to_datetime(data_df['FL_DATE'])
#Filter based on days
data_df = data_df[data_df['FL_DATE'].isin(rainy_days)]
#Collect relevant delay data
rainy_delays = pd.concat([rainy_delays, data_df['DEP_DELAY'].dropna()], ignore_index=True)
return rainy_delays[0].mean()
# Atlanta, ATL: 12.51768244458
def worstTravelDays(city, airport, verbose=False):
'''
Given a city and airport code what are that aiport's worst days to travel
Note: Interpreting 'worst days to travel' as largest delay time
Interpreting 'days' as days of the week
Input:
city: str
airport: str
Output:
Sorted list of days of the week and highest average delay
Approach:
Keep structure to track delays of each day of week, extract metrics after all data has been tracked
'''
# Create storage for delay times for days of the week
day_dfs = [pd.DataFrame() for x in range(7)]
# Load our data
for filename in cancellationFiles:
if verbose: print("Reading data from " + filename)
chunk_read = pd.read_csv(filename, chunksize=1000000)
for chunk in chunk_read:
#Filter based on airport code
data_df = chunk[chunk['ORIGIN'] == airport]
#Format datetime as days of the week
data_df['FL_DATE'] = pd.to_datetime(data_df['FL_DATE'])
data_df['FL_DATE'] = data_df['FL_DATE'].dt.weekday
#Add the relevant info for our respective weekdays
for day in [0,1,2,3,4,5,6]:
day_df = data_df[data_df['FL_DATE'] == day]['DEP_DELAY']
day_dfs[day] = pd.concat([day_dfs[day], day_df.dropna()], ignore_index=True)
day_delays = [x.mean() for x in day_dfs]
return np.argmax(day_delays), day_delays
'''
Atlanta, ATL:
0
[ 11.012215
8.355434
8.745895
10.206612
10.935639
7.176773
9.189768
]
'''
def cancellationProbability(city, airport, verbose=False):
'''
Given a city and airport code create a function that provides a probability of a flight getting cancelled
Input:
city: str
airport: str
Output:
float: probability of flight getting cancelled
Approach:
Filter flight data on airport, calculate ratio of cancelled to total flights
'''
# Calculate the ratio of cancelled to uncancelled flights for a certain airport
cancelled_flights, total_flights = 0, 0
for filename in cancellationFiles:
if verbose: print("Reading data from " + filename)
chunk_read = pd.read_csv(filename, chunksize=1000000)
for chunk in chunk_read:
#Filter based on airport code
data_df = chunk[chunk['ORIGIN'] == airport]
#Drop data that we shouldn't be counting
data_df = data_df.dropna(subset=['CANCELLED'])
#Get number of cancelled and valid
cancelled_flights += len(data_df[data_df['CANCELLED'] == 1.0])
total_flights += len(data_df)
return float(cancelled_flights/total_flights)
# Atlanta, ATL: 0.011344666128994242
def delayProbability(city, airport, weather_desc, verbose = False):
'''
Given an airport code, city and weather description create a function to predict the probability of departure
flight delay from the airport
Input:
city: str
airport: str
weather_desc: str
Output:
float: probability of flight getting delayed
Approach:
Get all days that experienced weather, filter flight data by these dates/airport, keep count of delayed flights
'''
delayed_flights, total_flights = 0, 0
# Load our data
weather_df = pd.read_csv(weatherFile)
try:
city_df = weather_df[['datetime', city]]
except:
print("No city found!")
return
# Basic Data Cleaning
city_df = city_df.dropna()
# Get all days where city experienced weather_desc
city_df = city_df[city_df[city].str.contains(weather_desc)]
city_df['datetime'] = pd.to_datetime(city_df['datetime'])
city_df['datetime'] = city_df['datetime'].dt.date
weather_days = city_df['datetime'].unique()
# Go through our data, filter by weather_days, and keep count of delayed vs. non-delayed flights
for filename in cancellationFiles:
if verbose: print("Reading data from " + filename)
chunk_read = pd.read_csv(filename, chunksize=1000000)
for chunk in chunk_read:
#Filter based on airport code
data_df = chunk[chunk['ORIGIN'] == airport]
#Format datetime correctly
data_df['FL_DATE'] = pd.to_datetime(data_df['FL_DATE'])
#Filter based on weather
data_df = data_df[data_df['FL_DATE'].isin(weather_days)]
#Get number of delayed
delayed_flights += len(data_df[data_df['DEP_DELAY'] > 0.0])
total_flights += len(data_df)
return delayed_flights / total_flights
#Atlanta, ATL, "sky is clear": 0.35335226279868004
if __name__ == "__main__":
# print(averageRainDelay("Atlanta", "ATL", verbose=True))
# print(worstTravelDays("Atlanta", "ATL", verbose=True))
# print(cancellationProbability("Atlanta", "ATL", verbose=True))
# print(delayProbability("Atlanta", "ATL", "sky is clear", verbose=True))
# print(topDelayAirlines(verbose=True))
pass