-
Notifications
You must be signed in to change notification settings - Fork 1
/
Copy pathfeature_engineering.py
225 lines (191 loc) · 11.3 KB
/
feature_engineering.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
#kaggle kernel link: https://www.kaggle.com/pradyu99914/data-feature-engineering?scriptVersionId=21783669
#please note that there are commits after this, where we have added more faeatures
#all of this code is spread over a number of commits as the feature engineering took ~7 hours per new feature.
#but, the later commits do not have data visualizations.
#this is the kernel we have used to perform feature engineering.
#not all of the visualizations that are required for feature engineering are here. Please refer to the file named "visualization.py"
#some imports and kaggle default code
#!pip install modin[ray] #parallelized pandas..
import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)
import pandas as pd
import dask.dataframe as dd
import os
from tqdm import tqdm
import gc
import holidays
import matplotlib.pyplot as plt
import datetime as dt
#check the available input data
import os
for dirname, _, filenames in os.walk("/kaggle"):
for filename in filenames:
print(os.path.join(dirname, filename))
#datatypes of the columns. using float32 to reduce memory needed
traintypes = {'fare_amount': 'float32',
'pickup_datetime': 'str',
'pickup_longitude': 'float32',
'pickup_latitude': 'float32',
'dropoff_longitude': 'float32',
'dropoff_latitude': 'float32',
'passenger_count': 'uint8'}
TRAIN_PATH = '../input/new-york-city-taxi-fare-prediction/train.csv'
df_list = [] # list to hold the batch dataframe
chunksize = 1000000 #we are reading 10**6 rows at a time
#haversine distance to calculate ditance between two points on earth
def haversine_distance(lat1, long1, lat2, long2):
R = 6371 #radius of earth in kilometers
phi1 = np.radians(lat1)
phi2 = np.radians(lat2)
delta_phi = np.radians(lat2-lat1)
delta_lambda = np.radians(long2-long1)
#a = sin²((φB - φA)/2) + cos φA . cos φB . sin²((λB - λA)/2)
a = np.sin(delta_phi / 2.0) ** 2 + np.cos(phi1) * np.cos(phi2) * np.sin(delta_lambda / 2.0) ** 2
#c = 2 * atan2( √a, √(1−a) )
c = 2 * np.arctan2(np.sqrt(a), np.sqrt(1-a))
#d = R*c
d = (R * c) #in kilometers
return d
#help taken from https://www.kaggle.com/szelee/how-to-import-a-csv-file-of-55-million-rows
#only the method to read the chunks has been taken from here
for df_chunk in tqdm(pd.read_csv(TRAIN_PATH, usecols=cols, dtype=traintypes, chunksize=chunksize)):
# Neat trick from https://www.kaggle.com/btyuhas/bayesian-optimization-with-xgboost
# Using parse_dates would be much slower!
df_chunk['pickup_datetime'] = df_chunk['pickup_datetime'].str.slice(0, 16)
df_chunk['pickup_datetime'] = pd.to_datetime(df_chunk['pickup_datetime'], utc=True, format='%Y-%m-%d %H:%M')
# Can process each chunk of dataframe here
# clean_data(), feature_engineer(),fit()
#print("Before: ",len(df_chunk))
df_chunk.dropna()
#remove the rows that have coordinates outside the bounding box of the city and its nearby areas.
mask = df_chunk['pickup_longitude'].between(-75, -73)
mask &= df_chunk['dropoff_longitude'].between(-75, -73)
mask &= df_chunk['pickup_latitude'].between(40, 42)
mask &= df_chunk['dropoff_latitude'].between(40, 42)
#remove the rows that have wrong number of passengers(negative or more than 8 passsengers)
mask &= df_chunk['passenger_count'].between(0, 8)
#remove rows with wrong fares(negative fares and grater than 250 USD..) and rows with fare amount = 0
mask &= df_chunk['fare_amount'].between(0, 250)
mask &= df_chunk['fare_amount'].gt(0)
#apply this mask, which will remove all the inconsistent rows
df_chunk = df_chunk[mask]
#print("After: ",len(df_chunk))
df_chunk = df_chunk.reset_index() #make it featherable again. masking messes with the index. reset index helps remove this problem.
mask = 0
#recover memory!
gc.collect()
#add a new attribute to the dataset, the haversine distance between pickup and dropodd coordinates
df_chunk["distance"] = pd.to_numeric(haversine_distance(df_chunk['pickup_latitude'], df_chunk['pickup_longitude'], df_chunk['dropoff_latitude'], df_chunk['dropoff_longitude']), downcast = 'float')
gc.collect()
#add a new column called time, which is the number of minutes since 12:00 am that day
df_chunk["time"] = pd.to_numeric(df_chunk.apply(lambda r: r.pickup_datetime.hour*60 + r.pickup_datetime.minute, axis = 1), downcast = "unsigned")
gc.collect()
#print("time")
#get a list of all US holidays
us_holidays = holidays.US()
#add a new column to the dataset, which is 1 if the row is on a holiday, or else, 0.
df_chunk["holiday"] = pd.to_numeric(df_chunk.apply(lambda x: 1 if x.pickup_datetime.strftime('%d-%m-%y')in us_holidays else 0, axis =1), downcast = "unsigned")
gc.collect()
#add a column named year to the dataset
df_chunk["year"] = df_chunk["pickup_datetime"].dt.year
#add the weekday attribute to the dataset.
df_chunk["weekday"] = pd.to_numeric(df_chunk["pickup_datetime"].dt.weekday, downcast = "unsigned")
#important places
Manhattan = (-73.9712,40.7831)[::-1]
JFK_airport = (-73.7781,40.6413)[::-1]
Laguardia_airport = (-73.8740,40.7769)[::-1]
#add these distances
df_chunk['pickup_distance_Mtn'] = pd.to_numeric(haversine_distance(Manhattan[0],Manhattan[1],df_chunk['pickup_latitude'],df_chunk['pickup_longitude']), downcast = 'float')
df_chunk['dropoff_distance_Mtn'] = pd.to_numeric(haversine_distance(Manhattan[0],Manhattan[1],df_chunk['dropoff_latitude'],df_chunk['dropoff_longitude']), downcast = 'float')
df_chunk['dropoff_distance_jfk'] = pd.to_numeric(haversine_distance(JFK_airport[0],JFK_airport[1],df_chunk['dropoff_latitude'],df_chunk['dropoff_longitude']), downcast = 'float')
df_chunk['pickup_distance_jfk'] = pd.to_numeric(haversine_distance(JFK_airport[0],JFK_airport[1],df_chunk['pickup_latitude'],df_chunk['pickup_longitude']), downcast = 'float')
df_chunk['pickup_distance_lg'] = pd.to_numeric(haversine_distance(Laguardia_airport[0],Laguardia_airport[1],df_chunk['dropoff_latitude'],df_chunk['dropoff_longitude']), downcast = 'float')
df_chunk['dropoff_distance_lg'] = pd.to_numeric(haversine_distance(Laguardia_airport[0],Laguardia_airport[1],df_chunk['pickup_latitude'],df_chunk['pickup_longitude']), downcast = 'float')
gc.collect()
# append the chunk to list and merge all
df_list.append(df_chunk)
#concatenate all the cunks to make the whole dataset into a single data frame
train_df = pd.concat(df_list)
#remove rows with distance = 0
mask = train_df['distance'].gt(0)
train_df = train_df[mask]
# Delete the dataframe list to release memory
del df_list
gc.collect()
# See what we have loaded
train_df.info()
#reset index, as mask messes with the index, which prevents it from being serializable
train_df = train_df.reset_index()
train_df.index = range(len(train_df.fare_amount))
#save to output.
train_df.to_feather('nyc_taxi_data_raw.feather')
#repeat the same thing for the test set.
df_chunk = pd.read_csv('/kaggle/input/new-york-city-taxi-fare-prediction/test.csv')
df_chunk['pickup_datetime'] = df_chunk['pickup_datetime'].str.slice(0, 16)
df_chunk['pickup_datetime'] = pd.to_datetime(df_chunk['pickup_datetime'], utc=True, format='%Y-%m-%d %H:%M')
gc.collect()
df_chunk["time"] = pd.to_numeric(df_chunk.apply(lambda r: r.pickup_datetime.hour*60 + r.pickup_datetime.minute, axis = 1), downcast = "unsigned")
gc.collect()
#print("time")
us_holidays = holidays.US()
df_chunk["holiday"] = pd.to_numeric(df_chunk.apply(lambda x: 1 if x.pickup_datetime.strftime('%d-%m-%y')in us_holidays else 0, axis =1), downcast = "unsigned")
gc.collect()
Manhattan = (-73.9712,40.7831)[::-1]
JFK_airport = (-73.7781,40.6413)[::-1]
Laguardia_airport = (-73.8740,40.7769)[::-1]
df_chunk["distance"] = pd.to_numeric(haversine_distance(df_chunk['pickup_latitude'], df_chunk['pickup_longitude'], df_chunk['dropoff_latitude'], df_chunk['dropoff_longitude']), downcast = 'float')
df_chunk["year"] = df_chunk["pickup_datetime"].dt.year
df_chunk["weekday"] = pd.to_numeric(df_chunk["pickup_datetime"].dt.weekday, downcast= "unsigned")
df_chunk['pickup_distance_Mtn'] = pd.to_numeric(haversine_distance(Manhattan[0],Manhattan[1],df_chunk['pickup_latitude'],df_chunk['pickup_longitude']), downcast = 'float')
df_chunk['dropoff_distance_Mtn'] = pd.to_numeric(haversine_distance(Manhattan[0],Manhattan[1],df_chunk['dropoff_latitude'],df_chunk['dropoff_longitude']), downcast = 'float')
df_chunk['dropoff_distance_jfk'] = pd.to_numeric(haversine_distance(JFK_airport[0],JFK_airport[1],df_chunk['dropoff_latitude'],df_chunk['dropoff_longitude']), downcast = 'float')
df_chunk['pickup_distance_jfk'] = pd.to_numeric(haversine_distance(JFK_airport[0],JFK_airport[1],df_chunk['pickup_latitude'],df_chunk['pickup_longitude']), downcast = 'float')
df_chunk['pickup_distance_lg'] = pd.to_numeric(haversine_distance(Laguardia_airport[0],Laguardia_airport[1],df_chunk['dropoff_latitude'],df_chunk['dropoff_longitude']), downcast = 'float')
df_chunk['dropoff_distance_lg'] = pd.to_numeric(haversine_distance(Laguardia_airport[0],Laguardia_airport[1],df_chunk['pickup_latitude'],df_chunk['pickup_longitude']), downcast = 'float')
#check if any column got dropped(kaggle needs original number of test datapoints for evaluation)
print("before", len(df_chunk))
print("after", len(df_chunk))
print(df_chunk.head())
df_chunk.index = range(len(df_chunk.pickup_datetime)) #fix thr index, which gets messed upbecause of the masking
print("after", len(df_chunk))
df_chunk.to_feather('test_feature.feather')
#as we see, all the NaN values have been removed successfully.
train_df.isna().sum()
#read only the 2 required columns from the dataset
train_df = pd.read_feather('../input/kernel318ff03a29/nyc_taxi_data_raw.feather', columns = ['holiday', 'fare_amount'])
#selcet all the rows on public holidays in one dataframe
holiday_df = train_df[train_df['holiday']==1]
#select all the rows from normla days in another dataframe
not_holiday_df = train_df[train_df['holiday']==0]
train_df = 0
gc.collect()
#find the mean fare amount for both of these sets
holiday_average = np.mean(holiday_df['fare_amount'])
not_holiday_average = np.mean(not_holiday_df['fare_amount'])
print(holiday_average, not_holiday_average)
#as we can see, the fare amounts are noticably higher on public holidays than regular days. So, the Holiday attribute is useful...
label = ["Holiday", "Not Holiday"]
x = [holiday_average, not_holiday_average]
def plot_bar_x():
# this is for plotting purpose
index = np.arange(len(label))
plt.figure(figsize = (10,10))
plt.bar(index, x)
plt.xlabel('Holiday')
plt.ylabel('Average Fare Amount')
plt.xticks(index, label, rotation=0)
plt.title('Average fare amounts on holidays and normal days')
plt.show()
plot_bar_x()
#as we can see, the distance between the coordinates is a very important feature, as it is strongly related to the target variable
train_df = pd.read_feather('../input/kernel318ff03a29/nyc_taxi_data_raw.feather', columns = ['distance', 'fare_amount'])
sample_df = train_df.sample(n = 5000)
plt.figure(figsize = (10,10))
plt.scatter(sample_df["distance"], sample_df["fare_amount"])
plt.title('Scatter plot of distance vs fare_amount')
plt.xlabel('distance')
plt.ylabel('fare_amount')
plt.show()
print(len(train_df[train_df['distance']==0]))
train_df = 0
gc.collect()