-
Notifications
You must be signed in to change notification settings - Fork 1
/
Copy pathpipeline_jmidkiff.py
284 lines (249 loc) · 9.44 KB
/
pipeline_jmidkiff.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
import pandas as pd
import numpy as np
from pandas.api.types import is_numeric_dtype
from sklearn import metrics
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
import datetime
##### Read
def read_csv(file, date_cols=[]):
'''
Reads a CSV file, and interprets specified columns as datetime
Inputs:
file (string)
date_cols (list of str)
Outputs:
Pandas DataFrame
'''
return pd.read_csv(file, parse_dates=date_cols,
infer_datetime_format=True)
##### Explore
def show(df, head=5):
'''
Prints the shape and datatype of a Pandas DataFrame and returns the
dataframe head.
Inputs:
df (Pandas DataFrame)
head (int) - Size of head to return
Outputs:
dataframe
'''
print()
print('Shape:')
print(df.shape)
print()
print('Data Types:')
print(df.dtypes)
return df.head(head)
def describe(df):
'''
Runs Pandas.DataFrame.describe() returning summary statistics of all
numerical columns.
Inputs:
df (Pandas DataFrame)
Outputs:
dataframe
'''
return df.describe()
def group_count(df, groupby, new_name='Count'):
'''
Produces a Pandas Series showing the observation counts grouped by 'groupby'
with parameter new_name for the new column.
Inputs:
df (Pandas DataFrame)
groupby (str) - name of column to groupby
new_name (str) - name of Pandas Series
Outputs:
Pandas Series
'''
return (df.groupby(groupby, dropna=False)[groupby]
.count().rename(new_name))
##### Training/Testing
def split(df, test_size=0.2, random_state=14, print_size=True):
'''
Splits training and testing data according to the proportion of test_size
and the random_state.
Inputs:
df (Pandas DataFrame)
test_size (float) - proportion size of test data [0.0, 1.0]
random_state (int) - for replication purposes
print_size (Bool) - whether to print training and test shapes
Outputs:
training, testing (Pandas DataFrames)
'''
training, testing = train_test_split(df,
test_size=test_size, random_state=random_state)
if print_size:
print('Training Size:')
print(training.shape)
print()
print('Testing Size:')
print(testing.shape)
return training, testing
##### Pre-Processing
def impute_missing(df):
'''
Imputes missing values with the column median for columns that are
of numeric dtype (namely int, float, bool). Also drops any columns that are
all NaN.
Inputs:
df (Pandas DataFrame)
Outputs:
new df (Pandas DataFrame)
'''
numeric_mask = df.dtypes.apply(is_numeric_dtype)
na_mask = df.isna().any()
print(f'Contains NA Values:\n{na_mask}')
col_mask = numeric_mask.combine(other=na_mask, func=min)
medians = df.loc[:,col_mask].median().to_dict()
return df.fillna(value=medians).dropna(axis=1, how='all')
def normalized_values(df, ignore=[], quiet=False):
'''
Normalizes numeric columns using sklearn.preprocessing.StandardScaler()
Inputs:
df (Pandas DataFrame)
ignore (list of str) - list of numerical columns to ignore when normalizing.
Usually will include at least the target column.
Outputs:
scaler (StandardScaler()) for use in function normalize()
'''
df_working = df.drop(columns=ignore)
numeric_mask = df_working.dtypes.apply(is_numeric_dtype)
scaler = StandardScaler()
scaler.fit(df_working.loc[:,numeric_mask])
if not quiet:
print('Normalization Results:')
print(list(df_working.loc[:,numeric_mask].columns))
print('Column Means:')
print(scaler.mean_)
print('Column Variances:')
print(scaler.var_)
return scaler
def normalize(df, scaler, ignore=[], inplace=False):
'''
Normalizes all numeric columns not included in ignore using scaler.
Inputs:
df (Pandas DataFrame)
scaler (sklearn.preprocessing.StandardScaler())
ignore (list of str) - columns to ignore.
inplace (bool) - Whether to overwrite columns or concatenate new ones
with normalized values
Outputs:
df (Pandas DataFrame) - Either overwritten or with normalized columns
appended.
'''
df_working = df.drop(columns=ignore)
numeric_mask = df_working.dtypes.apply(is_numeric_dtype)
if not inplace:
col_list = list(numeric_mask[numeric_mask].index)
for index, col in enumerate(col_list):
col_list[index] = col + '_norm'
df_norm = pd.DataFrame(
data=scaler.transform(df_working.loc[:,numeric_mask]),
columns=col_list,
index=df_working.loc[:,numeric_mask].index)
return df.join(df_norm)
df_working.loc[:,numeric_mask] = (scaler.transform(
df_working.loc[:,numeric_mask]))
return df.loc[:,ignore].join(df_working)
##### Generate Features
def get_dummies(df, df_original):
'''
Creates dummy variables of all non-numeric features.
Inputs:
df (Pandas DataFrame) - dataframe with just columns to dummify.
df_original() - full dataframe to append dummified columns to.
Outputs:
df (Pandas DataFrame)
'''
return df_original.join(pd.get_dummies(df))
def cut(df, bins, right=True, labels=None):
'''
Bin numerical columns in a dataframe.
Inputs:
df (Pandas DataFrame)
bins (int) - Number of bins
right (bool) - Include right boundary in bins
labels (list of str) - label names
Outputs:
df (Pandas DataFrame)
'''
pd.cut(df, bins=bins, right=right, labels=labels)
##### Build Classifiers
def learn(target, training, testing, models, grid):
'''
Learn a series of models using a grid of parameters, evaluate models, and
report the total time elapsed.
Inputs:
target (str) - Name of target column
training (Pandas DataFrame) - training dataframe
testing (Pandas DataFrame) - testing dataframe
models (dict) - dictionary of sklearn models
grid (dict of lists) - grid of parameters to use in models.
Outputs:
results (Pandas DataFrame) - dataframe of models, parameters as str, and
various metrics.
'''
# Begin timer
start = datetime.datetime.now()
# Initialize results data frame
results = pd.DataFrame(columns=['Model', 'Parameters', 'Accuracy',
'Precision', 'Recall', 'Classification Report'])
# Loop over models
for model_key in models.keys():
# Loop over parameters
for params in grid[model_key]:
model_start = datetime.datetime.now()
print("Training model:", model_key, "|", params)
# Create model
model = models[model_key]
model.set_params(**params)
# Fit model on training set
model.fit(X=training.drop(columns=target), y=training.loc[:,target])
# Predict on testing set
predictions = model.predict(testing.drop(columns=target))
# Evaluate predictions
accuracy = metrics.accuracy_score(testing.loc[:,[target]], predictions)
recall = metrics.recall_score(testing.loc[:,[target]], predictions)
precision = metrics.precision_score(testing.loc[:,[target]], predictions)
classification_report = metrics.classification_report(
testing.loc[:,[target]], predictions)
# Store results in your results data frame
result = pd.DataFrame(
data=np.array([[model_key, str(params), accuracy,
recall, precision, classification_report]]),
columns=["Model", "Parameters", "Accuracy", "Recall",
"Precision", "Classification Report"])
results = results.append(result, ignore_index=True)
model_end = datetime.datetime.now()
print(' Model Run Time:', model_end - model_start)
# End timer
stop = datetime.datetime.now()
print("Total Time Elapsed:", stop - start)
return results
def print_coefs(model, df, target, n=10):
'''
Prints target name, intercept and top/bottom n feature names and coefficient
results from a model. Assumes that all non-target columns in dataframe are
features of model.
Inputs:
model: scikit-learn model instance
df (Pandas DataFrame) - used only to extract column names
target (str) - Name of target column
n (int) - top/bottom n features to print
Outputs:
None, prints to screen.
'''
if n > pd.get_option('display.max_rows'):
pd.set_option('display.max_rows', n)
series = (pd.Series(
data=model.coef_.reshape(-1,),
index=df.drop(columns=target).columns)
.sort_values(ascending=False))
print(f'Target:\n{target}\n\nIntercept:\n{model.intercept_}\n')
if n >= model.coef_.shape[1]:
print(f'Features and Coefficients:\n{series}\n')
else:
print(f'Top {n} Features and Coefficients:\n{series.head(n)}\n\n' +
f'Bottom {n} Features and Coefficients:\n{series.tail(n)}\n\n')
pd.reset_option('display.max_rows')