-
Notifications
You must be signed in to change notification settings - Fork 0
/
lag_on_mean.py
211 lines (178 loc) · 10.9 KB
/
lag_on_mean.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
import os
import gzip
import csv
from dataset import DataSet
import xgboost as xgb
from sklearn.metrics import mean_squared_error
from sklearn.model_selection import train_test_split, cross_val_score, KFold, GridSearchCV
import numpy as np
import logging
import pandas as pd
import time
start_time = time.time()
from tqdm import tqdm
from sklearn.model_selection import KFold
def meanencoding_lagfeature():
print("[%s] Initialize the dataset." % logging.time.ctime())
dataset = DataSet()
print("[%s] Loading Training Data ..." % logging.time.ctime())
trainX, trainY = dataset.loadTrainData(True)
print("[%s] Loading Testing Data ..." % logging.time.ctime())
testX = dataset.loadTestData(True)
train_df = pd.DataFrame(trainX,columns = ['shop_id', 'item_id', 'cat_id', 'date_block_num','year', 'month', 'city_code', 'type_id','sub_type_id','price'])
train_df = train_df.drop(['price'], axis=1)
train_df['item_cnt_month'] = np.array(trainY)
test_df = pd.DataFrame(testX,columns = ['shop_id', 'item_id', 'cat_id','date_block_num', 'year', 'month', 'city_code', 'type_id','sub_type_id', 'price'])
test_df = test_df.drop(['price'], axis=1)
train_row = int(train_df.shape[0])
print(train_row)
test_count = len(test_df)
print("[%s] Mean Encoding and Feature Engineering ..." % logging.time.ctime())
# K fold Target Encoding
print('%0.2f min: Start adding mean-encoding for item_cnt_month'%((time.time() - start_time)/60))
Target = 'item_cnt_month'
global_mean = train_df[Target].mean()
SEED = 0
kf = KFold(n_splits = 5, shuffle = False, random_state = SEED)
mean_encoded_columns = ['shop_id', 'item_id', 'cat_id','city_code', 'type_id','sub_type_id']
for column in tqdm(mean_encoded_columns):
added_column_name = column + '_cnt_month_mean_Kfold'
df_temp = train_df[[column]+[Target]]
df_temp[added_column_name] = global_mean
for tr_ind, val_ind in kf.split(df_temp):
X_tr, X_val = df_temp.iloc[tr_ind], df_temp.iloc[val_ind]
df_temp.loc[df_temp.index[val_ind], added_column_name] = \
X_val[column].map(X_tr.groupby(column)[Target].mean())
df_temp[added_column_name].fillna(global_mean, inplace = True)
train_df = pd.concat([train_df, df_temp[added_column_name]],axis = 1)
# Adding target mean encoding for test DF
all_test_index = np.arange(test_count)
temp = test_df.iloc[all_test_index]
test_df[added_column_name] = np.nan
test_df.loc[:,added_column_name] = \
temp[column].map(train_df.groupby(column)[Target].mean())
#print(test_df[test_df[[added_column_name]].isnull().any(axis=1)])
test_df.fillna(0, inplace = True)
print('%0.2f min: Finish adding mean-encoding'%((time.time() - start_time)/60))
# Feature Engineering -- Creating lag based feature
print('%0.2f min: Start adding lag based feature'%((time.time() - start_time)/60))
# add one column to concat
test_df['item_cnt_month'] = -1
train_test_df = pd.concat([train_df, test_df], axis = 0)
print('%0.2f min: Adding first lag feature -- x:1,2,3,6,12 month ago item_cnt_month with same shop_id&item_id'%((time.time() - start_time)/60))
# lookback_range = [1,2,3,6,12]
# for diff in tqdm(lookback_range):
# new_feature_name = str(diff) + '_month_ago_item_cnt_month_same_shop_item'
# train_test_df_temp = train_test_df.copy()
# train_test_df_temp.loc[:,'date_block_num'] += diff
# train_test_df_temp.rename(columns={'item_cnt_month':new_feature_name}, inplace = True)
# train_test_df = train_test_df.merge(train_test_df_temp[['shop_id_cnt_month_mean_Kfold','item_id_cnt_month_mean_Kfold','date_block_num', new_feature_name]],\
# on = ['shop_id_cnt_month_mean_Kfold','item_id_cnt_month_mean_Kfold','date_block_num'], how = 'left')
# train_test_df[new_feature_name] = train_test_df[new_feature_name].fillna(0)
groups = train_test_df.groupby(by = ['item_id_cnt_month_mean_Kfold', 'shop_id_cnt_month_mean_Kfold','date_block_num'])
lookback_range = [1,2,3,6,12]
for diff in tqdm(lookback_range):
new_feature_name = str(diff) + '_month_ago_item_cnt_month_same_shop_item'
result = groups.agg({'item_cnt_month':'mean'})
result = result.reset_index()
result.loc[:,'date_block_num'] += diff
result.rename(columns={'item_cnt_month':new_feature_name}, inplace = True)
train_test_df = train_test_df.merge(result, on = ['item_id_cnt_month_mean_Kfold', 'shop_id_cnt_month_mean_Kfold','date_block_num'], how = 'left')
train_test_df[new_feature_name] = train_test_df[new_feature_name].fillna(0)
print('%0.2f min: Adding second lag feature -- x:1 month ago average item_cnt_month in all'%((time.time() - start_time)/60))
groups = train_test_df.groupby(by = ['date_block_num'])
lookback_range = [1]
for diff in tqdm(lookback_range):
new_feature_name = str(diff) + '_month_ago_item_cnt_month_in_all'
result = groups.agg({'item_cnt_month':'mean'})
result = result.reset_index()
result.loc[:,'date_block_num'] += diff
result.rename(columns={'item_cnt_month':new_feature_name}, inplace = True)
train_test_df = train_test_df.merge(result, on = ['date_block_num'], how = 'left')
train_test_df[new_feature_name] = train_test_df[new_feature_name].fillna(0)
print('%0.2f min: Adding third lag feature -- x:1,2,3,6,12 month ago average item_cnt_month with same item_id'%((time.time() - start_time)/60))
groups = train_test_df.groupby(by = ['item_id_cnt_month_mean_Kfold','date_block_num'])
lookback_range = [1,2,3,6,12]
for diff in tqdm(lookback_range):
new_feature_name = str(diff) + '_month_ago_item_cnt_month_item'
result = groups.agg({'item_cnt_month':'mean'})
result = result.reset_index()
result.loc[:,'date_block_num'] += diff
result.rename(columns={'item_cnt_month':new_feature_name}, inplace = True)
train_test_df = train_test_df.merge(result, on = ['item_id_cnt_month_mean_Kfold','date_block_num'], how = 'left')
train_test_df[new_feature_name] = train_test_df[new_feature_name].fillna(0)
print('%0.2f min: Adding fourth lag feature -- x:1,2,3,6,12 month ago average item_cnt_month with same shop_id'%((time.time() - start_time)/60))
groups = train_test_df.groupby(by = ['shop_id_cnt_month_mean_Kfold','date_block_num'])
lookback_range = [1,2,3,6,12]
for diff in tqdm(lookback_range):
new_feature_name = str(diff) + '_month_ago_item_cnt_month_shop'
result = groups.agg({'item_cnt_month':'mean'})
result = result.reset_index()
result.loc[:,'date_block_num'] += diff
result.rename(columns={'item_cnt_month':new_feature_name}, inplace = True)
train_test_df = train_test_df.merge(result, on = ['shop_id_cnt_month_mean_Kfold','date_block_num'], how = 'left')
train_test_df[new_feature_name] = train_test_df[new_feature_name].fillna(0)
print('%0.2f min: Adding fifth lag feature -- x:1 month ago average item_cnt_month with same cat_id'%((time.time() - start_time)/60))
groups = train_test_df.groupby(by = ['cat_id_cnt_month_mean_Kfold','date_block_num'])
lookback_range = [1]
for diff in tqdm(lookback_range):
new_feature_name = str(diff) + '_month_ago_item_cnt_month_cat'
result = groups.agg({'item_cnt_month':'mean'})
result = result.reset_index()
result.loc[:,'date_block_num'] += diff
result.rename(columns={'item_cnt_month':new_feature_name}, inplace = True)
train_test_df = train_test_df.merge(result, on = ['cat_id_cnt_month_mean_Kfold','date_block_num'], how = 'left')
train_test_df[new_feature_name] = train_test_df[new_feature_name].fillna(0)
print('%0.2f min: Adding sixth lag feature -- x:1 month ago average item_cnt_month with same cat_id&shop_id'%((time.time() - start_time)/60))
groups = train_test_df.groupby(by = ['cat_id_cnt_month_mean_Kfold','shop_id_cnt_month_mean_Kfold','date_block_num'])
lookback_range = [1]
for diff in tqdm(lookback_range):
new_feature_name = str(diff) + '_month_ago_item_cnt_month_cat_shop'
result = groups.agg({'item_cnt_month':'mean'})
result = result.reset_index()
result.loc[:,'date_block_num'] += diff
result.rename(columns={'item_cnt_month':new_feature_name}, inplace = True)
train_test_df = train_test_df.merge(result, on = ['cat_id_cnt_month_mean_Kfold','shop_id_cnt_month_mean_Kfold','date_block_num'], how = 'left')
train_test_df[new_feature_name] = train_test_df[new_feature_name].fillna(0)
print('%0.2f min: Adding seventh lag feature -- x:1 month ago average item_cnt_month with same city_code'%((time.time() - start_time)/60))
groups = train_test_df.groupby(by = ['city_code_cnt_month_mean_Kfold','date_block_num'])
lookback_range = [1]
for diff in tqdm(lookback_range):
new_feature_name = str(diff) + '_month_ago_item_cnt_month_city'
result = groups.agg({'item_cnt_month':'mean'})
result = result.reset_index()
result.loc[:,'date_block_num'] += diff
result.rename(columns={'item_cnt_month':new_feature_name}, inplace = True)
train_test_df = train_test_df.merge(result, on = ['city_code_cnt_month_mean_Kfold','date_block_num'], how = 'left')
train_test_df[new_feature_name] = train_test_df[new_feature_name].fillna(0)
print('%0.2f min: Adding eighth lag feature -- x:1 month ago average item_cnt_month with same city_code&shop_id'%((time.time() - start_time)/60))
groups = train_test_df.groupby(by = ['shop_id_cnt_month_mean_Kfold','city_code_cnt_month_mean_Kfold','date_block_num'])
lookback_range = [1]
for diff in tqdm(lookback_range):
new_feature_name = str(diff) + '_month_ago_item_cnt_month_shop_city'
result = groups.agg({'item_cnt_month':'mean'})
result = result.reset_index()
result.loc[:,'date_block_num'] += diff
result.rename(columns={'item_cnt_month':new_feature_name}, inplace = True)
train_test_df = train_test_df.merge(result, on = ['shop_id_cnt_month_mean_Kfold','city_code_cnt_month_mean_Kfold','date_block_num'], how = 'left')
train_test_df[new_feature_name] = train_test_df[new_feature_name].fillna(0)
print('%0.2f min: Finish adding lag based feature'%((time.time() - start_time)/60))
print('%0.2f min: Start generating training and testing data'%((time.time() - start_time)/60))
#split new train_df and test_df from train_test_df
new_train_df = train_test_df.iloc[:train_row]
new_test_df = train_test_df.iloc[train_row:]
new_train_df = new_train_df.drop(['shop_id','item_id','cat_id','date_block_num','item_cnt_month','shop_id_cnt_month_mean_Kfold', 'item_id_cnt_month_mean_Kfold', 'cat_id_cnt_month_mean_Kfold'], axis=1)
new_test_df = new_test_df.drop(['shop_id','item_id','cat_id','date_block_num','item_cnt_month','shop_id_cnt_month_mean_Kfold', 'item_id_cnt_month_mean_Kfold', 'cat_id_cnt_month_mean_Kfold'], axis=1)
new_train_df.head().to_csv('train_df_head.csv')
new_test_df.head().to_csv('test_df_head.csv')
mean_encoded_lag_feature_trainX = np.array(new_train_df.values.tolist())
mean_encoded_lag_feature_testX = np.array(new_test_df.values.tolist())
print(np.shape(mean_encoded_lag_feature_trainX))
print(np.shape(mean_encoded_lag_feature_testX))
print('%0.2f min: Finish generating training and testing data'%((time.time() - start_time)/60))
return mean_encoded_lag_feature_trainX, trainY, mean_encoded_lag_feature_testX
if __name__ == "__main__":
"""
For Test And Debug Only
"""
mean_encoded_trainX, trainY, mean_encoded_testX = meanencoding_lagfeature()