-
Notifications
You must be signed in to change notification settings - Fork 324
/
prepare_features.py
70 lines (57 loc) · 1.73 KB
/
prepare_features.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
import pickle
from datetime import datetime
from sklearn import preprocessing
import numpy as np
import random
random.seed(42)
with open('train_data.pickle', 'rb') as f:
train_data = pickle.load(f)
num_records = len(train_data)
with open('store_data.pickle', 'rb') as f:
store_data = pickle.load(f)
def feature_list(record):
dt = datetime.strptime(record['Date'], '%Y-%m-%d')
store_index = int(record['Store'])
year = dt.year
month = dt.month
day = dt.day
day_of_week = int(record['DayOfWeek'])
try:
store_open = int(record['Open'])
except:
store_open = 1
promo = int(record['Promo'])
return [store_open,
store_index,
day_of_week,
promo,
year,
month,
day,
store_data[store_index - 1]['State']
]
train_data_X = []
train_data_y = []
for record in train_data:
if record['Sales'] != '0' and record['Open'] != '':
fl = feature_list(record)
train_data_X.append(fl)
train_data_y.append(int(record['Sales']))
print("Number of train datapoints: ", len(train_data_y))
print(min(train_data_y), max(train_data_y))
full_X = train_data_X
full_X = np.array(full_X)
train_data_X = np.array(train_data_X)
les = []
for i in range(train_data_X.shape[1]):
le = preprocessing.LabelEncoder()
le.fit(full_X[:, i])
les.append(le)
train_data_X[:, i] = le.transform(train_data_X[:, i])
with open('les.pickle', 'wb') as f:
pickle.dump(les, f, -1)
train_data_X = train_data_X.astype(int)
train_data_y = np.array(train_data_y)
with open('feature_train_data.pickle', 'wb') as f:
pickle.dump((train_data_X, train_data_y), f, -1)
print(train_data_X[0], train_data_y[0])