-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathwalmart.py
113 lines (77 loc) · 3.17 KB
/
walmart.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
#importing required libraries
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.pipeline import make_pipeline,Pipeline
from sklearn.model_selection import train_test_split,GridSearchCV
from sklearn.linear_model import LinearRegression,Ridge,Lasso
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import mean_squared_error as mse
import datetime
from sklearn.base import BaseEstimator
import pdpipe as pdp
# class definition for joining multiple model throuh pipes
class ClfSwitcher(BaseEstimator):
def __init__(
self,
estimator = LinearRegression(),
):
"""
A Custom BaseEstimator that can switch between classifiers.
:param estimator: sklearn object - The classifier
"""
self.estimator = estimator
def fit(self, X, y = None, **kwargs):
self.estimator.fit(X, y)
return self
def predict(self, X, y = None):
return self.estimator.predict(X)
def score(self, X, y):
return self.estimator.score(X, y)
def preprocessing_walmart(dataset):# 2nd function definition processing
dataset['Day'] = pd.to_datetime(dataset['Date'])
#using pandas pipeline
panda_pipe = pdp.ApplyByCols('Day',lambda x: (x.day//7)+1,'Week_no',drop = False)
#converting given day into week of the month
panda_pipe += pdp.ApplyByCols('Day',lambda x: x.month,'month',drop = False)
#getting month from the date
panda_pipe += pdp.ColDrop(['Date','Day'])
dataset = panda_pipe(dataset)
dataset['Lag2'] = dataset['Weekly_Sales'].shift(2)
dataset['Lag3'] = dataset['Weekly_Sales'].shift(3)
dataset['Lag4'] = dataset['Weekly_Sales'].shift(4)
dataset['Lag5'] = dataset['Weekly_Sales'].shift(5)
dataset['Lag6'] = dataset['Weekly_Sales'].shift(6)
to_be_predicted = dataset['Weekly_Sales']
dataset = dataset.drop(columns = ['Weekly_Sales'])
X_train,X_test,Y_train,Y_test = train_test_split(dataset, to_be_predicted, random_state = 42, test_size = 0.3)
return (X_train,Y_train,X_test)
def model_pipeline():#3rd function definition
#referenced class ClfSwitcher for optimal parameter
pipe = Pipeline([('standard',StandardScaler()),('reg',ClfSwitcher())])
'''
class reference for selecting the best models
'''
params = [{'reg__estimator':[LinearRegression()]},
{
'reg__estimator':[Ridge()],
'reg__estimator__alpha':[0.1,1,10]},
{
'reg__estimator':[Lasso()],
'reg__estimator__alpha':[.001,.01,.1,1,10]
}]
#given evaluation states to pipe(ClfSwitcher(BaseEstimator=LinearRegression))
#using gridsearchCV for best model selection
gscv = GridSearchCV(pipe, params, cv=5)
return gscv
def results(X_train, Y_train, X_test, model):
model.fit(X_train,Y_train)
model.predict(X_test)
def main():# First Function definition main
dataset = pd.read_csv("/home/sundaram/walmart-recruiting-store-sales-forecasting/train.csv")
X_train,Y_train,X_test = preprocessing_walmart(dataset)# 2nd function call for processing
model=model_pipeline()
results(X_train, Y_train, X_test, model)
if __name__ == "__main__":
main()