-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathWhyRegularize.py
107 lines (86 loc) · 3.6 KB
/
WhyRegularize.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
from collections import OrderedDict
import copy
import numpy as np
import pandas as pd
from pandas import DataFrame, Series
import scipy
import sklearn as sk
import sklearn.model_selection as model_selection
from sklearn.model_selection import ShuffleSplit
import sklearn.feature_selection as feature_selection
import sklearn.linear_model as linear_model
import sklearn.pipeline as pipeline
import MaclearnUtilities
from MaclearnUtilities import bhfdr, colcor
def pandaize(f):
def pandaized(estimator, X, y, **kwargs):
return f(estimator, np.array(X), y, **kwargs)
return pandaized
@pandaize
def cross_val_score_pd(estimator, X, y, **kwargs):
return model_selection.cross_val_score(estimator, X, y, **kwargs)
## -----------------------------------------------------------------
## linear regression simulated example
## -----------------------------------------------------------------
x = np.random.randn(15, 4)
x[:, 1] = x[:, 0] + 0.01 * x[:, 1]
y = x[:, 3] + np.random.randn(15)
linmod = linear_model.LinearRegression().fit(x, y)
linmod.coef_
l2mod = linear_model.Ridge(alpha=15*0.1).fit(x, y)
l2mod.coef_
l1mod = linear_model.Lasso(alpha=0.1).fit(x, y)
l1mod.coef_
## -----------------------------------------------------------------
## load Hess data
## -----------------------------------------------------------------
def readTab(file):
return pd.read_csv(file, sep="\t", header=0, index_col=0)
x = readTab("microarray/Hess/HessTrainingData.tsv.gz").transpose()
annot = readTab("microarray/Hess/HessTrainingAnnotation.tsv")
y = MaclearnUtilities.safeFactorize(annot.pCRtxt)
logisticFitter = pipeline.Pipeline([
('featsel', feature_selection.SelectKBest(
feature_selection.f_regression, k=4)),
('classifier', linear_model.LogisticRegression(C=1e15))
])
logisticFit = copy.deepcopy(logisticFitter).fit(x, y)
logisticCoef = logisticFit.get_params()['classifier'].coef_
## -----------------------------------------------------------------
## regularized models
## -----------------------------------------------------------------
l2Fitter = pipeline.Pipeline([
('featsel', feature_selection.SelectKBest(
feature_selection.f_regression, k=4)),
('classifier', linear_model.LogisticRegression(
C=20.0/len(y), penalty="l2", intercept_scaling=100))
])
l2Fit = copy.deepcopy(l2Fitter).fit(x, y)
l2Coef = l2Fit.get_params()['classifier'].coef_
l1Fitter = pipeline.Pipeline([
('featsel', feature_selection.SelectKBest(
feature_selection.f_regression, k=4)),
('classifier', linear_model.LogisticRegression(
C=20.0/len(y), penalty="l1", intercept_scaling=100))
])
l1Fit = copy.deepcopy(l1Fitter).fit(x, y)
l1Coef = l1Fit.get_params()['classifier'].coef_
## -----------------------------------------------------------------
##
## -----------------------------------------------------------------
cvSchedule = ShuffleSplit(n_splits=5, test_size=0.2, random_state=123)
cvLogisticAcc = np.mean(cross_val_score_pd(estimator = logisticFitter,
X = x,
y = y,
cv = cvSchedule.split(x)))
cvLogisticAcc
cvL1Acc = np.mean(cross_val_score_pd(estimator = l1Fitter,
X = x,
y = y,
cv = cvSchedule.split(x)))
cvL1Acc
cvL2Acc = np.mean(cross_val_score_pd(estimator = l2Fitter,
X = x,
y = y,
cv = cvSchedule.split(x)))
cvL2Acc