-
Notifications
You must be signed in to change notification settings - Fork 13
/
evaluation.py
138 lines (127 loc) · 6.84 KB
/
evaluation.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
#from helpers import *
#from models import *
import numpy as np
from sklearn.metrics import roc_auc_score, average_precision_score
from sklearn.model_selection import KFold, StratifiedKFold
import keras.backend as K
import pandas as pd
# Definitions
repeats = 5
cv_splits = 10
num_drugs = 11
drugs = ['rif', 'inh', 'pza', 'emb', 'str', 'cip', 'cap', 'amk', 'moxi', 'oflx', 'kan']
data_dir = '/mnt/raid1/TB_data/tb_data_050818/'
# Data
X = np.loadtxt(data_dir + 'X_features.csv', delimiter=',')
alpha_matrix = np.loadtxt(data_dir + 'alpha_matrix.csv', delimiter=',')
y = np.loadtxt(data_dir + 'labels.csv', delimiter=',')
# Mutations unavailable through subset of isolates that underwent targeted sequencing
X[X == -1] = 0.5
# Get mutations that appear in at least 30 isolates
sufficient_inds = np.squeeze(np.where((X == 1).sum(axis=0) >= 30))
X = X[:,sufficient_inds]
column_names = ['Algorithm','Drug','AUC','AUC_PR']
results = pd.DataFrame(columns=column_names)
results_index = 0
for r in range(repeats):
cross_val_split = KFold(n_splits=cv_splits, shuffle=True)
for train, val in cross_val_split.split(X):
print(r)
X_train = X[train]
X_val = X[val]
y_train = y[train]
y_val = y[val]
#------- Train the wide and deep neural network ------#
wdnn = get_wide_deep()
wdnn.fit(X_train, alpha_matrix[train], epochs=100, verbose=False, validation_data=[X_val,alpha_matrix[val]])
mc_dropout = K.Function(wdnn.inputs + [K.learning_phase()], wdnn.outputs)
#wdnn_probs = ensemble(X_val, y_val, mc_dropout)
wdnn_probs = wdnn.predict(X_val)
with open('/mnt/raid1/TB_data/preds/val_probs.csv', 'a') as f:
df = pd.DataFrame(wdnn_probs)
df.to_csv(f, header=False, index=False)
with open('/mnt/raid1/TB_data/preds/y_val.csv', 'a') as f:
df = pd.DataFrame(y_val)
df.to_csv(f, header=False, index = False)
for i, drug in enumerate(drugs):
non_missing_val = np.where(y_val[:,i] != -1)[0]
auc_y = np.reshape(y_val[non_missing_val,i],(len(non_missing_val), 1))
auc_preds = np.reshape(wdnn_probs[non_missing_val,i],(len(non_missing_val), 1))
val_auc = roc_auc_score(auc_y, auc_preds)
val_auc_pr = average_precision_score(1-y_val[non_missing_val,i], 1-wdnn_probs[non_missing_val,i])
results.loc[results_index] = ['WDNN',drug,val_auc,val_auc_pr]
#print (drug + '\t' + str(val_auc) + '\t' + str(val_auc_pr))
results_index += 1
#------- Train a deep neural network ------#
deep = get_deep()
deep.fit(X_train, alpha_matrix[train], epochs=100, verbose=False, validation_data=[X_val,alpha_matrix[val]])
#wdnn_probs = ensemble(X_val, y_val, mc_dropout)
deep_probs = deep.predict(X_val)
for i, drug in enumerate(drugs):
non_missing_val = np.where(y_val[:,i] != -1)[0]
auc_y = np.reshape(y_val[non_missing_val,i],(len(non_missing_val), 1))
auc_preds = np.reshape(deep_probs[non_missing_val,i],(len(non_missing_val), 1))
val_auc = roc_auc_score(auc_y, auc_preds)
val_auc_pr = average_precision_score(1-y_val[non_missing_val,i], 1-deep_probs[non_missing_val,i])
results.loc[results_index] = ['Deep MLP',drug,val_auc,val_auc_pr]
#print (drug + '\t' + str(val_auc) + '\t' + str(val_auc_pr))
results_index += 1
for i, drug in enumerate(drugs):
y_drug = y[:, i]
# Disregard rows for which no resistance data exists
y_non_missing = y_drug[y_drug != -1]
X_non_missing = X[y_drug != -1]
cross_val_split = KFold(n_splits=cv_splits, shuffle=True)
for train, val in cross_val_split.split(X_non_missing):
X_train = X_non_missing[train]
X_val = X_non_missing[val]
y_train = y_non_missing[train]
y_val = y_non_missing[val]
# Train and predict on random forest classifier
random_forest = RandomForestClassifier(n_estimators=1000, max_features='auto', min_samples_leaf=0.002)
random_forest.fit(X_train, y_train)
pred_rf = random_forest.predict_proba(X_val)
# Get AUC of drug for RF
rf_auc = roc_auc_score(y_val, pred_rf[:,1])
rf_auc_pr = average_precision_score(1-y_val, 1-pred_rf[:,1])
results.loc[results_index] = ['Random Forest', drug, rf_auc, rf_auc_pr]
results_index += 1
# Train and predict on regularized logistic regression model
log_reg = LogisticRegression(penalty='l2', solver='liblinear')
Cs = np.logspace(-5, 5, 10)
estimator = GridSearchCV(estimator=log_reg, param_grid={'C': Cs}, cv=5, scoring='roc_auc')
estimator.fit(X_train, y_train)
pred_lm = estimator.predict_proba(X_val)
lm_auc = roc_auc_score(y_val, pred_lm[:,1])
lm_auc_pr = average_precision_score(1-y_val, 1-pred_lm[:, 1])
results.loc[results_index] = ['Logistic Regression', drug, lm_auc, lm_auc_pr]
results_index += 1
# Train single task WDNN
for i, drug in enumerate(drugs):
y_drug = y[:, i]
# Disregard rows for which no resistance data exists
y_non_missing = y_drug[y_drug != -1]
X_non_missing = X[y_drug != -1]
cross_val_split = KFold(n_splits=cv_splits, shuffle=True)
for train, val in cross_val_split.split(X_non_missing):
print(r)
# Training and validation label data
X_train = X_non_missing[train]
X_val = X_non_missing[val]
y_train = y_non_missing[train]
y_val = y_non_missing[val]
# Train and predict on random forest classifier
wdnn_single = get_wide_deep_single()
wdnn_single.fit(X_train, y_train, nb_epoch=100, validation_data=(X_val, y_val), verbose =False)
# Create and predict on multitask MLP with dropout at test time
wdnn_single_mc_dropout = K.Function(wdnn_single.inputs + [K.learning_phase()], wdnn_single.outputs)
#wdnn_single_preds = ensemble(X_val, np.expand_dims(y_val, axis=1), wdnn_single_mc_dropout)
wdnn_single_preds = wdnn_single.predict(X_val)
# Get AUC, specificity, and sensitivity of drug for single task WDNN
wdnn_single_auc = roc_auc_score(y_val.reshape(len(y_val),1), wdnn_single_preds.reshape((len(wdnn_single_preds),1)))
wdnn_single_auc_pr = average_precision_score(1-y_val.reshape(len(y_val),1), 1-wdnn_single_preds.reshape((len(wdnn_single_preds),1)))
results.loc[results_index] = ['WDNN Single Task', drug, wdnn_single_auc, wdnn_single_auc_pr]
results_index += 1
#K.clear_session()
results.to_csv('/mnt/raid1/TB_data/results.csv',index=False)
results.to_csv('results_020719/results_pr.csv',index=False)