-
Notifications
You must be signed in to change notification settings - Fork 13
Commit
This commit does not belong to any branch on this repository, and may belong to a fork outside of the repository.
- Loading branch information
Your Name
committed
Feb 19, 2019
1 parent
5d22ee7
commit aadd822
Showing
19 changed files
with
9,865 additions
and
0 deletions.
There are no files selected for viewing
Binary file not shown.
Binary file not shown.
Large diffs are not rendered by default.
Oops, something went wrong.
Large diffs are not rendered by default.
Oops, something went wrong.
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,98 @@ | ||
#from helpers import * | ||
#from models import * | ||
import numpy as np | ||
from sklearn.metrics import roc_auc_score, average_precision_score | ||
from sklearn.model_selection import KFold, StratifiedKFold | ||
import keras.backend as K | ||
import pandas as pd | ||
from sklearn.linear_model import LogisticRegression | ||
from sklearn.model_selection import GridSearchCV | ||
from sklearn.ensemble import RandomForestClassifier | ||
|
||
|
||
# Definitions | ||
repeats = 5 | ||
cv_splits = 10 | ||
num_drugs = 11 | ||
drugs = ['rif', 'inh', 'pza', 'emb', 'str', 'cip', 'cap', 'amk', 'moxi', 'oflx', 'kan'] | ||
|
||
|
||
data_dir = 'training_data_102618/' | ||
# Data | ||
X = np.loadtxt(data_dir + 'X_features.csv', delimiter=',') | ||
|
||
alpha_matrix = np.loadtxt(data_dir + 'alpha_matrix.csv', delimiter=',') | ||
y = np.loadtxt(data_dir + 'labels.csv', delimiter=',') | ||
|
||
# Mutations unavailable through subset of isolates that underwent targeted sequencing | ||
X[X == -1] = 0.5 | ||
|
||
# Get mutations that appear in at least 30 isolates | ||
sufficient_inds = np.squeeze(np.where((X == 1).sum(axis=0) >= 30)) | ||
X = X[:,sufficient_inds] | ||
|
||
## Derived features are last 56 columns | ||
raw_features_end = X.shape[1] - 56 | ||
X = X[:,:raw_features_end] | ||
|
||
column_names = ['Algorithm','Drug','AUC','AUC_PR'] | ||
results = pd.DataFrame(columns=column_names) | ||
results_index = 0 | ||
for r in range(repeats): | ||
cross_val_split = KFold(n_splits=cv_splits, shuffle=True) | ||
for train, val in cross_val_split.split(X): | ||
X_train = X[train] | ||
X_val = X[val] | ||
y_train = y[train] | ||
y_val = y[val] | ||
#------- Train the wide and deep neural network ------# | ||
wdnn = get_wide_deep_raw_features() | ||
wdnn.fit(X_train, alpha_matrix[train], epochs=100, verbose=False, validation_data=[X_val,alpha_matrix[val]]) | ||
mc_dropout = K.Function(wdnn.inputs + [K.learning_phase()], wdnn.outputs) | ||
#wdnn_probs = ensemble(X_val, y_val, mc_dropout) | ||
wdnn_probs = wdnn.predict(X_val) | ||
for i, drug in enumerate(drugs): | ||
non_missing_val = np.where(y_val[:,i] != -1)[0] | ||
auc_y = np.reshape(y_val[non_missing_val,i],(len(non_missing_val), 1)) | ||
auc_preds = np.reshape(wdnn_probs[non_missing_val,i],(len(non_missing_val), 1)) | ||
val_auc = roc_auc_score(auc_y, auc_preds) | ||
val_auc_pr = average_precision_score(1-y_val[non_missing_val,i], 1-wdnn_probs[non_missing_val,i]) | ||
results.loc[results_index] = ['WDNN Raw Features',drug,val_auc,val_auc_pr] | ||
print drug + '\t' + str(val_auc) + '\t' + str(val_auc_pr) | ||
results_index += 1 | ||
|
||
for r in range(repeats): | ||
for i, drug in enumerate(drugs): | ||
y_drug = y[:, i] | ||
# Disregard rows for which no resistance data exists | ||
y_non_missing = y_drug[y_drug != -1] | ||
X_non_missing = X[y_drug != -1] | ||
cross_val_split = KFold(n_splits=cv_splits, shuffle=True) | ||
for train, val in cross_val_split.split(X_non_missing): | ||
X_train = X_non_missing[train] | ||
X_val = X_non_missing[val] | ||
y_train = y_non_missing[train] | ||
y_val = y_non_missing[val] | ||
# Train and predict on random forest classifier | ||
random_forest = RandomForestClassifier(n_estimators=1000, max_features='auto', min_samples_leaf=0.002) | ||
random_forest.fit(X_train, y_train) | ||
pred_rf = random_forest.predict_proba(X_val) | ||
# Get AUC of drug for RF | ||
rf_auc = roc_auc_score(y_val, pred_rf[:,1]) | ||
rf_auc_pr = average_precision_score(1-y_val, 1-pred_rf[:,1]) | ||
results.loc[results_index] = ['Random Forest Raw Features', drug, rf_auc, rf_auc_pr] | ||
results_index += 1 | ||
# Train and predict on regularized logistic regression model | ||
log_reg = LogisticRegression(penalty='l2') | ||
Cs = np.logspace(-5, 5, 10) | ||
estimator = GridSearchCV(estimator=log_reg, param_grid={'C': Cs}, cv=5, scoring='roc_auc') | ||
estimator.fit(X_train, y_train) | ||
pred_lm = estimator.predict_proba(X_val) | ||
lm_auc = roc_auc_score(y_val, pred_lm[:,1]) | ||
lm_auc_pr = average_precision_score(1-y_val, 1-pred_lm[:, 1]) | ||
results.loc[results_index] = ['Logistic Regression Raw Features', drug, lm_auc, lm_auc_pr] | ||
results_index += 1 | ||
|
||
|
||
results.to_csv('results_raw_features_rf_lm.csv',index=False) | ||
|
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,138 @@ | ||
#from helpers import * | ||
#from models import * | ||
import numpy as np | ||
from sklearn.metrics import roc_auc_score, average_precision_score | ||
from sklearn.model_selection import KFold, StratifiedKFold | ||
import keras.backend as K | ||
import pandas as pd | ||
|
||
|
||
# Definitions | ||
repeats = 5 | ||
cv_splits = 10 | ||
num_drugs = 11 | ||
drugs = ['rif', 'inh', 'pza', 'emb', 'str', 'cip', 'cap', 'amk', 'moxi', 'oflx', 'kan'] | ||
|
||
|
||
data_dir = '/mnt/raid1/TB_data/tb_data_050818/' | ||
# Data | ||
X = np.loadtxt(data_dir + 'X_features.csv', delimiter=',') | ||
|
||
alpha_matrix = np.loadtxt(data_dir + 'alpha_matrix.csv', delimiter=',') | ||
y = np.loadtxt(data_dir + 'labels.csv', delimiter=',') | ||
|
||
# Mutations unavailable through subset of isolates that underwent targeted sequencing | ||
X[X == -1] = 0.5 | ||
|
||
# Get mutations that appear in at least 30 isolates | ||
sufficient_inds = np.squeeze(np.where((X == 1).sum(axis=0) >= 30)) | ||
X = X[:,sufficient_inds] | ||
|
||
column_names = ['Algorithm','Drug','AUC','AUC_PR'] | ||
results = pd.DataFrame(columns=column_names) | ||
results_index = 0 | ||
for r in range(repeats): | ||
cross_val_split = KFold(n_splits=cv_splits, shuffle=True) | ||
for train, val in cross_val_split.split(X): | ||
print(r) | ||
X_train = X[train] | ||
X_val = X[val] | ||
y_train = y[train] | ||
y_val = y[val] | ||
#------- Train the wide and deep neural network ------# | ||
wdnn = get_wide_deep() | ||
wdnn.fit(X_train, alpha_matrix[train], epochs=100, verbose=False, validation_data=[X_val,alpha_matrix[val]]) | ||
mc_dropout = K.Function(wdnn.inputs + [K.learning_phase()], wdnn.outputs) | ||
#wdnn_probs = ensemble(X_val, y_val, mc_dropout) | ||
wdnn_probs = wdnn.predict(X_val) | ||
with open('/mnt/raid1/TB_data/preds/val_probs.csv', 'a') as f: | ||
df = pd.DataFrame(wdnn_probs) | ||
df.to_csv(f, header=False, index=False) | ||
with open('/mnt/raid1/TB_data/preds/y_val.csv', 'a') as f: | ||
df = pd.DataFrame(y_val) | ||
df.to_csv(f, header=False, index = False) | ||
for i, drug in enumerate(drugs): | ||
non_missing_val = np.where(y_val[:,i] != -1)[0] | ||
auc_y = np.reshape(y_val[non_missing_val,i],(len(non_missing_val), 1)) | ||
auc_preds = np.reshape(wdnn_probs[non_missing_val,i],(len(non_missing_val), 1)) | ||
val_auc = roc_auc_score(auc_y, auc_preds) | ||
val_auc_pr = average_precision_score(1-y_val[non_missing_val,i], 1-wdnn_probs[non_missing_val,i]) | ||
results.loc[results_index] = ['WDNN',drug,val_auc,val_auc_pr] | ||
#print (drug + '\t' + str(val_auc) + '\t' + str(val_auc_pr)) | ||
results_index += 1 | ||
#------- Train a deep neural network ------# | ||
deep = get_deep() | ||
deep.fit(X_train, alpha_matrix[train], epochs=100, verbose=False, validation_data=[X_val,alpha_matrix[val]]) | ||
#wdnn_probs = ensemble(X_val, y_val, mc_dropout) | ||
deep_probs = deep.predict(X_val) | ||
for i, drug in enumerate(drugs): | ||
non_missing_val = np.where(y_val[:,i] != -1)[0] | ||
auc_y = np.reshape(y_val[non_missing_val,i],(len(non_missing_val), 1)) | ||
auc_preds = np.reshape(deep_probs[non_missing_val,i],(len(non_missing_val), 1)) | ||
val_auc = roc_auc_score(auc_y, auc_preds) | ||
val_auc_pr = average_precision_score(1-y_val[non_missing_val,i], 1-deep_probs[non_missing_val,i]) | ||
results.loc[results_index] = ['Deep MLP',drug,val_auc,val_auc_pr] | ||
#print (drug + '\t' + str(val_auc) + '\t' + str(val_auc_pr)) | ||
results_index += 1 | ||
for i, drug in enumerate(drugs): | ||
y_drug = y[:, i] | ||
# Disregard rows for which no resistance data exists | ||
y_non_missing = y_drug[y_drug != -1] | ||
X_non_missing = X[y_drug != -1] | ||
cross_val_split = KFold(n_splits=cv_splits, shuffle=True) | ||
for train, val in cross_val_split.split(X_non_missing): | ||
X_train = X_non_missing[train] | ||
X_val = X_non_missing[val] | ||
y_train = y_non_missing[train] | ||
y_val = y_non_missing[val] | ||
# Train and predict on random forest classifier | ||
random_forest = RandomForestClassifier(n_estimators=1000, max_features='auto', min_samples_leaf=0.002) | ||
random_forest.fit(X_train, y_train) | ||
pred_rf = random_forest.predict_proba(X_val) | ||
# Get AUC of drug for RF | ||
rf_auc = roc_auc_score(y_val, pred_rf[:,1]) | ||
rf_auc_pr = average_precision_score(1-y_val, 1-pred_rf[:,1]) | ||
results.loc[results_index] = ['Random Forest', drug, rf_auc, rf_auc_pr] | ||
results_index += 1 | ||
# Train and predict on regularized logistic regression model | ||
log_reg = LogisticRegression(penalty='l2', solver='liblinear') | ||
Cs = np.logspace(-5, 5, 10) | ||
estimator = GridSearchCV(estimator=log_reg, param_grid={'C': Cs}, cv=5, scoring='roc_auc') | ||
estimator.fit(X_train, y_train) | ||
pred_lm = estimator.predict_proba(X_val) | ||
lm_auc = roc_auc_score(y_val, pred_lm[:,1]) | ||
lm_auc_pr = average_precision_score(1-y_val, 1-pred_lm[:, 1]) | ||
results.loc[results_index] = ['Logistic Regression', drug, lm_auc, lm_auc_pr] | ||
results_index += 1 | ||
# Train single task WDNN | ||
for i, drug in enumerate(drugs): | ||
y_drug = y[:, i] | ||
# Disregard rows for which no resistance data exists | ||
y_non_missing = y_drug[y_drug != -1] | ||
X_non_missing = X[y_drug != -1] | ||
cross_val_split = KFold(n_splits=cv_splits, shuffle=True) | ||
for train, val in cross_val_split.split(X_non_missing): | ||
print(r) | ||
# Training and validation label data | ||
X_train = X_non_missing[train] | ||
X_val = X_non_missing[val] | ||
y_train = y_non_missing[train] | ||
y_val = y_non_missing[val] | ||
# Train and predict on random forest classifier | ||
wdnn_single = get_wide_deep_single() | ||
wdnn_single.fit(X_train, y_train, nb_epoch=100, validation_data=(X_val, y_val), verbose =False) | ||
# Create and predict on multitask MLP with dropout at test time | ||
wdnn_single_mc_dropout = K.Function(wdnn_single.inputs + [K.learning_phase()], wdnn_single.outputs) | ||
#wdnn_single_preds = ensemble(X_val, np.expand_dims(y_val, axis=1), wdnn_single_mc_dropout) | ||
wdnn_single_preds = wdnn_single.predict(X_val) | ||
# Get AUC, specificity, and sensitivity of drug for single task WDNN | ||
wdnn_single_auc = roc_auc_score(y_val.reshape(len(y_val),1), wdnn_single_preds.reshape((len(wdnn_single_preds),1))) | ||
wdnn_single_auc_pr = average_precision_score(1-y_val.reshape(len(y_val),1), 1-wdnn_single_preds.reshape((len(wdnn_single_preds),1))) | ||
results.loc[results_index] = ['WDNN Single Task', drug, wdnn_single_auc, wdnn_single_auc_pr] | ||
results_index += 1 | ||
#K.clear_session() | ||
|
||
|
||
results.to_csv('/mnt/raid1/TB_data/results.csv',index=False) | ||
results.to_csv('results_020719/results_pr.csv',index=False) | ||
|
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,34 @@ | ||
library("UpSetR") | ||
require(cowplot) | ||
library(readr) | ||
|
||
# Drugs | ||
drugs = c('RIF', 'INH', 'PZA', 'EMB', 'STR', 'CIP', 'CAP', 'AMK', 'MOXI', 'OFLX', 'KAN') | ||
|
||
# Get feature category names | ||
feature_list <- read.csv("feature_names020318.csv", header=F, stringsAsFactors=FALSE)$V1 | ||
|
||
# Create dataframe of significant mutations | ||
feature_res = as.data.frame(matrix(0, nrow=length(feature_list), ncol=11), row.names=feature_list) | ||
colnames(feature_res) <- drugs | ||
for (drug in drugs) { | ||
file <- paste("snpsF", drug, "020318.csv", sep="") | ||
df_drug <- as.data.frame(read_csv(file, col_names = TRUE)) | ||
resistant <- df_drug[1][df_drug['S/R'] == 'resistant'] | ||
feature_res[drug][resistant,] = 1 | ||
} | ||
|
||
# Plot | ||
feature_res_reorder <- feature_res[,c(2,1,4,3,5,8,11,7,10,6,9)] | ||
feature_res_reorder$CIP <- NULL | ||
pdf("feature_importance_021218.pdf", paper="a4r", width=10, height=7, onefile = FALSE) | ||
upset(feature_res_reorder, sets = colnames(feature_res_reorder), | ||
sets.bar.color = "#56B4E9", | ||
matrix.color = "#7A7775", | ||
mainbar.y.label = "Number of Feature Intersections", | ||
sets.x.label = "Features per Drug", | ||
order.by = c("freq"), | ||
#nintersects=12, | ||
keep.order = TRUE) | ||
dev.off() | ||
|
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,89 @@ | ||
from helpers import * | ||
import pandas as pd | ||
import numpy as np | ||
from keras.layers import Dense, Dropout, Input, merge | ||
from keras.models import Model | ||
from keras.optimizers import Adam | ||
from keras import regularizers | ||
|
||
# Definitions | ||
crit_value = 0.05 | ||
num_permute = 100000 | ||
num_drugs = 11 | ||
|
||
# Gene names | ||
genes = ['ahpC', 'alr', 'ddl', 'embA', 'embB', 'embC', 'ethA', 'gid', 'gyrA', 'gyrB', 'inhA', 'iniA', | ||
'iniB', 'iniC', 'kasA', 'katG', 'murA-rrs', 'fabG1', 'ndh', 'pncA', 'rpoB', 'rpsL', 'rrl', | ||
'rrs', 'thyA', 'tlyA', 'gyrB-gyrA', 'gyrA-Rv0007', 'iniB-iniA-iniC', 'iniB-iniA', 'iniC-lpqJ', | ||
'rpoB-rpoC', 'fabG1-inhA', 'rrs-rrl', 'rrl-rrf', 'inhA-hemZ', 'katG-furA', 'kasA-kasB', | ||
'ahpC-ahpD', 'dfrA-thyA', 'alr-Rv3792', 'embA-embB', 'embB-Rv3796', 'menG-ethA', 'rpsA', 'eis', | ||
"oxyR'", 'acpM'] | ||
|
||
# Get training and phenotypic data | ||
df_X = pd.read_csv("X_features_with_names.csv", index_col=0) | ||
X = np.loadtxt("X_features.csv", delimiter=',') | ||
alpha_matrix = np.loadtxt("alpha_matrix.csv", delimiter=',') | ||
y_true = np.loadtxt("labels.csv", delimiter=',') | ||
|
||
valid_snp_inds_all = np.squeeze(np.where((big_X == 1).sum(axis=0) >= 30)) | ||
X = big_X[:,valid_snp_inds_all] | ||
|
||
# Train and get predictions | ||
clf = get_wide_deep() | ||
clf.fit(X, alpha_matrix, nb_epoch=100) | ||
#clf_dom = K.Function(clf.inputs + [K.learning_phase()], clf.outputs) | ||
y_pred_dom = clf_dom.predict(X) | ||
|
||
# Create permutation distribution | ||
def distribute(snp_data, y_pred): | ||
diff = np.zeros((num_permute, num_drugs), dtype=np.float64) | ||
for index in range(num_permute): | ||
shuffled = np.random.permutation(snp_data) | ||
prob_1_given_1_shuffle = y_pred[(np.where(shuffled == 1))] | ||
prob_1_given_0_shuffle = y_pred[(np.where(shuffled == 0))] | ||
diff[index] = (np.mean(prob_1_given_1_shuffle, axis=0) - np.mean(prob_1_given_0_shuffle, axis=0)) | ||
return diff | ||
|
||
# Used to store data | ||
snp_data = np.zeros((num_drugs, len(valid_snp_inds_all), 4), dtype=np.object) | ||
final_sig_snps = np.zeros((num_drugs, 4), dtype=np.object) | ||
|
||
s = 0 | ||
# Get mutations, p-value from permutation test, and exact difference in probabilities | ||
for snp in valid_snp_inds_all: | ||
X_curr = big_X[:,snp][big_X[:,snp] != 0.5] | ||
y_curr = y_pred_dom[big_X[:,snp] != 0.5] | ||
permute_data = distribute(X_curr, y_curr) | ||
for drug in range(num_drugs): | ||
prob_1_given_1 = y_curr[:,drug][(np.where(X_curr == 0))] | ||
prob_1_given_0 = y_curr[:,drug][(np.where(X_curr == 1))] | ||
diff_exact = np.mean(prob_1_given_1) - np.mean(prob_1_given_0) | ||
permute_data_drug = permute_data[:,drug] | ||
sig = ((permute_data_drug > np.abs(diff_exact)) | | ||
(permute_data_drug < -np.abs(diff_exact))).sum() | ||
p_value = float(sig) / len(permute_data) | ||
snp_data[drug, s, 0] = snp | ||
snp_data[drug, s, 1] = 'sensitive' if diff_exact < 0 else 'resistant' | ||
snp_data[drug, s, 2] = p_value | ||
snp_data[drug, s, 3] = diff_exact | ||
s += 1 | ||
|
||
# Get mutations that meet critical value | ||
for drug in range(num_drugs): | ||
swapped = np.swapaxes(snp_data, 1, 2) | ||
snp_inds = np.where(swapped[drug][2] < crit_value / len(valid_snp_inds_all)) | ||
final_sig_snps[drug, 0] = list(df_X.columns[list(swapped[drug][0][snp_inds])]) | ||
final_sig_snps[drug, 1] = list(swapped[drug][1][snp_inds]) | ||
final_sig_snps[drug, 2] = list(swapped[drug][2][snp_inds]) | ||
final_sig_snps[drug, 3] = list(swapped[drug][3][snp_inds]) | ||
|
||
# Save significant features | ||
outarr = np.zeros(num_drugs, dtype=np.object) | ||
for i in range(num_drugs): | ||
outarr[i] = np.vstack((final_sig_snps[i])) | ||
np.savetxt('snpsF{drug}020318.csv'.format(drug=drugs[i]), np.transpose(outarr[i]), fmt='%s', delimiter=',', | ||
header='SNPs, S/R, p-value, (y=S|s=1) - (y=S|s=0)') | ||
|
||
# Save valid feature names | ||
derived_names_orig = list(df_X.columns.values[valid_snp_inds_all]) | ||
np.savetxt('feature_names020318.csv', derived_names_orig, fmt='%s', delimiter=',') |
Oops, something went wrong.