Skip to content

Commit

Permalink
021819 commit
Browse files Browse the repository at this point in the history
  • Loading branch information
Your Name committed Feb 19, 2019
1 parent 5d22ee7 commit aadd822
Show file tree
Hide file tree
Showing 19 changed files with 9,865 additions and 0 deletions.
Binary file added data/X_features.zip
Binary file not shown.
Binary file added data/X_features_with_names.zip
Binary file not shown.
3,601 changes: 3,601 additions & 0 deletions data/alpha_matrix.csv

Large diffs are not rendered by default.

3,601 changes: 3,601 additions & 0 deletions data/labels.csv

Large diffs are not rendered by default.

98 changes: 98 additions & 0 deletions evaluate_derived_features.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,98 @@
#from helpers import *
#from models import *
import numpy as np
from sklearn.metrics import roc_auc_score, average_precision_score
from sklearn.model_selection import KFold, StratifiedKFold
import keras.backend as K
import pandas as pd
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import GridSearchCV
from sklearn.ensemble import RandomForestClassifier


# Definitions
repeats = 5
cv_splits = 10
num_drugs = 11
drugs = ['rif', 'inh', 'pza', 'emb', 'str', 'cip', 'cap', 'amk', 'moxi', 'oflx', 'kan']


data_dir = 'training_data_102618/'
# Data
X = np.loadtxt(data_dir + 'X_features.csv', delimiter=',')

alpha_matrix = np.loadtxt(data_dir + 'alpha_matrix.csv', delimiter=',')
y = np.loadtxt(data_dir + 'labels.csv', delimiter=',')

# Mutations unavailable through subset of isolates that underwent targeted sequencing
X[X == -1] = 0.5

# Get mutations that appear in at least 30 isolates
sufficient_inds = np.squeeze(np.where((X == 1).sum(axis=0) >= 30))
X = X[:,sufficient_inds]

## Derived features are last 56 columns
raw_features_end = X.shape[1] - 56
X = X[:,:raw_features_end]

column_names = ['Algorithm','Drug','AUC','AUC_PR']
results = pd.DataFrame(columns=column_names)
results_index = 0
for r in range(repeats):
cross_val_split = KFold(n_splits=cv_splits, shuffle=True)
for train, val in cross_val_split.split(X):
X_train = X[train]
X_val = X[val]
y_train = y[train]
y_val = y[val]
#------- Train the wide and deep neural network ------#
wdnn = get_wide_deep_raw_features()
wdnn.fit(X_train, alpha_matrix[train], epochs=100, verbose=False, validation_data=[X_val,alpha_matrix[val]])
mc_dropout = K.Function(wdnn.inputs + [K.learning_phase()], wdnn.outputs)
#wdnn_probs = ensemble(X_val, y_val, mc_dropout)
wdnn_probs = wdnn.predict(X_val)
for i, drug in enumerate(drugs):
non_missing_val = np.where(y_val[:,i] != -1)[0]
auc_y = np.reshape(y_val[non_missing_val,i],(len(non_missing_val), 1))
auc_preds = np.reshape(wdnn_probs[non_missing_val,i],(len(non_missing_val), 1))
val_auc = roc_auc_score(auc_y, auc_preds)
val_auc_pr = average_precision_score(1-y_val[non_missing_val,i], 1-wdnn_probs[non_missing_val,i])
results.loc[results_index] = ['WDNN Raw Features',drug,val_auc,val_auc_pr]
print drug + '\t' + str(val_auc) + '\t' + str(val_auc_pr)
results_index += 1

for r in range(repeats):
for i, drug in enumerate(drugs):
y_drug = y[:, i]
# Disregard rows for which no resistance data exists
y_non_missing = y_drug[y_drug != -1]
X_non_missing = X[y_drug != -1]
cross_val_split = KFold(n_splits=cv_splits, shuffle=True)
for train, val in cross_val_split.split(X_non_missing):
X_train = X_non_missing[train]
X_val = X_non_missing[val]
y_train = y_non_missing[train]
y_val = y_non_missing[val]
# Train and predict on random forest classifier
random_forest = RandomForestClassifier(n_estimators=1000, max_features='auto', min_samples_leaf=0.002)
random_forest.fit(X_train, y_train)
pred_rf = random_forest.predict_proba(X_val)
# Get AUC of drug for RF
rf_auc = roc_auc_score(y_val, pred_rf[:,1])
rf_auc_pr = average_precision_score(1-y_val, 1-pred_rf[:,1])
results.loc[results_index] = ['Random Forest Raw Features', drug, rf_auc, rf_auc_pr]
results_index += 1
# Train and predict on regularized logistic regression model
log_reg = LogisticRegression(penalty='l2')
Cs = np.logspace(-5, 5, 10)
estimator = GridSearchCV(estimator=log_reg, param_grid={'C': Cs}, cv=5, scoring='roc_auc')
estimator.fit(X_train, y_train)
pred_lm = estimator.predict_proba(X_val)
lm_auc = roc_auc_score(y_val, pred_lm[:,1])
lm_auc_pr = average_precision_score(1-y_val, 1-pred_lm[:, 1])
results.loc[results_index] = ['Logistic Regression Raw Features', drug, lm_auc, lm_auc_pr]
results_index += 1


results.to_csv('results_raw_features_rf_lm.csv',index=False)

138 changes: 138 additions & 0 deletions evaluation.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,138 @@
#from helpers import *
#from models import *
import numpy as np
from sklearn.metrics import roc_auc_score, average_precision_score
from sklearn.model_selection import KFold, StratifiedKFold
import keras.backend as K
import pandas as pd


# Definitions
repeats = 5
cv_splits = 10
num_drugs = 11
drugs = ['rif', 'inh', 'pza', 'emb', 'str', 'cip', 'cap', 'amk', 'moxi', 'oflx', 'kan']


data_dir = '/mnt/raid1/TB_data/tb_data_050818/'
# Data
X = np.loadtxt(data_dir + 'X_features.csv', delimiter=',')

alpha_matrix = np.loadtxt(data_dir + 'alpha_matrix.csv', delimiter=',')
y = np.loadtxt(data_dir + 'labels.csv', delimiter=',')

# Mutations unavailable through subset of isolates that underwent targeted sequencing
X[X == -1] = 0.5

# Get mutations that appear in at least 30 isolates
sufficient_inds = np.squeeze(np.where((X == 1).sum(axis=0) >= 30))
X = X[:,sufficient_inds]

column_names = ['Algorithm','Drug','AUC','AUC_PR']
results = pd.DataFrame(columns=column_names)
results_index = 0
for r in range(repeats):
cross_val_split = KFold(n_splits=cv_splits, shuffle=True)
for train, val in cross_val_split.split(X):
print(r)
X_train = X[train]
X_val = X[val]
y_train = y[train]
y_val = y[val]
#------- Train the wide and deep neural network ------#
wdnn = get_wide_deep()
wdnn.fit(X_train, alpha_matrix[train], epochs=100, verbose=False, validation_data=[X_val,alpha_matrix[val]])
mc_dropout = K.Function(wdnn.inputs + [K.learning_phase()], wdnn.outputs)
#wdnn_probs = ensemble(X_val, y_val, mc_dropout)
wdnn_probs = wdnn.predict(X_val)
with open('/mnt/raid1/TB_data/preds/val_probs.csv', 'a') as f:
df = pd.DataFrame(wdnn_probs)
df.to_csv(f, header=False, index=False)
with open('/mnt/raid1/TB_data/preds/y_val.csv', 'a') as f:
df = pd.DataFrame(y_val)
df.to_csv(f, header=False, index = False)
for i, drug in enumerate(drugs):
non_missing_val = np.where(y_val[:,i] != -1)[0]
auc_y = np.reshape(y_val[non_missing_val,i],(len(non_missing_val), 1))
auc_preds = np.reshape(wdnn_probs[non_missing_val,i],(len(non_missing_val), 1))
val_auc = roc_auc_score(auc_y, auc_preds)
val_auc_pr = average_precision_score(1-y_val[non_missing_val,i], 1-wdnn_probs[non_missing_val,i])
results.loc[results_index] = ['WDNN',drug,val_auc,val_auc_pr]
#print (drug + '\t' + str(val_auc) + '\t' + str(val_auc_pr))
results_index += 1
#------- Train a deep neural network ------#
deep = get_deep()
deep.fit(X_train, alpha_matrix[train], epochs=100, verbose=False, validation_data=[X_val,alpha_matrix[val]])
#wdnn_probs = ensemble(X_val, y_val, mc_dropout)
deep_probs = deep.predict(X_val)
for i, drug in enumerate(drugs):
non_missing_val = np.where(y_val[:,i] != -1)[0]
auc_y = np.reshape(y_val[non_missing_val,i],(len(non_missing_val), 1))
auc_preds = np.reshape(deep_probs[non_missing_val,i],(len(non_missing_val), 1))
val_auc = roc_auc_score(auc_y, auc_preds)
val_auc_pr = average_precision_score(1-y_val[non_missing_val,i], 1-deep_probs[non_missing_val,i])
results.loc[results_index] = ['Deep MLP',drug,val_auc,val_auc_pr]
#print (drug + '\t' + str(val_auc) + '\t' + str(val_auc_pr))
results_index += 1
for i, drug in enumerate(drugs):
y_drug = y[:, i]
# Disregard rows for which no resistance data exists
y_non_missing = y_drug[y_drug != -1]
X_non_missing = X[y_drug != -1]
cross_val_split = KFold(n_splits=cv_splits, shuffle=True)
for train, val in cross_val_split.split(X_non_missing):
X_train = X_non_missing[train]
X_val = X_non_missing[val]
y_train = y_non_missing[train]
y_val = y_non_missing[val]
# Train and predict on random forest classifier
random_forest = RandomForestClassifier(n_estimators=1000, max_features='auto', min_samples_leaf=0.002)
random_forest.fit(X_train, y_train)
pred_rf = random_forest.predict_proba(X_val)
# Get AUC of drug for RF
rf_auc = roc_auc_score(y_val, pred_rf[:,1])
rf_auc_pr = average_precision_score(1-y_val, 1-pred_rf[:,1])
results.loc[results_index] = ['Random Forest', drug, rf_auc, rf_auc_pr]
results_index += 1
# Train and predict on regularized logistic regression model
log_reg = LogisticRegression(penalty='l2', solver='liblinear')
Cs = np.logspace(-5, 5, 10)
estimator = GridSearchCV(estimator=log_reg, param_grid={'C': Cs}, cv=5, scoring='roc_auc')
estimator.fit(X_train, y_train)
pred_lm = estimator.predict_proba(X_val)
lm_auc = roc_auc_score(y_val, pred_lm[:,1])
lm_auc_pr = average_precision_score(1-y_val, 1-pred_lm[:, 1])
results.loc[results_index] = ['Logistic Regression', drug, lm_auc, lm_auc_pr]
results_index += 1
# Train single task WDNN
for i, drug in enumerate(drugs):
y_drug = y[:, i]
# Disregard rows for which no resistance data exists
y_non_missing = y_drug[y_drug != -1]
X_non_missing = X[y_drug != -1]
cross_val_split = KFold(n_splits=cv_splits, shuffle=True)
for train, val in cross_val_split.split(X_non_missing):
print(r)
# Training and validation label data
X_train = X_non_missing[train]
X_val = X_non_missing[val]
y_train = y_non_missing[train]
y_val = y_non_missing[val]
# Train and predict on random forest classifier
wdnn_single = get_wide_deep_single()
wdnn_single.fit(X_train, y_train, nb_epoch=100, validation_data=(X_val, y_val), verbose =False)
# Create and predict on multitask MLP with dropout at test time
wdnn_single_mc_dropout = K.Function(wdnn_single.inputs + [K.learning_phase()], wdnn_single.outputs)
#wdnn_single_preds = ensemble(X_val, np.expand_dims(y_val, axis=1), wdnn_single_mc_dropout)
wdnn_single_preds = wdnn_single.predict(X_val)
# Get AUC, specificity, and sensitivity of drug for single task WDNN
wdnn_single_auc = roc_auc_score(y_val.reshape(len(y_val),1), wdnn_single_preds.reshape((len(wdnn_single_preds),1)))
wdnn_single_auc_pr = average_precision_score(1-y_val.reshape(len(y_val),1), 1-wdnn_single_preds.reshape((len(wdnn_single_preds),1)))
results.loc[results_index] = ['WDNN Single Task', drug, wdnn_single_auc, wdnn_single_auc_pr]
results_index += 1
#K.clear_session()


results.to_csv('/mnt/raid1/TB_data/results.csv',index=False)
results.to_csv('results_020719/results_pr.csv',index=False)

34 changes: 34 additions & 0 deletions feature_importance.R
Original file line number Diff line number Diff line change
@@ -0,0 +1,34 @@
library("UpSetR")
require(cowplot)
library(readr)

# Drugs
drugs = c('RIF', 'INH', 'PZA', 'EMB', 'STR', 'CIP', 'CAP', 'AMK', 'MOXI', 'OFLX', 'KAN')

# Get feature category names
feature_list <- read.csv("feature_names020318.csv", header=F, stringsAsFactors=FALSE)$V1

# Create dataframe of significant mutations
feature_res = as.data.frame(matrix(0, nrow=length(feature_list), ncol=11), row.names=feature_list)
colnames(feature_res) <- drugs
for (drug in drugs) {
file <- paste("snpsF", drug, "020318.csv", sep="")
df_drug <- as.data.frame(read_csv(file, col_names = TRUE))
resistant <- df_drug[1][df_drug['S/R'] == 'resistant']
feature_res[drug][resistant,] = 1
}

# Plot
feature_res_reorder <- feature_res[,c(2,1,4,3,5,8,11,7,10,6,9)]
feature_res_reorder$CIP <- NULL
pdf("feature_importance_021218.pdf", paper="a4r", width=10, height=7, onefile = FALSE)
upset(feature_res_reorder, sets = colnames(feature_res_reorder),
sets.bar.color = "#56B4E9",
matrix.color = "#7A7775",
mainbar.y.label = "Number of Feature Intersections",
sets.x.label = "Features per Drug",
order.by = c("freq"),
#nintersects=12,
keep.order = TRUE)
dev.off()

89 changes: 89 additions & 0 deletions feature_importance.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,89 @@
from helpers import *
import pandas as pd
import numpy as np
from keras.layers import Dense, Dropout, Input, merge
from keras.models import Model
from keras.optimizers import Adam
from keras import regularizers

# Definitions
crit_value = 0.05
num_permute = 100000
num_drugs = 11

# Gene names
genes = ['ahpC', 'alr', 'ddl', 'embA', 'embB', 'embC', 'ethA', 'gid', 'gyrA', 'gyrB', 'inhA', 'iniA',
'iniB', 'iniC', 'kasA', 'katG', 'murA-rrs', 'fabG1', 'ndh', 'pncA', 'rpoB', 'rpsL', 'rrl',
'rrs', 'thyA', 'tlyA', 'gyrB-gyrA', 'gyrA-Rv0007', 'iniB-iniA-iniC', 'iniB-iniA', 'iniC-lpqJ',
'rpoB-rpoC', 'fabG1-inhA', 'rrs-rrl', 'rrl-rrf', 'inhA-hemZ', 'katG-furA', 'kasA-kasB',
'ahpC-ahpD', 'dfrA-thyA', 'alr-Rv3792', 'embA-embB', 'embB-Rv3796', 'menG-ethA', 'rpsA', 'eis',
"oxyR'", 'acpM']

# Get training and phenotypic data
df_X = pd.read_csv("X_features_with_names.csv", index_col=0)
X = np.loadtxt("X_features.csv", delimiter=',')
alpha_matrix = np.loadtxt("alpha_matrix.csv", delimiter=',')
y_true = np.loadtxt("labels.csv", delimiter=',')

valid_snp_inds_all = np.squeeze(np.where((big_X == 1).sum(axis=0) >= 30))
X = big_X[:,valid_snp_inds_all]

# Train and get predictions
clf = get_wide_deep()
clf.fit(X, alpha_matrix, nb_epoch=100)
#clf_dom = K.Function(clf.inputs + [K.learning_phase()], clf.outputs)
y_pred_dom = clf_dom.predict(X)

# Create permutation distribution
def distribute(snp_data, y_pred):
diff = np.zeros((num_permute, num_drugs), dtype=np.float64)
for index in range(num_permute):
shuffled = np.random.permutation(snp_data)
prob_1_given_1_shuffle = y_pred[(np.where(shuffled == 1))]
prob_1_given_0_shuffle = y_pred[(np.where(shuffled == 0))]
diff[index] = (np.mean(prob_1_given_1_shuffle, axis=0) - np.mean(prob_1_given_0_shuffle, axis=0))
return diff

# Used to store data
snp_data = np.zeros((num_drugs, len(valid_snp_inds_all), 4), dtype=np.object)
final_sig_snps = np.zeros((num_drugs, 4), dtype=np.object)

s = 0
# Get mutations, p-value from permutation test, and exact difference in probabilities
for snp in valid_snp_inds_all:
X_curr = big_X[:,snp][big_X[:,snp] != 0.5]
y_curr = y_pred_dom[big_X[:,snp] != 0.5]
permute_data = distribute(X_curr, y_curr)
for drug in range(num_drugs):
prob_1_given_1 = y_curr[:,drug][(np.where(X_curr == 0))]
prob_1_given_0 = y_curr[:,drug][(np.where(X_curr == 1))]
diff_exact = np.mean(prob_1_given_1) - np.mean(prob_1_given_0)
permute_data_drug = permute_data[:,drug]
sig = ((permute_data_drug > np.abs(diff_exact)) |
(permute_data_drug < -np.abs(diff_exact))).sum()
p_value = float(sig) / len(permute_data)
snp_data[drug, s, 0] = snp
snp_data[drug, s, 1] = 'sensitive' if diff_exact < 0 else 'resistant'
snp_data[drug, s, 2] = p_value
snp_data[drug, s, 3] = diff_exact
s += 1

# Get mutations that meet critical value
for drug in range(num_drugs):
swapped = np.swapaxes(snp_data, 1, 2)
snp_inds = np.where(swapped[drug][2] < crit_value / len(valid_snp_inds_all))
final_sig_snps[drug, 0] = list(df_X.columns[list(swapped[drug][0][snp_inds])])
final_sig_snps[drug, 1] = list(swapped[drug][1][snp_inds])
final_sig_snps[drug, 2] = list(swapped[drug][2][snp_inds])
final_sig_snps[drug, 3] = list(swapped[drug][3][snp_inds])

# Save significant features
outarr = np.zeros(num_drugs, dtype=np.object)
for i in range(num_drugs):
outarr[i] = np.vstack((final_sig_snps[i]))
np.savetxt('snpsF{drug}020318.csv'.format(drug=drugs[i]), np.transpose(outarr[i]), fmt='%s', delimiter=',',
header='SNPs, S/R, p-value, (y=S|s=1) - (y=S|s=0)')

# Save valid feature names
derived_names_orig = list(df_X.columns.values[valid_snp_inds_all])
np.savetxt('feature_names020318.csv', derived_names_orig, fmt='%s', delimiter=',')
Loading

0 comments on commit aadd822

Please sign in to comment.