Skip to content

Commit

Permalink
Merge pull request #10 from jjc2718/bug_fixes
Browse files Browse the repository at this point in the history
Subset gene features by MAD and clean up data preprocessing code
  • Loading branch information
jjc2718 authored Aug 11, 2020
2 parents 66ca105 + 9058f90 commit 5c96290
Show file tree
Hide file tree
Showing 4 changed files with 170 additions and 62 deletions.
3 changes: 3 additions & 0 deletions pancancer_utilities/classify_utilities.py
Original file line number Diff line number Diff line change
Expand Up @@ -86,6 +86,7 @@ def train_model(x_train, x_test, y_train, alphas, l1_ratios, seed, n_folds=5, ma

return cv_pipeline, y_predict_train, y_predict_test, y_cv


def extract_coefficients(cv_pipeline, feature_names, signal, seed):
"""
Pull out the coefficients from the trained classifiers
Expand Down Expand Up @@ -114,6 +115,7 @@ def extract_coefficients(cv_pipeline, feature_names, signal, seed):

return coef_df


def get_threshold_metrics(y_true, y_pred, drop=False):
"""
Retrieve true/false positive rates and auroc/aupr for class predictions
Expand Down Expand Up @@ -145,6 +147,7 @@ def get_threshold_metrics(y_true, y_pred, drop=False):

return {"auroc": auroc, "aupr": aupr, "roc_df": roc_df, "pr_df": pr_df}


def summarize_results(results, gene, holdout_cancer_type, signal, seed,
data_type, fold_no):
"""
Expand Down
57 changes: 49 additions & 8 deletions pancancer_utilities/data_utilities.py
Original file line number Diff line number Diff line change
Expand Up @@ -5,6 +5,7 @@
import os
import sys

import numpy as np
import pandas as pd
import pickle as pkl
from sklearn.model_selection import KFold
Expand All @@ -17,7 +18,6 @@ def load_expression_data(scale_input=False, verbose=False):
Arguments
---------
subset_mad_genes (int): TODO still need to implement this
scale_input (bool): whether or not to scale the expression data
verbose (bool): whether or not to print verbose output
Expand All @@ -30,11 +30,6 @@ def load_expression_data(scale_input=False, verbose=False):

rnaseq_df = pd.read_csv(cfg.rnaseq_data, index_col=0, sep='\t')

# TODO: this needs to be added to data loading script
# if subset_mad_genes is not None:
# rnaseq_df = subset_genes_by_mad(rnaseq_df, cfg.mad_data,
# subset_mad_genes)

# Scale RNAseq matrix the same way RNAseq was scaled for
# compression algorithms
if scale_input:
Expand All @@ -47,6 +42,7 @@ def load_expression_data(scale_input=False, verbose=False):

return rnaseq_df


def load_pancancer_data(gene_list, verbose=False):
"""Load pan-cancer relevant data from previous Greene Lab repos.
Expand Down Expand Up @@ -97,7 +93,8 @@ def load_pancancer_data(gene_list, verbose=False):
with open(cfg.pancan_data, 'wb') as f:
pkl.dump(pancan_data, f)

return (genes_df, pancan_data)
return genes_df, pancan_data


def load_top_50():
"""Load top 50 mutated genes in TCGA from BioBombe repo.
Expand All @@ -113,6 +110,7 @@ def load_top_50():
genes_df = pd.read_csv(file, sep='\t')
return genes_df


def load_pancancer_data_from_repo():
"""Load data to build feature matrices from pancancer repo. """

Expand Down Expand Up @@ -142,11 +140,13 @@ def load_pancancer_data_from_repo():
mut_burden_df
)


def load_sample_info(verbose=False):
if verbose:
print('Loading sample info...', file=sys.stderr)
return pd.read_csv(cfg.sample_info, sep='\t', index_col='sample_id')


def split_by_cancer_type(rnaseq_df, sample_info_df, holdout_cancer_type,
use_pancancer=False, num_folds=4, fold_no=1,
seed=cfg.default_seed):
Expand Down Expand Up @@ -190,7 +190,8 @@ def split_by_cancer_type(rnaseq_df, sample_info_df, holdout_cancer_type,
else:
rnaseq_train_df = cancer_type_train_df

return (rnaseq_train_df, rnaseq_test_df)
return rnaseq_train_df, rnaseq_test_df


def split_single_cancer_type(cancer_type_df, num_folds, fold_no, seed):
"""Split data for a single cancer type into train and test sets."""
Expand All @@ -201,6 +202,7 @@ def split_single_cancer_type(cancer_type_df, num_folds, fold_no, seed):
test_df = cancer_type_df.iloc[test_ixs]
return train_df, test_df


def summarize_results(results, gene, holdout_cancer_type, signal, z_dim,
seed, algorithm, data_type):
"""
Expand Down Expand Up @@ -252,4 +254,43 @@ def summarize_results(results, gene, holdout_cancer_type, signal, z_dim,

return metrics_out_, roc_df_, pr_df_

def subset_by_mad(X_train_df, X_test_df, gene_features, subset_mad_genes, verbose=False):
"""Subset features by mean absolute deviation.
Takes the top subset_mad_genes genes (sorted in descending order),
calculated on the training set.
Arguments
---------
X_train_df: training data, samples x genes
X_test_df: test data, samples x genes
gene_features: numpy bool array, indicating which features are genes (and should be subsetted/standardized)
subset_mad_genes (int): number of genes to take
Returns
-------
(train_df, test_df, gene_features) datasets with filtered features
"""
if verbose:
print('Taking subset of gene features', file=sys.stderr)

mad_genes_df = (
X_train_df.loc[:, gene_features]
.mad(axis=0)
.sort_values(ascending=False)
.reset_index()
)
mad_genes_df.columns = ['gene_id', 'mean_absolute_deviation']
mad_genes = mad_genes_df.iloc[:subset_mad_genes, :].gene_id.astype(str).values

non_gene_features = X_train_df.columns.values[~gene_features]
valid_features = np.concatenate((mad_genes, non_gene_features))

gene_features = np.concatenate((
np.ones(mad_genes.shape[0]).astype('bool'),
np.zeros(non_gene_features.shape[0]).astype('bool')
))
train_df = X_train_df.reindex(valid_features, axis='columns')
test_df = X_test_df.reindex(valid_features, axis='columns')
return train_df, test_df, gene_features

110 changes: 73 additions & 37 deletions pancancer_utilities/scripts/classify_cancer_type.py
Original file line number Diff line number Diff line change
Expand Up @@ -4,10 +4,9 @@
Adapted from:
https://github.com/greenelab/BioBombe/blob/master/9.tcga-classify/classify-with-raw-expression.py
"""
import os
import argparse
import logging
import pickle as pkl
from pathlib import Path

import numpy as np
import pandas as pd
Expand All @@ -16,8 +15,8 @@
import pancancer_utilities.data_utilities as du
from pancancer_utilities.tcga_utilities import (
process_y_matrix,
process_y_matrix_cancertype,
align_matrices,
standardize_gene_features,
check_status
)
from pancancer_utilities.classify_utilities import (
Expand All @@ -27,6 +26,10 @@
summarize_results
)

#########################################
### 1. Process command line arguments ###
#########################################

p = argparse.ArgumentParser()
p.add_argument('--gene', type=str, required=True,
help='Provide a gene to run mutation classification for.')
Expand All @@ -41,13 +44,34 @@
p.add_argument('--seed', type=int, default=cfg.default_seed)
p.add_argument('--shuffle_labels', action='store_true',
help='Include flag to shuffle labels as a negative control')
p.add_argument('--subset_mad_genes', type=int, default=-1,
help='If included, subset gene features to this number of\
features having highest mean absolute deviation.')
p.add_argument('--verbose', action='store_true')
args = p.parse_args()

np.random.seed(args.seed)
if args.verbose:
logging.basicConfig(level=logging.DEBUG, format='%(message)s')

# create directory for the gene
dirname = 'pancancer' if args.use_pancancer else 'single_cancer'
gene_dir = Path(args.results_dir, dirname, args.gene).resolve()
gene_dir.mkdir(parents=True, exist_ok=True)

# check if gene has been processed already
# TODO: probably want to add a "resume" option for this in the future
signal = 'shuffled' if args.shuffle_labels else 'signal'
check_file = Path(gene_dir,
"{}_{}_{}_coefficients.tsv.gz".format(
args.gene, args.holdout_cancer_type, signal)).resolve()
if check_status(check_file):
exit('Results file already exists, exiting')

#######################################################
### 2. Load gene expression and mutation label data ###
#######################################################

# load and unpack pancancer data
genes_df, pancan_data = du.load_pancancer_data([args.gene], verbose=args.verbose)
gene_idx = 0
Expand Down Expand Up @@ -85,19 +109,9 @@
gene_coef_list = []
gene_metrics_list = []

# create directory for the gene
dirname = 'pancancer' if args.use_pancancer else 'single_cancer'
gene_dir = os.path.join(args.results_dir, dirname, gene_name)
os.makedirs(gene_dir, exist_ok=True)

# check if this experiment has been run already
# TODO: probably want to add a "resume" option for this in the future
signal = 'shuffled' if args.shuffle_labels else 'signal'
check_file = os.path.join(gene_dir,
"{}_{}_{}_coefficients.tsv.gz".format(
gene_name, args.holdout_cancer_type, signal))
if check_status(check_file):
exit()
#################################################
### 3. Process labels and filter cancer types ###
#################################################

# process the y matrix for the given gene or pathway
y_mutation_df = mutation_df.loc[:, gene_name]
Expand Down Expand Up @@ -128,32 +142,54 @@
hyper_filter=5
)

use_samples, rnaseq_df, y_df, gene_features = align_matrices(
x_file_or_df=rnaseq_df,
y=y_df,
add_cancertype_covariate=args.use_pancancer,
add_mutation_covariate=True
)

# shuffle mutation status labels if necessary
if args.shuffle_labels:
y_df.status = np.random.permutation(y_df.status.values)

############################################
### 4. Split data and fit/evaluate model ###
############################################

for fold_no in range(args.num_folds):

X_train_raw_df, X_test_raw_df = du.split_by_cancer_type(
rnaseq_df, sample_info_df, args.holdout_cancer_type,
num_folds=args.num_folds, fold_no=fold_no,
use_pancancer=args.use_pancancer, seed=args.seed)
logging.debug('Splitting data and preprocessing features...')

# split data into train and test sets
try:
train_samples, X_train_df, y_train_df = align_matrices(
x_file_or_df=X_train_raw_df, y=y_df, add_cancertype_covariate=False
)
test_samples, X_test_df, y_test_df = align_matrices(
x_file_or_df=X_test_raw_df, y=y_df, add_cancertype_covariate=False
)
X_train_raw_df, X_test_raw_df = du.split_by_cancer_type(
rnaseq_df, sample_info_df, args.holdout_cancer_type,
num_folds=args.num_folds, fold_no=fold_no,
use_pancancer=args.use_pancancer, seed=args.seed)
except ValueError:
exit('No test samples found for cancer type: {}, gene: {}\n'.format(
args.holdout_cancer_type, args.gene))

y_train_df = y_df.reindex(X_train_raw_df.index)
y_test_df = y_df.reindex(X_test_raw_df.index)

# data processing/feature selection, needs to happen for train and
# test sets independently
if args.subset_mad_genes > 0:
X_train_raw_df, X_test_raw_df, gene_features_filtered = du.subset_by_mad(
X_train_raw_df, X_test_raw_df, gene_features, args.subset_mad_genes
)
X_train_df = standardize_gene_features(X_train_raw_df, gene_features_filtered)
X_test_df = standardize_gene_features(X_test_raw_df, gene_features_filtered)
else:
X_train_df = standardize_gene_features(X_train_raw_df, gene_features)
X_test_df = standardize_gene_features(X_test_raw_df, gene_features)

# fit the model
logging.debug('Training model for fold {}'.format(fold_no))
logging.debug(X_train_df.shape)
logging.debug(X_test_df.shape)
logging.debug('-- training dimensions: {}'.format(X_train_df.shape))
logging.debug('-- testing dimensions: {}'.format(X_test_df.shape))
cv_pipeline, y_pred_train_df, y_pred_test_df, y_cv_df = train_model(
x_train=X_train_df,
x_test=X_test_df,
Expand Down Expand Up @@ -212,35 +248,35 @@

gene_coef_list.append(coef_df)

#######################################
### 5. Save results to output files ###
#######################################

gene_auc_df = pd.concat(gene_auc_list)
gene_aupr_df = pd.concat(gene_aupr_list)
gene_coef_df = pd.concat(gene_coef_list)
gene_metrics_df = pd.concat(gene_metrics_list)

# save metric dataframes and label info to files
gene_coef_df.to_csv(
check_file, sep="\t", index=False, compression="gzip", float_format="%.5g"
)

output_file = os.path.join(
output_file = Path(
gene_dir, "{}_{}_{}_auc_threshold_metrics.tsv.gz".format(
gene_name, args.holdout_cancer_type, signal)
)
gene_name, args.holdout_cancer_type, signal)).resolve()
gene_auc_df.to_csv(
output_file, sep="\t", index=False, compression="gzip", float_format="%.5g"
)

output_file = os.path.join(
output_file = Path(
gene_dir, "{}_{}_{}_aupr_threshold_metrics.tsv.gz".format(
gene_name, args.holdout_cancer_type, signal)
)
gene_name, args.holdout_cancer_type, signal)).resolve()
gene_aupr_df.to_csv(
output_file, sep="\t", index=False, compression="gzip", float_format="%.5g"
)

output_file = os.path.join(gene_dir, "{}_{}_{}_classify_metrics.tsv.gz".format(
gene_name, args.holdout_cancer_type, signal))
output_file = Path(gene_dir, "{}_{}_{}_classify_metrics.tsv.gz".format(
gene_name, args.holdout_cancer_type, signal)).resolve()
gene_metrics_df.to_csv(
output_file, sep="\t", index=False, compression="gzip", float_format="%.5g"
)
Expand Down
Loading

0 comments on commit 5c96290

Please sign in to comment.