Merge pull request #10 from jjc2718/bug_fixes

Subset gene features by MAD and clean up data preprocessing code
greenelab · Aug 11, 2020 · 5c96290 · 5c96290
2 parents 66ca105 + 9058f90
commit 5c96290
Show file tree

Hide file tree

Showing 4 changed files with 170 additions and 62 deletions.
diff --git a/pancancer_utilities/classify_utilities.py b/pancancer_utilities/classify_utilities.py
@@ -86,6 +86,7 @@ def train_model(x_train, x_test, y_train, alphas, l1_ratios, seed, n_folds=5, ma
 
     return cv_pipeline, y_predict_train, y_predict_test, y_cv
 
+
 def extract_coefficients(cv_pipeline, feature_names, signal, seed):
     """
     Pull out the coefficients from the trained classifiers
@@ -114,6 +115,7 @@ def extract_coefficients(cv_pipeline, feature_names, signal, seed):
 
     return coef_df
 
+
 def get_threshold_metrics(y_true, y_pred, drop=False):
     """
     Retrieve true/false positive rates and auroc/aupr for class predictions
@@ -145,6 +147,7 @@ def get_threshold_metrics(y_true, y_pred, drop=False):
 
     return {"auroc": auroc, "aupr": aupr, "roc_df": roc_df, "pr_df": pr_df}
 
+
 def summarize_results(results, gene, holdout_cancer_type, signal, seed,
                       data_type, fold_no):
     """

diff --git a/pancancer_utilities/data_utilities.py b/pancancer_utilities/data_utilities.py
@@ -5,6 +5,7 @@
 import os
 import sys
 
+import numpy as np
 import pandas as pd
 import pickle as pkl
 from sklearn.model_selection import KFold
@@ -17,7 +18,6 @@ def load_expression_data(scale_input=False, verbose=False):
 
     Arguments
     ---------
-    subset_mad_genes (int): TODO still need to implement this
     scale_input (bool): whether or not to scale the expression data
     verbose (bool): whether or not to print verbose output
 
@@ -30,11 +30,6 @@ def load_expression_data(scale_input=False, verbose=False):
 
     rnaseq_df = pd.read_csv(cfg.rnaseq_data, index_col=0, sep='\t')
 
-    # TODO: this needs to be added to data loading script
-    # if subset_mad_genes is not None:
-    #     rnaseq_df = subset_genes_by_mad(rnaseq_df, cfg.mad_data,
-    #                                     subset_mad_genes)
-
     # Scale RNAseq matrix the same way RNAseq was scaled for
     # compression algorithms
     if scale_input:
@@ -47,6 +42,7 @@ def load_expression_data(scale_input=False, verbose=False):
 
     return rnaseq_df
 
+
 def load_pancancer_data(gene_list, verbose=False):
     """Load pan-cancer relevant data from previous Greene Lab repos.
 
@@ -97,7 +93,8 @@ def load_pancancer_data(gene_list, verbose=False):
         with open(cfg.pancan_data, 'wb') as f:
             pkl.dump(pancan_data, f)
 
-    return (genes_df, pancan_data)
+    return genes_df, pancan_data
+
 
 def load_top_50():
     """Load top 50 mutated genes in TCGA from BioBombe repo.
@@ -113,6 +110,7 @@ def load_top_50():
     genes_df = pd.read_csv(file, sep='\t')
     return genes_df
 
+
 def load_pancancer_data_from_repo():
     """Load data to build feature matrices from pancancer repo. """
 
@@ -142,11 +140,13 @@ def load_pancancer_data_from_repo():
         mut_burden_df
     )
 
+
 def load_sample_info(verbose=False):
     if verbose:
         print('Loading sample info...', file=sys.stderr)
     return pd.read_csv(cfg.sample_info, sep='\t', index_col='sample_id')
 
+
 def split_by_cancer_type(rnaseq_df, sample_info_df, holdout_cancer_type,
                          use_pancancer=False, num_folds=4, fold_no=1,
                          seed=cfg.default_seed):
@@ -190,7 +190,8 @@ def split_by_cancer_type(rnaseq_df, sample_info_df, holdout_cancer_type,
     else:
         rnaseq_train_df = cancer_type_train_df
 
-    return (rnaseq_train_df, rnaseq_test_df)
+    return rnaseq_train_df, rnaseq_test_df
+
 
 def split_single_cancer_type(cancer_type_df, num_folds, fold_no, seed):
     """Split data for a single cancer type into train and test sets."""
@@ -201,6 +202,7 @@ def split_single_cancer_type(cancer_type_df, num_folds, fold_no, seed):
             test_df = cancer_type_df.iloc[test_ixs]
     return train_df, test_df
 
+
 def summarize_results(results, gene, holdout_cancer_type, signal, z_dim,
                       seed, algorithm, data_type):
     """
@@ -252,4 +254,43 @@ def summarize_results(results, gene, holdout_cancer_type, signal, z_dim,
 
     return metrics_out_, roc_df_, pr_df_
 
+def subset_by_mad(X_train_df, X_test_df, gene_features, subset_mad_genes, verbose=False):
+    """Subset features by mean absolute deviation.
+
+    Takes the top subset_mad_genes genes (sorted in descending order),
+    calculated on the training set.
+
+    Arguments
+    ---------
+    X_train_df: training data, samples x genes
+    X_test_df: test data, samples x genes
+    gene_features: numpy bool array, indicating which features are genes (and should be subsetted/standardized)
+    subset_mad_genes (int): number of genes to take
+
+    Returns
+    -------
+    (train_df, test_df, gene_features) datasets with filtered features
+    """
+    if verbose:
+        print('Taking subset of gene features', file=sys.stderr)
+
+    mad_genes_df = (
+        X_train_df.loc[:, gene_features]
+                  .mad(axis=0)
+                  .sort_values(ascending=False)
+                  .reset_index()
+    )
+    mad_genes_df.columns = ['gene_id', 'mean_absolute_deviation']
+    mad_genes = mad_genes_df.iloc[:subset_mad_genes, :].gene_id.astype(str).values
+
+    non_gene_features = X_train_df.columns.values[~gene_features]
+    valid_features = np.concatenate((mad_genes, non_gene_features))
+
+    gene_features = np.concatenate((
+        np.ones(mad_genes.shape[0]).astype('bool'),
+        np.zeros(non_gene_features.shape[0]).astype('bool')
+    ))
+    train_df = X_train_df.reindex(valid_features, axis='columns')
+    test_df = X_test_df.reindex(valid_features, axis='columns')
+    return train_df, test_df, gene_features
 
diff --git a/pancancer_utilities/scripts/classify_cancer_type.py b/pancancer_utilities/scripts/classify_cancer_type.py
@@ -4,10 +4,9 @@
 Adapted from:
 https://github.com/greenelab/BioBombe/blob/master/9.tcga-classify/classify-with-raw-expression.py
 """
-import os
 import argparse
 import logging
-import pickle as pkl
+from pathlib import Path
 
 import numpy as np
 import pandas as pd
@@ -16,8 +15,8 @@
 import pancancer_utilities.data_utilities as du
 from pancancer_utilities.tcga_utilities import (
     process_y_matrix,
-    process_y_matrix_cancertype,
     align_matrices,
+    standardize_gene_features,
     check_status
 )
 from pancancer_utilities.classify_utilities import (
@@ -27,6 +26,10 @@
     summarize_results
 )
 
+#########################################
+### 1. Process command line arguments ###
+#########################################
+
 p = argparse.ArgumentParser()
 p.add_argument('--gene', type=str, required=True,
                help='Provide a gene to run mutation classification for.')
@@ -41,13 +44,34 @@
 p.add_argument('--seed', type=int, default=cfg.default_seed)
 p.add_argument('--shuffle_labels', action='store_true',
                help='Include flag to shuffle labels as a negative control')
+p.add_argument('--subset_mad_genes', type=int, default=-1,
+               help='If included, subset gene features to this number of\
+                     features having highest mean absolute deviation.')
 p.add_argument('--verbose', action='store_true')
 args = p.parse_args()
 
 np.random.seed(args.seed)
 if args.verbose:
     logging.basicConfig(level=logging.DEBUG, format='%(message)s')
 
+# create directory for the gene
+dirname = 'pancancer' if args.use_pancancer else 'single_cancer'
+gene_dir = Path(args.results_dir, dirname, args.gene).resolve()
+gene_dir.mkdir(parents=True, exist_ok=True)
+
+# check if gene has been processed already
+# TODO: probably want to add a "resume" option for this in the future
+signal = 'shuffled' if args.shuffle_labels else 'signal'
+check_file = Path(gene_dir,
+                  "{}_{}_{}_coefficients.tsv.gz".format(
+                  args.gene, args.holdout_cancer_type, signal)).resolve()
+if check_status(check_file):
+    exit('Results file already exists, exiting')
+
+#######################################################
+### 2. Load gene expression and mutation label data ###
+#######################################################
+
 # load and unpack pancancer data
 genes_df, pancan_data = du.load_pancancer_data([args.gene], verbose=args.verbose)
 gene_idx = 0
@@ -85,19 +109,9 @@
 gene_coef_list = []
 gene_metrics_list = []
 
-# create directory for the gene
-dirname = 'pancancer' if args.use_pancancer else 'single_cancer'
-gene_dir = os.path.join(args.results_dir, dirname, gene_name)
-os.makedirs(gene_dir, exist_ok=True)
-
-# check if this experiment has been run already
-# TODO: probably want to add a "resume" option for this in the future
-signal = 'shuffled' if args.shuffle_labels else 'signal'
-check_file = os.path.join(gene_dir,
-                          "{}_{}_{}_coefficients.tsv.gz".format(
-                              gene_name, args.holdout_cancer_type, signal))
-if check_status(check_file):
-    exit()
+#################################################
+### 3. Process labels and filter cancer types ###
+#################################################
 
 # process the y matrix for the given gene or pathway
 y_mutation_df = mutation_df.loc[:, gene_name]
@@ -128,32 +142,54 @@
     hyper_filter=5
 )
 
+use_samples, rnaseq_df, y_df, gene_features = align_matrices(
+    x_file_or_df=rnaseq_df,
+    y=y_df,
+    add_cancertype_covariate=args.use_pancancer,
+    add_mutation_covariate=True
+)
+
 # shuffle mutation status labels if necessary
 if args.shuffle_labels:
     y_df.status = np.random.permutation(y_df.status.values)
 
+############################################
+### 4. Split data and fit/evaluate model ###
+############################################
+
 for fold_no in range(args.num_folds):
 
-    X_train_raw_df, X_test_raw_df = du.split_by_cancer_type(
-       rnaseq_df, sample_info_df, args.holdout_cancer_type,
-       num_folds=args.num_folds, fold_no=fold_no,
-       use_pancancer=args.use_pancancer, seed=args.seed)
+    logging.debug('Splitting data and preprocessing features...')
 
+    # split data into train and test sets
     try:
-        train_samples, X_train_df, y_train_df = align_matrices(
-            x_file_or_df=X_train_raw_df, y=y_df, add_cancertype_covariate=False
-        )
-        test_samples, X_test_df, y_test_df = align_matrices(
-            x_file_or_df=X_test_raw_df, y=y_df, add_cancertype_covariate=False
-        )
+        X_train_raw_df, X_test_raw_df = du.split_by_cancer_type(
+           rnaseq_df, sample_info_df, args.holdout_cancer_type,
+           num_folds=args.num_folds, fold_no=fold_no,
+           use_pancancer=args.use_pancancer, seed=args.seed)
     except ValueError:
         exit('No test samples found for cancer type: {}, gene: {}\n'.format(
                args.holdout_cancer_type, args.gene))
 
+    y_train_df = y_df.reindex(X_train_raw_df.index)
+    y_test_df = y_df.reindex(X_test_raw_df.index)
+
+    # data processing/feature selection, needs to happen for train and
+    # test sets independently
+    if args.subset_mad_genes > 0:
+        X_train_raw_df, X_test_raw_df, gene_features_filtered = du.subset_by_mad(
+            X_train_raw_df, X_test_raw_df, gene_features, args.subset_mad_genes
+        )
+        X_train_df = standardize_gene_features(X_train_raw_df, gene_features_filtered)
+        X_test_df = standardize_gene_features(X_test_raw_df, gene_features_filtered)
+    else:
+        X_train_df = standardize_gene_features(X_train_raw_df, gene_features)
+        X_test_df = standardize_gene_features(X_test_raw_df, gene_features)
+
     # fit the model
     logging.debug('Training model for fold {}'.format(fold_no))
-    logging.debug(X_train_df.shape)
-    logging.debug(X_test_df.shape)
+    logging.debug('-- training dimensions: {}'.format(X_train_df.shape))
+    logging.debug('-- testing dimensions: {}'.format(X_test_df.shape))
     cv_pipeline, y_pred_train_df, y_pred_test_df, y_cv_df = train_model(
         x_train=X_train_df,
         x_test=X_test_df,
@@ -212,35 +248,35 @@
 
     gene_coef_list.append(coef_df)
 
+#######################################
+### 5. Save results to output files ###
+#######################################
 
 gene_auc_df = pd.concat(gene_auc_list)
 gene_aupr_df = pd.concat(gene_aupr_list)
 gene_coef_df = pd.concat(gene_coef_list)
 gene_metrics_df = pd.concat(gene_metrics_list)
 
-# save metric dataframes and label info to files
 gene_coef_df.to_csv(
     check_file, sep="\t", index=False, compression="gzip", float_format="%.5g"
 )
 
-output_file = os.path.join(
+output_file = Path(
     gene_dir, "{}_{}_{}_auc_threshold_metrics.tsv.gz".format(
-        gene_name, args.holdout_cancer_type, signal)
-)
+        gene_name, args.holdout_cancer_type, signal)).resolve()
 gene_auc_df.to_csv(
     output_file, sep="\t", index=False, compression="gzip", float_format="%.5g"
 )
 
-output_file = os.path.join(
+output_file = Path(
     gene_dir, "{}_{}_{}_aupr_threshold_metrics.tsv.gz".format(
-        gene_name, args.holdout_cancer_type, signal)
-)
+        gene_name, args.holdout_cancer_type, signal)).resolve()
 gene_aupr_df.to_csv(
     output_file, sep="\t", index=False, compression="gzip", float_format="%.5g"
 )
 
-output_file = os.path.join(gene_dir, "{}_{}_{}_classify_metrics.tsv.gz".format(
-    gene_name, args.holdout_cancer_type, signal))
+output_file = Path(gene_dir, "{}_{}_{}_classify_metrics.tsv.gz".format(
+    gene_name, args.holdout_cancer_type, signal)).resolve()
 gene_metrics_df.to_csv(
     output_file, sep="\t", index=False, compression="gzip", float_format="%.5g"
 )