Skip to content

Commit

Permalink
Merge pull request #42 from jjc2718/add_cancer_types
Browse files Browse the repository at this point in the history
Add cancer types to single-cancer mutation prediction models
  • Loading branch information
jjc2718 authored Jan 19, 2021
2 parents eb23253 + 8b3f9c7 commit 3908b29
Show file tree
Hide file tree
Showing 10 changed files with 1,856 additions and 17 deletions.
1,121 changes: 1,121 additions & 0 deletions 04_add_cancer_types/plot_add_cancer_results.ipynb

Large diffs are not rendered by default.

257 changes: 257 additions & 0 deletions 04_add_cancer_types/run_add_cancer_classification.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,257 @@
"""
Script to run "add cancer" experiments. The general idea is to add cancers
in a particular order (either random or determined by cancer type similarity).
The prediction goal is the same as in 02_cancer_type_classification
experiments; that is, predicting presence/absence of a given mutation in
a particular cancer type.
"""
import sys
import argparse
import itertools as it
from pathlib import Path

import numpy as np
import pandas as pd
from tqdm import tqdm

import pancancer_evaluation.config as cfg
from pancancer_evaluation.data_models.tcga_data_model import TCGADataModel
from pancancer_evaluation.exceptions import (
NoTrainSamplesError,
NoTestSamplesError,
OneClassError,
ResultsFileExistsError
)
from pancancer_evaluation.utilities.classify_utilities import run_cv_cancer_type
import pancancer_evaluation.utilities.data_utilities as du
import pancancer_evaluation.utilities.file_utilities as fu

def process_args():
p = argparse.ArgumentParser()
p.add_argument('--custom_genes', nargs='*', default=None,
help='currently this needs to be a subset of top_50')
p.add_argument('--debug', action='store_true',
help='use subset of data for fast debugging')
p.add_argument('--gene_set', type=str,
choices=['top_50', 'vogelstein', 'custom'],
default='top_50',
help='choose which gene set to use. top_50 and vogelstein are '
'predefined gene sets (see data_utilities), and custom allows '
'any gene or set of genes in TCGA, specified in --custom_genes')
p.add_argument('--holdout_cancer_types', nargs='*', default=None,
help='provide a list of cancer types to hold out, uses all '
'cancer types in TCGA if none are provided')
p.add_argument('--how_to_add', type=str,
choices=['random', 'similarity'],
default='random',
help='Method for choosing cancer types to add to the '
'training dataset; see data model for details')
p.add_argument('--log_file', default=None,
help='name of file to log skipped cancer types to')
p.add_argument('--num_folds', type=int, default=4,
help='number of folds of cross-validation to run')
p.add_argument('--results_dir', default=cfg.results_dir,
help='where to write results to')
p.add_argument('--seed', type=int, default=cfg.default_seed)
p.add_argument('--subset_mad_genes', type=int, default=cfg.num_features_raw,
help='if included, subset gene features to this number of '
'features having highest mean absolute deviation')
p.add_argument('--verbose', action='store_true')
args = p.parse_args()

if args.gene_set == 'custom':
if args.custom_genes is None:
p.error('must include --custom_genes when --gene_set=\'custom\'')
args.gene_set = args.custom_genes
del args.custom_genes
elif (args.gene_set != 'custom' and args.custom_genes is not None):
p.error('must use option --gene_set=\'custom\' if custom genes are included')

sample_info_df = du.load_sample_info(args.verbose)
tcga_cancer_types = list(np.unique(sample_info_df.cancer_type))
if args.holdout_cancer_types is None:
args.holdout_cancer_types = tcga_cancer_types
else:
not_in_tcga = set(args.holdout_cancer_types) - set(tcga_cancer_types)
if len(not_in_tcga) > 0:
p.error('some cancer types not present in TCGA: {}'.format(
' '.join(not_in_tcga)))

args.results_dir = Path(args.results_dir).resolve()

if args.log_file is None:
args.log_file = Path(args.results_dir, 'log_skipped.tsv').resolve()

return args, sample_info_df


if __name__ == '__main__':

# process command line arguments
args, sample_info_df = process_args()

# create results dir if it doesn't exist
args.results_dir.mkdir(parents=True, exist_ok=True)

# create empty log file if it doesn't exist
log_columns = [
'gene',
'cancer_type',
'shuffle_labels',
'skip_reason'
]
if args.log_file.exists() and args.log_file.is_file():
log_df = pd.read_csv(args.log_file, sep='\t')
else:
log_df = pd.DataFrame(columns=log_columns)
log_df.to_csv(args.log_file, sep='\t')

tcga_data = TCGADataModel(sample_info=sample_info_df,
seed=args.seed,
subset_mad_genes=args.subset_mad_genes,
verbose=args.verbose,
debug=args.debug)

genes_df = tcga_data.load_gene_set(args.gene_set)

# we want to run mutation prediction experiments:
# - for signal and shuffled labels
# (shuffled labels acts as our lower baseline)
# - for all genes in the given gene set
# - for all cancer types in the given holdout cancer types (or all of TCGA)
# - for all numbers of cancers to add to training set
# (cfg.num_train_cancer_types)
for shuffle_labels in (False, True):

print('shuffle_labels: {}'.format(shuffle_labels))

progress_1 = tqdm(genes_df.iterrows(),
total=genes_df.shape[0],
ncols=100,
file=sys.stdout)

for gene_idx, gene_series in progress_1:
gene = gene_series.gene
classification = gene_series.classification
progress_1.set_description('gene: {}'.format(gene))

gene_dir = fu.make_gene_dir(args.results_dir, gene, add_cancer=True)

progress_2 = tqdm(args.holdout_cancer_types,
ncols=100,
file=sys.stdout)

for test_cancer_type in progress_2:

progress_2.set_description('cancer type: {}'.format(test_cancer_type))
cancer_type_log_df = None

progress_3 = tqdm(cfg.num_train_cancer_types,
ncols=100,
file=sys.stdout)

for num_train_cancer_types in progress_3:

progress_3.set_description('num train cancers: {}'.format(
num_train_cancer_types))

try:
tcga_data.process_data_for_gene_and_cancer(gene,
classification,
test_cancer_type,
gene_dir,
num_train_cancer_types,
how_to_add=args.how_to_add,
shuffle_labels=shuffle_labels)
except NoTrainSamplesError:
if args.verbose:
print('Skipping due to no train samples: gene {}, '
'cancer type {}'.format(gene, test_cancer_type),
file=sys.stderr)
cancer_type_log_df = fu.generate_log_df(
log_columns,
[gene, test_cancer_type, shuffle_labels, 'no_train_samples']
)
continue

try:
# check if results file already exists, if not skip it
check_file = fu.check_add_cancer_file(gene_dir,
gene,
test_cancer_type,
num_train_cancer_types,
args.how_to_add,
args.seed,
shuffle_labels)
except ResultsFileExistsError:
if args.verbose:
print('Skipping because results file exists already: '
'gene {}, cancer type {}'.format(gene, test_cancer_type),
file=sys.stderr)
cancer_type_log_df = fu.generate_log_df(
log_columns,
[gene, test_cancer_type, shuffle_labels, 'file_exists']
)
continue

try:
# run cross-validation for the given cancer type
#
# since we already filtered the dataset to the cancer
# types of interest, we can just use this function with
# the "pancancer" option (you can think of it as a a
# pancancer model where the "universe" of all cancers
# is limited by our previous filtering, kinda).
results = run_cv_cancer_type(tcga_data,
gene,
test_cancer_type,
sample_info_df,
args.num_folds,
use_pancancer=True,
use_pancancer_only=False,
shuffle_labels=shuffle_labels)
except NoTrainSamplesError:
if args.verbose:
print('Skipping due to no train samples: gene {}, '
'cancer type {}'.format(gene, test_cancer_type),
file=sys.stderr)
cancer_type_log_df = fu.generate_log_df(
log_columns,
[gene, test_cancer_type, shuffle_labels, 'no_train_samples']
)
except NoTestSamplesError:
if args.verbose:
print('Skipping due to no test samples: gene {}, '
'cancer type {}'.format(gene, test_cancer_type),
file=sys.stderr)
cancer_type_log_df = fu.generate_log_df(
log_columns,
[gene, test_cancer_type, shuffle_labels, 'no_test_samples']
)
except OneClassError:
if args.verbose:
print('Skipping due to one holdout class: gene {}, '
'cancer type {}'.format(gene, test_cancer_type),
file=sys.stderr)
cancer_type_log_df = fu.generate_log_df(
log_columns,
[gene, test_cancer_type, shuffle_labels, 'one_class']
)
else:
# only save results if no exceptions
fu.save_results_add_cancer(gene_dir,
check_file,
results,
gene,
test_cancer_type,
tcga_data.y_df.DISEASE.unique(),
num_train_cancer_types,
args.how_to_add,
args.seed,
shuffle_labels)

if cancer_type_log_df is not None:
fu.write_log_file(cancer_type_log_df, args.log_file)

File renamed without changes.
8 changes: 8 additions & 0 deletions pancancer_evaluation/config.py
Original file line number Diff line number Diff line change
Expand Up @@ -58,3 +58,11 @@
cross_cancer_types = [
'THCA', 'COAD', 'GBM', 'LGG', 'SKCM'
]

# parameters for "add cancer" experiments

# how many cancer types to add to target cancer
# 0 = just use target cancer, -1 = use all cancers (pan-cancer model)
num_train_cancer_types = [0, 1, 2, 4, -1]
# similarity matrix to use for 'similarity' addition option
similarity_matrix_file = data_dir / 'expression_confusion_matrix.tsv'
Loading

0 comments on commit 3908b29

Please sign in to comment.