Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Add cancer types to single-cancer mutation prediction models #42

Merged
merged 22 commits into from
Jan 19, 2021
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
Show all changes
22 commits
Select commit Hold shift + click to select a range
f51d020
get_cancers_to_add function seems to work
jjc2718 Jan 8, 2021
2d9057c
small tweaks to cancer types function
jjc2718 Jan 8, 2021
c0d848d
add run cancer script + get things running somewhat
jjc2718 Jan 8, 2021
6e6d0bd
fix order of random cancer type addition
jjc2718 Jan 9, 2021
2fc767f
add simple unit test for add cancers function
jjc2718 Jan 9, 2021
947d174
add a few more tests
jjc2718 Jan 9, 2021
160c0e8
remove testing code from data model
jjc2718 Jan 9, 2021
14b3065
sort out filename formatting options
jjc2718 Jan 9, 2021
0bf9e5b
fix a few logging details
jjc2718 Jan 11, 2021
37afee5
should be able to use existing CV code
jjc2718 Jan 11, 2021
1a36501
small changes to make similarity ranking work
jjc2718 Jan 11, 2021
5aaec8b
add code to save add cancer experiments results
jjc2718 Jan 11, 2021
d8ff806
write results to add_cancer directory
jjc2718 Jan 11, 2021
4efad66
catch and skip case where no test cancer samples
jjc2718 Jan 11, 2021
bc21692
start working on add cancer analysis script
jjc2718 Jan 12, 2021
c09e5c7
add cancer types to single id plot
jjc2718 Jan 12, 2021
59d9082
add comparison against single-cancer and pancancer classifiers
jjc2718 Jan 12, 2021
7d710fd
rearrange cells to make repeat runs easier/quicker
jjc2718 Jan 12, 2021
885217a
add single-cancer vs. pancancer plot
jjc2718 Jan 12, 2021
30838eb
update some documentation
jjc2718 Jan 14, 2021
15316af
add a bit of explanation to results notebook
jjc2718 Jan 14, 2021
8b3f9c7
use results from current experiments for box plot
jjc2718 Jan 19, 2021
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
1,121 changes: 1,121 additions & 0 deletions 04_add_cancer_types/plot_add_cancer_results.ipynb

Large diffs are not rendered by default.

257 changes: 257 additions & 0 deletions 04_add_cancer_types/run_add_cancer_classification.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,257 @@
"""
Script to run "add cancer" experiments. The general idea is to add cancers
in a particular order (either random or determined by cancer type similarity).

The prediction goal is the same as in 02_cancer_type_classification
experiments; that is, predicting presence/absence of a given mutation in
a particular cancer type.

"""
import sys
import argparse
import itertools as it
from pathlib import Path

import numpy as np
import pandas as pd
from tqdm import tqdm

import pancancer_evaluation.config as cfg
from pancancer_evaluation.data_models.tcga_data_model import TCGADataModel
from pancancer_evaluation.exceptions import (
NoTrainSamplesError,
NoTestSamplesError,
OneClassError,
ResultsFileExistsError
)
from pancancer_evaluation.utilities.classify_utilities import run_cv_cancer_type
import pancancer_evaluation.utilities.data_utilities as du
import pancancer_evaluation.utilities.file_utilities as fu

def process_args():
jjc2718 marked this conversation as resolved.
Show resolved Hide resolved
p = argparse.ArgumentParser()
jjc2718 marked this conversation as resolved.
Show resolved Hide resolved
p.add_argument('--custom_genes', nargs='*', default=None,
help='currently this needs to be a subset of top_50')
p.add_argument('--debug', action='store_true',
help='use subset of data for fast debugging')
p.add_argument('--gene_set', type=str,
choices=['top_50', 'vogelstein', 'custom'],
default='top_50',
help='choose which gene set to use. top_50 and vogelstein are '
'predefined gene sets (see data_utilities), and custom allows '
'any gene or set of genes in TCGA, specified in --custom_genes')
p.add_argument('--holdout_cancer_types', nargs='*', default=None,
help='provide a list of cancer types to hold out, uses all '
'cancer types in TCGA if none are provided')
p.add_argument('--how_to_add', type=str,
choices=['random', 'similarity'],
default='random',
help='Method for choosing cancer types to add to the '
'training dataset; see data model for details')
p.add_argument('--log_file', default=None,
help='name of file to log skipped cancer types to')
p.add_argument('--num_folds', type=int, default=4,
help='number of folds of cross-validation to run')
p.add_argument('--results_dir', default=cfg.results_dir,
help='where to write results to')
p.add_argument('--seed', type=int, default=cfg.default_seed)
p.add_argument('--subset_mad_genes', type=int, default=cfg.num_features_raw,
help='if included, subset gene features to this number of '
'features having highest mean absolute deviation')
p.add_argument('--verbose', action='store_true')
args = p.parse_args()

if args.gene_set == 'custom':
if args.custom_genes is None:
p.error('must include --custom_genes when --gene_set=\'custom\'')
args.gene_set = args.custom_genes
del args.custom_genes
elif (args.gene_set != 'custom' and args.custom_genes is not None):
p.error('must use option --gene_set=\'custom\' if custom genes are included')

sample_info_df = du.load_sample_info(args.verbose)
tcga_cancer_types = list(np.unique(sample_info_df.cancer_type))
if args.holdout_cancer_types is None:
args.holdout_cancer_types = tcga_cancer_types
else:
not_in_tcga = set(args.holdout_cancer_types) - set(tcga_cancer_types)
if len(not_in_tcga) > 0:
p.error('some cancer types not present in TCGA: {}'.format(
' '.join(not_in_tcga)))

args.results_dir = Path(args.results_dir).resolve()

if args.log_file is None:
args.log_file = Path(args.results_dir, 'log_skipped.tsv').resolve()

return args, sample_info_df


if __name__ == '__main__':

# process command line arguments
args, sample_info_df = process_args()

# create results dir if it doesn't exist
args.results_dir.mkdir(parents=True, exist_ok=True)

# create empty log file if it doesn't exist
log_columns = [
'gene',
'cancer_type',
'shuffle_labels',
'skip_reason'
]
if args.log_file.exists() and args.log_file.is_file():
log_df = pd.read_csv(args.log_file, sep='\t')
else:
log_df = pd.DataFrame(columns=log_columns)
log_df.to_csv(args.log_file, sep='\t')

tcga_data = TCGADataModel(sample_info=sample_info_df,
seed=args.seed,
subset_mad_genes=args.subset_mad_genes,
verbose=args.verbose,
debug=args.debug)

genes_df = tcga_data.load_gene_set(args.gene_set)

# we want to run mutation prediction experiments:
# - for signal and shuffled labels
# (shuffled labels acts as our lower baseline)
# - for all genes in the given gene set
# - for all cancer types in the given holdout cancer types (or all of TCGA)
# - for all numbers of cancers to add to training set
# (cfg.num_train_cancer_types)
for shuffle_labels in (False, True):

print('shuffle_labels: {}'.format(shuffle_labels))

progress_1 = tqdm(genes_df.iterrows(),
total=genes_df.shape[0],
ncols=100,
file=sys.stdout)

for gene_idx, gene_series in progress_1:
gene = gene_series.gene
classification = gene_series.classification
progress_1.set_description('gene: {}'.format(gene))

gene_dir = fu.make_gene_dir(args.results_dir, gene, add_cancer=True)

progress_2 = tqdm(args.holdout_cancer_types,
ncols=100,
file=sys.stdout)

for test_cancer_type in progress_2:

progress_2.set_description('cancer type: {}'.format(test_cancer_type))
cancer_type_log_df = None

progress_3 = tqdm(cfg.num_train_cancer_types,
ncols=100,
file=sys.stdout)

for num_train_cancer_types in progress_3:

progress_3.set_description('num train cancers: {}'.format(
num_train_cancer_types))

try:
tcga_data.process_data_for_gene_and_cancer(gene,
classification,
test_cancer_type,
gene_dir,
num_train_cancer_types,
how_to_add=args.how_to_add,
shuffle_labels=shuffle_labels)
except NoTrainSamplesError:
if args.verbose:
print('Skipping due to no train samples: gene {}, '
'cancer type {}'.format(gene, test_cancer_type),
file=sys.stderr)
cancer_type_log_df = fu.generate_log_df(
log_columns,
[gene, test_cancer_type, shuffle_labels, 'no_train_samples']
)
continue

try:
# check if results file already exists, if not skip it
check_file = fu.check_add_cancer_file(gene_dir,
gene,
test_cancer_type,
num_train_cancer_types,
args.how_to_add,
args.seed,
shuffle_labels)
except ResultsFileExistsError:
if args.verbose:
print('Skipping because results file exists already: '
'gene {}, cancer type {}'.format(gene, test_cancer_type),
file=sys.stderr)
cancer_type_log_df = fu.generate_log_df(
log_columns,
[gene, test_cancer_type, shuffle_labels, 'file_exists']
)
continue

try:
# run cross-validation for the given cancer type
#
# since we already filtered the dataset to the cancer
# types of interest, we can just use this function with
# the "pancancer" option (you can think of it as a a
# pancancer model where the "universe" of all cancers
# is limited by our previous filtering, kinda).
results = run_cv_cancer_type(tcga_data,
gene,
test_cancer_type,
sample_info_df,
args.num_folds,
use_pancancer=True,
use_pancancer_only=False,
shuffle_labels=shuffle_labels)
except NoTrainSamplesError:
if args.verbose:
print('Skipping due to no train samples: gene {}, '
'cancer type {}'.format(gene, test_cancer_type),
file=sys.stderr)
cancer_type_log_df = fu.generate_log_df(
log_columns,
[gene, test_cancer_type, shuffle_labels, 'no_train_samples']
)
except NoTestSamplesError:
if args.verbose:
print('Skipping due to no test samples: gene {}, '
'cancer type {}'.format(gene, test_cancer_type),
file=sys.stderr)
cancer_type_log_df = fu.generate_log_df(
log_columns,
[gene, test_cancer_type, shuffle_labels, 'no_test_samples']
)
except OneClassError:
if args.verbose:
print('Skipping due to one holdout class: gene {}, '
'cancer type {}'.format(gene, test_cancer_type),
file=sys.stderr)
cancer_type_log_df = fu.generate_log_df(
log_columns,
[gene, test_cancer_type, shuffle_labels, 'one_class']
)
else:
# only save results if no exceptions
fu.save_results_add_cancer(gene_dir,
check_file,
results,
gene,
test_cancer_type,
tcga_data.y_df.DISEASE.unique(),
num_train_cancer_types,
args.how_to_add,
args.seed,
shuffle_labels)

if cancer_type_log_df is not None:
fu.write_log_file(cancer_type_log_df, args.log_file)

File renamed without changes.
8 changes: 8 additions & 0 deletions pancancer_evaluation/config.py
Original file line number Diff line number Diff line change
Expand Up @@ -58,3 +58,11 @@
cross_cancer_types = [
'THCA', 'COAD', 'GBM', 'LGG', 'SKCM'
]

# parameters for "add cancer" experiments

# how many cancer types to add to target cancer
# 0 = just use target cancer, -1 = use all cancers (pan-cancer model)
num_train_cancer_types = [0, 1, 2, 4, -1]
# similarity matrix to use for 'similarity' addition option
similarity_matrix_file = data_dir / 'expression_confusion_matrix.tsv'
Loading