Merge pull request #571 from d3b-center/analysis-module-action

d3b-center · May 30, 2024 · d98b38c · d98b38c
2 parents 994d4b3 + 384916d
commit d98b38c
Show file tree

Hide file tree

Showing 32 changed files with 394,845 additions and 293,293 deletions.
diff --git a/.github/workflows/run_analysis.yml b/.github/workflows/run_analysis.yml
@@ -17,14 +17,14 @@ jobs:
  docker rmi $(docker image ls -aq)
  df -h
  - name: Download Data for Consensus CN Manta
- uses: docker://pgc-images.sbgenomics.com/d3b-bixu/open-pedcan:latest
+ uses: docker://pgc-images.sbgenomics.com/d3b-bixu/openpedcanverse:latest
  with:
  entrypoint: ./download-data.sh
  env:
  OPENPEDCAN_URL: https://s3.amazonaws.com/d3b-openaccess-us-east-1-prd-pbta/open-targets
  OPENPEDCAN_RELEASE: testing
  - name: Run Consensus CN Manta
- uses: docker://pgc-images.sbgenomics.com/d3b-bixu/open-pedcan:latest
+ uses: docker://pgc-images.sbgenomics.com/d3b-bixu/openpedcanverse:latest
  with:
  entrypoint: ./analyses/copy_number_consensus_call_manta/run_consensus_call.sh
 
@@ -42,14 +42,14 @@ jobs:
  docker rmi $(docker image ls -aq)
  df -h
  - name: Download Data for Consensus CN
- uses: docker://pgc-images.sbgenomics.com/d3b-bixu/open-pedcan:latest
+ uses: docker://pgc-images.sbgenomics.com/d3b-bixu/openpedcanverse:latest
  with:
  entrypoint: ./download-data.sh
  env:
  OPENPEDCAN_URL: https://s3.amazonaws.com/d3b-openaccess-us-east-1-prd-pbta/open-targets
  OPENPEDCAN_RELEASE: testing
  - name: Run Consensus CN
- uses: docker://pgc-images.sbgenomics.com/d3b-bixu/open-pedcan:latest
+ uses: docker://pgc-images.sbgenomics.com/d3b-bixu/openpedcanverse:latest
  with:
  entrypoint: ./analyses/copy_number_consensus_call/run_consensus_call.sh
 
@@ -118,6 +118,11 @@ jobs:
  entrypoint: independent-samples/run-independent-samples.sh
  run_for_subtyping: 1
 
+ - name: Gene set enrichment
+ entrypoint: gene-set-enrichment-analysis/run-gsea.sh
+ run_for_subtyping: 1
+ openpbta_testing: 1
+
  - name: TP53/NF1 scores
  entrypoint: tp53_nf1_score/run_classifier.sh
  openpedcan_polya_strand: 0
@@ -129,11 +134,11 @@ jobs:
  entrypoint: fusion-summary/run-new-analysis.sh
  openpbta_subset: 0
 
- # - name: Consensus CN Manta
- # entrypoint: copy_number_consensus_call_manta/run_consensus_call.sh
+ - name: Consensus CN Manta
+ entrypoint: copy_number_consensus_call_manta/run_consensus_call.sh
 
- # - name: Consensus CN
- # entrypoint: copy_number_consensus_call/run_consensus_call.sh
+ - name: Consensus CN
+ entrypoint: copy_number_consensus_call/run_consensus_call.sh
 
  - name: Consensus CN annotation
  entrypoint: focal-cn-file-preparation/run-prepare-cn.sh
@@ -184,19 +189,20 @@ jobs:
  docker rmi $(docker image ls -aq)
  df -h
  - name: Download Data
- uses: docker://pgc-images.sbgenomics.com/d3b-bixu/open-pedcan:latest
+ uses: docker://pgc-images.sbgenomics.com/d3b-bixu/openpedcanverse:latest
  with:
  entrypoint: ./download-data.sh
  env:
  OPENPEDCAN_URL: https://s3.amazonaws.com/d3b-openaccess-us-east-1-prd-pbta/open-targets
  OPENPEDCAN_RELEASE: testing
 
  - name: Run Analysis
- uses: docker://pgc-images.sbgenomics.com/d3b-bixu/open-pedcan:latest
+ uses: docker://pgc-images.sbgenomics.com/d3b-bixu/openpedcanverse:latest
  with:
  entrypoint: analyses/${{ matrix.entrypoint }}
  env:
  OPENPBTA_SUBSET: ${{ matrix.openpbta_subset }}
  OPENPBTA_TESTING: ${{ matrix.openpbta_testing }}
  RUN_FOR_SUBTYPING: ${{ matrix.run_for_subtyping }}
- OPENPEDCAN_POLYA_STRAND: ${{ matrix.openpedcan_polya_strand }}
+ OPENPEDCAN_POLYA_STRAND: ${{ matrix.openpedcan_polya_strand }}
+
diff --git a/analyses/copy_number_consensus_call/run_consensus_call.sh b/analyses/copy_number_consensus_call/run_consensus_call.sh
@@ -39,15 +39,15 @@ python3 scripts/merged_to_individual_files.py \
 ## The snakemake flag options are:
 ## -s : Point to the location of the Snakemake file
 ## --configfile : Point to the location of the config file
-## -j : Set available cores, in this case, when no number is provided, thus use all available cores
+## -j : Set available cores, in this case, when $(nproc) is provided, thus use all available cores
 ## -p : Print shell command that will be executed
 ## --restart-times : Define the times a job restarts when run into an error before giving up
 ## --latency-wait: Define the number of seconds to wait for a file to show up after that file has been created
 
 snakemake \
  -s Snakefile \
  --configfile $SCRATCHDIR/config_snakemake.yaml \
- -j \
+ -j $(nproc) \
  --restart-times 2
 
 # merge cnv files

diff --git a/analyses/copy_number_consensus_call_manta/run_consensus_call.sh b/analyses/copy_number_consensus_call_manta/run_consensus_call.sh
@@ -47,13 +47,13 @@ python3 scripts/merged_to_individual_files.py \
 ## The snakemake flag options are:
 ## -s : Point to the location of the Snakemake file
 ## --configfile : Point to the location of the config file
-## -j : Set available cores, in this case, when no number is provided, thus use all available cores
+## -j : Set available cores, in this case, when $(nproc) is provided, thus use all available cores
 ## -p : Print shell command that will be executed
 ## --restart-times : Define the times a job restarts when run into an error before giving up
 ## --latency-wait: Define the number of seconds to wait for a file to show up after that file has been created
 
 snakemake \
  -s Snakefile \
  --configfile $SCRATCHDIR/config_snakemake.yaml \
- -j \
+ -j $(nproc) \
  --restart-times 2 
diff --git a/analyses/gene-set-enrichment-analysis/01-conduct-gsea-analysis.R b/analyses/gene-set-enrichment-analysis/01-conduct-gsea-analysis.R
@@ -9,7 +9,7 @@
 # + "For pathways containing genes strongly acting in both directions, the deviations with cancel each other out and show little or no enrichment."
 #
 # Written by Stephanie J. Spielman for CCDL ALSF, 2020
-#
+# Edited by Jo lynne Rokita for D3b, 2024
 #
 #
 # ####### USAGE, assumed to be run from top-level of project:
@@ -61,6 +61,12 @@ option_list <- list(
  type = "character",
  default = NA,
  help = "Histology file containing clinical information in TSV format."
+ ),
+ optparse::make_option(
+ c("--library"),
+ type = "character",
+ default = "rna_library",
+ help = "RNA library to run in GHA."
  )
 )
 
@@ -87,14 +93,16 @@ if (!file.exists(expression_data_file)) stop("\n\nERROR: Provided input file doe
 scores_output_file <- file.path(output_dir, basename(opt$output_file))
 
 #### Load input files --------------------------------------------------------------------
-expression_data <- as.data.frame( readr::read_rds(expression_data_file) )
+expression_data <- as.data.frame(readr::read_rds(expression_data_file))
 human_hallmark <- msigdbr::msigdbr(species = "Homo sapiens", category = "H") ## human hallmark genes from `migsdbr` package. The loaded data is a tibble.
 
 histology_df <- readr::read_tsv(opt$histology, guess_max = 100000)
 
 #### Prepare hallmark genes: Create a list of hallmarks, each of which is a list of genes -----------------------------------------------
-human_hallmark_twocols <- human_hallmark %>% dplyr::select(gs_name, human_gene_symbol)
-human_hallmark_list <- base::split(human_hallmark_twocols$human_gene_symbol, list(human_hallmark_twocols$gs_name))
+human_hallmark_twocols <- human_hallmark %>% 
+ dplyr::select(gs_name, human_gene_symbol)
+human_hallmark_list <- base::split(human_hallmark_twocols$human_gene_symbol, 
+ list(human_hallmark_twocols$gs_name))
 
 #### Perform gene set enrichment analysis --------------------------------------------------------------------
 
@@ -103,27 +111,36 @@ human_hallmark_list <- base::split(human_hallmark_twocols$human_gene_symbol,
 histology_rna_df <- histology_df %>% 
  dplyr::filter(experimental_strategy == "RNA-Seq") %>% 
  dplyr::filter(!is.na(RNA_library)) %>%
- dplyr::filter(cohort != "TCGA") %>%
- dplyr::filter(!is.na(broad_histology)) %>%
- dplyr::filter(broad_histology != "Non-tumor") %>% 
- dplyr::filter(broad_histology != "Other tumor") #%>%
- #dplyr::filter(!is.na(cancer_group)) # using histologies base for won't work 
-
+ dplyr::filter(!cohort %in% c("TCGA", "GTEx")) %>%
+ dplyr::filter(!is.na(broad_histology))
 
 # First filter expression data to exclude GTEx and TCGA
 expression_data <- expression_data %>% 
- dplyr::select(histology_rna_df$Kids_First_Biospecimen_ID)
+ dplyr::select(any_of(histology_rna_df$Kids_First_Biospecimen_ID))
 
 # for each type of the RNA library, we subset the expression matrix accordingly and run gsea scores for each RNA library 
-rna_library_list <- histology_rna_df %>% pull(RNA_library) %>% unique()
+rna_library_list <- histology_rna_df %>% 
+ pull(RNA_library) %>% 
+ unique()
+
 # Further subset to each cohort to deal with size issues
-cohort_list <- histology_rna_df %>% pull(cohort) %>% unique()
+cohort_list <- histology_rna_df %>% 
+ pull(cohort) %>% 
+ unique()
 
 gsea_scores_df_tidy <- data.frame()
 
 # iterate through each cohort and RNA library type 
 for(i in 1:length(rna_library_list)){
- rna_library = rna_library_list[i]
+
+ if (opt$library == "rna_library") {
+ rna_library = rna_library_list[i]
+ }
+
+ else if (opt$library == "stranded") {
+ rna_library = "stranded"
+ }
+
  # get bs id for one particular rna library type
  rna_library_type_bs_id <- histology_rna_df %>% 
  dplyr::filter(RNA_library == rna_library) %>% 
@@ -133,40 +150,45 @@ for(i in 1:length(rna_library_list)){
  # Filter the expression data to this RNA library type
  # Subset to the remaining samples 
  expression_data_each <- expression_data %>% 
- dplyr::select(rna_library_type_bs_id)
+ dplyr::select(any_of(rna_library_type_bs_id))
 
  ### Rownames are genes and column names are samples
- expression_data_each_log2_matrix <- as.matrix( log2(expression_data_each + 1) )
+ expression_data_each_log2 <- log2(expression_data_each + 1)
+ expression_data_each_log2_matrix <- as.matrix(expression_data_each_log2)
 
- # Renmove genes with 0 variance
+ # Remove genes with 0 variance
  #keep <- apply(expression_data_each_log2_matrix, 1, function(x) var(x, na.rm = TRUE)) > 0 
  #expression_data_each_log2_matrix_keep <- data.matrix(expression_data_each_log2_matrix[keep,])
-
- #We then calculate the Gaussian-distributed scores
- gsea_scores_each <- GSVA::gsva(expression_data_each_log2_matrix,
- human_hallmark_list,
- method = "gsva",
- min.sz=1, max.sz=1500,## Arguments from K. Rathi
- parallel.sz = 8, # For the bigger dataset, this ensures this won't crash due to memory problems
- mx.diff = TRUE) ## Setting this argument to TRUE computes Gaussian-distributed scores (bimodal score distribution if FALSE)
+
+ # We then calculate the Gaussian-distributed scores
+ gsea_scores_param <- gsvaParam(expression_data_each_log2_matrix,
+ geneSets = human_hallmark_list,
+ kcdf = "Gaussian",
+ assay = NA_character_,
+ annotation = NA_character_,
+ tau = 1,
+ minSize = 1, 
+ maxSize = 1500, ## Arguments from K. Rathi
+ maxDiff = TRUE) ## Setting this argument to TRUE computes Gaussian-distributed scores (bimodal score distribution if FALSE)
+
+ gsea_scores_each <- gsva(gsea_scores_param, verbose = TRUE)
 
  ### Clean scoring into tidy format
  gsea_scores_each_df <- as.data.frame(gsea_scores_each) %>%
  rownames_to_column(var = "hallmark_name")
-
- #first/last_bs needed for use in gather (we are not on tidyr1.0)
- first_bs <- head(colnames(gsea_scores_each), n=1)
- last_bs <- tail(colnames(gsea_scores_each), n=1)
-
+
  rna_library<-gsub(" ", "_", rna_library)
  rna_library<-stringr::str_to_lower(gsub("-", "", rna_library))
 
  gsea_scores_each_df_tidy <- gsea_scores_each_df %>%
- tidyr::gather(Kids_First_Biospecimen_ID, gsea_score, !!first_bs : !!last_bs) %>%
+ tidyr::pivot_longer(cols = -hallmark_name, 
+ names_to = "Kids_First_Biospecimen_ID", 
+ values_to = "gsea_score") %>%
  dplyr::select(Kids_First_Biospecimen_ID, hallmark_name, gsea_score) %>%
  dplyr::mutate(data_type = rna_library)
 
- gsea_scores_df_tidy <- bind_rows(gsea_scores_df_tidy , gsea_scores_each_df_tidy)
+ gsea_scores_df_tidy <- bind_rows(gsea_scores_df_tidy, 
+ gsea_scores_each_df_tidy)
 }
 
 

diff --git a/analyses/gene-set-enrichment-analysis/02-model-gsea.Rmd b/analyses/gene-set-enrichment-analysis/02-model-gsea.Rmd
@@ -1,7 +1,7 @@
 ---
 title: "GSVA Score Modeling"
-author: "Stephanie J. Spielman for ALSF CCDL"
-date: '2020'
+author: "Stephanie J. Spielman for ALSF CCDL, Jo Lynne Rokita for D3b"
+date: '2020, 2024'
 output:
  html_document:
  df_print: paged
@@ -177,18 +177,18 @@ for(i in 1:length(rna_library_list)){
 
 # CI part is commented out since we want to run in CAVATICA later
 ### Don't run polyA samples in CI due to data limitations, won't be enough levels for ANOVA
-# if (!(running_in_ci)){
-# cancer_group_polya_model_results <- gsva_anova_tukey(metadata_with_gsva, cancer_group, 'polya', SIGNIFICANCE_THRESHOLD)
-# broad_histology_polya_model_results <- gsva_anova_tukey(metadata_with_gsva, broad_histology, 'polya', SIGNIFICANCE_THRESHOLD)
-# head(cancer_group_polya_model_results)
-# head(broad_histology_polya_model_results)
-# 
-# ### Save polya library results
-# write_tsv(cancer_group_polya_model_results[["anova"]],  file_anova_polya_cancer_group)
-# write_tsv(broad_histology_polya_model_results[["anova"]],  file_anova_polya_broad_histology)
-# write_tsv(cancer_group_polya_model_results[["tukey"]],  file_tukey_polya_cancer_group)
-# write_tsv(broad_histology_polya_model_results[["tukey"]],  file_tukey_polya_broad_histology)
-# }
+ if (!(running_in_ci)){
+ cancer_group_polya_model_results <- gsva_anova_tukey(metadata_with_gsva, cancer_group, 'polya', SIGNIFICANCE_THRESHOLD)
+ broad_histology_polya_model_results <- gsva_anova_tukey(metadata_with_gsva, broad_histology, 'polya', SIGNIFICANCE_THRESHOLD)
+ head(cancer_group_polya_model_results)
+ head(broad_histology_polya_model_results)
+ 
+ ### Save polya library results
+ write_tsv(cancer_group_polya_model_results[["anova"]], "results/gsva_anova_polya_cancer_group.tsv")
+ write_tsv(broad_histology_polya_model_results[["anova"]], "results/gsva_anova_polya_broad_histology.tsv")
+ write_tsv(cancer_group_polya_model_results[["tukey"]], "results/gsva_tukey_polya_cancer_group.tsv")
+ write_tsv(broad_histology_polya_model_results[["tukey"]], "results/gsva_tukey_polya_broad_histology.tsv")
+ }
 
 ```
 
@@ -214,10 +214,10 @@ for(i in 1:length(rna_library_list)){
 }
 
 # CI part is commented out since we want to run in CAVATICA later
-# if (!(running_in_ci))
-# {
-# cancer_group_polya_model_results[["anova"]] %>% count(significant_anova)
-# }
+ if (!(running_in_ci))
+ {
+ cancer_group_polya_model_results[["anova"]] %>% count(significant_anova)
+ }
 ```
 
 > All are significantly different for stranded, none for polya. Likely due to data/power limitations.
@@ -244,10 +244,10 @@ for(i in 1:length(rna_library_list)){
 }
 
 # CI part is commented out since we want to run in CAVATICA later
-# if (!(running_in_ci))
-# {
-# broad_histology_polya_model_results[["anova"]] %>% count(significant_anova)
-# }
+ if (!(running_in_ci))
+ {
+ broad_histology_polya_model_results[["anova"]] %>% count(significant_anova)
+ }
 ```
 
 

diff --git a/analyses/gene-set-enrichment-analysis/02-model-gsea.html b/analyses/gene-set-enrichment-analysis/02-model-gsea.html
diff --git a/analyses/gene-set-enrichment-analysis/02-model-gsea.nb.html b/analyses/gene-set-enrichment-analysis/02-model-gsea.nb.html
diff --git a/analyses/gene-set-enrichment-analysis/README.md b/analyses/gene-set-enrichment-analysis/README.md
@@ -1,12 +1,12 @@
-# Gene Set Enrichment Analysis
+# Gene Set Enrichment Analysis using GSVA
 
 Written by Stephanie J. Spielman to supercede previous analyses in [`ssgsea-hallmark`](https://github.com/AlexsLemonade/OpenPBTA-analysis/tree/master/analyses/ssgsea-hallmark).
+Edited for R 4.4, GSVA 1.52.0, tidyR >1.0 by Jo Lynne Rokita
 
 Primary goals include:
 
 1. Score hallmark pathways based on expression data using GSVA analysis, using a strategy that produces Gaussian-distributed scores.
 2. Analyze scores for highly significant differences among tumor classifications 
-3. [Pending] generate heatmaps and templates for boxplot figures 
 
 ## Usage:
 
@@ -18,7 +18,7 @@ using OPENPBTA_BASE_SUBTYPING=1 to run this module using the pbta-histologies-ba
 OPENPBTA_BASE_SUBTYPING=1 analyses/gene-set-enrichment-analysis/run-gsea.sh
 ```
 
-OR by default uses pbta-histologies.tsv from data folder
+OR by default uses histologies.tsv from data folder
 ```sh
 bash analyses/gene-set-enrichment-analysis/run-gsea.sh
 ```
@@ -31,13 +31,11 @@ bash analyses/gene-set-enrichment-analysis/run-gsea.sh
 
 + `02-model-gsea.Rmd` performs ANOVA and Tukey tests on GSVA scores to evaluate, for each hallmark pathway, differences in GSVA across groups (e.g. short histology or disease type).
 
-+ FORTHCOMING: `03-visualize-gsea.Rmd` will create heatmaps and boxplots to visualize scores.
-
 + `results/gsva_scores.tsv` represents GSVA scores calculated from `gene-expression-rsem-tpm-collapsed.rds` with `Rscript --vanilla 01-conduct-gsea-analysis.R`
 
 + `results/gsva_scores_polya.tsv` represents GSVA scores calculated from `pbta-gene-expression-rsem-fpkm-collapsed.polya.rds` with with `Rscript --vanilla 01-conduct-gsea-analysis.R`
 
 + Files named as `results/gsva_<tukey/anova>_<all_possible_RNA_library>_<broad_histology/cancer_group)>.tsv` represent results from modeling
- + Files created with: `Rscript --vanilla 02-model-gsea.R`
+ + Files created with: `Rscript -e "rmarkdown::render('02-model-gsea.Rmd', clean = TRUE, params=list(is_ci = ${IS_CI}))"`
  + Assumes `results/gsva_scores.tsv`