Merge pull request #132 from griffithlab/improved_compare_junctions

Fixes docker files Improves stats script Updates readthedocs
griffithlab · Jan 8, 2020 · 7b80c9c · 7b80c9c
2 parents eab866a + 7438f66
commit 7b80c9c
Show file tree

Hide file tree

Showing 4 changed files with 136 additions and 83 deletions.
diff --git a/Dockerfile b/Dockerfile
@@ -1,10 +1,3 @@
-################################################################################
-##################### Add Container Labels #####################################
-LABEL "Regtools_License"="MIT"
-LABEL "Description"="Software package which integrate DNA-seq and RNA-seq data\
-                     to help interpret mutations in a regulatory and splicing\
-                     context."
-
 ################################################################################
 ##################### Set Inital Image to work from ############################
 
@@ -34,6 +27,13 @@ RUN apt-get update -y && apt-get install -y \
   cmake \
   python3
 
+################################################################################
+##################### Add Container Labels #####################################
+LABEL "Regtools_License"="MIT"
+LABEL "Description"="Software package which integrate DNA-seq and RNA-seq data\
+                     to help interpret mutations in a regulatory and splicing\
+                     context."
+
 ################################################################################
 ####################### Install R ##############################################
 
@@ -55,18 +55,21 @@ RUN R --vanilla -e 'install.packages(c("data.table", "plyr", "tidyverse"), repos
 ##################### Install Regtools #########################################
 
 # clone git repository
-RUN git clone https://github.com/griffithlab/regtools.git
+RUN cd / && git clone https://github.com/griffithlab/regtools.git
 
 # make a build directory for regtools
 WORKDIR /regtools/
-RUN mkdir build
+
 
 # compile from source
-RUN cd /regtools/build && cmake ..
-RUN cd /regtools/build && make
+RUN mkdir build && cd build && cmake .. && make
 
 ################################################################################
 ###################### set environment path    #################################
 
+# make a build directory for regtools
+WORKDIR /regtools/scripts/
+
 # add regtools executable to path
-ENV PATH="/regtools/build:${PATH}"
+ENV PATH="/regtools/build:/usr/local/bin/R-${r_version}:${PATH}"
+
diff --git a/docs/workflow.md b/docs/workflow.md
@@ -2,9 +2,10 @@
 
 This is an example workflow for running RegTools on a cohort of samples. This analysis requires that there be a vcf and RNA bam/cram file for each samples. The outline described below was used to run our own analysis on TCGA data.
 
-By the end of the analysis, the directory structure should look like so:
+By the end of the analysis, the directory structure should look like the example below. The `*` in the example below refers to the tag/parameter used to run `regtools cis-splice-effects identify` with.
 
-- Project/ (SCLC/)
+```bash
+- Project/
   - all_splicing_variants*.bed
   - paths.tsv
   - make_vcfs.sh
@@ -23,64 +24,82 @@ By the end of the analysis, the directory structure should look like so:
         - cse_identify_filtered_*
         - cse_identify_filtered_compare_*
         - variants*.bed
+    - Sample_2/
+      - tumor_rna_alignments.bam
+      - tumor_rna_alignments.bam.bai
+      - variants.per_gene.vep.vcf.gz
+      - variants.per_gene.vep.vcf.gz.tbi
+      - variants.ensembl
+      - logs/
+      - output/
+        - cse_identify_filtered_*
+        - cse_identify_filtered_compare_*
+        - variants*.bed
   - compare_junctions/
     - hist/
       - junction_pvalues_*.tsv
+```
+
+### Set tag and parameter shell arguments
+
+```bash
+tag=<tag>
+param=<run option>
+# (e.g. tag=default param=""; tag=E param="-E"; tag=i20e5 param="-i 20 -e 5")
+```
 
-## Set tag and parameter shell arguments
+### Run `regtools cis-splice-effects identify` with desired options for selecting variant and window size
 
-tag=<tag> param=<run option> (e.g. tag=default param=""; tag=E param="-E"; tag=i20e5 param="-i 20 -e 5")
+```bash
+for i in samples/*/; do regtools cis-splice-effects identify $param -o ${i}/output/cse_identify_filtered_$tag.tsv -j ${i}/output/cse_identify_filtered_$tag.bed -v ${i}/output/cse_identify_filtered_$tag.vcf ${i}/variants.per_gene.vep.vcf.gz ${i}/tumor_rna_alignments.bam /reference.fa reference.gtf; done
+```
 
-# run regtools cse identify with desired options for selecting variant and window size
-for i in samples/*/; do bsub -oo $i/logs/regtools_actual_$tag.lsf regtools cis-splice-effects identify $param -o ${i}/output/cse_identify_filtered_$tag.tsv -j ${i}/output/cse_identify_filtered_$tag.bed -v ${i}/output/cse_identify_filtered_$tag.vcf ${i}/variants.per_gene.vep.vcf.gz ${i}/tumor_rna_alignments.bam /gscmnt/gc2602/griffithlab/regtools/yafeng/GRCh37.fa /gscmnt/gc2602/griffithlab/regtools/GRCh37.87.exons.sorted.gtf; done
+### Make `variant.bed` for each sample
 
-# make variant.bed
-for i in samples/*/; do bsub -oo $i/logs/make_variant_bed_$tag.lsf bash /gscmnt/gc2602/griffithlab/regtools/yafeng/scripts/variants.sh ${i}/output/cse_identify_filtered_$tag.tsv ${i}/output/variants_$tag.bed; done 
+```bash
+for i in samples/*/; do bash variants.sh ${i}/output/cse_identify_filtered_$tag.tsv ${i}/output/variants_$tag.bed; done
+```
 
-# make bed (really just tsv with columns: chrom start end samples) with all variants that were deemed significant to splicing across all samples
+### Combine each sample's `variant.bed` file per tag to get all variants that were deemed significant to splicing across all samples for a given tag
+
+```bash
 echo -e 'chrom\tstart\tend\tsamples' > all_splicing_variants_$tag.bed
 for i in samples/*/; do j=${i##samples/}; uniq ${i}output/variants_$tag.bed | awk -v var=${j%%/} '{print $0 "\t" var}' >> all_splicing_variants_$tag.bed; done
+```
+
+### Make vcf of all variants across all samples (from each sample's variants.vcf). Then, compress it and index it
+
+```bash
+vcf-concat samples/*/variants.vcf.gz | vcf-sort > all_variants_sorted.vcf
 
-# make vcf of all variants across all samples (from variants.vcf)
-vcf-concat samples/*/variants.per_gene.vep.vcf.gz | vcf-sort > all_variants_sorted.vcf
-bgzip all_variants_sorted.vcf 
-tabix all_variants_sorted.vcf.gz 
+###### optional ######
+bgzip all_variants_sorted.vcf
 
-# run cis-splice effects identify on all samples with all variants (with $tag options as example)
-for i in samples/*/; do bsub -oo $i/logs/regtools_compare_$tag.lsf regtools cis-splice-effects identify $param -o ${i}/output/cse_identify_filtered_compare_$tag.tsv -j ${i}/output/cse_identify_filtered_compare_$tag.bed -v ${i}/output/cse_identify_filtered_compare_$tag.vcf variants_all_sorted.vcf.gz ${i}/tumor_rna_alignments.bam /gscmnt/gc2602/griffithlab/regtools/yafeng/GRCh37.fa /gscmnt/gc2602/griffithlab/regtools/GRCh37.87.exons.sorted.gtf; done
+tabix all_variants_sorted.vcf.gz
+```
 
-<===== JUNCTION COMPARISON ANALYSIS =====>
+### Run `regtools cis-splice effects identify` on all samples with all variants (with `$tag` options as example)
 
-# make directories
-mkdir compare_junctions/
-mkdir compare_junctions/outlier
-mkdir compare_junctions/ratio
+```bash
+for i in samples/*/; do bsub -oo $i/logs/regtools_compare_$tag.lsf regtools cis-splice-effects identify $param -o ${i}/output/cse_identify_filtered_compare_$tag.tsv -j ${i}/output/cse_identify_filtered_compare_$tag.bed -v ${i}/output/cse_identify_filtered_compare_$tag.vcf all_variants_sorted.vcf.gz ${i}/tumor_rna_alignments.bam reference.fa reference.gtf; done
+```
 
-# outlier analysis
-bsub -M 650000000 -R 'select[mem>65000] span[hosts=1] rusage[mem=65000]' /gscuser/zskidmor/R-3.3.0/bin/Rscript --vanilla /gscmnt/gc2602/griffithlab/regtools/yafeng/scripts/compare_junctions_outlier.R $tag
+## Beginning of compare junctions analysis
 
-# ratio analysis
-bsub -M 650000000 -R 'select[mem>65000] span[hosts=1] rusage[mem=65000]' /gscuser/zskidmor/R-3.3.0/bin/Rscript --vanilla /gscmnt/gc2602/griffithlab/regtools/yafeng/scripts/compare_junctions_ratio.R $tag
+### Make directory to store comparison results
 
-# vep comparison (outlier)
-bsub /gscuser/zskidmor/R-3.3.0/bin/Rscript --vanilla /gscmnt/gc2602/griffithlab/regtools/yafeng/scripts/vep_compare.R outlier $tag
+```bash
+mkdir -p compare_junctions/hist
+```
 
-# vep comparison (ratio)
-bsub /gscuser/zskidmor/R-3.3.0/bin/Rscript --vanilla /gscmnt/gc2602/griffithlab/regtools/yafeng/scripts/vep_compare.R ratio $tag
+### Run `compare_junctions_hist.R` on sample data
 
-NOTE: in the vep comparison, since regtools doesn't really have a concept of "variants" per se but rather "variant positions" (doesn't care about the actual alleles), we are really counting (positions of variants found splicing-significant by vep AND found significant by regtools) / (positions of variants found significant by found significant by regtools)
+```bash
+Rscript --vanilla compare_junctions_hist.R <tag>
+```
 
-To count:
+### Run `filter_and_BH.R` to adjust p values and filter results
 
-echo -e "anchor\tvep_significant_vars\ttotal_significant_vars\tpercent_vep_significant" > comparison_counts.tsv
-for i in default i20e5 i50e5 E I; do 
-	echo -n $i >> comparison_counts.tsv; 
-	echo -en "\t" >> comparison_counts.tsv; 
-	vep=$(cut -f 1,2,7 vep_comparison_$i.tsv | sort | uniq | grep "TRUE" | wc -l)
-	echo -n $vep >> comparison_counts.tsv; 
-	echo -en "\t" >> comparison_counts.tsv;
-	total=$(($(cut -f 1,2,7 vep_comparison_$i.tsv | sort | uniq | wc -l)-1)) 
-	echo -n $total >> comparison_counts.tsv;
-	echo -en "\t" >> comparison_counts.tsv;
-	echo $(bc -l <<< "$vep/$total") >> comparison_counts.tsv;
-done
+```bash
+Rscript --vanilla filter_and_BH.R <tag>
+```
diff --git a/scripts/compare_junctions_hist_v2.R b/scripts/compare_junctions_hist_v2.R
@@ -9,21 +9,21 @@ library(tidyverse)
 
 debug = F
 
-system.time({
-if (debug){
-  tag = paste("_", "default", sep="")
-} else {
-  # get options tag
-  args = commandArgs(trailingOnly = TRUE)
-  tag = args[1]
-  input_file = args[2]
-  if ( substr(tag, 2, 3) == "--"){
-    stop("Please specify an option tag (e.g. \"default\", \"i20e5\")")
-  }
-}
+# system.time({
+# if (debug){
+#   tag = paste("_", "default", sep="")
+# } else {
+#   # get options tag
+#   args = commandArgs(trailingOnly = TRUE)
+#   tag = args[1]
+#   input_file = args[2]
+#   if ( substr(tag, 2, 3) == "--"){
+#     stop("Please specify an option tag (e.g. \"default\", \"i20e5\")")
+#   }
+# }
 
-# tag = 'I'
-# input_file = '~/Desktop/CHOL/all_splicing_variants_I.bed'
+tag = 'E'
+input_file = 'all_splicing_variants_E.bed'
 
 # All splicing relevant variants (union of rows from variants.bed files; add column with comma-separated list of sample names)
 all_splicing_variants = unique(data.table::fread(input_file), sep = '\t', header = T, stringsAsFactors = FALSE)
@@ -150,21 +150,14 @@ regtools_data = subset(regtools_data, select=columns_to_keep)
 
 
 # zeroes need to be added in for some samples
-a <- function(x, y, z){
+a <- function(x, y){
   toAdd <- y - length(x) - 1
   # browser()
   toAdd <- rep(0.0000000, toAdd)
   x <- c(x, toAdd)
   return(x)
 }
-x <- mapply(a, regtools_data$norm_scores_non, length(all_samples), regtools_data$samples)
-
-
-# if (typeof(x) == 'list') {
-#   x <- matrix(pad(unlist(x), ncols),nrow = rows, byrow = TRUE, ncol = cols)
-#   x <- t(x)
-#   }
-# browser()
+x <- mapply(a, regtools_data$norm_scores_non, length(all_samples))
 
 get_num_zeros_to_rm <- function(z){
   num_zeroes_to_rm = str_count(z, ',') 
@@ -188,6 +181,47 @@ if (max(num_zeroes_to_rm > 0)) {
 x <- mapply(rm_zeroes, regtools_data$norm_scores_non, regtools_data$zeroes_to_rm)
 regtools_data$norm_scores_non = x
 }
+
+get_mean <- function(x){
+  x <- mean(as.numeric(x))
+  return(x)
+}
+
+x <- mapply(get_mean, regtools_data$norm_scores_non) 
+regtools_data$mean_norm_score_non <- x
+
+get_sd <- function(x){
+  x <- sd(as.numeric(x))
+  return(x)
+}
+
+x <- mapply(get_sd, regtools_data$norm_scores_non) 
+regtools_data$sd_norm_score_non <- x
+
+a <- function(x, y){
+  # if(y == "TCGA-ZH-A8Y2-01A,TCGA-ZH-A8Y5-01A"){
+  #    browser()
+  # }
+  toAdd <- (str_count(y, ',') + 1) - (str_count(x, ',') + 1) 
+  # browser()
+  if (toAdd > 0) {
+    toAdd <- rep(0.0000000, toAdd)
+    x <- c(x, toAdd)
+  } else {
+    x <- unlist(strsplit(x, ","))
+  }
+  x <- list(x)
+  return(x)
+}
+x <- mapply(a, regtools_data$norm_scores_variant, regtools_data$samples)
+regtools_data$norm_scores_variant = x
+
+x <- mapply(get_mean, regtools_data$norm_scores_variant) 
+regtools_data$mean_norm_score_variant <- x
+
+x <- mapply(get_sd, regtools_data$norm_scores_variant) 
+regtools_data$sd_norm_score_variant <- x
+
 print("test7")
 
 ################ calculate p-values ############################################
@@ -207,9 +241,6 @@ a <- function(x){
                   breaks = seq(0.5, max(non_variant_norm_scores_ranked)+1.5, by=1), plot=F)
   mids = histinfo$mids
   cd = cumsum(histinfo$density)
-  # if(x$info == "chr1_729955_735423_D_chr1:809966-809967"){
-  #   browser()
-  # }
   underestimate = max(which(mids <= variant_norm_score_ranked))
   pvalue = 1-cd[underestimate]
   return(pvalue)
@@ -243,5 +274,5 @@ regtools_data = regtools_data %>% distinct()
 
 
 write.table(regtools_data, file=paste(input_file, "_out.tsv", sep=""), quote=FALSE, sep='\t', row.names = F)
-
-})
+# 
+# })
diff --git a/scripts/stats_wrapper.py b/scripts/stats_wrapper.py
@@ -36,7 +36,7 @@
 files.sort()
 number_of_in_files = len(files)
 for file in files:
-    subprocess.run(f'Rscript --vanilla /home/ec2-user/workspace/regtools/scripts/compare_junctions_hist_v2.R {tag} {file}', shell=True, check=True)
+    subprocess.run(f'Rscript --vanilla compare_junctions_hist_v2.R {tag} {file}', shell=True, check=True)
 output_files = glob.glob("*_out.tsv")
 output_files.sort()# glob lacks reliable ordering, so impose your own if output order matters
 number_of_out_files = len(output_files)