Skip to content

Commit

Permalink
Merge pull request #132 from griffithlab/improved_compare_junctions
Browse files Browse the repository at this point in the history
Fixes docker files
Improves stats script
Updates readthedocs
  • Loading branch information
kcotto authored Jan 8, 2020
2 parents eab866a + 7438f66 commit 7b80c9c
Show file tree
Hide file tree
Showing 4 changed files with 136 additions and 83 deletions.
27 changes: 15 additions & 12 deletions Dockerfile
Original file line number Diff line number Diff line change
@@ -1,10 +1,3 @@
################################################################################
##################### Add Container Labels #####################################
LABEL "Regtools_License"="MIT"
LABEL "Description"="Software package which integrate DNA-seq and RNA-seq data\
to help interpret mutations in a regulatory and splicing\
context."

################################################################################
##################### Set Inital Image to work from ############################

Expand Down Expand Up @@ -34,6 +27,13 @@ RUN apt-get update -y && apt-get install -y \
cmake \
python3

################################################################################
##################### Add Container Labels #####################################
LABEL "Regtools_License"="MIT"
LABEL "Description"="Software package which integrate DNA-seq and RNA-seq data\
to help interpret mutations in a regulatory and splicing\
context."

################################################################################
####################### Install R ##############################################

Expand All @@ -55,18 +55,21 @@ RUN R --vanilla -e 'install.packages(c("data.table", "plyr", "tidyverse"), repos
##################### Install Regtools #########################################

# clone git repository
RUN git clone https://github.com/griffithlab/regtools.git
RUN cd / && git clone https://github.com/griffithlab/regtools.git

# make a build directory for regtools
WORKDIR /regtools/
RUN mkdir build


# compile from source
RUN cd /regtools/build && cmake ..
RUN cd /regtools/build && make
RUN mkdir build && cd build && cmake .. && make

################################################################################
###################### set environment path #################################

# make a build directory for regtools
WORKDIR /regtools/scripts/

# add regtools executable to path
ENV PATH="/regtools/build:${PATH}"
ENV PATH="/regtools/build:/usr/local/bin/R-${r_version}:${PATH}"

103 changes: 61 additions & 42 deletions docs/workflow.md
Original file line number Diff line number Diff line change
Expand Up @@ -2,9 +2,10 @@

This is an example workflow for running RegTools on a cohort of samples. This analysis requires that there be a vcf and RNA bam/cram file for each samples. The outline described below was used to run our own analysis on TCGA data.

By the end of the analysis, the directory structure should look like so:
By the end of the analysis, the directory structure should look like the example below. The `*` in the example below refers to the tag/parameter used to run `regtools cis-splice-effects identify` with.

- Project/ (SCLC/)
```bash
- Project/
- all_splicing_variants*.bed
- paths.tsv
- make_vcfs.sh
Expand All @@ -23,64 +24,82 @@ By the end of the analysis, the directory structure should look like so:
- cse_identify_filtered_*
- cse_identify_filtered_compare_*
- variants*.bed
- Sample_2/
- tumor_rna_alignments.bam
- tumor_rna_alignments.bam.bai
- variants.per_gene.vep.vcf.gz
- variants.per_gene.vep.vcf.gz.tbi
- variants.ensembl
- logs/
- output/
- cse_identify_filtered_*
- cse_identify_filtered_compare_*
- variants*.bed
- compare_junctions/
- hist/
- junction_pvalues_*.tsv
```

### Set tag and parameter shell arguments

```bash
tag=<tag>
param=<run option>
# (e.g. tag=default param=""; tag=E param="-E"; tag=i20e5 param="-i 20 -e 5")
```

## Set tag and parameter shell arguments
### Run `regtools cis-splice-effects identify` with desired options for selecting variant and window size

tag=<tag> param=<run option> (e.g. tag=default param=""; tag=E param="-E"; tag=i20e5 param="-i 20 -e 5")
```bash
for i in samples/*/; do regtools cis-splice-effects identify $param -o ${i}/output/cse_identify_filtered_$tag.tsv -j ${i}/output/cse_identify_filtered_$tag.bed -v ${i}/output/cse_identify_filtered_$tag.vcf ${i}/variants.per_gene.vep.vcf.gz ${i}/tumor_rna_alignments.bam /reference.fa reference.gtf; done
```

# run regtools cse identify with desired options for selecting variant and window size
for i in samples/*/; do bsub -oo $i/logs/regtools_actual_$tag.lsf regtools cis-splice-effects identify $param -o ${i}/output/cse_identify_filtered_$tag.tsv -j ${i}/output/cse_identify_filtered_$tag.bed -v ${i}/output/cse_identify_filtered_$tag.vcf ${i}/variants.per_gene.vep.vcf.gz ${i}/tumor_rna_alignments.bam /gscmnt/gc2602/griffithlab/regtools/yafeng/GRCh37.fa /gscmnt/gc2602/griffithlab/regtools/GRCh37.87.exons.sorted.gtf; done
### Make `variant.bed` for each sample

# make variant.bed
for i in samples/*/; do bsub -oo $i/logs/make_variant_bed_$tag.lsf bash /gscmnt/gc2602/griffithlab/regtools/yafeng/scripts/variants.sh ${i}/output/cse_identify_filtered_$tag.tsv ${i}/output/variants_$tag.bed; done
```bash
for i in samples/*/; do bash variants.sh ${i}/output/cse_identify_filtered_$tag.tsv ${i}/output/variants_$tag.bed; done
```

# make bed (really just tsv with columns: chrom start end samples) with all variants that were deemed significant to splicing across all samples
### Combine each sample's `variant.bed` file per tag to get all variants that were deemed significant to splicing across all samples for a given tag

```bash
echo -e 'chrom\tstart\tend\tsamples' > all_splicing_variants_$tag.bed
for i in samples/*/; do j=${i##samples/}; uniq ${i}output/variants_$tag.bed | awk -v var=${j%%/} '{print $0 "\t" var}' >> all_splicing_variants_$tag.bed; done
```

### Make vcf of all variants across all samples (from each sample's variants.vcf). Then, compress it and index it

```bash
vcf-concat samples/*/variants.vcf.gz | vcf-sort > all_variants_sorted.vcf

# make vcf of all variants across all samples (from variants.vcf)
vcf-concat samples/*/variants.per_gene.vep.vcf.gz | vcf-sort > all_variants_sorted.vcf
bgzip all_variants_sorted.vcf
tabix all_variants_sorted.vcf.gz
###### optional ######
bgzip all_variants_sorted.vcf

# run cis-splice effects identify on all samples with all variants (with $tag options as example)
for i in samples/*/; do bsub -oo $i/logs/regtools_compare_$tag.lsf regtools cis-splice-effects identify $param -o ${i}/output/cse_identify_filtered_compare_$tag.tsv -j ${i}/output/cse_identify_filtered_compare_$tag.bed -v ${i}/output/cse_identify_filtered_compare_$tag.vcf variants_all_sorted.vcf.gz ${i}/tumor_rna_alignments.bam /gscmnt/gc2602/griffithlab/regtools/yafeng/GRCh37.fa /gscmnt/gc2602/griffithlab/regtools/GRCh37.87.exons.sorted.gtf; done
tabix all_variants_sorted.vcf.gz
```

<===== JUNCTION COMPARISON ANALYSIS =====>
### Run `regtools cis-splice effects identify` on all samples with all variants (with `$tag` options as example)

# make directories
mkdir compare_junctions/
mkdir compare_junctions/outlier
mkdir compare_junctions/ratio
```bash
for i in samples/*/; do bsub -oo $i/logs/regtools_compare_$tag.lsf regtools cis-splice-effects identify $param -o ${i}/output/cse_identify_filtered_compare_$tag.tsv -j ${i}/output/cse_identify_filtered_compare_$tag.bed -v ${i}/output/cse_identify_filtered_compare_$tag.vcf all_variants_sorted.vcf.gz ${i}/tumor_rna_alignments.bam reference.fa reference.gtf; done
```

# outlier analysis
bsub -M 650000000 -R 'select[mem>65000] span[hosts=1] rusage[mem=65000]' /gscuser/zskidmor/R-3.3.0/bin/Rscript --vanilla /gscmnt/gc2602/griffithlab/regtools/yafeng/scripts/compare_junctions_outlier.R $tag
## Beginning of compare junctions analysis

# ratio analysis
bsub -M 650000000 -R 'select[mem>65000] span[hosts=1] rusage[mem=65000]' /gscuser/zskidmor/R-3.3.0/bin/Rscript --vanilla /gscmnt/gc2602/griffithlab/regtools/yafeng/scripts/compare_junctions_ratio.R $tag
### Make directory to store comparison results

# vep comparison (outlier)
bsub /gscuser/zskidmor/R-3.3.0/bin/Rscript --vanilla /gscmnt/gc2602/griffithlab/regtools/yafeng/scripts/vep_compare.R outlier $tag
```bash
mkdir -p compare_junctions/hist
```

# vep comparison (ratio)
bsub /gscuser/zskidmor/R-3.3.0/bin/Rscript --vanilla /gscmnt/gc2602/griffithlab/regtools/yafeng/scripts/vep_compare.R ratio $tag
### Run `compare_junctions_hist.R` on sample data

NOTE: in the vep comparison, since regtools doesn't really have a concept of "variants" per se but rather "variant positions" (doesn't care about the actual alleles), we are really counting (positions of variants found splicing-significant by vep AND found significant by regtools) / (positions of variants found significant by found significant by regtools)
```bash
Rscript --vanilla compare_junctions_hist.R <tag>
```

To count:
### Run `filter_and_BH.R` to adjust p values and filter results

echo -e "anchor\tvep_significant_vars\ttotal_significant_vars\tpercent_vep_significant" > comparison_counts.tsv
for i in default i20e5 i50e5 E I; do
echo -n $i >> comparison_counts.tsv;
echo -en "\t" >> comparison_counts.tsv;
vep=$(cut -f 1,2,7 vep_comparison_$i.tsv | sort | uniq | grep "TRUE" | wc -l)
echo -n $vep >> comparison_counts.tsv;
echo -en "\t" >> comparison_counts.tsv;
total=$(($(cut -f 1,2,7 vep_comparison_$i.tsv | sort | uniq | wc -l)-1))
echo -n $total >> comparison_counts.tsv;
echo -en "\t" >> comparison_counts.tsv;
echo $(bc -l <<< "$vep/$total") >> comparison_counts.tsv;
done
```bash
Rscript --vanilla filter_and_BH.R <tag>
```
87 changes: 59 additions & 28 deletions scripts/compare_junctions_hist_v2.R
Original file line number Diff line number Diff line change
Expand Up @@ -9,21 +9,21 @@ library(tidyverse)

debug = F

system.time({
if (debug){
tag = paste("_", "default", sep="")
} else {
# get options tag
args = commandArgs(trailingOnly = TRUE)
tag = args[1]
input_file = args[2]
if ( substr(tag, 2, 3) == "--"){
stop("Please specify an option tag (e.g. \"default\", \"i20e5\")")
}
}
# system.time({
# if (debug){
# tag = paste("_", "default", sep="")
# } else {
# # get options tag
# args = commandArgs(trailingOnly = TRUE)
# tag = args[1]
# input_file = args[2]
# if ( substr(tag, 2, 3) == "--"){
# stop("Please specify an option tag (e.g. \"default\", \"i20e5\")")
# }
# }

# tag = 'I'
# input_file = '~/Desktop/CHOL/all_splicing_variants_I.bed'
tag = 'E'
input_file = 'all_splicing_variants_E.bed'

# All splicing relevant variants (union of rows from variants.bed files; add column with comma-separated list of sample names)
all_splicing_variants = unique(data.table::fread(input_file), sep = '\t', header = T, stringsAsFactors = FALSE)
Expand Down Expand Up @@ -150,21 +150,14 @@ regtools_data = subset(regtools_data, select=columns_to_keep)


# zeroes need to be added in for some samples
a <- function(x, y, z){
a <- function(x, y){
toAdd <- y - length(x) - 1
# browser()
toAdd <- rep(0.0000000, toAdd)
x <- c(x, toAdd)
return(x)
}
x <- mapply(a, regtools_data$norm_scores_non, length(all_samples), regtools_data$samples)


# if (typeof(x) == 'list') {
# x <- matrix(pad(unlist(x), ncols),nrow = rows, byrow = TRUE, ncol = cols)
# x <- t(x)
# }
# browser()
x <- mapply(a, regtools_data$norm_scores_non, length(all_samples))

get_num_zeros_to_rm <- function(z){
num_zeroes_to_rm = str_count(z, ',')
Expand All @@ -188,6 +181,47 @@ if (max(num_zeroes_to_rm > 0)) {
x <- mapply(rm_zeroes, regtools_data$norm_scores_non, regtools_data$zeroes_to_rm)
regtools_data$norm_scores_non = x
}

get_mean <- function(x){
x <- mean(as.numeric(x))
return(x)
}

x <- mapply(get_mean, regtools_data$norm_scores_non)
regtools_data$mean_norm_score_non <- x

get_sd <- function(x){
x <- sd(as.numeric(x))
return(x)
}

x <- mapply(get_sd, regtools_data$norm_scores_non)
regtools_data$sd_norm_score_non <- x

a <- function(x, y){
# if(y == "TCGA-ZH-A8Y2-01A,TCGA-ZH-A8Y5-01A"){
# browser()
# }
toAdd <- (str_count(y, ',') + 1) - (str_count(x, ',') + 1)
# browser()
if (toAdd > 0) {
toAdd <- rep(0.0000000, toAdd)
x <- c(x, toAdd)
} else {
x <- unlist(strsplit(x, ","))
}
x <- list(x)
return(x)
}
x <- mapply(a, regtools_data$norm_scores_variant, regtools_data$samples)
regtools_data$norm_scores_variant = x

x <- mapply(get_mean, regtools_data$norm_scores_variant)
regtools_data$mean_norm_score_variant <- x

x <- mapply(get_sd, regtools_data$norm_scores_variant)
regtools_data$sd_norm_score_variant <- x

print("test7")

################ calculate p-values ############################################
Expand All @@ -207,9 +241,6 @@ a <- function(x){
breaks = seq(0.5, max(non_variant_norm_scores_ranked)+1.5, by=1), plot=F)
mids = histinfo$mids
cd = cumsum(histinfo$density)
# if(x$info == "chr1_729955_735423_D_chr1:809966-809967"){
# browser()
# }
underestimate = max(which(mids <= variant_norm_score_ranked))
pvalue = 1-cd[underestimate]
return(pvalue)
Expand Down Expand Up @@ -243,5 +274,5 @@ regtools_data = regtools_data %>% distinct()


write.table(regtools_data, file=paste(input_file, "_out.tsv", sep=""), quote=FALSE, sep='\t', row.names = F)

})
#
# })
2 changes: 1 addition & 1 deletion scripts/stats_wrapper.py
Original file line number Diff line number Diff line change
Expand Up @@ -36,7 +36,7 @@
files.sort()
number_of_in_files = len(files)
for file in files:
subprocess.run(f'Rscript --vanilla /home/ec2-user/workspace/regtools/scripts/compare_junctions_hist_v2.R {tag} {file}', shell=True, check=True)
subprocess.run(f'Rscript --vanilla compare_junctions_hist_v2.R {tag} {file}', shell=True, check=True)
output_files = glob.glob("*_out.tsv")
output_files.sort()# glob lacks reliable ordering, so impose your own if output order matters
number_of_out_files = len(output_files)
Expand Down

0 comments on commit 7b80c9c

Please sign in to comment.