diff --git a/qc_metrics/rna-seq/script/README.md b/qc_metrics/rna-seq/script/README.md index b5bbbde..9b36acd 100644 --- a/qc_metrics/rna-seq/script/README.md +++ b/qc_metrics/rna-seq/script/README.md @@ -1,26 +1,23 @@ -This folder contains a .PDF document with the definitions and shell script for computation of the RNA-seq QC metrics. +# RNA-seq QC metrics -To calculate the metrics please use: - -- samtools v 1.3.1 - -- picard v 2.9.0 +This folder contains a document with the definitions and shell script for computation of the RNA-seq QC metrics. +To calculate the metrics please use: +- sambamba v0.6.6 ## Prerequisites: -For the script to run you have to define the following environment variables: - +For the script to run you have to define the following environment variable: - export PICARD_290= + export SAMBAMBA= - export SAMTOOLS_131= +Temporary files are stored in `/tmp` by default. This setting can be overriden by: - export $TMP_DIR= + export $TMPDIR= Required computational resources: -- 1 CPU Core +- at least 1 CPU Core - 4 GB RAM ## How to use: @@ -29,13 +26,15 @@ To run the statistics on your BAM file mapped to the human genome (assembly GRCh [sullrich@cluster]$ ./rna_stats.sh - USAGE: ./rna_stats.sh - The script computes RNA-seq quality metrics according to IHEC standards + USAGE: rna_stats.sh [] + The script computes RNA-seq qualtity metrics according to IHEC standards + + Input files in the following order are required: + 1. path to BAM file to be analyzed + 2. Bed file with intergenic coordinates + 3. Bed file with rRNA coordinates - Input files in the following order are required: - 1. path to BAM file to be analyzed - 2. Bed file with intergenic coordinates - 3. Bed file with rRNA coordinates + An optional 4th argument can be used to specify the maximum number of threads to use (default: 1) The script will create a folder in the chosen working directory with the name of the bam file. A statistics output file will be written inside in text format. This allows running many samples in parallel in your computational environment. Temporary files (e.g. duplicate marked BAM files) are removed to minimize disk usage. @@ -43,7 +42,7 @@ The script will create a folder in the chosen working directory with the name of ## Example: -./rna_stats.sh SAMPLE_ID.bam intergeneic_regions_gencode22_GRCh38.bed rRNA_Mt-rRNA_gencode22_GRCh38.bed +./rna_stats.sh SAMPLE_ID.bam intergeneic_regions_gencode22_GRCh38.bed rRNA_Mt-rRNA_gencode22_GRCh38.bed 4 The metrics are found in the text file SAMPLE_ID_read_stats.txt created in the directory SAMPLE_ID and have the following format: diff --git a/qc_metrics/rna-seq/script/rna_stats.sh b/qc_metrics/rna-seq/script/rna_stats.sh index 8c01555..b7f7a64 100644 --- a/qc_metrics/rna-seq/script/rna_stats.sh +++ b/qc_metrics/rna-seq/script/rna_stats.sh @@ -4,7 +4,7 @@ set -euf -o pipefail if [ $# -lt 3 ]; then echo - echo " USAGE: $0 " + echo " USAGE: $0 []" echo " The script computes RNA-seq qualtity metrics according to IHEC standards" echo "" echo " Input files in the following order are required:" @@ -12,17 +12,33 @@ if [ $# -lt 3 ]; then echo " 2. Bed file with intergenic coordinates" echo " 3. Bed file with rRNA coordinates" echo + echo " An optional 4th argument can be used to specify the maximum number of threads to use (default: 1)" + echo exit 1 fi -if [[ -z ${PICARD_290+x} || -z ${SAMTOOLS_131+x} ]]; then - echo "Environment variable PICARD_290 (path to picard jar v2.9.0) and SAMTOOLS_131 (path to samtools v1.3.1) must be set" +if [[ -z ${SAMBAMBA+x} ]]; then + echo "Environment variable SAMBAMBA (path to sambamba binary) must be set" exit 1 fi -SAMPLE_PATH=$1 -INTERGENIC_BED=$2 -RRNA_BED=$3 +resolve() { + if [[ "$1" = /* ]];then + echo $1 + return + fi + HEAD=$1 + if [ ! -d $1 ]; then + HEAD=$(dirname $1) + TAIL="/$(basename $1)" + fi + cd $HEAD && echo "$(pwd -P)${TAIL-}" +} + +SAMPLE_PATH=$(resolve $1) +INTERGENIC_BED=$(resolve $2) +RRNA_BED=$(resolve $3) +THREADS=${4-"1"} SAMPLE_NAME=$(basename $SAMPLE_PATH .bam) @@ -30,31 +46,31 @@ WORKING_DIR=${QC_DIR:-"."} # by default working directory is current mkdir -p $WORKING_DIR/$SAMPLE_NAME || true cd $WORKING_DIR/$SAMPLE_NAME -#computation of IHEC RNA-seq stats -#get the fraction of mapped reads -MAPPED_READS=$($SAMTOOLS_131/samtools view -c -F 0x904 $SAMPLE_PATH) +BASE_FILTER='not (secondary_alignment or supplementary)' +MAPPED_FILTER='not (secondary_alignment or supplementary or unmapped)' -N_NO_MULTIMAP=$($SAMTOOLS_131/samtools view -b -c -F 0x900 $SAMPLE_PATH) +# computation of IHEC RNA-seq stats +# get the fraction of mapped reads +MAPPED_READS=$($SAMBAMBA view -c -t $THREADS -F "$MAPPED_FILTER" $SAMPLE_PATH) + +N_NO_MULTIMAP=$($SAMBAMBA view -c -t $THREADS -F "$BASE_FILTER" $SAMPLE_PATH) echo $N_NO_MULTIMAP | awk -v MAPPED_READS=$MAPPED_READS '{print "FRACTION_MAPPED\t"MAPPED_READS/$1}' > ${SAMPLE_NAME}_read_stats.txt -#get the number of reads falling into intergenic regions with samtools -$SAMTOOLS_131/samtools view -c -F 0x900 $SAMPLE_PATH -L $INTERGENIC_BED | awk -v MAPPED_READS=$MAPPED_READS '{print "FRACTION_INTERGENIC\t"$1/MAPPED_READS}' >> ${SAMPLE_NAME}_read_stats.txt +# get the number of reads falling into intergenic regions +$SAMBAMBA view -t $THREADS -c -F "$BASE_FILTER" $SAMPLE_PATH -L $INTERGENIC_BED | awk -v MAPPED_READS=$MAPPED_READS '{print "FRACTION_INTERGENIC\t"$1/MAPPED_READS}' >> ${SAMPLE_NAME}_read_stats.txt -#get the number of reads from rRNA with samtools -$SAMTOOLS_131/samtools view -c -F 0x900 $SAMPLE_PATH -L $RRNA_BED | awk -v MAPPED_READS=$MAPPED_READS '{print "FRACTION_RRNA\t"$1/MAPPED_READS}' >> ${SAMPLE_NAME}_read_stats.txt +# get the number of reads from rRNA regions +$SAMBAMBA view -t $THREADS -c -F "$BASE_FILTER" $SAMPLE_PATH -L $RRNA_BED | awk -v MAPPED_READS=$MAPPED_READS '{print "FRACTION_RRNA\t"$1/MAPPED_READS}' >> ${SAMPLE_NAME}_read_stats.txt -#get number of duplicates -PICARD_MARK_DUP_CMD="java -Xmx4g -jar $PICARD_290/picard.jar MarkDuplicates VALIDATION_STRINGENCY=LENIENT I=$SAMPLE_PATH O=${SAMPLE_NAME}_noMULTI_noDUP.bam M=${SAMPLE_NAME}_duplicated.txt" -#add -TMP_DIR argument to mark dups so avoid running out of space. -if [[ -z $TMP_DIR ]]; then - $PICARD_MARK_DUP_CMD -else - TEMP="$TMP_DIR/$SAMPLE_NAME/temp" +# get number of duplicates +# add TMPDIR to mark dups +TEMP="/tmp" +if [[ ! -z ${TMPDIR+x} ]]; then + TEMP="$TMPDIR/$SAMPLE_NAME/temp" mkdir -p $TEMP || true - $PICARD_MARK_DUP_CMD TMP_DIR=$TEMP fi -awk -F $'\t' '/PERCENT_DUPLICATION/{getline; print "FRACTION_DUPLICATED\t"$9}' ${SAMPLE_NAME}_duplicated.txt >> ${SAMPLE_NAME}_read_stats.txt - -rm ${SAMPLE_NAME}_noMULTI_noDUP.bam -rm ${SAMPLE_NAME}_duplicated.txt +$SAMBAMBA markdup -t $THREADS $SAMPLE_PATH --tmpdir ${TEMP} ${SAMPLE_NAME}_noMULTI_noDUP.bam +$SAMBAMBA view -t 8 -c -F 'duplicate' ${SAMPLE_NAME}_noMULTI_noDUP.bam | awk -v MAPPED_READS=$MAPPED_READS '{print "FRACTION_DUPLICATED\t"$1/MAPPED_READS}' >> ${SAMPLE_NAME}_read_stats.txt +# cleanup +rm ${SAMPLE_NAME}_noMULTI_noDUP.bam{,.bai}