count_circs.sh

#!/bin/bash

####
# T.J.Blätte
# S. Lux
# 2016
####
#
# Generates read count files to input into DESeq2,
#       whose format is based on that of HTSeq.
#       Counts are for circular RNAs / backsplice junctions.
#
# Args:
#   PREFIX: Prefix for output files.
#   [...]: The samples to generate count files for, format is
#       that of *_circsAnnotatedFinal_linearJunctionsCounts.tsv as
#       generated by get_circs.sh.
#       Note: All samples to be later processed together in one DESeq2
#       analysis must also be pre-processed together by this
#       script to ensure that counts are generated for the same
#       circRNAs. Those detected in only a single sample will be listed
#       with zero counts in the other samples' files.
#
# Output:
#   $PREFIX_circs.tsv: Read counts of each circular RNA / backsplice.
#   $PREFIX_circs-per-gene.tsv: Read counts of circular RNAs / backsplices
#       discovered per gene - when multiple circular RNAs / backsplices
#       are detected within the same gene, counts are summed.
#
####


PREFIX=$1
#SAMPLES= ALL OTHER INPUTS

KEY="${PREFIX}_circs_key.tsv"
LIST="${PREFIX}_circs_list.tsv"

mkdir -p intermediate_files

# create a list of all circRNA backsplice junctions encountered in the provided samples
# columns 1-3 are chr, start, stop coordinates of the backsplice
# column 4 is the gene symbol to which the junction was annotated
tail -q -n +2 "${@:2}" | cut -f1-3,4 | sed -e 's/\t/_/' -e 's/\t/_/' | sort | uniq > $KEY

# create a dummy counts file, setting counts to 0 for all features
# -> then go through each of the provided input files and add the observed number of reads
#    supporting each backsplice on top
awk -v OFS='\t' '{print $1,0}' $KEY > $LIST

for FILE in "${@:2}"
do
    echo "$FILE"
    # prepare sample
    # -> columns 1-3 are chr, start, stop coordinates of the backsplice
    # -> column 5 is the number of circular reads supporting the junction
    SAMPLE="$(tail -n +2 "$FILE" | cut -f1-3,5 | sed -e 's/\t/_/' -e 's/\t/_/' | sort | uniq)"
    SAMPLE_LONG="$(cat <(echo "$SAMPLE") "$LIST" | sort)"

    UNIFORM="intermediate_files/$(basename ${FILE%.tsv}_circs.counts)"
    ENSEMBL="intermediate_files/$(basename ${FILE%.tsv}_circs-per-gene.counts)"

    echo "$SAMPLE_LONG" | awk -v OFS='\t' '{if(PREV==0){PREV=$1;} ID=$1; if(ID==PREV){SUM=SUM+$2}else{print(PREV,SUM); SUM=$2;}; PREV=ID;} END {print(ID,SUM);}' > $UNIFORM

    # create the same output but with ENSEMBL gene IDs as identifiers instead of custom junction coords
    join -t '	' $KEY $UNIFORM  | cut -f2,3 | sed 's/.*"\([^"]\+\)"/\1/' | sort | awk -v OFS='\t' 'NR==1 {PREV=$1; SUM=0} {ID=$1; if(ID==PREV){SUM=SUM+$2}else{print(PREV,SUM); SUM=$2;}; PREV=ID;} END {print(ID,SUM);}' > $ENSEMBL

    echo '__no_feature	0
__ambiguous	0
__too_low_aQual	0
__not_aligned	0
__alignment_not_unique	0' >> $UNIFORM

    echo '__no_feature	0
__ambiguous	0
__too_low_aQual	0
__not_aligned	0
__alignment_not_unique	0' >> $ENSEMBL
done

# create DESeq2 dummy design table
# -> one with counts per circRNA junction, one with counts per gene / ENSEMBL ID
echo -e 'sampleName\tfileName\tcondition' > ${PREFIX}_circs.tsv
ls -C $(find intermediate_files -type f -name "*_circs.counts" -o -name ".*_circs.counts" ) | sed -e 's/.*\///' -e 's/\(.*\)/\1\t\1/' >> ${PREFIX}_circs.tsv


echo -e 'sampleName\tfileName\tcondition' > ${PREFIX}_circs-per-gene.tsv
ls -C $(find intermediate_files -type f -name "*_circs-per-gene.counts" -o -name ".*_circs-per-gene.counts" ) | sed -e 's/.*\///' -e 's/\(.*\)/\1\t\1/' >> ${PREFIX}_circs-per-gene.tsv


rm "$LIST"