forked from tjblaette/ngs
-
Notifications
You must be signed in to change notification settings - Fork 0
/
count_circs.sh
executable file
·91 lines (73 loc) · 3.5 KB
/
count_circs.sh
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
#!/bin/bash
####
# T.J.Blätte
# S. Lux
# 2016
####
#
# Generates read count files to input into DESeq2,
# whose format is based on that of HTSeq.
# Counts are for circular RNAs / backsplice junctions.
#
# Args:
# PREFIX: Prefix for output files.
# [...]: The samples to generate count files for, format is
# that of *_circsAnnotatedFinal_linearJunctionsCounts.tsv as
# generated by get_circs.sh.
# Note: All samples to be later processed together in one DESeq2
# analysis must also be pre-processed together by this
# script to ensure that counts are generated for the same
# circRNAs. Those detected in only a single sample will be listed
# with zero counts in the other samples' files.
#
# Output:
# $PREFIX_circs.tsv: Read counts of each circular RNA / backsplice.
# $PREFIX_circs-per-gene.tsv: Read counts of circular RNAs / backsplices
# discovered per gene - when multiple circular RNAs / backsplices
# are detected within the same gene, counts are summed.
#
####
PREFIX=$1
#SAMPLES= ALL OTHER INPUTS
KEY="${PREFIX}_circs_key.tsv"
LIST="${PREFIX}_circs_list.tsv"
mkdir -p intermediate_files
# create a list of all circRNA backsplice junctions encountered in the provided samples
# columns 1-3 are chr, start, stop coordinates of the backsplice
# column 4 is the gene symbol to which the junction was annotated
tail -q -n +2 "${@:2}" | cut -f1-3,4 | sed -e 's/\t/_/' -e 's/\t/_/' | sort | uniq > $KEY
# create a dummy counts file, setting counts to 0 for all features
# -> then go through each of the provided input files and add the observed number of reads
# supporting each backsplice on top
awk -v OFS='\t' '{print $1,0}' $KEY > $LIST
for FILE in "${@:2}"
do
echo "$FILE"
# prepare sample
# -> columns 1-3 are chr, start, stop coordinates of the backsplice
# -> column 5 is the number of circular reads supporting the junction
SAMPLE="$(tail -n +2 "$FILE" | cut -f1-3,5 | sed -e 's/\t/_/' -e 's/\t/_/' | sort | uniq)"
SAMPLE_LONG="$(cat <(echo "$SAMPLE") "$LIST" | sort)"
UNIFORM="intermediate_files/$(basename ${FILE%.tsv}_circs.counts)"
ENSEMBL="intermediate_files/$(basename ${FILE%.tsv}_circs-per-gene.counts)"
echo "$SAMPLE_LONG" | awk -v OFS='\t' '{if(PREV==0){PREV=$1;} ID=$1; if(ID==PREV){SUM=SUM+$2}else{print(PREV,SUM); SUM=$2;}; PREV=ID;} END {print(ID,SUM);}' > $UNIFORM
# create the same output but with ENSEMBL gene IDs as identifiers instead of custom junction coords
join -t ' ' $KEY $UNIFORM | cut -f2,3 | sed 's/.*"\([^"]\+\)"/\1/' | sort | awk -v OFS='\t' 'NR==1 {PREV=$1; SUM=0} {ID=$1; if(ID==PREV){SUM=SUM+$2}else{print(PREV,SUM); SUM=$2;}; PREV=ID;} END {print(ID,SUM);}' > $ENSEMBL
echo '__no_feature 0
__ambiguous 0
__too_low_aQual 0
__not_aligned 0
__alignment_not_unique 0' >> $UNIFORM
echo '__no_feature 0
__ambiguous 0
__too_low_aQual 0
__not_aligned 0
__alignment_not_unique 0' >> $ENSEMBL
done
# create DESeq2 dummy design table
# -> one with counts per circRNA junction, one with counts per gene / ENSEMBL ID
echo -e 'sampleName\tfileName\tcondition' > ${PREFIX}_circs.tsv
ls -C $(find intermediate_files -type f -name "*_circs.counts" -o -name ".*_circs.counts" ) | sed -e 's/.*\///' -e 's/\(.*\)/\1\t\1/' >> ${PREFIX}_circs.tsv
echo -e 'sampleName\tfileName\tcondition' > ${PREFIX}_circs-per-gene.tsv
ls -C $(find intermediate_files -type f -name "*_circs-per-gene.counts" -o -name ".*_circs-per-gene.counts" ) | sed -e 's/.*\///' -e 's/\(.*\)/\1\t\1/' >> ${PREFIX}_circs-per-gene.tsv
rm "$LIST"