-
Notifications
You must be signed in to change notification settings - Fork 0
/
IndicaRTD.sh
112 lines (70 loc) · 8.14 KB
/
IndicaRTD.sh
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
# Raw sequencing data curation with AdapterRemoval tool:
AdapterRemoval --file1 $INFILE1 --file2 $INFILE2 --threads 15 --adapter-list adap.fa --minquality 25 --basename ${FName}
# Downloaded the Oryza sativa ssp. indica genome (ASM465v1.43) fasta file, gff3 and GTF files from Ensembl plant database.
# Quality filtered sequencing data was used for STAR mapping :
### STAR genome index generated with the following parameters using the downloaded genome files:
################################################################################################
STAR --runThreadN 10 --runMode genomeGenerate --genomeDir /home/ns43567/CEIB_rice_project/JHI/ensembl_indica --genomeFastaFiles Oryza_indica.ASM465v1.dna.toplevel.fa --sjdbGTFfile ./Oryza_indica.ASM465v1.43.gtf --sjdbOverhang 100 --genomeChrBinNbits 8 --genomeSAindexNbases 13 --limitGenomeGenerateRAM 32000000000 --genomeSAsparseD 5
### First-pass STAR mapping - Mapping the quality filtered reads to the genome file using STAR
#############################################################################################
STAR --runMode alignReads --genomeDir /home/ns43567/CEIB_rice_project/JHI/ensembl_indica --readFilesType Fastx --readFilesIn $INFILE --sjdbOverhang 100 --outSAMprimaryFlag AllBestScore --outFilterMismatchNmax 2/0 --outSJfilterCountTotalMin 10 5 5 5 --outSAMstrandField intronMotif --outFilterIntronMotifs RemoveNoncanonical --alignIntronMin 60 --alignIntronMax 6000 --outSAMtype BAM SortedByCoordinate --outFileNamePrefix /home/ns43567/CEIB_rice_project/transcriptome/NGS_P5366/pe/${FName}_ --outTmpDir /home/ns43567/CEIB_rice_project/transcriptome/NGS_P5366/pe/${FName}_STAR
#STAR output file sj.out.tab file specifications:
#column 1: chromosome
#column 2: first base of the intron (1-based)
#column 3: last base of the intron (1-based)
#column 4: strand (0: undefined, 1: +, 2: -)
#column 5: intron motif: 0: non-canonical; 1: GT/AG, 2: CT/AC, 3: GC/AG, 4: CT/GC, 5: AT/AC, 6: GT/AT
#column 6: 0: unannotated, 1: annotated (only if splice junction database is used)
#column 7: number of uniquely mapping reads crossing the junction
#column 8: number of multi-mapping reads crossing the junction
#column 9: maximum spliced alignment overhang
### Novel SJ filteration after first-pass STAR mapping (unannotated sj):
########################################################################
cat *.tab | awk '($5 > 0 && $7 >= 1 && $6==0)' | cut -f1-6 | sort | uniq > NovelSJ_paired.tab
### STAR genome index generated with the following parameters for the genome files along with the NovelSJ_paired.tab:
#####################################################################################################################
STAR --runThreadN 10 --runMode genomeGenerate --genomeDir /home/ns43567/CEIB_rice_project/JHI/ensembl_indica --genomeFastaFiles Oryza_indica.ASM465v1.dna.toplevel.fa --sjdbGTFfile ./Oryza_indica.ASM465v1.43.gtf --sjdbOverhang 100 --genomeChrBinNbits 8 --genomeSAindexNbases 13 --limitGenomeGenerateRAM 32000000000 --genomeSAsparseD 5 --sjdbFileChrStartEnd /home/ns43567/CEIB_rice_project/transcriptome/NGS_P5366/pe/StarMapping_paired/NovelSJ_paired.tab
### Second-pass STAR mapping - Mapping the quality filtered reads to the genome file using STAR:
################################################################################################
STAR --runThreadN 2 --runMode alignReads --genomeDir /home/ns43567/CEIB_rice_project/JHI/ensembl_indica --readFilesType Fastx --readFilesIn $INFILE --sjdbOverhang 100 --outSAMprimaryFlag AllBestScore --outFilterMismatchNmax 0 --outFilterScoreMinOverLread 0 --outFilterMatchNminOverLread 0 --outSJfilterCountTotalMin 10 5 5 5 --outSAMstrandField intronMotif --outFilterIntronMotifs RemoveNoncanonical --alignIntronMin 60 --alignIntronMax 6000 --alignMatesGapMax 400 --outSAMtype BAM SortedByCoordinate --outFileNamePrefix /home/ns43567/CEIB_rice_project/transcriptome/NGS_P5366/pe/StarMappingpaired2ndPass/${FName}_paired_2ndPass_
### Total number of SJ after second-pass STAR mapping (both annotated, unannotated and cannonica sj):
##################################################################################################
cat *.tab | awk '($5 > 0 && $7 >= 1)' | cut -f1-6 | sort | uniq |wc -l
### Assembling the mapped reads with 3 different assemblers Scallop, Stringtie and Cufflinks with the default parameters
########################################################################################################################
scallop -i $INFILE -o ${FName}_Scallop.gtf
cufflinks -g Oryza_indica.ASM465v1.43.gtf -o ${FName}_cufflinks $INFILE
stringtie2 $INFILE -o ${FName}_stringtie2.gtf -G ~/Oryza_indica.ASM465v1.43.gtf -t -f 0
###Count number of genes in the GTF file:
#########################################
for i in *.gtf; do printf $i":" && awk 'BEGIN{FS="\t"}{print $9}' $i| awk 'BEGIN{FS=";"}{print $1}'| sed 's/gene_id "//g;s/"//g;/^$/d'|sort -u | wc -l; done
### Merging the individual assemblies of all samples for each assembler with cuffmerge, stringtie and taco with default parameters
#################################################################################################################################
stringtie --merge -p 10 -o StringtieMergedScallop.gtf -F 0 -T 0 -f 0 -g 0 -i ScallopGtf.txt
stringtie --merge -p 10 -o StringtieMergedCufflinks.gtf -F 0 -T 0 -f 0 -g 0 -i CufflinksGtf.txt
stringtie --merge -p 10 -o StringtieMergedStringtie.gtf -F 0 -T 0 -f 0 -g 0 -i StringtieGtf.txt
#-F <min_fpkm> : minimum input transcript FPKM to include in the merge (default: 1.0)
#-T <min_tpm> : minimum input transcript TPM to include in the merge (default: 1.0)
#-f <min_iso> : minimum isoform fraction (default: 0.01)
#-g <gap_len> : gap between transcripts to merge together (default: 250)
#-i : keep merged transcripts with retained introns; by default these are not kept unless there is strong evidence for them
#### Reference Transcriptome data (RTD) construction, creating the splice junction database (sj.db) and filtration of RTD with high-confident SJs
#################################################################################################################################################
cat StringtieMerged*.gtf > stringtiemergedgtflist.txt
./stringtie --merge -p 10 -o stringtiemerge3assemblers.gtf -F 0 -T 0 -g 0 -i stringtiemergedgtflist.txt
Rscript --vanilla ../multifilesgread.R stringtiemerge3assemblers.gtf > stringtiemerge3assemblers_intron.txt
awk '{print $11"\t"$1"_"$5"_"$2"_"$3}' stringtiemerge3assemblers_intron.txt > stringtiemerge3assemblers_intron_modified.txt
cat *sj.out.tab | awk '($5 > 0 && $7 >= 5 && $9 >= 10)' | cut -f1-4 | sort |uniq > sj.filtered.out.tab
#column 5: intron motif: 0: non-canonical; 1: GT/AG, 2: CT/AC, 3: GC/AG, 4: CT/GC, 5: AT/AC, 6: GT/AT
#column 7: number of uniquely mapping reads crossing the junction
#column 9: maximum spliced alignment overhang
awk '$4=="1" { print $1"_+_"$2"_"$3} $4=="2" { print $1"_-_"$2"_"$3}' sj.filtered.out.tab > sj.db
awk 'FNR==NR{a[$1];next}($2 in a){print}' sj.filtered.out.modified.txt stringtiemerge3assemblers_intron_modified.txt > tmp.txt
awk '{print $1}' stringtiemerge3assemblers_intron_modified.txt | sort | uniq -c | sed 's/^ *//g' > transcriptID_count.txt
awk '{print $1}' tmp.txt| sort | uniq -c | sed 's/^ *//g'| sed 's/ /_/g'|sort | comm -12 - transcriptID_count.txt > matchedtranscript.txt
awk 'BEGIN{FS="_"}{print $2}' matchedtranscript.txt |sort -u|fgrep -w -f - stringtiemerge3assemblers.gtf > matchedtranscript.gtf
### Extract & merge single exon transcripts:
############################################
tail -n +3 stringtiemerge3assemblers.gtf |awk '{print $12}'|sed 's/"//g;s/;//'|sort|uniq -c > stringtiemerge3assemblers_transcriptID_count.txt
sed 's/^[ ]*//g' stringtiemerge3assemblers_transcriptID_count.txt |awk 'BEGIN{FS=" "}{if ($1==2) print $2}'|fgrep -w -f - stringtiemerge3assemblers.gtf > stringtiemerge3assemblers_single_exon.gtf
stringtie --merge -p 10 -G /home/ns43567/CEIB_rice_project/JHI/ensembl_indica/Oryza_indica.ASM465v1.43.gtf -o IndicaRTD.gtf -F 0 -T 0 -g 0 -i matchedtranscript.gtf stringtiemerge3assemblers_single_exon.gtf