-
Notifications
You must be signed in to change notification settings - Fork 14
/
Copy pathrunBITACORA_genome_mode.sh
197 lines (141 loc) · 7.27 KB
/
runBITACORA_genome_mode.sh
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
#!/bin/bash
##########################################################
## ##
## BITACORA ##
## ##
## Bioinformatics tool to assist the ##
## comprehensive annotation of gene families ##
## ##
## ##
## Developed by Joel Vizueta ##
## Contact: via github or [email protected] ##
## ##
##########################################################
VERSION=1.4
##########################################################
## EXPORT EXECUTABLES TO PATH ##
##########################################################
# Perl needs to be installed in the system.
# HMMER and BLAST executables need to be included in $PATH
export PATH=$PATH:/path/to/blast/bin
export PATH=$PATH:/path/to/hmmer/bin
# PATH to BITACORA Scripts folder.
SCRIPTDIR=/path/to/Scripts
# In case of using GeMoMa (set as default), specify the PATH to jar file. Otherwise, set GEMOMA=F in editable parameters section to use the close-proximity method, which does not require any external software
GEMOMAP=/path/to/GeMoMa.jar
##########################################################
## PREPARE THE DATA ##
##########################################################
# Include here the name of your species. i.e. NAME=Dmel
NAME=OrganismName
# Write all files with its full or relative PATH
# Include the fasta file containing the genome sequences
GENOME=/path/to/genome.fasta
# Include the folder containing the FPDB databases (Including a fasta and HMM file named as YOURFPDB_db.fasta and YOURFPDB_db.hmm); Multiple FPDB can be included in the folder to be searched
QUERYDIR=/path/to/query_folder
##########################################################
## EDITABLE PARAMETERS ##
##########################################################
# Set CLEAN=T if you want to clean the output folder. Intermediate files will not be erased but saved in the Intermediate_files folder. Otherwise, set CLEAN=F to keep all files in the same output folder
CLEAN=T
# You can modify the E-value used to filter BLAST and HMMER. Default is 1e-5
EVALUE=1e-3
# Number of threads to be used in blast searches
THREADS=1
# (Default) GEMOMA=T (with upper case) will use GeMoMa software to predict novel genes from TBLASTN alignments (PATH to jar file need to be specified in GEMOMAP variable)
# Otherwise, set GEMOMA=F to predict new genes by exon proximity (close-proximity method)
GEMOMA=T
# (Used when GEMOMA=F; close-proximity method) Maximum length of an intron used to join putative exons of a gene. Default value is conservative and can also join exons from different genes (labeled in output files with _Ndom)
# The provided script in Scripts/Tools/get_intron_size_fromgff.pl can estimate intron length statistics for a specific GFF. See the manual for more details
MAXINTRON=15000
# Set GENOMICBLASTP=T in order to conduct both BLASTP and HMMER to curate novel annotated genes (Note that this option is the most sensitive but greatly depends on the database quality and could result in false positives)
# Otherwise, BITACORA will only use the protein domain (HMMER) to validate new annotated genes (In this case, the probability of detecting all copies is lower, but it will avoid to identify unrelated genes)
GENOMICBLASTP=F
# An additional validation and filtering of the resulting annotations can be conducted using the option ADDFILTER.
# If ADDFILTER=T, BITACORA will cluster highly similar sequences (with 98% identity; being isoforms or resulting from putative assembly artifacts), and will discard all annotations with a length lower than the specified in FILTERLENGTH parameter.
ADDFILTER=T
FILTERLENGTH=30
# Alternatively, BITACORA can report all annotated genes, without any clustering of identical copies. Set RETAINNONFILTER=T in this case.
RETAINNONFILTER=F
##########################################################
## HOW TO RUN ##
##########################################################
# Once you have included all of the above variables, you can run BITACORA as in:
#$ bash runBITACORA_genome_mode.sh
##########################################################
## PIPELINE - CODE ##
##########################################################
echo -e "\n####################### Running BITACORA Genome mode #######################";
echo "BITACORA genome-mode version $VERSION";
date
# Checking if provided data is ok
if [[ ! -f $SCRIPTDIR/check_data_genome_mode.pl ]] ; then
echo -e "BITACORA can't find Scripts folder in $SCRIPTDIR. Be sure to add also Scripts at the end of the path as /path/Scripts";
echo -e "BITACORA died with error\n";
exit 1;
fi
if [ $GEMOMA == "T" ] ; then
perl $SCRIPTDIR/check_data_genome_mode.pl $GENOME $QUERYDIR $GEMOMA $GEMOMAP 2>BITACORAstd.err
fi
if [ $GEMOMA != "T" ] ; then
perl $SCRIPTDIR/check_data_genome_mode.pl $GENOME $QUERYDIR $GEMOMA 2>BITACORAstd.err
fi
ERRORCHECK="$(grep -c 'ERROR' BITACORAstd.err)"
if [ $ERRORCHECK != 0 ]; then
cat BITACORAstd.err;
echo -e "BITACORA died with error\n";
exit 1;
fi
# Run step 2
if [ $GEMOMA == "T" ] ; then
if [ $GENOMICBLASTP == "T" ] ; then
perl $SCRIPTDIR/runanalysis_2ndround_v2_genomic_nogff_gemoma.pl $NAME $QUERYDIR $GENOME $EVALUE $MAXINTRON $THREADS $GEMOMAP 2>>BITACORAstd.err 2>BITACORAstd.err
fi
if [ $GENOMICBLASTP != "T" ] ; then
perl $SCRIPTDIR/runanalysis_2ndround_genomic_nogff_gemoma.pl $NAME $QUERYDIR $GENOME $EVALUE $MAXINTRON $THREADS $GEMOMAP 2>>BITACORAstd.err 2>BITACORAstd.err
fi
fi
if [ $GEMOMA != "T" ] ; then
if [ $GENOMICBLASTP == "T" ] ; then
perl $SCRIPTDIR/runanalysis_2ndround_v2_genomic_nogff.pl $NAME $QUERYDIR $GENOME $EVALUE $MAXINTRON $THREADS 2>>BITACORAstd.err 2>BITACORAstd.err
fi
if [ $GENOMICBLASTP != "T" ] ; then
perl $SCRIPTDIR/runanalysis_2ndround_genomic_nogff.pl $NAME $QUERYDIR $GENOME $EVALUE $MAXINTRON $THREADS 2>>BITACORAstd.err 2>BITACORAstd.err
fi
fi
ERRORCHECK="$(grep -c 'ERROR' BITACORAstd.err)"
if [ $ERRORCHECK != 0 ]; then
cat BITACORAstd.err;
echo -e "BITACORA died with error\n";
exit 1;
fi
ERRORCHECK="$(grep -c 'Segmentation' BITACORAstd.err)"
if [ $ERRORCHECK != 0 ]; then
cat BITACORAstd.err;
echo -e "BITACORA died with error\n";
exit 1;
fi
# Run additional filtering and clustering
if [ $ADDFILTER == "T" ] ; then
perl $SCRIPTDIR/runfiltering_genome_mode.pl $NAME $QUERYDIR $FILTERLENGTH 2>>BITACORAstd.err 2>BITACORAstd.err
fi
ERRORCHECK="$(grep -c 'ERROR' BITACORAstd.err)"
if [ $ERRORCHECK != 0 ]; then
cat BITACORAstd.err;
echo -e "BITACORA died with error\n";
exit 1;
fi
# Cleaning
if [ $RETAINNONFILTER == "T" ]; then
perl $SCRIPTDIR/runcleaning_genome_mode_allcopies.pl $NAME $QUERYDIR
echo -e "Cleaning output folders\n";
fi
if [ $RETAINNONFILTER != "T" ]; then
if [ $CLEAN == "T" ]; then
perl $SCRIPTDIR/runcleaning_genome_mode.pl $NAME $QUERYDIR
echo -e "Cleaning output folders\n";
fi
fi
rm BITACORAstd.err
echo -e "BITACORA completed without errors :)";
date