-
Notifications
You must be signed in to change notification settings - Fork 0
/
wblast
134 lines (123 loc) · 3.74 KB
/
wblast
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
#!/usr/bin/env bash
set -e
helpstr="WBLAST - Will's wrapper script for command-line BLAST
Usage: wblast [OPTIONS] [-q QUERY_FILE | -Q QUERY_SEQUENCE]
[-s SUBJECT_FILE | -d DATABASE]
Input arguments:
-q: Query file path; OR
-Q: Query sequence
-s: Subject file path; OR
-d: Database prefix path
All else being equal, aligning to a database is much faster
than aligning to a subject file.
Output filtering options (defaults):
-l: Minimum alignment length (0)
-i, -y: Minimum % identity (0)
-c: Minimum query coverage (0)
-b: Best-Hits parameters (0.1 0.1)
Best-Hits controls how stringently BLAST filters overlapping
matches. Lowering the first parameter
or increasing the second will decrease the stringency;
acceptable values are 0.1-0.25 and 0.05-0.25, respectively.
Other options (defaults)
-B: BLAST script to run (blastn)
-t: BLAST algorithm to use (megablast)
-F: Return full pairwise alignment information instead of summary table
-o: Other options to pass to BLAST
-C: Print the full BLAST command to be executed and exit
-h: Print this help message and exit
Suitable arguments for -B depend on the nature of the query and subject
sequences (nucleotide/NT vs protein/PR):
-----------------NT---------SUBJECT----------PR----------------
NT | blastn, tblastx | blastx |
QUERY |------------------------------|------------------------------|
PR | tblastn | blastp, psiblast, deltablast |
---------------------------------------------------------------
The values -t can take depends upon the value of -B:
-B | -t
--------|---------------------------------------------------------
blastn | blastn, blastn-short, dc-megablast, megablast, rmblastn
blastx | blastx, blastx-fast
tblastn | tblastn, tblastn-fast
blastp | blastp, blastp-fast, blastp-short
See the BLAST documentation for more details. For tblastx, psiblast &
deltablast -t should be left unspecified."
# Defaults and unsetting
fmt="6 qseqid sseqid pident qcovhsp length mismatch gapopen gaps sstrand qstart qend sstart send evalue bitscore qlen slen"
incmd=cat
outcmd=eval
minlength=0
ident=0
bscript=blastn
besthits="0.1 0.1"
unset q bstring
# Get arguments
while getopts "q:Q:s:d:l:i:y:c:b:B:t:Fo:Ch" opt; do
case $opt in
q)
q=$OPTARG
;;
Q)
q=$OPTARG
incmd=echo
;;
s)
bstring="$bstring -subject $OPTARG"
;;
d)
bstring="$bstring -db $OPTARG"
;;
l)
minlength=$OPTARG
;;
[iy])
ident=$OPTARG
;;
c)
bstring="$bstring -qcov_hsp_perc $OPTARG"
;;
b)
besthits="$OPTARG"
;;
B)
bscript=$OPTARG
;;
t)
bstring="$bstring -task $OPTARG"
;;
F)
fmt=0
;;
o)
bstring="$bstring $OPTARG"
;;
C)
outcmd=echo
;;
h)
echo "$helpstr"
exit
;;
*)
echo "Invalid argument:" $opt >&2
print "$helpstr"
exit 1
;;
esac
done
bstring="$bstring -best_hit_overhang ${besthits%% *}"
bstring="$bstring -best_hit_score_edge ${besthits##* }"
# Argument errors and warnings:
if [[ -z "$q" ]]; then
>&2 echo "Error: query file (-q) or sequence (-Q) required."
exit 1
fi
# Main script:
if [[ "$fmt" == "0" ]]; then
bcmd="$incmd $q | $bscript $bstring"
else
bcmd="$incmd $q | $bscript $bstring -outfmt '$fmt' | column -t"
bcmd="$bcmd | awk -v m=$minlength -v i=$ident '{if (\$5>=m && \$3>=i)print }'"
fi
$outcmd $bcmd
exit