-
Notifications
You must be signed in to change notification settings - Fork 5
/
cnv_blast2gff.pl
223 lines (195 loc) · 7.73 KB
/
cnv_blast2gff.pl
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
#!/usr/bin/perl -w
#-----------------------------------------------------------+
# |
# Blast2Gff.pl |
# |
#-----------------------------------------------------------+
# AUTHOR: James C. Estill |
# CONTACT: JamesEstill_at_gmail.com |
# STARTED: 04/17/2007 |
# UPDATED: 04/18/2007 |
# |
# DESCRIPTION: |
# Converts BLAST output to GFF format. This the the GFF |
# format that is used with the Apollo Genome Annotation |
# curation program. |
# Currently this only works with m8 blast output. |
# |
# VERSION: |
# $Id:: $: |
# |
#-----------------------------------------------------------+
=head1 NAME
Blast2Gff.pl - Convert BLAST output to GFF format.
=cut
#-----------------------------+
# INCLUDES |
#-----------------------------+
use strict; # Keeps thing running clean
use Getopt::Std; # Get options from command line
=head1 VARIABLES
=cut
#-----------------------------+
# VARIABLES |
#-----------------------------+
my $GffAppend; # BOOLEAN. Append to GFF file
my $InFile; # Full path to the input blast output file
my $OutFile; # Full path to the gff formatted output file
my $AlignFormat; # Alignment format of the blast output file
# ie. -m = 0,8, or 9
my $PrintHelp; # Boolean, print the Usage statement
my $BlastDb; # Blast database
my $BlastProg; # Blast program (ie. blastn, blastx
my $SeqName; # Name of the sequence file used for query
my $Usage = "USAGE:\n".
"Blast2Gff.pl -i InFile.Fasta -o OutFile.gff -d BlastDb\n".
" -p blastprogram -m AligFormat -s SeqName -a\n\n".
" -i Full path to the BLAST output file[STRING]\n".
" -o Full path for the GFF formated file [STRING]\n".
" Default is the intput file path with gff extension.\n".
" -d Blast database that was blasted against [STRING]\n".
" This is required\n".
" -s ".
" -m Format of the algnment outout from blast [INTEGER]\n".
" Default value is 8. Valid values are 0,8,9".
" -p Blast program used [STRING]\n".
" Default is blastn\n".
" -a Append results to the gff file [BOOLEAN]\n".
" Default is to overwrite any exiting file.\n".
" -h Print this help statement [BOOLEAN]\n";
=head1 COMMAND LINE VARIABLES
=cut
#-----------------------------+
# COMMAND LINE VARIABLES |
#-----------------------------+
my %Options;
getopts('d:i:o:m:p:s:ha', \%Options);
$PrintHelp = $Options{h};
if ($PrintHelp)
{
print $Usage;
exit;
}
$SeqName = $Options{s} ||
die "\aERROR: A sequence file name must be specified\n$Usage\n";
$GffAppend = $Options{a};
$InFile = $Options{i} ||
die "\aERROR: An input file must be specified.\n\n$Usage\n";
# Default output is the full path of the input file with the gff extension
$BlastProg = $Options{p} ||
"blastn";
$BlastDb = $Options{d} ||
die "\aERROR: A blast database should be indicated.\n\n$Usage\n";
$OutFile = $Options{o} ||
$InFile.".gff";
$AlignFormat = $Options{m} ||
"8"; # Default format is tab delim
#-----------------------------+
# CHECK FILE EXISTENCE |
#-----------------------------+
unless (-e $InFile)
{
print "The input file could not be found\n$InFile\n";
exit;
}
#-----------------------------+
# CONVERT BLAST FILE TO GFF |
#-----------------------------+
# Test Blast2Gff subfunction
if ($AlignFormat == "8")
{
&TabBlast2Gff ($InFile, $OutFile, $BlastDb, $SeqName, "blastn");
} else {
print "A valid BLAST alignment format was not selected.\n";
}
#-----------------------------------------------------------+
# SUBFUNCTIONS |
#-----------------------------------------------------------+
sub TabBlast2Gff
{
my $In = $_[0]; # Path to blast intput file
my $Out = $_[1]; # Path to gff output file
my $Db = $_[2]; # The BLAST databas the hits are derived from
my $Name = $_[3]; # Seqname
my $Prog = $_[4]; # BLAST program used
my $GStart; # GFF Start position
my $GEnd; # GFF End position
# my $Format = $_[4]; # Format of the blast file
# # 8,9, 0 etc
# my $UseScore = $_[5]; # Score format to use
my $HitNum = "0";
#-----------------------------+
# FILE I/O |
#-----------------------------+
open (BLASTIN, "<".$In) ||
die "Can not open BLAST input file.$In.\n";
# If append was selected, just append gff data to the
# output file
if ($GffAppend)
{
open (GFFOUT, ">>".$Out) ||
die "Can not open GFF ouput file.$Out.\n";
} else {
open (GFFOUT, ">".$Out) ||
die "Can not open GFF ouput file.$Out.\n";
}
while (<BLASTIN>)
{
$HitNum++;
my ($QryId, $SubId, $PID, $Len,
$MisMatch, $GapOpen,
$QStart,$QEnd, $SStart, $SEnd,
$EVal, $BitScore) = split(/\t/);
my $Strand;
my $Frame = ".";
# Set the start to be less then the end
# This info can be used to dedeuct the strand
if ($SStart < $SEnd)
{
$GStart = $SStart;
$GEnd = $SEnd;
$Strand = "+";
} elsif ($SStart > $SEnd) {
$GStart = $SEnd;
$GEnd = $SStart;
$Strand = "-";
} else {
die "Unexpected Query Start and End:\n\tS:$QStart\n\tE:$QEnd";
}
# Trim leading white space from Bit score
$BitScore =~ s/^\s*(.*?)\s*$/$1/;
# Currently working with this to get it to draw
# the items as separate items
print GFFOUT
# I initially used the following
# $Name."\t". # SeqName
$SubId."\t".
$Prog.":".$Db."\t". # Source (BLAST PROGRAM)
# $Prog.":".$Db."\t". # Feature (Database)
# $SubId."\t". # Feature (Database)
$Prog."\t". # Feature (Database)
$GStart."\t". # Start
$GEnd."\t". # End
"."."\t". # Score
$Strand."\t". # Strand
$Frame."\tID=". # Frame
$SubId. # Attribute
"\n";
} # END OF WHILE BLASTIN
} # END OF Blast2Gff Subfunction
#-----------------------------------------------------------+
# HISTORY |
#-----------------------------------------------------------+
# 04/17/2007
# - Program started
# - Started Blast2Gff subfunction with tab delim format
#
# 04/18/2007
# - Adding command line options
# - Working on Blast2Gff tab delim format
#
# 04/23/2007
# - Uploaded to google account
#
# 07/12/2007
# - Added SVN version variable to header