Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Use indexed BLAST DB if already exists #124

Open
wants to merge 13 commits into
base: master
Choose a base branch
from
Open
110 changes: 42 additions & 68 deletions bin/prokka
Original file line number Diff line number Diff line change
Expand Up @@ -48,7 +48,7 @@ my $AUTHOR = 'Torsten Seemann <[email protected]>';
my $URL = 'https://github.com/tseemann/prokka';
my $PROKKA_PMID = '24642063';
my $PROKKA_DOI = '10.1093/bioinformatics/btu153';
my $DBDIR = "$FindBin::RealBin/../db";
my $DBDIR = "/srv/databases/proteins/prokka/";
my $HYPO = 'hypothetical protein';
my $UNANN = 'unannotated protein';
my $MAXCONTIGIDLEN = 37; # Genbank rule
Expand Down Expand Up @@ -519,7 +519,6 @@ else {
-frame => 0,
-tag => {
'product' => $product,
'inference' => "COORDINATES:profile:$tool",
@gene,
}
);
Expand Down Expand Up @@ -571,7 +570,6 @@ if ($kingdom ne 'Viruses' and !$norrna) {
-frame => 0,
-tag => {
'product' => $desc,
'inference' => "COORDINATES:profile:$tool", # FIXME version ?
}
);
msg(join "\t", $num_rrna, $desc, $sid, $entry->{start}[0], $entry->{stop}[0], $entry->{direction}[0]);
Expand Down Expand Up @@ -619,7 +617,6 @@ if ($rfam) {
-frame => 0,
-tag => {
'product' => $x[0],
'inference' => "COORDINATES:profile:$tool",
}
);
$num_ncrna++;
Expand Down Expand Up @@ -699,7 +696,6 @@ while (<$PRODIGAL>) {
-score => undef,
-frame => 0,
-tag => {
'inference' => "ab initio prediction:$tool",
}
);
my $overlap;
Expand Down Expand Up @@ -792,7 +788,6 @@ if ($tools{signalp}->{HAVE}) {
# 'ID' => $ID,
# 'Parent' => $x[0], # don't have proper IDs yet....
'product' => "putative signal peptide",
'inference' => "ab initio prediction:$tool",
'note' => "predicted cleavage at residue $x[3] with probability $prob",
}
);
Expand All @@ -819,7 +814,6 @@ if ($tools{signalp}->{HAVE}) {
# 'ID' => $ID,
# 'Parent' => $x[0], # don't have proper IDs yet....
'product' => "putative signal peptide",
'inference' => "ab initio prediction:$tool",
'note' => "predicted cleavage at residue $x[2]",
}
);
Expand All @@ -843,7 +837,7 @@ if ($tools{signalp}->{HAVE}) {
my @database = (
{
DB => "$DBDIR/kingdom/$kingdom/sprot",
SRC => 'similar to AA sequence:UniProtKB:',
SRC => 'UniProtKB',
FMT => 'blast',
CMD => $BLASTPCMD,
},
Expand All @@ -856,7 +850,7 @@ unless ($kingdom eq 'Viruses') {
if (-r $dbfile) {
push @database, {
DB => $dbfile,
SRC => "protein motif:$name:",
SRC => $name,
FMT => 'hmmer3',
CMD => $HMMER3CMD,
VERSION => 3, # without this, latest Bioperl goes into infinite loop
Expand All @@ -876,7 +870,7 @@ if ($usegenus) {
msg("Using custom $genus database for annotation");
unshift @database, {
DB => $blastdb,
SRC => 'similar to AA sequence:RefSeq:',
SRC => 'RefSeq',
FMT => 'blast',
CMD => $BLASTPCMD,
};
Expand All @@ -896,10 +890,10 @@ if (-r $hmms) {
my $src = $hmms;
$src =~ s{^.*/}{};
$src =~ s/.hmm$//;
msg("Using /inference source as '$src'");
msg("Using inference source as '$src'");
unshift @database, {
DB => $hmms,
SRC => "protein motif:$src:",
SRC => $src,
FMT => 'hmmer3',
CMD => $HMMER3CMD,
VERSION => 3, # without this, latest Bioperl goes into infinite loop
Expand All @@ -909,25 +903,33 @@ if (-r $hmms) {
# if user supplies a trusted set of proteins, we try these first!
if (-r $proteins) {
msg("Preparing user-supplied primary BLAST annotation source: $proteins");
my $faa_file = $proteins;
my $format = Bio::Tools::GuessSeqFormat->new(-file=>$proteins)->guess
or err("could not determine format of --proteins file '$proteins'");
msg("Guessed source was in $format format.");
if ($format =~ m/^(embl|genbank)$/) {
$faa_file = "$outdir/proteins.faa";
msg("Converting $format '$proteins' into Prokka FASTA '$faa_file'");
runcmd("prokka-genbank_to_fasta_db --format $format \Q$proteins\E > \Q$faa_file\E 2> /dev/null");
my $blastdb;
unless (-f "$proteins.pin" || -f "$proteins.pal") {
my $faa_file = $proteins;
my $format = Bio::Tools::GuessSeqFormat->new(-file=>$proteins)->guess
or err("could not determine format of --proteins file '$proteins'");
msg("Guessed source was in $format format.");
if ($format =~ m/^(embl|genbank)$/) {
$faa_file = "$outdir/proteins.faa";
msg("Converting $format '$proteins' into Prokka FASTA '$faa_file'");
runcmd("prokka-genbank_to_fasta_db --format $format \Q$proteins\E > \Q$faa_file\E 2> /dev/null");
}
elsif ($format ne 'fasta') {
err("Option --proteins only supports FASTA, BLAST database, GenBank, and EMBL formats.");
}
$blastdb = "$outdir/proteins";
runcmd("makeblastdb -dbtype prot -in \Q$faa_file\E -out \Q$blastdb\E -logfile /dev/null");
}
elsif ($format ne 'fasta') {
err("Option --proteins only supports FASTA, GenBank, and EMBL formats.");
else {
msg("Using existing BLAST database: $proteins");
$blastdb = $proteins;
}
runcmd("makeblastdb -dbtype prot -in \Q$faa_file\E -out \Q$outdir/proteins\E -logfile /dev/null");
my $src = $proteins;
$src =~ s{^.*/}{};
msg("Using /inference source as '$src'");
msg("Using inference source as '$src'");
unshift @database, {
DB => "$outdir/proteins",
SRC => "similar to AA sequence:$src:",
DB => $blastdb,
SRC => $src,
FMT => 'blast',
CMD => $BLASTPCMD,
};
Expand Down Expand Up @@ -1013,16 +1015,17 @@ else {
}
$cds{$pid}->add_tag_value('product', $cleanprod);
$cds{$pid}->add_tag_value('EC_number', $EC) if $EC;
$cds{$pid}->add_tag_value('gene', $gene) if $gene;
$cds{$pid}->add_tag_value('inference', $db->{SRC}.$hit->name);
$cds{$pid}->add_tag_value('gene_feature', $gene) if $gene;
$cds{$pid}->add_tag_value('gene', $hit->name);
$cds{$pid}->add_tag_value('inference', $db->{SRC});
}
msg("Cleaned $num_cleaned /product names") if $num_cleaned > 0;
msg("Cleaned $num_cleaned product names") if $num_cleaned > 0;
delfile($faa_name, $bls_name);
}
}

if ($proteins) {
delfile( map { "$outdir/proteins.$_" } qw(psq phr pin) );
delfile( map { "$outdir/proteins.$_" } qw(psq phr pin) )
}

# . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . .
Expand Down Expand Up @@ -1055,37 +1058,6 @@ for my $sid (@seq) {
}
}

# . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . .
# Fix colliding /gene names in CDS (before we add 'gene' features)
# (this could be written as such a nice map/map/grep one day...)

my %collide;

for my $sid (@seq) {
for my $f ( sort { $a->start <=> $b->start } @{ $seq{$sid}{FEATURE} }) {
next unless $f->primary_tag eq 'CDS';
my $gene = TAG($f, 'gene') or next;
$gene =~ s/_(\d+)$//; # remove existing de-duplicated suffix
push @{ $collide{$gene} }, $f;
}
}
msg("Found", scalar(keys(%collide)), "unique /gene codes.");

my $num_collide=0;
for my $gene (keys %collide) {
my @cds = @{$collide{$gene}};
next unless @cds > 1;
my $n=0;
for my $f (@cds) {
$f->remove_tag('gene');
$n++;
$f->add_tag_value('gene', "${gene}_${n}");
}
msg("Fixed $n duplicate /gene -", map { $_->get_tag_values('gene') } @cds);
$num_collide++;
}
msg("Fixed $num_collide colliding /gene names.");

# . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . .
# Add locus_tags and protein_id[CDS only] (and Parent genes if asked)

Expand Down Expand Up @@ -1232,7 +1204,7 @@ msg("Generating Genbank and Sequin files");
#runcmd("tbl2asn -a s -q F -A $prefix -N 1 -y 'Annotated using $EXE $VERSION from $URL' -Z $outdir/$prefix.err -M n -V b -i $outdir/$prefix.fsa -f $outdir/$prefix.tbl 2> /dev/null");
#runcmd("tbl2asn -V b -a s -A $prefix -N 1 -y 'Annotated using $EXE $VERSION from $URL' -Z $outdir/$prefix.err -i $outdir/$prefix.fsa");
#runcmd("tbl2asn -V b -a s -N 1 -y 'Annotated using $EXE $VERSION from $URL' -Z $outdir/$prefix.err -i $outdir/$prefix.fsa 2> /dev/null");
runcmd(
system(
"tbl2asn -V b -a r10k -l paired-ends $tbl2asn_opt -N 1 -y 'Annotated using $EXE $VERSION from $URL'".
" -Z \Q$outdir/$prefix.err\E -i \Q$outdir/$prefix.fsa\E 2> /dev/null"
);
Expand Down Expand Up @@ -1364,12 +1336,14 @@ sub runcmd {

sub delfile {
for my $file (@_) {
if ($debug) {
msg("In --debug mode, saving temporary file:", $file);
}
else {
msg("Deleting unwanted file:", $file);
unlink $file;
if (-f $file) {
if ($debug) {
msg("In --debug mode, saving temporary file:", $file);
}
else {
msg("Deleting unwanted file:", $file);
unlink $file;
}
}
}
}
Expand Down
Binary file removed db/cm/Bacteria
Binary file not shown.
29 changes: 0 additions & 29 deletions db/cm/README

This file was deleted.

Binary file removed db/cm/Viruses
Binary file not shown.
Loading