Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Added support for partial genes prokka-1.12 #219

Open
wants to merge 1 commit into
base: master
Choose a base branch
from
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
9 changes: 3 additions & 6 deletions README.md
Original file line number Diff line number Diff line change
Expand Up @@ -40,7 +40,7 @@ sudo cpan Time::Piece XML::Simple Bio::Perl Digest::MD5
```

There are currently 3 ways to install the main Prokka software:
[Github](#github), [Tarball](#tarball) or [Homebrew](#homebrew).
[Github](#github), [Tarball](#tarball) or Homebrew(#homebrew).

###Github

Expand Down Expand Up @@ -191,7 +191,7 @@ export PATH=$PATH:$HOME/prokka-1.11/bin
###Wizard
```bash
# Watch and learn
% prokka --outdir mydir --locustag EHEC --proteins NewToxins.faa --evalue 0.001 --gram neg --addgenes contigs.fa
% prokka --outdir mydir --locustag EHEC --proteins NewToxins.faa --evalue 0.001 --partialgenes --gram neg --addgenes contigs.fa

# Check to see if anything went really wrong
% less mydir/EHEC_06072012.err
Expand Down Expand Up @@ -305,6 +305,7 @@ export PATH=$PATH:$HOME/prokka-1.11/bin
--proteins [X] Fasta file of trusted proteins to first annotate from (default '')
--hmms [X] Trusted HMM to first annotate from (default '')
--metagenome Improve gene predictions for highly fragmented genomes (default OFF)
--partialgenes Allow genes to run off edges, yielding incomplete genes (no closed ends option in prodigal) (default OFF)
--rawproduct Do not clean up /product annotation (default OFF)
Computation:
--fast Fast mode - skip CDS /product searching (default OFF)
Expand Down Expand Up @@ -475,10 +476,6 @@ _Camacho C et al. BLAST+: architecture and applications. BMC Bioinformatics. 200
Finds protein-coding features (CDS)
_Hyatt D et al. Prodigal: prokaryotic gene recognition and translation initiation site identification. BMC Bioinformatics. 2010 Mar 8;11:119._

* __TBL2ASN__
Prepare sequence records for Genbank submission
[Tbl2asn home page](https://www.ncbi.nlm.nih.gov/genbank/tbl2asn2/)

### Recommended

* __Aragorn__
Expand Down
101 changes: 64 additions & 37 deletions bin/prokka
Original file line number Diff line number Diff line change
Expand Up @@ -34,6 +34,7 @@ use Bio::Seq;
use Bio::SeqFeature::Generic;
use Bio::Tools::GFF;
use Bio::Tools::GuessSeqFormat;
use Bio::Location::Fuzzy;
use FindBin;
use lib "$FindBin::RealBin/../perl5"; # for bundled Perl modules

Expand Down Expand Up @@ -221,7 +222,7 @@ my(@Options, $quiet, $debug, $kingdom, $fast, $force, $outdir, $prefix, $cpus,
$genus, $species, $strain, $plasmid,
$usegenus, $proteins, $hmms, $centre, $scaffolds,
$rfam, $norrna, $notrna, $rnammer, $rawproduct, $noanno,
$metagenome, $compliant, $listdb, $citation);
$metagenome, $partialgenes, $compliant, $listdb, $citation);
setOptions();

# . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . .
Expand Down Expand Up @@ -684,47 +685,67 @@ if ($tools{'minced'}->{HAVE}) {
# CDS

msg("Predicting coding sequences");
my $tool = "Prodigal:".$tools{prodigal}->{VERSION};
my $totalbp = sum( map { $seq{$_}{DNA}->length } @seq);
my $prodigal_mode = ($totalbp >= 100000 && !$metagenome) ? 'single' : 'meta';
msg("Contigs total $totalbp bp, so using $prodigal_mode mode");
my $prodigal_closedends = $partialgenes ? '' : '-c';
msg("Contigs total $totalbp bp, so using $prodigal_mode mode, " .
($partialgenes ? 'not' : '') . " allowing genes to run off contig edges.");
my $num_cds=0;
my $cmd = "prodigal -i \Q$outdir/$prefix.fna\E -c -m -g $gcode -p $prodigal_mode -f sco -q";
my $cmd = "prodigal -i \Q$outdir/$prefix.fna\E $prodigal_closedends -m -g $gcode -p $prodigal_mode -f gff -q";
msg("Running: $cmd");
open my $PRODIGAL, '-|', $cmd;
my $sid;
while (<$PRODIGAL>) {
if (m/seqhdr="([^\s\"]+)"/) {
$sid = $1;
# msg("CDS $sid");
next;
}
elsif (m/^>\d+_(\d+)_(\d+)_([+-])$/) {
my $tool = "Prodigal:".$tools{prodigal}->{VERSION}; # FIXME: why inner loop?
my $cds = Bio::SeqFeature::Generic->new(
-primary => 'CDS',
-seq_id => $sid,
-source => $tool,
-start => $1,
-end => $2,
-strand => ($3 eq '+' ? +1 : -1),
-score => undef,
-frame => 0,
-tag => {
'inference' => "ab initio prediction:$tool",
}
);
my $gff_in = Bio::Tools::GFF->new(-fh => $PRODIGAL,
-gff_version => 3);
while (my $cds = $gff_in->next_feature){
my $sid = $cds->seq_id;
# Determine whether the CDS is partial, i.e. it is running off
# contig edges. Add the appropriate /codon_start attribute if necessary
# Assuming prodigal only returns one single 'partial' tag
# Modify start & end attributes, because gff output is wrong
my ($partial) = $cds->get_tag_values('partial');
my ($codon_start);
my ($start, $end) = ($cds->start, $cds->end);
my $ctg_length = $seq{$sid}{DNA}->length;
if ($partial ne '00') {
if ($partial =~ /^1/){
$codon_start = $cds->start if ($cds->strand == 1);
$start = '<1';
}
if ($partial =~ /1$/){
$codon_start = ($ctg_length - $cds->end + 1)
if ($cds->strand == -1);
$end = ">$ctg_length";
}
my $fuzzyloc = Bio::Location::Fuzzy->new(
-start => $start,
-end => $end,
-location_type => '..',
-strand => $cds->strand);
$cds->location($fuzzyloc);
}
# Removing all tags
foreach my $tag ($cds->get_all_tags) {
$cds->remove_tag($tag);
}
# Adding only codon_start, if applicable, and the inference tag
if ($codon_start){
$cds->add_tag_value('codon_start', $codon_start);
$cds->frame($codon_start-1);
}
$cds->add_tag_value('inference', "ab initio prediction:$tool");
my $overlap;
for my $rna (@allrna) {
# same contig, overlapping (could check same strand too? not sure)
if ($rna->seq_id eq $sid and $cds->overlaps($rna)) {
$overlap = $rna;
last;
}
}
}
# mitochondria are highly packed, so don't exclude as CDS/tRNA often overlap.
if ($overlap and ! $cds_rna_olap) {
my $type = $overlap->primary_tag;
msg("Excluding CDS which overlaps existing RNA ($type) at $sid:$1..$2 on $3 strand");
msg("Excluding CDS which overlaps existing RNA ($type) at $sid:$start..$end on strand " . $cds->strand);
}
else {
$num_cds++;
Expand All @@ -735,7 +756,6 @@ while (<$PRODIGAL>) {
}

}
}
}
msg("Found $num_cds CDS");

Expand Down Expand Up @@ -769,7 +789,7 @@ if ($tools{signalp}->{HAVE}) {
for my $f (@{ $seq{$sid}{FEATURE} }) {
next unless $f->primary_tag eq 'CDS';
$cds{++$count} = $f;
my $seq = $f->seq->translate(-codontable_id=>$gcode, -complete=>1);
my $seq = $f->seq->translate(-codontable_id=>$gcode);
$seq->display_id($count);
$spout->write_seq($seq);
}
Expand Down Expand Up @@ -983,7 +1003,7 @@ else {
next if $f->has_tag('product');
$cds{++$count} = $f;
print $faa ">$count\n",
$f->seq->translate(-codontable_id=>$gcode, -complete=>1)->seq,"\n";
$f->seq->translate(-codontable_id=>$gcode)->seq,"\n";
}
}
close $faa;
Expand Down Expand Up @@ -1132,9 +1152,6 @@ for my $sid (@seq) {
my $g = Bio::SeqFeature::Generic->new(
-primary => 'gene',
-seq_id => $f->seq_id,
-start => $f->start,
-end => $f->end,
-strand => $f->strand,
-source_tag => $EXE,
-tag => {
'locus_tag' => $ID,
Expand All @@ -1144,6 +1161,7 @@ for my $sid (@seq) {
# Make a Parent tag from the CDS to the gene
$f->add_tag_value('Parent', $gene_id);
# copy the /gene from the CDS
$g->location($f->location);
if (my $gENE = TAG($f, 'gene')) {
$g->add_tag_value('gene', $gENE);
}
Expand Down Expand Up @@ -1210,12 +1228,19 @@ for my $sid (@seq) {
if (my $name = TAG($f, 'gene')) {
$f->add_tag_value('Name', $name);
}
# Make sure we have valid frames/phases (GFF column 8)
$f->frame( $f->primary_tag eq 'CDS' ? 0 : '.' );

# Make sure we have valid frames/phases (GFF column 8: 0/1/2 for CDS,
# . for other primary tags.)
$f->frame( $f->primary_tag eq 'CDS' ? $f->frame : '.' );

print $gff_fh $f->gff_string($gff_factory),"\n";

my ($L,$R) = ($f->strand >= 0) ? ($f->start,$f->end) : ($f->end,$f->start);
my ($start, $end) = ($f->start, $f->end);
$start = (($f->strand >= 0) ? '<' : '>') . $start
if ($f->location->start_pos_type eq 'BEFORE');
$end = (($f->strand >= 0) ? '>' : '<') . $end
if ($f->location->end_pos_type eq 'AFTER');
my ($L, $R) = ($f->strand >= 0) ? ($start, $end) : ($end, $start);
print {$tbl_fh} "$L\t$R\t",$f->primary_tag,"\n";
for my $tag ($f->get_all_tags) {
next if $tag =~ m/^[A-Z]/ and $tag !~ m/EC_number/i; # remove GFF specific tags (start with uppercase letter)
Expand All @@ -1235,7 +1260,8 @@ for my $sid (@seq) {
# $ffn_fh->write_seq($p);
# }
if ($f->primary_tag eq 'CDS') {
$faa_fh->write_seq( $p->translate(-codontable_id=>$gcode, -complete=>1) );
$faa_fh->write_seq( $p->translate(-codontable_id=>$gcode,
-frame => $f->frame) );
}
if ($f->primary_tag =~ m/^(CDS|rRNA|tmRNA|tRNA|misc_RNA)$/) {
$ffn_fh->write_seq($p);
Expand Down Expand Up @@ -1635,6 +1661,7 @@ sub setOptions {
{OPT=>"proteins=s", VAR=>\$proteins, DEFAULT=>'', DESC=>"FASTA or GBK file to use as 1st priority"},
{OPT=>"hmms=s", VAR=>\$hmms, DEFAULT=>'', DESC=>"Trusted HMM to first annotate from"},
{OPT=>"metagenome!", VAR=>\$metagenome, DEFAULT=>0, DESC=>"Improve gene predictions for highly fragmented genomes"},
{OPT=>"partialgenes!", VAR=>\$partialgenes, DEFAULT=>0, DESC=>"Allow genes to run off edges, yielding incomplete genes (no closed ends option in prodigal)"},
{OPT=>"rawproduct!", VAR=>\$rawproduct, DEFAULT=>0, DESC=>"Do not clean up /product annotation"},
{OPT=>"cdsrnaolap!", VAR=>\$cds_rna_olap, DEFAULT=>0, DESC=>"Allow [tr]RNA to overlap CDS"},
'Computation:',
Expand Down