From c53a70b93abc4c1c5ed2fb408741f2df061d8d82 Mon Sep 17 00:00:00 2001 From: djennah Date: Thu, 23 Feb 2017 14:06:54 +0100 Subject: [PATCH] Added partialgene support --- README.md | 9 ++--- bin/prokka | 101 +++++++++++++++++++++++++++++++++-------------------- 2 files changed, 67 insertions(+), 43 deletions(-) diff --git a/README.md b/README.md index ebdd6d3..52a198e 100644 --- a/README.md +++ b/README.md @@ -40,7 +40,7 @@ sudo cpan Time::Piece XML::Simple Bio::Perl Digest::MD5 ``` There are currently 3 ways to install the main Prokka software: -[Github](#github), [Tarball](#tarball) or [Homebrew](#homebrew). +[Github](#github), [Tarball](#tarball) or Homebrew(#homebrew). ###Github @@ -191,7 +191,7 @@ export PATH=$PATH:$HOME/prokka-1.11/bin ###Wizard ```bash # Watch and learn -% prokka --outdir mydir --locustag EHEC --proteins NewToxins.faa --evalue 0.001 --gram neg --addgenes contigs.fa +% prokka --outdir mydir --locustag EHEC --proteins NewToxins.faa --evalue 0.001 --partialgenes --gram neg --addgenes contigs.fa # Check to see if anything went really wrong % less mydir/EHEC_06072012.err @@ -305,6 +305,7 @@ export PATH=$PATH:$HOME/prokka-1.11/bin --proteins [X] Fasta file of trusted proteins to first annotate from (default '') --hmms [X] Trusted HMM to first annotate from (default '') --metagenome Improve gene predictions for highly fragmented genomes (default OFF) + --partialgenes Allow genes to run off edges, yielding incomplete genes (no closed ends option in prodigal) (default OFF) --rawproduct Do not clean up /product annotation (default OFF) Computation: --fast Fast mode - skip CDS /product searching (default OFF) @@ -475,10 +476,6 @@ _Camacho C et al. BLAST+: architecture and applications. BMC Bioinformatics. 200 Finds protein-coding features (CDS) _Hyatt D et al. Prodigal: prokaryotic gene recognition and translation initiation site identification. BMC Bioinformatics. 2010 Mar 8;11:119._ -* __TBL2ASN__ -Prepare sequence records for Genbank submission -[Tbl2asn home page](https://www.ncbi.nlm.nih.gov/genbank/tbl2asn2/) - ### Recommended * __Aragorn__ diff --git a/bin/prokka b/bin/prokka index 4077ecc..bdde5b6 100755 --- a/bin/prokka +++ b/bin/prokka @@ -34,6 +34,7 @@ use Bio::Seq; use Bio::SeqFeature::Generic; use Bio::Tools::GFF; use Bio::Tools::GuessSeqFormat; +use Bio::Location::Fuzzy; use FindBin; use lib "$FindBin::RealBin/../perl5"; # for bundled Perl modules @@ -221,7 +222,7 @@ my(@Options, $quiet, $debug, $kingdom, $fast, $force, $outdir, $prefix, $cpus, $genus, $species, $strain, $plasmid, $usegenus, $proteins, $hmms, $centre, $scaffolds, $rfam, $norrna, $notrna, $rnammer, $rawproduct, $noanno, - $metagenome, $compliant, $listdb, $citation); + $metagenome, $partialgenes, $compliant, $listdb, $citation); setOptions(); # . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . @@ -684,47 +685,67 @@ if ($tools{'minced'}->{HAVE}) { # CDS msg("Predicting coding sequences"); +my $tool = "Prodigal:".$tools{prodigal}->{VERSION}; my $totalbp = sum( map { $seq{$_}{DNA}->length } @seq); my $prodigal_mode = ($totalbp >= 100000 && !$metagenome) ? 'single' : 'meta'; -msg("Contigs total $totalbp bp, so using $prodigal_mode mode"); +my $prodigal_closedends = $partialgenes ? '' : '-c'; +msg("Contigs total $totalbp bp, so using $prodigal_mode mode, " . + ($partialgenes ? 'not' : '') . " allowing genes to run off contig edges."); my $num_cds=0; -my $cmd = "prodigal -i \Q$outdir/$prefix.fna\E -c -m -g $gcode -p $prodigal_mode -f sco -q"; +my $cmd = "prodigal -i \Q$outdir/$prefix.fna\E $prodigal_closedends -m -g $gcode -p $prodigal_mode -f gff -q"; msg("Running: $cmd"); open my $PRODIGAL, '-|', $cmd; -my $sid; -while (<$PRODIGAL>) { - if (m/seqhdr="([^\s\"]+)"/) { - $sid = $1; -# msg("CDS $sid"); - next; - } - elsif (m/^>\d+_(\d+)_(\d+)_([+-])$/) { - my $tool = "Prodigal:".$tools{prodigal}->{VERSION}; # FIXME: why inner loop? - my $cds = Bio::SeqFeature::Generic->new( - -primary => 'CDS', - -seq_id => $sid, - -source => $tool, - -start => $1, - -end => $2, - -strand => ($3 eq '+' ? +1 : -1), - -score => undef, - -frame => 0, - -tag => { - 'inference' => "ab initio prediction:$tool", - } - ); +my $gff_in = Bio::Tools::GFF->new(-fh => $PRODIGAL, + -gff_version => 3); +while (my $cds = $gff_in->next_feature){ + my $sid = $cds->seq_id; + # Determine whether the CDS is partial, i.e. it is running off + # contig edges. Add the appropriate /codon_start attribute if necessary + # Assuming prodigal only returns one single 'partial' tag + # Modify start & end attributes, because gff output is wrong + my ($partial) = $cds->get_tag_values('partial'); + my ($codon_start); + my ($start, $end) = ($cds->start, $cds->end); + my $ctg_length = $seq{$sid}{DNA}->length; + if ($partial ne '00') { + if ($partial =~ /^1/){ + $codon_start = $cds->start if ($cds->strand == 1); + $start = '<1'; + } + if ($partial =~ /1$/){ + $codon_start = ($ctg_length - $cds->end + 1) + if ($cds->strand == -1); + $end = ">$ctg_length"; + } + my $fuzzyloc = Bio::Location::Fuzzy->new( + -start => $start, + -end => $end, + -location_type => '..', + -strand => $cds->strand); + $cds->location($fuzzyloc); + } + # Removing all tags + foreach my $tag ($cds->get_all_tags) { + $cds->remove_tag($tag); + } + # Adding only codon_start, if applicable, and the inference tag + if ($codon_start){ + $cds->add_tag_value('codon_start', $codon_start); + $cds->frame($codon_start-1); + } + $cds->add_tag_value('inference', "ab initio prediction:$tool"); my $overlap; for my $rna (@allrna) { # same contig, overlapping (could check same strand too? not sure) if ($rna->seq_id eq $sid and $cds->overlaps($rna)) { $overlap = $rna; last; - } + } } # mitochondria are highly packed, so don't exclude as CDS/tRNA often overlap. if ($overlap and ! $cds_rna_olap) { my $type = $overlap->primary_tag; - msg("Excluding CDS which overlaps existing RNA ($type) at $sid:$1..$2 on $3 strand"); + msg("Excluding CDS which overlaps existing RNA ($type) at $sid:$start..$end on strand " . $cds->strand); } else { $num_cds++; @@ -735,7 +756,6 @@ while (<$PRODIGAL>) { } } - } } msg("Found $num_cds CDS"); @@ -769,7 +789,7 @@ if ($tools{signalp}->{HAVE}) { for my $f (@{ $seq{$sid}{FEATURE} }) { next unless $f->primary_tag eq 'CDS'; $cds{++$count} = $f; - my $seq = $f->seq->translate(-codontable_id=>$gcode, -complete=>1); + my $seq = $f->seq->translate(-codontable_id=>$gcode); $seq->display_id($count); $spout->write_seq($seq); } @@ -983,7 +1003,7 @@ else { next if $f->has_tag('product'); $cds{++$count} = $f; print $faa ">$count\n", - $f->seq->translate(-codontable_id=>$gcode, -complete=>1)->seq,"\n"; + $f->seq->translate(-codontable_id=>$gcode)->seq,"\n"; } } close $faa; @@ -1132,9 +1152,6 @@ for my $sid (@seq) { my $g = Bio::SeqFeature::Generic->new( -primary => 'gene', -seq_id => $f->seq_id, - -start => $f->start, - -end => $f->end, - -strand => $f->strand, -source_tag => $EXE, -tag => { 'locus_tag' => $ID, @@ -1144,6 +1161,7 @@ for my $sid (@seq) { # Make a Parent tag from the CDS to the gene $f->add_tag_value('Parent', $gene_id); # copy the /gene from the CDS + $g->location($f->location); if (my $gENE = TAG($f, 'gene')) { $g->add_tag_value('gene', $gENE); } @@ -1210,12 +1228,19 @@ for my $sid (@seq) { if (my $name = TAG($f, 'gene')) { $f->add_tag_value('Name', $name); } - # Make sure we have valid frames/phases (GFF column 8) - $f->frame( $f->primary_tag eq 'CDS' ? 0 : '.' ); + + # Make sure we have valid frames/phases (GFF column 8: 0/1/2 for CDS, + # . for other primary tags.) + $f->frame( $f->primary_tag eq 'CDS' ? $f->frame : '.' ); print $gff_fh $f->gff_string($gff_factory),"\n"; - my ($L,$R) = ($f->strand >= 0) ? ($f->start,$f->end) : ($f->end,$f->start); + my ($start, $end) = ($f->start, $f->end); + $start = (($f->strand >= 0) ? '<' : '>') . $start + if ($f->location->start_pos_type eq 'BEFORE'); + $end = (($f->strand >= 0) ? '>' : '<') . $end + if ($f->location->end_pos_type eq 'AFTER'); + my ($L, $R) = ($f->strand >= 0) ? ($start, $end) : ($end, $start); print {$tbl_fh} "$L\t$R\t",$f->primary_tag,"\n"; for my $tag ($f->get_all_tags) { next if $tag =~ m/^[A-Z]/ and $tag !~ m/EC_number/i; # remove GFF specific tags (start with uppercase letter) @@ -1235,7 +1260,8 @@ for my $sid (@seq) { # $ffn_fh->write_seq($p); # } if ($f->primary_tag eq 'CDS') { - $faa_fh->write_seq( $p->translate(-codontable_id=>$gcode, -complete=>1) ); + $faa_fh->write_seq( $p->translate(-codontable_id=>$gcode, + -frame => $f->frame) ); } if ($f->primary_tag =~ m/^(CDS|rRNA|tmRNA|tRNA|misc_RNA)$/) { $ffn_fh->write_seq($p); @@ -1635,6 +1661,7 @@ sub setOptions { {OPT=>"proteins=s", VAR=>\$proteins, DEFAULT=>'', DESC=>"FASTA or GBK file to use as 1st priority"}, {OPT=>"hmms=s", VAR=>\$hmms, DEFAULT=>'', DESC=>"Trusted HMM to first annotate from"}, {OPT=>"metagenome!", VAR=>\$metagenome, DEFAULT=>0, DESC=>"Improve gene predictions for highly fragmented genomes"}, + {OPT=>"partialgenes!", VAR=>\$partialgenes, DEFAULT=>0, DESC=>"Allow genes to run off edges, yielding incomplete genes (no closed ends option in prodigal)"}, {OPT=>"rawproduct!", VAR=>\$rawproduct, DEFAULT=>0, DESC=>"Do not clean up /product annotation"}, {OPT=>"cdsrnaolap!", VAR=>\$cds_rna_olap, DEFAULT=>0, DESC=>"Allow [tr]RNA to overlap CDS"}, 'Computation:',