From c53a70b93abc4c1c5ed2fb408741f2df061d8d82 Mon Sep 17 00:00:00 2001
From: djennah <jennah.dharamshi@icm.uu.se>
Date: Thu, 23 Feb 2017 14:06:54 +0100
Subject: [PATCH] Added partialgene support

---
 README.md  |   9 ++---
 bin/prokka | 101 +++++++++++++++++++++++++++++++++--------------------
 2 files changed, 67 insertions(+), 43 deletions(-)

diff --git a/README.md b/README.md
index ebdd6d3..52a198e 100644
--- a/README.md
+++ b/README.md
@@ -40,7 +40,7 @@ sudo cpan Time::Piece XML::Simple Bio::Perl Digest::MD5
 ```
 
 There are currently 3 ways to install the main Prokka software: 
-[Github](#github), [Tarball](#tarball) or [Homebrew](#homebrew).
+[Github](#github), [Tarball](#tarball) or Homebrew(#homebrew).
 
 ###Github
 
@@ -191,7 +191,7 @@ export PATH=$PATH:$HOME/prokka-1.11/bin
 ###Wizard
 ```bash
 # Watch and learn
-% prokka --outdir mydir --locustag EHEC --proteins NewToxins.faa --evalue 0.001 --gram neg --addgenes contigs.fa
+% prokka --outdir mydir --locustag EHEC --proteins NewToxins.faa --evalue 0.001 --partialgenes --gram neg --addgenes contigs.fa
 
 # Check to see if anything went really wrong
 % less mydir/EHEC_06072012.err
@@ -305,6 +305,7 @@ export PATH=$PATH:$HOME/prokka-1.11/bin
       --proteins [X]    Fasta file of trusted proteins to first annotate from (default '')
       --hmms [X]        Trusted HMM to first annotate from (default '')
       --metagenome      Improve gene predictions for highly fragmented genomes (default OFF)
+      --partialgenes    Allow genes to run off edges, yielding incomplete genes (no closed ends option in prodigal) (default OFF)
       --rawproduct      Do not clean up /product annotation (default OFF)
     Computation:
       --fast            Fast mode - skip CDS /product searching (default OFF)
@@ -475,10 +476,6 @@ _Camacho C et al. BLAST+: architecture and applications. BMC Bioinformatics. 200
 Finds protein-coding features (CDS)  
 _Hyatt D et al. Prodigal: prokaryotic gene recognition and translation initiation site identification. BMC Bioinformatics. 2010 Mar 8;11:119._
 
-* __TBL2ASN__
-Prepare sequence records for Genbank submission
-[Tbl2asn home page](https://www.ncbi.nlm.nih.gov/genbank/tbl2asn2/)
-
 ### Recommended
 
 * __Aragorn__  
diff --git a/bin/prokka b/bin/prokka
index 4077ecc..bdde5b6 100755
--- a/bin/prokka
+++ b/bin/prokka
@@ -34,6 +34,7 @@ use Bio::Seq;
 use Bio::SeqFeature::Generic;
 use Bio::Tools::GFF;
 use Bio::Tools::GuessSeqFormat;
+use Bio::Location::Fuzzy;
 use FindBin;
 use lib "$FindBin::RealBin/../perl5"; # for bundled Perl modules
 
@@ -221,7 +222,7 @@ my(@Options, $quiet, $debug, $kingdom, $fast, $force, $outdir, $prefix, $cpus,
              $genus, $species, $strain, $plasmid, 
              $usegenus, $proteins, $hmms, $centre, $scaffolds,
              $rfam, $norrna, $notrna, $rnammer, $rawproduct, $noanno,
-	     $metagenome, $compliant, $listdb, $citation);
+	     $metagenome, $partialgenes, $compliant, $listdb, $citation);
 setOptions();
 
 # . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . 
@@ -684,47 +685,67 @@ if ($tools{'minced'}->{HAVE}) {
 # CDS
 
 msg("Predicting coding sequences");
+my $tool = "Prodigal:".$tools{prodigal}->{VERSION};
 my $totalbp = sum( map { $seq{$_}{DNA}->length } @seq);
 my $prodigal_mode = ($totalbp >= 100000 && !$metagenome) ? 'single' : 'meta';
-msg("Contigs total $totalbp bp, so using $prodigal_mode mode");
+my $prodigal_closedends = $partialgenes ? '' : '-c';
+msg("Contigs total $totalbp bp, so using $prodigal_mode mode, " . 
+	($partialgenes ? 'not' : '') . " allowing genes to run off contig edges.");
 my $num_cds=0;
-my $cmd = "prodigal -i \Q$outdir/$prefix.fna\E -c -m -g $gcode -p $prodigal_mode -f sco -q";
+my $cmd = "prodigal -i \Q$outdir/$prefix.fna\E $prodigal_closedends -m -g $gcode -p $prodigal_mode -f gff -q";
 msg("Running: $cmd");
 open my $PRODIGAL, '-|', $cmd;
-my $sid;
-while (<$PRODIGAL>) {
-  if (m/seqhdr="([^\s\"]+)"/) {  
-    $sid = $1;
-#    msg("CDS $sid");
-    next;
-  }
-  elsif (m/^>\d+_(\d+)_(\d+)_([+-])$/) {   
-    my $tool = "Prodigal:".$tools{prodigal}->{VERSION}; # FIXME: why inner loop?
-    my $cds = Bio::SeqFeature::Generic->new( 
-      -primary    => 'CDS',
-      -seq_id     => $sid,
-      -source     => $tool,
-      -start      => $1,
-      -end        => $2,
-      -strand     => ($3 eq '+' ? +1 : -1),
-      -score      => undef,
-      -frame      => 0,
-      -tag        => {
-	'inference' => "ab initio prediction:$tool", 
-      }
-    );
+my $gff_in = Bio::Tools::GFF->new(-fh => $PRODIGAL,
+				  -gff_version => 3);
+while (my $cds = $gff_in->next_feature){
+    my $sid = $cds->seq_id;
+    # Determine whether the CDS is partial, i.e. it is running off 
+    # contig edges. Add the appropriate /codon_start attribute if necessary
+    # Assuming prodigal only returns one single 'partial' tag
+    # Modify start & end attributes, because gff output is wrong
+    my ($partial) = $cds->get_tag_values('partial');
+    my ($codon_start); 
+    my ($start, $end) = ($cds->start, $cds->end);
+    my $ctg_length = $seq{$sid}{DNA}->length;
+    if ($partial ne '00') {
+	if ($partial =~ /^1/){
+	    $codon_start = $cds->start if ($cds->strand == 1);
+	    $start = '<1';
+	}
+	if ($partial =~ /1$/){
+	    $codon_start = ($ctg_length - $cds->end + 1)
+		if ($cds->strand == -1);
+	    $end = ">$ctg_length";
+	}
+	my $fuzzyloc = Bio::Location::Fuzzy->new(
+	    -start  => $start,
+	    -end    => $end,
+	    -location_type => '..',
+	    -strand => $cds->strand);
+	$cds->location($fuzzyloc);
+    }
+    # Removing all tags
+    foreach my $tag ($cds->get_all_tags) {
+	$cds->remove_tag($tag);
+    }
+    # Adding only codon_start, if applicable, and the inference tag
+    if ($codon_start){
+	$cds->add_tag_value('codon_start', $codon_start);
+	$cds->frame($codon_start-1);
+    }
+    $cds->add_tag_value('inference', "ab initio prediction:$tool");
     my $overlap;
     for my $rna (@allrna) {
       # same contig, overlapping (could check same strand too? not sure)
       if ($rna->seq_id eq $sid and $cds->overlaps($rna)) { 
         $overlap = $rna;
 	last;
-      }	
+      }
     }
     # mitochondria are highly packed, so don't exclude as CDS/tRNA often overlap.
     if ($overlap and ! $cds_rna_olap) {
       my $type = $overlap->primary_tag;
-      msg("Excluding CDS which overlaps existing RNA ($type) at $sid:$1..$2 on $3 strand");
+      msg("Excluding CDS which overlaps existing RNA ($type) at $sid:$start..$end on strand " . $cds->strand);
     }
     else {
       $num_cds++;
@@ -735,7 +756,6 @@ while (<$PRODIGAL>) {
       }
       
     }
-  }
 }
 msg("Found $num_cds CDS");
 
@@ -769,7 +789,7 @@ if ($tools{signalp}->{HAVE}) {
         for my $f (@{ $seq{$sid}{FEATURE} }) {
           next unless $f->primary_tag eq 'CDS';
           $cds{++$count} = $f;
-          my $seq = $f->seq->translate(-codontable_id=>$gcode, -complete=>1);
+          my $seq = $f->seq->translate(-codontable_id=>$gcode);
           $seq->display_id($count);
           $spout->write_seq($seq);
         }
@@ -983,7 +1003,7 @@ else {
 	next if $f->has_tag('product');
 	$cds{++$count} = $f;
 	print $faa ">$count\n",
-          $f->seq->translate(-codontable_id=>$gcode, -complete=>1)->seq,"\n";
+          $f->seq->translate(-codontable_id=>$gcode)->seq,"\n";
       }
     }
     close $faa;
@@ -1132,9 +1152,6 @@ for my $sid (@seq) {
       my $g = Bio::SeqFeature::Generic->new(
         -primary    => 'gene',
         -seq_id     => $f->seq_id,
-        -start      => $f->start,
-        -end        => $f->end,
-        -strand     => $f->strand,
         -source_tag => $EXE,
         -tag        => { 
            'locus_tag' => $ID, 
@@ -1144,6 +1161,7 @@ for my $sid (@seq) {
       # Make a Parent tag from the CDS to the gene
       $f->add_tag_value('Parent', $gene_id);
       # copy the /gene from the CDS
+      $g->location($f->location);
       if (my $gENE = TAG($f, 'gene')) {
    	$g->add_tag_value('gene', $gENE);
       }
@@ -1210,12 +1228,19 @@ for my $sid (@seq) {
     if (my $name = TAG($f, 'gene')) {
       $f->add_tag_value('Name', $name);
     }
-    # Make sure we have valid frames/phases (GFF column 8)
-    $f->frame( $f->primary_tag eq 'CDS' ? 0 : '.' );
+    
+    # Make sure we have valid frames/phases (GFF column 8: 0/1/2 for CDS,
+    # . for other primary tags.)
+    $f->frame( $f->primary_tag eq 'CDS' ? $f->frame : '.' );
     
     print $gff_fh $f->gff_string($gff_factory),"\n";
     
-    my ($L,$R) = ($f->strand >= 0) ? ($f->start,$f->end) : ($f->end,$f->start);
+    my ($start, $end) = ($f->start, $f->end);
+    $start = (($f->strand >= 0) ? '<' : '>') . $start 
+      if ($f->location->start_pos_type eq 'BEFORE');
+    $end = (($f->strand >= 0) ? '>' : '<') . $end 
+      if ($f->location->end_pos_type eq 'AFTER');
+    my ($L, $R) = ($f->strand >= 0) ? ($start, $end) : ($end, $start);
     print {$tbl_fh} "$L\t$R\t",$f->primary_tag,"\n";
     for my $tag ($f->get_all_tags) {
       next if $tag =~ m/^[A-Z]/ and $tag !~ m/EC_number/i; # remove GFF specific tags (start with uppercase letter)
@@ -1235,7 +1260,8 @@ for my $sid (@seq) {
 #      $ffn_fh->write_seq($p);      
 #    }
     if ($f->primary_tag eq 'CDS') {
-      $faa_fh->write_seq( $p->translate(-codontable_id=>$gcode, -complete=>1) ); 
+      $faa_fh->write_seq( $p->translate(-codontable_id=>$gcode,
+			  -frame => $f->frame) ); 
     }
     if ($f->primary_tag =~ m/^(CDS|rRNA|tmRNA|tRNA|misc_RNA)$/) {
       $ffn_fh->write_seq($p);
@@ -1635,6 +1661,7 @@ sub setOptions {
     {OPT=>"proteins=s",  VAR=>\$proteins, DEFAULT=>'', DESC=>"FASTA or GBK file to use as 1st priority"},
     {OPT=>"hmms=s",  VAR=>\$hmms, DEFAULT=>'', DESC=>"Trusted HMM to first annotate from"},
     {OPT=>"metagenome!",  VAR=>\$metagenome, DEFAULT=>0, DESC=>"Improve gene predictions for highly fragmented genomes"},
+    {OPT=>"partialgenes!",  VAR=>\$partialgenes, DEFAULT=>0, DESC=>"Allow genes to run off edges, yielding incomplete genes (no closed ends option in prodigal)"},
     {OPT=>"rawproduct!",  VAR=>\$rawproduct, DEFAULT=>0, DESC=>"Do not clean up /product annotation"},
     {OPT=>"cdsrnaolap!",  VAR=>\$cds_rna_olap, DEFAULT=>0, DESC=>"Allow [tr]RNA to overlap CDS"},
     'Computation:',