tseemann · aleimba · May 30, 2014 · May 30, 2014 · Jun 24, 2014 · Nov 26, 2014
diff --git a/bin/prokka b/bin/prokka
diff --git a/bin/prokka-biocyc_to_fasta_db b/bin/prokka-biocyc_to_fasta_db
@@ -32,7 +32,12 @@ while (<$ec_fh>) {
   my @x = split m/\t/;
   next unless exists $seq{ $x[1] };
   next unless $x[7] =~ m/^EC:(.*)$/;
-  $seq{$x[1]}{EC} = $1;  
+  if ($seq{$x[1]}{EC}) { # concatenate EC_numbers via ';' if several present
+      next if ($seq{$x[1]}{EC} =~ /$1/); # skip EC numbers that are already included
+      $seq{$x[1]}{EC} .= ";$1";
+  } else {
+      $seq{$x[1]}{EC} = $1;
+  }
 }
 
 while (<$gene_fh>) {
@@ -41,7 +46,7 @@ while (<$gene_fh>) {
   # remove HTML markup
   $x[3] =~ s/<.*?>//g;
   # special case for regular entities
-  $x[3] =~ s/&(alpha|beta|gamma|delta|epsilon);/$1/g; 
+  $x[3] =~ s/&(alpha|beta|gamma|delta|epsilon);/$1/g;
   # http://stackoverflow.com/questions/576095/how-can-i-decode-html-entities
   $x[3] = unidecode(decode_entities($x[3]));
   $seq{$id}{PRODUCT} = $x[3];
@@ -112,5 +117,5 @@ sub usage {
   }
   exit(1);
 }
- 
+
 #----------------------------------------------------------------------
diff --git a/bin/prokka-genbank_to_fasta_db b/bin/prokka-genbank_to_fasta_db
@@ -3,8 +3,10 @@ use strict;
 use Bio::SeqIO;
 use Data::Dumper;
 
-my(@Options, $verbose, $format, $hypo, $idtag, $sep, $blank, $pseudo, $minlen);
+my(@Options, $verbose, $format, $gcode, $hypo, $idtag, $sep, $blank, $pseudo, $minlen);
 setOptions();
+($gcode < 1 or $gcode > 24) and die "Invalid genetic code, must be 1..24\n";
+print STDERR "Using genetic code table $gcode.\n";
 
 my $in = Bio::SeqIO->new(-fh=>\*ARGV, -format=>$format);
 my $out = Bio::SeqIO->new(-fh=>\*STDOUT, -format=>'Fasta');
@@ -13,23 +15,23 @@ while (my $seq = $in->next_seq) {
   print STDERR "\rParsing: ",$seq->display_id;
   my $counter = 0;
   for my $f ($seq->get_SeqFeatures)   {
-    
+
     next unless $f->primary_tag eq 'CDS';
-        
+
     next unless $f->length >= $minlen;
-    
+
     next if !$pseudo and $f->has_tag('pseudo');
 
     my $prod = TAG($f, 'product') || $blank;
 
-    next if $prod eq 'hypothetical protein';
+    next if !$hypo and $prod eq 'hypothetical protein';
 
     my $cds = $f->spliced_seq;  # don't forget eukaryotes!
 
     my $id = TAG($f, $idtag) or die "feature #$counter does not have /$idtag tag!";
     $counter++;
     print STDERR "Processing: [$counter] ", $seq->display_id, " | $id | $prod\n" if $verbose;
-    
+
     # HANDLE CODON START FOR FUZZY FEATURES!
     if ($f->has_tag('codon_start')) {
       my($cs) = $f->get_tag_values('codon_start');
@@ -41,20 +43,15 @@ while (my $seq = $in->next_seq) {
     #END
 
 
-    # DNA -> AA    
-    $cds = $cds->translate;
-
-    # remove stop codon
-    my $aa = $cds->seq;
-    $aa =~ s/\*$//; 
-    $cds->seq($aa);
-
+    # DNA -> AA
+    $cds = $cds->translate(-codontable_id=>$gcode, -complete => 1);
+
     my $ec = TAG($f, 'EC_number') || $blank;
     my $gene = TAG($f, 'gene') || $blank;
-    
+
     $cds->desc("$ec$sep$gene$sep$prod");
     $cds->display_id($id);
-    
+
     $out->write_seq($cds);
   }
   print STDERR "\n";
@@ -69,6 +66,7 @@ sub TAG {
   # Seems new submissions have 2 protein IDs - one useful and one annoying!
   # /protein_id="REF_PRJNA193299:EFAU085_p1001"
   # /protein_id="YP_008390691.1"
+  return join(';', $f->get_tag_values($tag)) if ($tag eq 'EC_number'); # concatenate EC_numbers via ';' if several present
   return ( grep { ! m/PRJNA/ } $f->get_tag_values($tag) ) [0];
 }
 
@@ -82,6 +80,7 @@ sub setOptions {
     {OPT=>"help",      VAR=>\&usage,               DESC=>"This help"},
     {OPT=>"verbose!",  VAR=>\$verbose, DEFAULT=>0, DESC=>"Verbose progress"},
     {OPT=>"format=s",  VAR=>\$format,  DEFAULT=>'genbank', DESC=>"Input format"},
+    {OPT=>"gcode=i",  VAR=>\$gcode, DEFAULT=>1, DESC=>"Genetic code / Translation table (1..24)"},
     {OPT=>"idtag=s",   VAR=>\$idtag,   DEFAULT=>'protein_id', DESC=>"What tag to use as Fasta ID"},
     {OPT=>"sep=s",     VAR=>\$sep,     DEFAULT=>'~~~', DESC=>"Separator between EC/gene/product" },
     {OPT=>"blank=s",   VAR=>\$blank,   DEFAULT=>'',    DESC=>"Replace empty EC/gene/product with this"},
@@ -110,5 +109,5 @@ sub usage {
   }
   exit(1);
 }
- 
+
 #----------------------------------------------------------------------
diff --git a/bin/prokka-genpept_to_fasta_db b/bin/prokka-genpept_to_fasta_db
@@ -12,11 +12,11 @@ while (my $seq = $in->next_seq) {
 #  print STDERR "\rParsing: ",$seq->display_id;
   my $counter = 0;
   for my $f ($seq->get_SeqFeatures)   {
-    
+
     next unless $f->primary_tag eq 'Protein';
-        
+
     next unless $f->length >= $minlen;
-    
+
     next if !$pseudo and $f->has_tag('pseudo');
 
     my $prod = TAG($f, 'product') || $blank;
@@ -27,16 +27,16 @@ while (my $seq = $in->next_seq) {
 
 #    my $id = TAG($f, $idtag) or die "feature #$counter does not have /$idtag tag!";
    my $id = $seq->accession;
-   
+
     $counter++;
     print STDERR "Processing: ", $seq->display_id, " | $counter | $id | $prod\n" if $verbose;
-    
+
     my $ec = TAG($f, 'EC_number') || $blank;
     my $gene = TAG($f, 'gene') || $blank;
-    
+
     $cds->desc("$ec$sep$gene$sep$prod");
     $cds->display_id($id);
-    
+
     $out->write_seq($cds);
   }
 #  print STDERR "\n";
@@ -48,6 +48,7 @@ print STDERR "\nDone\n";
 sub TAG {
   my($f, $tag) = @_;
   return unless $f->has_tag($tag);
+  return join(';', $f->get_tag_values($tag)) if ($tag eq 'EC_number'); # concatenate EC_numbers via ';' if several present
   return ($f->get_tag_values($tag))[0];
 }
 
@@ -89,5 +90,5 @@ sub usage {
   }
   exit(1);
 }
- 
+
 #----------------------------------------------------------------------
diff --git a/bin/prokka-hamap_to_hmm b/bin/prokka-hamap_to_hmm
@@ -32,7 +32,7 @@ while (<$fam_fh>) {
   if (m{^//}) {
     #print STDERR "AC=$AC EC=$EC Bac=$Bac\n";
     if ($AC and $EC and exists $DESC{$AC}) {
-      $DESC{$AC} = $EC.$DESC{$AC}; 
+      $DESC{$AC} = $EC.$DESC{$AC};
     }
     if ($AC and ! $Bac) {
       print STDERR "Removing $AC as non-Bacterial\n";
@@ -44,7 +44,8 @@ while (<$fam_fh>) {
     $AC = $1;
   }
   elsif (m/EC=([\d.]+);/) {
-    $EC = $1;
+    $EC .= ";$1" if ($EC && $EC !~ $1); # include each EC_number only once
+    $EC = $1 if (!$EC);
   }
   elsif (m/^ Bacteria/) {
     $Bac = 1;
@@ -131,12 +132,12 @@ sub setOptions {
 }
 
 sub usage {
-  print "Usage: $0 [options] [--datadir hamap_data_subdir] > hamap.hmm\n";
+  print "Usage: $0 [options] [--datadir hamap_data_subdir]\n";
   foreach (@Options) {
     printf "  --%-13s %s%s.\n",$_->{OPT},$_->{DESC},
            defined($_->{DEFAULT}) ? " (default '$_->{DEFAULT}')" : "";
   }
   exit(1);
 }
- 
+
 #----------------------------------------------------------------------
diff --git a/bin/prokka-uniprot_to_fasta_db b/bin/prokka-uniprot_to_fasta_db
@@ -8,15 +8,15 @@ use Data::Dumper;
 my(@Options, $verbose, $frag, $evlev, $minlen, $sep, $blank, $term, $hypo);
 setOptions();
 
-my $HYPO = 'hypothetical protein'; 
+my $HYPO = 'hypothetical protein';
 
 # Read an entire record at a time
 local $/ = "\n//\n";
 
 my $in=0;
 my $out=0;
 
-while (<ARGV>) 
+while (<ARGV>)
 {
   # Read the entry
   my $entry = SWISS::Entry->fromText($_);
@@ -28,17 +28,17 @@ while (<ARGV>)
 
   # Too short?
   next if $minlen > 0 and length($entry->SQs->seq) < $minlen;
- 
+
   # Reject on evidence level
   # grep ^PE uniprot_sprot.dat | sort | uniq -c
   #  74284 PE   1: Evidence at protein level;
   #  67762 PE   2: Evidence at transcript level;
   # 376894 PE   3: Inferred from homology;
   #  14424 PE   4: Predicted;
   #   1884 PE   5: Uncertain;
-  if ($evlev < 5) { 
+  if ($evlev < 5) {
     $entry->PE->text =~ m/^(\d+)/;
-    next unless $1 <= $evlev; 
+    next unless $1 <= $evlev;
   }
 
   # Only specified organism class
@@ -47,31 +47,29 @@ while (<ARGV>)
     next unless grep { $_ eq $term } @$tax ;
   }
 
-  # /gene code  
+  # /gene code
   my $gene = $entry->GNs->getFirst || '';
   $gene = '' if $gene =~ m/\d{2}/ or $gene =~ m/\./;
 
-  my $ec = ''; 
-#  my $prod = 'hypothetical protein'; 
-  my $prod = ''; 
+  my $ec = '';
+#  my $prod = 'hypothetical protein';
+  my $prod = '';
 
-  if (1) {  
     for my $de ($entry->DEs->elements) {
       if ($de->type eq 'EC') {
-	$ec = $de->text;
-	$ec =~ s/^\D*//;
-#	last;
+    $ec .= $de->text;
+#   last;
       }
       elsif ($de->type eq 'Full' and $de->category eq 'RecName') {
-	$prod = $de->text;
-	if ($prod =~ m/^UPF\d|^Uncharacterized protein|^ORF|^Protein /) {
-	  $prod = $HYPO;
-	}
+    $prod = $de->text;
+    if ($prod =~ m/^UPF\d|^Uncharacterized protein|^ORF|^Protein /) {
+      $prod = $HYPO;
+    }
       }
-      last if $prod and $ec;  # we have some data now, exit out
     }
-  }
-
+      $ec =~ s/^\D*//;
+      $ec =~ s/EC /;/g;
+
   $prod ||= $HYPO;
 
   # skip hypthetical proteins, unless user has overridden this with --hypo
@@ -80,10 +78,10 @@ while (<ARGV>)
   $ec ||= $blank;
   $gene ||= $blank;
   $prod ||= $blank;
-  
+
   print STDERR join("\t", $entry->AC, $ec, $gene, $prod), "\n" if $verbose;
   print ">", $entry->AC, " $ec$sep$gene$sep$prod\n", $entry->SQs->seq, "\n";
-  
+
   $out++;
 
 }
@@ -121,12 +119,12 @@ sub setOptions {
 }
 
 sub usage {
-  print "Usage: $0 [options] <uniprot.dat>\n";
+  print "Usage: $0 [options] <uniprot.dat> > sprot\n";
   foreach (@Options) {
     printf "  --%-13s %s%s.\n",$_->{OPT},$_->{DESC},
            defined($_->{DEFAULT}) ? " (default '$_->{DEFAULT}')" : "";
   }
   exit(1);
 }
- 
+
 #----------------------------------------------------------------------