Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Multiple EC_number tags for CDSs, lower case initials in /product descriptions for prokka-1.9.1-testing #18

Open
wants to merge 8 commits into
base: master
Choose a base branch
from
249 changes: 129 additions & 120 deletions bin/prokka

Large diffs are not rendered by default.

11 changes: 8 additions & 3 deletions bin/prokka-biocyc_to_fasta_db
Original file line number Diff line number Diff line change
Expand Up @@ -32,7 +32,12 @@ while (<$ec_fh>) {
my @x = split m/\t/;
next unless exists $seq{ $x[1] };
next unless $x[7] =~ m/^EC:(.*)$/;
$seq{$x[1]}{EC} = $1;
if ($seq{$x[1]}{EC}) { # concatenate EC_numbers via ';' if several present
next if ($seq{$x[1]}{EC} =~ /$1/); # skip EC numbers that are already included
$seq{$x[1]}{EC} .= ";$1";
} else {
$seq{$x[1]}{EC} = $1;
}
}

while (<$gene_fh>) {
Expand All @@ -41,7 +46,7 @@ while (<$gene_fh>) {
# remove HTML markup
$x[3] =~ s/<.*?>//g;
# special case for regular entities
$x[3] =~ s/&(alpha|beta|gamma|delta|epsilon);/$1/g;
$x[3] =~ s/&(alpha|beta|gamma|delta|epsilon);/$1/g;
# http://stackoverflow.com/questions/576095/how-can-i-decode-html-entities
$x[3] = unidecode(decode_entities($x[3]));
$seq{$id}{PRODUCT} = $x[3];
Expand Down Expand Up @@ -112,5 +117,5 @@ sub usage {
}
exit(1);
}

#----------------------------------------------------------------------
33 changes: 16 additions & 17 deletions bin/prokka-genbank_to_fasta_db
Original file line number Diff line number Diff line change
Expand Up @@ -3,8 +3,10 @@ use strict;
use Bio::SeqIO;
use Data::Dumper;

my(@Options, $verbose, $format, $hypo, $idtag, $sep, $blank, $pseudo, $minlen);
my(@Options, $verbose, $format, $gcode, $hypo, $idtag, $sep, $blank, $pseudo, $minlen);
setOptions();
($gcode < 1 or $gcode > 24) and die "Invalid genetic code, must be 1..24\n";
print STDERR "Using genetic code table $gcode.\n";

my $in = Bio::SeqIO->new(-fh=>\*ARGV, -format=>$format);
my $out = Bio::SeqIO->new(-fh=>\*STDOUT, -format=>'Fasta');
Expand All @@ -13,23 +15,23 @@ while (my $seq = $in->next_seq) {
print STDERR "\rParsing: ",$seq->display_id;
my $counter = 0;
for my $f ($seq->get_SeqFeatures) {

next unless $f->primary_tag eq 'CDS';

next unless $f->length >= $minlen;

next if !$pseudo and $f->has_tag('pseudo');

my $prod = TAG($f, 'product') || $blank;

next if $prod eq 'hypothetical protein';
next if !$hypo and $prod eq 'hypothetical protein';

my $cds = $f->spliced_seq; # don't forget eukaryotes!

my $id = TAG($f, $idtag) or die "feature #$counter does not have /$idtag tag!";
$counter++;
print STDERR "Processing: [$counter] ", $seq->display_id, " | $id | $prod\n" if $verbose;

# HANDLE CODON START FOR FUZZY FEATURES!
if ($f->has_tag('codon_start')) {
my($cs) = $f->get_tag_values('codon_start');
Expand All @@ -41,20 +43,15 @@ while (my $seq = $in->next_seq) {
#END


# DNA -> AA
$cds = $cds->translate;

# remove stop codon
my $aa = $cds->seq;
$aa =~ s/\*$//;
$cds->seq($aa);

# DNA -> AA
$cds = $cds->translate(-codontable_id=>$gcode, -complete => 1);

my $ec = TAG($f, 'EC_number') || $blank;
my $gene = TAG($f, 'gene') || $blank;

$cds->desc("$ec$sep$gene$sep$prod");
$cds->display_id($id);

$out->write_seq($cds);
}
print STDERR "\n";
Expand All @@ -69,6 +66,7 @@ sub TAG {
# Seems new submissions have 2 protein IDs - one useful and one annoying!
# /protein_id="REF_PRJNA193299:EFAU085_p1001"
# /protein_id="YP_008390691.1"
return join(';', $f->get_tag_values($tag)) if ($tag eq 'EC_number'); # concatenate EC_numbers via ';' if several present
return ( grep { ! m/PRJNA/ } $f->get_tag_values($tag) ) [0];
}

Expand All @@ -82,6 +80,7 @@ sub setOptions {
{OPT=>"help", VAR=>\&usage, DESC=>"This help"},
{OPT=>"verbose!", VAR=>\$verbose, DEFAULT=>0, DESC=>"Verbose progress"},
{OPT=>"format=s", VAR=>\$format, DEFAULT=>'genbank', DESC=>"Input format"},
{OPT=>"gcode=i", VAR=>\$gcode, DEFAULT=>1, DESC=>"Genetic code / Translation table (1..24)"},
{OPT=>"idtag=s", VAR=>\$idtag, DEFAULT=>'protein_id', DESC=>"What tag to use as Fasta ID"},
{OPT=>"sep=s", VAR=>\$sep, DEFAULT=>'~~~', DESC=>"Separator between EC/gene/product" },
{OPT=>"blank=s", VAR=>\$blank, DEFAULT=>'', DESC=>"Replace empty EC/gene/product with this"},
Expand Down Expand Up @@ -110,5 +109,5 @@ sub usage {
}
exit(1);
}

#----------------------------------------------------------------------
17 changes: 9 additions & 8 deletions bin/prokka-genpept_to_fasta_db
Original file line number Diff line number Diff line change
Expand Up @@ -12,11 +12,11 @@ while (my $seq = $in->next_seq) {
# print STDERR "\rParsing: ",$seq->display_id;
my $counter = 0;
for my $f ($seq->get_SeqFeatures) {

next unless $f->primary_tag eq 'Protein';

next unless $f->length >= $minlen;

next if !$pseudo and $f->has_tag('pseudo');

my $prod = TAG($f, 'product') || $blank;
Expand All @@ -27,16 +27,16 @@ while (my $seq = $in->next_seq) {

# my $id = TAG($f, $idtag) or die "feature #$counter does not have /$idtag tag!";
my $id = $seq->accession;

$counter++;
print STDERR "Processing: ", $seq->display_id, " | $counter | $id | $prod\n" if $verbose;

my $ec = TAG($f, 'EC_number') || $blank;
my $gene = TAG($f, 'gene') || $blank;

$cds->desc("$ec$sep$gene$sep$prod");
$cds->display_id($id);

$out->write_seq($cds);
}
# print STDERR "\n";
Expand All @@ -48,6 +48,7 @@ print STDERR "\nDone\n";
sub TAG {
my($f, $tag) = @_;
return unless $f->has_tag($tag);
return join(';', $f->get_tag_values($tag)) if ($tag eq 'EC_number'); # concatenate EC_numbers via ';' if several present
return ($f->get_tag_values($tag))[0];
}

Expand Down Expand Up @@ -89,5 +90,5 @@ sub usage {
}
exit(1);
}

#----------------------------------------------------------------------
9 changes: 5 additions & 4 deletions bin/prokka-hamap_to_hmm
Original file line number Diff line number Diff line change
Expand Up @@ -32,7 +32,7 @@ while (<$fam_fh>) {
if (m{^//}) {
#print STDERR "AC=$AC EC=$EC Bac=$Bac\n";
if ($AC and $EC and exists $DESC{$AC}) {
$DESC{$AC} = $EC.$DESC{$AC};
$DESC{$AC} = $EC.$DESC{$AC};
}
if ($AC and ! $Bac) {
print STDERR "Removing $AC as non-Bacterial\n";
Expand All @@ -44,7 +44,8 @@ while (<$fam_fh>) {
$AC = $1;
}
elsif (m/EC=([\d.]+);/) {
$EC = $1;
$EC .= ";$1" if ($EC && $EC !~ $1); # include each EC_number only once
$EC = $1 if (!$EC);
}
elsif (m/^ Bacteria/) {
$Bac = 1;
Expand Down Expand Up @@ -131,12 +132,12 @@ sub setOptions {
}

sub usage {
print "Usage: $0 [options] [--datadir hamap_data_subdir] > hamap.hmm\n";
print "Usage: $0 [options] [--datadir hamap_data_subdir]\n";
foreach (@Options) {
printf " --%-13s %s%s.\n",$_->{OPT},$_->{DESC},
defined($_->{DEFAULT}) ? " (default '$_->{DEFAULT}')" : "";
}
exit(1);
}

#----------------------------------------------------------------------
46 changes: 22 additions & 24 deletions bin/prokka-uniprot_to_fasta_db
Original file line number Diff line number Diff line change
Expand Up @@ -8,15 +8,15 @@ use Data::Dumper;
my(@Options, $verbose, $frag, $evlev, $minlen, $sep, $blank, $term, $hypo);
setOptions();

my $HYPO = 'hypothetical protein';
my $HYPO = 'hypothetical protein';

# Read an entire record at a time
local $/ = "\n//\n";

my $in=0;
my $out=0;

while (<ARGV>)
while (<ARGV>)
{
# Read the entry
my $entry = SWISS::Entry->fromText($_);
Expand All @@ -28,17 +28,17 @@ while (<ARGV>)

# Too short?
next if $minlen > 0 and length($entry->SQs->seq) < $minlen;

# Reject on evidence level
# grep ^PE uniprot_sprot.dat | sort | uniq -c
# 74284 PE 1: Evidence at protein level;
# 67762 PE 2: Evidence at transcript level;
# 376894 PE 3: Inferred from homology;
# 14424 PE 4: Predicted;
# 1884 PE 5: Uncertain;
if ($evlev < 5) {
if ($evlev < 5) {
$entry->PE->text =~ m/^(\d+)/;
next unless $1 <= $evlev;
next unless $1 <= $evlev;
}

# Only specified organism class
Expand All @@ -47,31 +47,29 @@ while (<ARGV>)
next unless grep { $_ eq $term } @$tax ;
}

# /gene code
# /gene code
my $gene = $entry->GNs->getFirst || '';
$gene = '' if $gene =~ m/\d{2}/ or $gene =~ m/\./;

my $ec = '';
# my $prod = 'hypothetical protein';
my $prod = '';
my $ec = '';
# my $prod = 'hypothetical protein';
my $prod = '';

if (1) {
for my $de ($entry->DEs->elements) {
if ($de->type eq 'EC') {
$ec = $de->text;
$ec =~ s/^\D*//;
# last;
$ec .= $de->text;
# last;
}
elsif ($de->type eq 'Full' and $de->category eq 'RecName') {
$prod = $de->text;
if ($prod =~ m/^UPF\d|^Uncharacterized protein|^ORF|^Protein /) {
$prod = $HYPO;
}
$prod = $de->text;
if ($prod =~ m/^UPF\d|^Uncharacterized protein|^ORF|^Protein /) {
$prod = $HYPO;
}
}
last if $prod and $ec; # we have some data now, exit out
}
}

$ec =~ s/^\D*//;
$ec =~ s/EC /;/g;

$prod ||= $HYPO;

# skip hypthetical proteins, unless user has overridden this with --hypo
Expand All @@ -80,10 +78,10 @@ while (<ARGV>)
$ec ||= $blank;
$gene ||= $blank;
$prod ||= $blank;

print STDERR join("\t", $entry->AC, $ec, $gene, $prod), "\n" if $verbose;
print ">", $entry->AC, " $ec$sep$gene$sep$prod\n", $entry->SQs->seq, "\n";

$out++;

}
Expand Down Expand Up @@ -121,12 +119,12 @@ sub setOptions {
}

sub usage {
print "Usage: $0 [options] <uniprot.dat>\n";
print "Usage: $0 [options] <uniprot.dat> > sprot\n";
foreach (@Options) {
printf " --%-13s %s%s.\n",$_->{OPT},$_->{DESC},
defined($_->{DEFAULT}) ? " (default '$_->{DEFAULT}')" : "";
}
exit(1);
}

#----------------------------------------------------------------------
Loading