From 24f4bcc1d540fa9320fbc1f429272b6743ce6458 Mon Sep 17 00:00:00 2001 From: kgalinsky Date: Wed, 28 Oct 2009 22:02:32 +0000 Subject: [PATCH] Updated trunk to 0.5.0 git-svn-id: https://jcvi-tools.svn.sourceforge.net/svnroot/jcvi-tools/JCVI-Translator/trunk@14 a16988d2-67c9-4db9-a897-0603be59747b --- Changes | 6 +- MANIFEST | 2 + Makefile.PL | 2 + lib/JCVI/Translator.pm | 1139 +++++----------------------------- lib/JCVI/Translator/Base.pm | 322 ++++++++++ lib/JCVI/Translator/Table.pm | 943 ++++++++++++++++++++++++++++ lib/JCVI/Translator/Utils.pm | 54 +- t/00-load.t | 8 +- t/02-translate_exons.t | 2 +- t/03-translate6.t | 15 +- t/04-custom.t | 2 +- 11 files changed, 1484 insertions(+), 1011 deletions(-) create mode 100644 lib/JCVI/Translator/Base.pm create mode 100644 lib/JCVI/Translator/Table.pm diff --git a/Changes b/Changes index 3379c19..56b9373 100644 --- a/Changes +++ b/Changes @@ -1,7 +1,11 @@ Revision history for JCVI-Translator +0.5.0 Apr 2 2009 + - Split out the table into its own package + - Split out helper translator functions into their own package + 0.4.3 Apr 1 2009 -- Enforced that functions are called using func(@mandatory, %optional) + - Enforced that functions are called using func(@mandatory, %optional) 0.4.2 Apr 1 2009 - Added find method into utils diff --git a/MANIFEST b/MANIFEST index 1922701..8a25936 100644 --- a/MANIFEST +++ b/MANIFEST @@ -5,6 +5,8 @@ Makefile.PL README lib/JCVI/Translator.pm lib/JCVI/Translator/Utils.pm +lib/JCVI/Translator/Table.pm +lib/JCVI/Translator/Base.pm t/00-load.t t/boilerplate.t t/pod-coverage.t diff --git a/Makefile.PL b/Makefile.PL index 1e9a07b..6d8fc12 100755 --- a/Makefile.PL +++ b/Makefile.PL @@ -10,6 +10,8 @@ WriteMakefile( PL_FILES => {}, PREREQ_PM => { 'Test::More' => 0, + 'JCVI::DNATools' => 0, + 'JCVI::AATools' => 0 }, dist => { COMPRESS => 'gzip -9f', SUFFIX => 'gz', }, clean => { FILES => 'JCVI-Translator-*' }, diff --git a/lib/JCVI/Translator.pm b/lib/JCVI/Translator.pm index f154460..edc38d4 100644 --- a/lib/JCVI/Translator.pm +++ b/lib/JCVI/Translator.pm @@ -11,7 +11,7 @@ JCVI::Translator - Translate DNA sequences =head1 SYNOPSIS -use JCVI::Translator; + use JCVI::Translator; my $translator = new JCVI::Translator(); my $translator = new JCVI::Translator(11); @@ -67,14 +67,18 @@ use strict; use warnings; use version; -our $VERSION = qv('0.4.3'); +our $VERSION = qv('0.5.0'); use base qw(Class::Accessor::Fast); -__PACKAGE__->mk_accessors(qw(id names _table _starts _reverse)); +__PACKAGE__->mk_accessors(qw(table base)); +use Carp; use Log::Log4perl qw(:easy); use Params::Validate; +use JCVI::Translator::Table; +use JCVI::Translator::Base; + use JCVI::DNATools qw( %degenerate_map $degen_match @@ -102,57 +106,20 @@ our $DEFAULT_SANITIZED = 0; =cut +sub _new { + my $class = shift; + $class->SUPER::new( + { table => shift, base => JCVI::Translator::Base->new() } ); +} + =head2 new my $translator = new JCVI::Translator(); my $translator = new JCVI::Translator( $id ); my $translator = new JCVI::Translator( $id, \%params ); -This method creates a translator by loading a translation table from the -internal list. Pass an ID and the type of ID. By default, it will load the -tranlation table with id 1. The type of ID may be "id" or "name," which -correspond to the numeric id of the translation table or the long name of the -translation table. For instance, below are the headers for the first 3 -translation tables. - - { - name "Standard" , - name "SGC0" , - id 1 , - ... - }, - { - name "Vertebrate Mitochondrial" , - name "SGC1" , - id 2 , - ... - }, - { - name "Yeast Mitochondrial" , - name "SGC2" , - id 3 , - ... - }, - ... - -By default, the "Standard" translation table will be loaded. You may create a -translator with this table by calling any of the following: - - my $t = new JCVI::Translator(); # default table - my $t = new JCVI::Translator(1); # explicitly set id - my $t = new JCVI::Translator( 1, { type => 'id' } ); # set id and type - my $t = new JCVI::Translator( 'Standard', { type => 'name' } ); - my $t = new JCVI::Translator( 'SGC0', { type => 'name' } ); - my $t = new JCVI::Translator( 'standard', { type => 'name' } ); - my $t = new JCVI::Translator( 'stan', { type => 'name' } ); - -For partial matches, JCVI::Translator will use the first matching translation -table. - - my $t = new JCVI::Translator( 'mitochondrial', { type => 'name' } ); - -This will use translation table with ID 2, "Vertebrate Mitochondrial," because -that is the first match (even though "Yeast Mitochondrial" would also match). +Create a translator with a translation table provided by $id. Please see +JCVI::Translator::Table for the full list of options. =cut @@ -160,47 +127,11 @@ sub new { TRACE('new called'); my $class = shift; + my $table = JCVI::Translator::Table->new(@_); - my ( $id, @p ); - - ( $id, $p[0] ) = validate_pos( - @_, - { default => $DEFAULT_ID }, - { type => Params::Validate::HASHREF, default => {} } - ); - - my %p = - validate( @p, - { type => { default => $DEFAULT_TYPE, regex => qr/id|name/ } } ); - - TRACE( uc( $p{type} ) . ': ' . $id ); - - # Get the beginning DATA so that we can seek back to it - my $start_pos = tell DATA; - - # Set up regular expression for searching. - my $match = ( $p{type} eq 'id' ) ? qr/id $id\b/ : qr/name ".*$id.*"/i; - - # Go through every internal table until it matches on id or name. - my $found = 0; - local $/ = "}"; - local $_; - while () { - if ( $_ =~ $match ) { - $found = 1; - last; - } - } - - # Reset DATA - seek DATA, $start_pos, 0; + return undef unless ($table); - # Call custom with internal table. Complete is set to 1. - return $class->custom( \$_, { complete => 1 } ) if ($found); - - # Internal table not matched. - ERROR("Table with $p{type} of $id not found"); - return undef; + $class->_new($table); } =head2 custom() @@ -208,338 +139,26 @@ sub new { my $translator = $translator->custom( $table_ref ); my $translator = $translator->custom( $table_ref, \%params ); -Create a translator table based off a passed table reference for custom -translation tables. Loads degenerate nucleotides if $complete isn't set (this -can take a little time). The format of the translation table should reflect -those of the internal tables: - - name "Names separated; by semicolons" - name "May have multiple lines" - id 99 - ncbieaa "AMINOACIDS...", - sncbieaa "-M--------..." - -- Base1 AAAAAAAAAA... - -- Base2 AAAACCCCGG... - -- Base3 ACGTACTGAC... - -JCVI::Translator is a bit more permissive, see the $TABLE_REGEX regular -expression to see that actual format. - -Examples: - - $translator = new Translator( - table_ref => \'name "All Alanines; All the Time" - id 9000 - ncbieaa "AAAAAAAA" - sncbieaa "----M---" - base1 AAAAAAAA - base2 AACCGGTT - base3 ACACACAC' - ); - - $translator = new Translator( - table_ref => \$table, - complete => 1 - ); +Create a translator with a custom translation table. Please see +JCVI::Translator::Table for the full list of options. =cut -# Regular expression which should match translation tables and also extracts -# relevant information. -our $TABLE_REGEX = qr/ - ( (?:name\s+".+?".*?) + ) - id\s+(\d+).* - ncbieaa\s+"([a-z*]+)".* - sncbieaa\s+"([a-z-]+)".* - base1\s+([a-z]+).* - base2\s+([a-z]+).* - base3\s+([a-z]+).* - /isx; - sub custom { TRACE('custom called'); my $class = shift; + my $table = JCVI::Translator::Table->custom(@_); - my ($table_ref, @p); - - ( $table_ref, $p[0] ) = validate_pos( - @_, - { type => Params::Validate::SCALARREF }, - { type => Params::Validate::HASHREF, default => {} } -); - -my %p = validate(@p, { - complete => { - default => $DEFAULT_COMPLETE, - regex => qr/^[01]$/ - }} - ); - - # Match the table or return undef. - unless ( $$table_ref =~ $TABLE_REGEX ) { - ERROR( 'Translation table is in invalid format', $$table_ref ); - return undef; - } - - # Store the data that has been stripped using descriptive names; - my $names = $1; - my $id = $2; - my $residues = $3; - my $starts = $4; - my $base1 = $5; - my $base2 = $6; - my $base3 = $7; - - my $self = $class->_new(); - - $self->id($id); - - # Extract each name, massage, and push it onto names array - while ( $names =~ /"(.+?)"/gis ) { - my @names = split( /;/, $1 ); - local $_; - foreach (@names) { - s/^\s+//; - s/\s+$//; - s/\n/ /g; - s/\s{2,}/ /g; - push @{ $self->names }, $_ if $_; - } - } - - # Store all the hashes in $self so we don't have to keep using accessors - my $forward_hash = $self->_table->[0]; - my $rc_forward_hash = $self->_table->[1]; - - my $starts_hash = $self->_starts->[0]; - my $rc_starts_hash = $self->_starts->[1]; + return undef unless ($table); - my $reverse_hash = $self->_reverse->[0]; - my $rc_reverse_hash = $self->_reverse->[1]; - - # Chop is used to efficiently get the last character from each string - while ( my $residue = uc( chop $residues ) ) { - my $start = uc( chop $starts ); - my $codon = uc( chop($base1) . chop($base2) . chop($base3) ); - - my $rc_codon_ref = reverse_complement( \$codon ); - - # If the residue is valid, store it - if ( $residue ne 'X' ) { - $forward_hash->{$codon} = $residue; - $rc_forward_hash->{$$rc_codon_ref} = $residue; - - push @{ $reverse_hash->{$residue} }, $codon; - push @{ $rc_reverse_hash->{$residue} }, $$rc_codon_ref; - } - - # If the start is valid, store it - if ( ( $start ne '-' ) ) { - $starts_hash->{$codon} = $start; - $rc_starts_hash->{$$rc_codon_ref} = $start; - - push @{ $reverse_hash->{start} }, $codon; - push @{ $rc_reverse_hash->{start} }, $$rc_codon_ref; - } - } - - # Unroll the translation table unless it has been marked complete - $self->bootstrap() unless ($p{complete}); - - return $self; -} - -# Helper constructor. Instantiates the object with arrayrefs and hashrefs in -# the right places -sub _new { - my $self = shift->SUPER::new( - { - names => [], - _table => [], - _starts => [], - _reverse => [] - } - ); - - foreach my $func (qw( _table _starts _reverse )) { - foreach my $rc ( 0 .. 1 ) { - $self->$func->[$rc] = {}; - } - } - - return $self; + $class->_new($table); } =head1 METHODS =cut -=head2 add_translation - - $translator->add_translation( $codon, $residue ); - $translator->add_translation( $codon, $residue, \%params ); - -Add a codon-to-residue translation to the translation table. $start inidicates -if this is a start codon. - -Examples: - - # THESE AREN'T REAL!!! - $translator->add_translation( 'ABA', 'G' ); - $translator->add_translation( 'ABA', 'M', 1 ); - -=cut - -sub add_translation { - TRACE('add_translation called'); - - my $self = shift; - - my ( $codon, $residue, @p ); - - ( $codon, $residue, $p[0] ) = validate_pos( - @_, - { regex => qr/^${nuc_match}{3}$/ }, - { regex => qr/^$aa_match$/ }, - { type => Params::Validate::HASHREF, default => {} } - ); - - my %p = validate( - @p, - { - strand => { - default => 1, - regex => qr/^[+-]?1$/, - type => Params::Validate::SCALAR - }, - start => { - default => 0, - regex => qr/^[01]$/, - type => Params::Validate::SCALAR - } - } - ); - - my $codon_ref; - my $rc_codon_ref; - - if ( $p{strand} == 1 ) { - $codon_ref = \$codon; - $rc_codon_ref = reverse_complement( \$codon ); - } - else { - $rc_codon_ref = \$codon; - $codon_ref = reverse_complement( \$codon ); - } - - # Store residue in the starts or regular translation table. - my $table = $p{start} ? '_starts' : '_table'; - $self->$table->[0]->{$$codon_ref} = $residue; - $self->$table->[1]->{$$rc_codon_ref} = $residue; - - # Store the reverse lookup - $residue = 'start' if ( $p{start} ); - push @{ $self->_reverse->[0]->{$residue} }, $$codon_ref; - push @{ $self->_reverse->[1]->{$residue} }, $$rc_codon_ref; -} - -=head2 bootstrap - - $translator->bootstrap(); - -Bootstrap the translation table. Find every possible translation, even those -that involve degenerate nucleotides or ambiguous amino acids. - -=cut - -sub bootstrap { - TRACE('bootstrap called'); - - my $self = shift; - - # Loop through every nucleotide combination and run _translate_codon on - # each. - foreach my $n1 (@nucs) { - foreach my $n2 (@nucs) { - foreach my $n3 (@nucs) { - $self->_translate_codon( $n1 . $n2 . $n3, $self->_table->[0] ); - $self->_translate_codon( - $n1 . $n2 . $n3, - $self->_starts->[0], - { start => 1 } - ); - } - } - } -} - -=head2 table_string - - my $table_string_ref = $translator->_table_string(); - my $table_string_ref = $translator->_table_string( \%params ); - -Returns the table string. %params can specify whether or not this table should -try to bootstrap itself using the bootstrap function above. By default, it will -try to. - -Examples: - - my $table_string_ref = $translator->_table_string(); - my $table_string_ref = $translator->_table_string( { bootstrap => 0 } ); - -=cut - -sub table_string { - TRACE('table_string called'); - - my $self = shift; - - my $bootstrap = - validate_pos( @_, - { default => $DEFAULT_BOOTSTRAP, regex => qr/^[01]$/ } ); - - # Bootstrap if necessary - $self->bootstrap() if ($bootstrap); - - # Generate the names string - my $names = join( '; ', @{ $self->names } ); - - my ( $residues, $starts ); # starts/residues string - my @base = (undef) x 3; # this will store the base strings - - # Loop over all stored codons. Sort the codons in the translation table and - # starts table, then use grep to get the unique ones with the help of $prev - # which stores the previous value - my $prev = ''; - foreach my $codon ( - grep ( ( $_ ne $prev ) && ( $prev = $_ ), - sort { $a cmp $b } ( - keys( %{ $self->_table->[0] } ), - keys( %{ $self->_starts->[0] } ) - ) ) - ) - { - $residues .= $self->_table->[0]->{$codon} || 'X'; - $starts .= $self->_starts->[0]->{$codon} || '-'; - - # Chop up the codon because the bases are stored on separate lines - $base[ -$_ ] .= chop $codon foreach ( 1 .. 3 ); - } - - # Generate the string - my $string = join( "\n", - '{', - qq(name "$names" ,), - qq(id $self->{id} ,), - qq(ncbieaa "$residues",), - qq(sncbieaa "$starts"), - map( {"-- Base$_ $base[$_ - 1]"} ( 1 .. 3 ) ), - '}' ); - - return \$string; -} - =head2 translate $pep_ref = $translator->translate( $seq_ref, \%params ); @@ -641,20 +260,26 @@ sub translate { my %p = validate( @p, { + + # Make sure strand is 1 or -1 strand => { default => $DEFAULT_STRAND, regex => qr/^[+-]?1$/, type => Params::Validate::SCALAR }, + + # Make sure lower is an integer within the sequence lower => { default => 0, - regex => qr/^[0-9]+$/, + regex => qr/^\d+$/, type => Params::Validate::SCALAR, callbacks => { 'lower >= 0' => sub { $_[0] >= 0 }, 'lower <= seq_length' => sub { $_[0] <= length($$seq_ref) } } }, + + # Make sure upper is an integer within the sequence upper => { default => length($$seq_ref), regex => qr/^[0-9]+$/, @@ -664,28 +289,50 @@ sub translate { 'upper <= seq_length' => sub { $_[0] <= length($$seq_ref) } } }, - partial => { default => $DEFAULT_PARTIAL }, - sanitized => { default => $DEFAULT_SANITIZED } + + # Make sure the offset is 0, 1 or 2. + offset => { + default => 0, + regex => qr/^[012]$/, + type => Params::Validate::SCALAR + }, + + # Make sure they are boolean values + partial => { + default => $DEFAULT_PARTIAL, + regex => qr/^[01]$/, + type => Params::Validate::SCALAR + }, + sanitized => { + default => $DEFAULT_SANITIZED, + regex => qr/^[01]$/, + type => Params::Validate::SCALAR + } } ); # Die if upper < lower if ( $p{upper} < $p{lower} ) { FATAL "Upper $p{upper} < Lower $p{lower}"; - die "Upper $p{upper} < Lower $p{lower}"; + croak "Upper $p{upper} < Lower $p{lower}"; } - $seq_ref = cleanDNA($seq_ref) unless ( $p{sanitized} ); + # Return undef if the offset is bigger than the space between bounds + return undef if ( $p{upper} <= $p{lower} + $p{offset} ); - # These are necessary for the _translate function - my $prep = $self->_prepare( $p{strand} ); - my $ends = $self->_endpoints( @p{qw(strand lower upper)} ); + # Clean the sequence and cache it + $seq_ref = cleanDNA($seq_ref) unless ( $p{sanitized} ); + $self->base->set_seq($seq_ref); - my $peptide = ''; + # Set the partial status + $self->base->set_partial( $p{partial} ); - $self->_start( $seq_ref, \$peptide, $ends, $prep ) unless ( $p{partial} ); - $self->_translate( $seq_ref, \$peptide, $ends, $prep ); + # Prepare for translation + $self->base->prepare( $p{strand}, $self->table ); + $self->base->endpoints( @p{qw(lower upper offset)} ); + # Translate and convert the resulting arrayref to a string + my $peptide = join( '', @{ $self->base->translate() } ); return \$peptide; } @@ -744,51 +391,62 @@ sub translate6 { my %p = validate( @p, { + + # Make sure lower is an integer within the sequence lower => { default => 0, - regex => qr/^[0-9]+$/, + regex => qr/^\d+$/, type => Params::Validate::SCALAR, callbacks => { 'lower >= 0' => sub { $_[0] >= 0 }, 'lower <= seq_length' => sub { $_[0] <= length($$seq_ref) } } }, + + # Make sure upper is an integer within the sequence upper => { default => length($$seq_ref), - regex => qr/^[0-9]+$/, + regex => qr/^\d+$/, type => Params::Validate::SCALAR, callbacks => { 'upper >= 0' => sub { $_[0] >= 0 }, 'upper <= seq_length' => sub { $_[0] <= length($$seq_ref) } } }, - partial => { default => $DEFAULT_PARTIAL }, - sanitized => { default => $DEFAULT_SANITIZED } + + # Make sure they are boolean values + partial => { + default => $DEFAULT_PARTIAL, + regex => qr/^[01]$/, + type => Params::Validate::SCALAR + }, + sanitized => { + default => $DEFAULT_SANITIZED, + regex => qr/^[01]$/, + type => Params::Validate::SCALAR + } } ); $seq_ref = cleanDNA($seq_ref) unless ( $p{sanitized} ); + $self->base->set_seq($seq_ref); + + $self->base->set_partial( $p{partial} ); my @peptides; - foreach my $strand ( -1, 1 ) { + foreach my $strand ( 1, -1 ) { - # We only need to calculate prep once for a given strand - my $prep = $self->_prepare($strand); - my $rc = $prep->[0]; # True if reverse complement - my $fw = ( $rc + 1 ) % 2; # True if forward strand - foreach ( 0 .. 2 ) { + # We only need to prepare once for a given strand + $self->base->prepare( $strand, $self->table ); + + foreach my $offset ( 0 .. 2 ) { # Calculate endpoints and translate - my $ends = $self->_endpoints( - $strand, - $p{lower} + $fw * $_, - $p{upper} - $rc * $_ - ); - $self->_start( $seq_ref, \$peptides[ $rc * 3 + $_ ], $ends, $prep ) - unless ( $p{partial} ); - $self->_translate( $seq_ref, \$peptides[ $rc * 3 + $_ ], - $ends, $prep ); + $self->base->endpoints( @p{qw(lower upper)}, $offset ); + + # Translate and push onto array + push @peptides, join( '', @{ $self->base->translate() } ); } } @@ -837,174 +495,91 @@ sub translate_exons { { type => Params::Validate::HASHREF, default => {} } ); - validate_pos( - @$exons, - ( - { - type => Params::Validate::ARRAYREF, - callbacks => { - 'Bound not an integer' => sub { - foreach my $bound ( @{ $_[0] } ) { - return 0 unless ( $bound =~ /^\d+$/ ); - } - return 1; - }, - 'Bound out of range' => sub { - foreach my $bound ( @{ $_[0] } ) { - return 0 - unless ( ( $bound >= 0 ) - && ( $bound <= length $$seq_ref ) ); - } - return 1; - }, - 'lower <= upper' => sub { - return $_[0][0] <= $_[0][1]; - } - } - } - ) x @$exons - ); - + # Validate optional arguments my %p = validate( @p, { + + # strand must be 1 or -1 strand => { default => $DEFAULT_STRAND, regex => qr/^[+-]?1$/, type => Params::Validate::SCALAR }, - partial => { default => $DEFAULT_PARTIAL }, - sanitized => { default => $DEFAULT_SANITIZED } - } - ); - - my @exons = - sort { ( $a->[0] <=> $b->[0] || $a->[1] <=> $b->[1] ) * $p{strand} } - @$exons; - - my $prep = $self->_prepare( $p{strand} ); - my $leftover = ''; - my $peptide; - - EXON: foreach my $exon (@exons) { - my ( $lower, $upper ) = @$exon; - - LEFTOVER: { - - # Deal with leftovers. These are codons that have been cut by - # splicing. In the event that no codon has been cut, the leftover - # will be the first codon of the exon. - - my $to_go = 3 - length($leftover); - - # If the exon has fewer nucleotides than what is required to - # complete the codon, set $to_go to be the length of that exon. - if ( ( my $length = $upper - $lower ) < $to_go ) { - $to_go = $length; - } - - # Complete the leftover and increment the start index. - unless ( $prep->[0] ) { - $leftover .= substr( $$seq_ref, $lower, $to_go ); - $lower += $to_go; - } - else { - $upper -= $to_go; - $leftover = substr( $$seq_ref, $upper, $to_go ) . $leftover; - } - - # If leftover isn't long enough, then move to the next exon. - next EXON if ( length($leftover) < 3 ); - } - - START: { - - # Handle the start codon. After the start codon has been - # translated, set the partial flag so we don't try it again. - my $ends = [ 0, $prep->[1] ]; - unless ( $p{partial} ) { - $self->_start( \$leftover, \$peptide, $ends, $prep ); - $p{partial} = 1; + # Make sure they are boolean values + partial => { + default => $DEFAULT_PARTIAL, + regex => qr/^[01]$/, + type => Params::Validate::SCALAR + }, + sanitized => { + default => $DEFAULT_SANITIZED, + regex => qr/^[01]$/, + type => Params::Validate::SCALAR } - - $self->_translate( \$leftover, \$peptide, $ends, $prep ); } + ); - my $ends = $self->_endpoints( $p{strand}, $lower, $upper ); - BOUNDS: { - my $phase_diff = ( $upper - $lower ) % 3; - $leftover = - $prep->[0] - ? substr( $$seq_ref, $lower, $phase_diff ) - : substr( $$seq_ref, $ends->[1], $phase_diff ); - } + # Validate the exons and sort in the proper order for translation + $self->_validate_exons( $exons, $seq_ref ); + my @exons = sort { + ( ( $a->[0] <=> $b->[0] ) || ( $a->[1] <=> $b->[1] ) ) * $p{strand} + } @$exons; - $self->_translate( $seq_ref, \$peptide, $ends, $prep ); - } + $seq_ref = cleanDNA($seq_ref) unless ( $p{sanitized} ); + $self->base->set_seq($seq_ref); - return \$peptide; -} + $self->base->set_partial( $p{partial} ); -# Returns [ $is_this_a_reverse_complement, $increment_for_loop ] -sub _prepare { - my $self = shift; - my ($strand) = @_; - return [ ( $strand == 1 ? 0 : 1 ), $strand * 3 ]; -} + my $prep = $self->base->prepare( $p{strand}, $self->table ); + my @peptides; -# Convert (lower, upper) into endpoints for a loop. For the + strand, we just -# adjust upper so that it is in phase with lower. However, for the - strand, -# we not only adjust lower for phase, but also subtract 3 from everything so -# that we can take the substring properly. Here is a picture that might make -# sense of this: + EXON: foreach my $exon (@exons) { + $self->base->endpoints(@$exon); -# Positions: 0 1 2 3 4 5 6 7 8 9 0 1 2 3 4 5 6 7 8 9 0 1 2 3 4 -# Region of interest (-): . . . .4- - - - - - - - - - - - - - - -20 . . . -# For + strand: 4- - -|- - -|- - -|- - -|- - >19 -# For - strand: 2. . .|< - -|- - -|- - -|- - -17 - -| + next EXON unless ( $self->base->finish_leftover() ); -# For a + strand, it returns [4, 19]. For a - strand, it returns [17, 2]. To -# get the first codon on the - strand, we take the substring starting at 17, -# and the loop will end once we decrement our counter to 2. + push @peptides, [ $self->base->translate_leftover() ]; + push @peptides, $self->base->translate(); + $self->base->store_leftover(); + } -sub _endpoints { - my $self = shift; - my ( $strand, $lower, $upper ) = @_; + my $peptide = join( '', map { @$_ } @peptides ); - return $strand == 1 - ? [ $lower, $upper - ( ( $upper - $lower ) % 3 ) ] - : [ $upper - 3, $lower - 3 + ( ( $upper - $lower ) % 3 ) ]; + return \$peptide; } -# The actual translation function. Goes from start to stop, appends to the -# peptide sequence using the translation table provided. - -sub _translate { +# Validate the exons +sub _validate_exons { my $self = shift; - my ( $seq_ref, $pep_ref, $ends, $prep ) = @_; + my ( $exons, $seq_ref ) = @_; - my $table = $self->_table->[ $prep->[0] ]; - while ( $ends->[0] != $ends->[1] ) { - $$pep_ref .= $table->{ substr( $$seq_ref, $ends->[0], 3 ) } || 'X'; - $ends->[0] += $prep->[1]; - } -} + foreach my $exon (@$exons) { + my ( $lower, $upper ) = @$exon; -# Perform translation for only one frame and adjusts the start only if it finds -# a codon in the translation table. + # Make sure upper and lower bounds are integers + if ( $lower !~ m/^\d+$/ ) { + FATAL "Lower $lower not an integer"; + croak "Lower $lower not an integer"; + } -sub _start { - my $self = shift; - my ( $seq_ref, $pep_ref, $ends, $prep ) = @_; + if ( $upper !~ m/^\d+$/ ) { + FATAL "Upper $upper not an integer"; + croak "Upper $upper not an integer"; + } - return if ( $ends->[0] == $ends->[1] ); + # Make sure upper >= lower + if ( $upper < $lower ) { + FATAL "Upper $upper < Lower $lower"; + croak "Upper $upper < Lower $lower"; + } - my $start = - $self->_starts->[ $prep->[0] ]->{ substr( $$seq_ref, $ends->[0], 3 ) }; - if ($start) { - $$pep_ref = $start; - $ends->[0] += $prep->[1]; + # Make sure upper is within the sequence + if ( $upper > length($$seq_ref) ) { + FATAL "Upper $upper not in the sequence"; + croak "Upper $upper not in the sequence"; + } } } @@ -1016,9 +591,7 @@ sub _start { Translate a codon. Return 'X' or '-' if it isn't in the codon table. Handles degenerate nucleotides, so if all possible codons for an ambiguity map to the same residue, -return that residue. Will also handle ambiguous amino acids. -start dictates whether or not to translate this as a start -codon. Will also cache any new translations it finds. +return that residue. Example: @@ -1045,11 +618,15 @@ sub translate_codon { my %p = validate( @p, { + + # Make sure strand is 1 or -1 strand => { default => 1, regex => qr/^[+-]?1$/, type => Params::Validate::SCALAR }, + + # Make sure it is a boolean value start => { default => 0, regex => qr/^[01]$/, @@ -1063,256 +640,20 @@ sub translate_codon { my ( $table, $not_found ); unless ( $p{start} ) { - $table = $self->_table->[$rc]; + $table = $self->table->_forward->[$rc]; $not_found = 'X'; } else { - $table = $self->_starts->[$rc]; + $table = $self->table->_starts->[$rc]; $not_found = '-'; } - # return $table->{$codon} if ( defined $table->{$codon} ); - return $self->_translate_codon( $codon, $table, \%p ) || $not_found; -} - -# This is the helper function for translate_codon. It is designed to speed -# things up because it doesn't perform validation or try to figure out which -# tables to use, which can slow things down since this is a recursive function. -# Handles codons with degenerate nucleotides: [RYMKWS] [BDHV] or N. Several -# codons may map to the same amino acid. If all possible codons for an -# amibguity map to the same residue, return that residue rather than X. - -sub _translate_codon { - my $self = shift; - my $codon = shift; - my $table = shift; - - # Check for base case: no degenerate nucleotides; we can't unroll further. - unless ( $codon =~ /($degen_match)/ ) { - return $table->{$codon}; - } - - # Check to see if this degenerate-containing codon has been computed - return $table->{$codon} if ( $table->{$codon} ); - - my $consensus; - my $nuc = $1; - - # Replace the nucleotide with every possiblity from degenerate map hash. - foreach ( @{ $degenerate_map{$nuc} } ) { - my $new_codon = $codon; - $new_codon =~ s/$nuc/$_/; - - # Recursively call this function - my $residue = $self->_translate_codon( $new_codon, $table, @_ ); - - # If the new_codon didn't come to a consensus, or if the translation - # isn't defined for new_codon in a custom translation table, return - # undef. - return undef unless ( defined $residue ); - - # If consensus isn't set, set it to the current residue. - $consensus = $residue unless ($consensus); - - # This is an interesting step. If the residue isn't the same as the - # consensus, check to see if they map to the same ambiguous amino acid. - # If true, then change the consensus to that ambiguous acid and proceed. - # Otherwise, return undef (consensus could not be reached). - if ( $residue ne $consensus ) { - if ( - ( defined $ambiguous_forward{$residue} ) - && ( defined $ambiguous_forward{$consensus} ) - && ( $ambiguous_forward{$residue} eq - $ambiguous_forward{$consensus} ) - ) - { - $consensus = $ambiguous_forward{$consensus}; - } - else { - return undef; - } - } - } - - # If we got this far, it means that we have a valid consensus sequence for - # a degenerate-nucleotide-containing codon. Cache and return results. - DEBUG("New codon translation found: $codon => $consensus"); - $self->add_translation( $codon, $consensus, @_ ); - return $consensus; + return $self->table->_unroll( $codon, $table, { start => $p{start} } ) + || $not_found; } 1; -=head1 MISC - -These are the original translation tables. The translation tables used by this -module have been boostrapped - they include translations for degenerate -nucleotides and allow ambiguous amino acids to be the targets of translation -(e.g. every effort has been made to give a translation that isn't "X"). - - { - name "Standard" , - name "SGC0" , - id 1 , - ncbieaa "FFLLSSSSYY**CC*WLLLLPPPPHHQQRRRRIIIMTTTTNNKKSSRRVVVVAAAADDEEGGGG", - sncbieaa "---M---------------M---------------M----------------------------" - -- Base1 TTTTTTTTTTTTTTTTCCCCCCCCCCCCCCCCAAAAAAAAAAAAAAAAGGGGGGGGGGGGGGGG - -- Base2 TTTTCCCCAAAAGGGGTTTTCCCCAAAAGGGGTTTTCCCCAAAAGGGGTTTTCCCCAAAAGGGG - -- Base3 TCAGTCAGTCAGTCAGTCAGTCAGTCAGTCAGTCAGTCAGTCAGTCAGTCAGTCAGTCAGTCAG - }, - { - name "Vertebrate Mitochondrial" , - name "SGC1" , - id 2 , - ncbieaa "FFLLSSSSYY**CCWWLLLLPPPPHHQQRRRRIIMMTTTTNNKKSS**VVVVAAAADDEEGGGG", - sncbieaa "--------------------------------MMMM---------------M------------" - -- Base1 TTTTTTTTTTTTTTTTCCCCCCCCCCCCCCCCAAAAAAAAAAAAAAAAGGGGGGGGGGGGGGGG - -- Base2 TTTTCCCCAAAAGGGGTTTTCCCCAAAAGGGGTTTTCCCCAAAAGGGGTTTTCCCCAAAAGGGG - -- Base3 TCAGTCAGTCAGTCAGTCAGTCAGTCAGTCAGTCAGTCAGTCAGTCAGTCAGTCAGTCAGTCAG - }, - { - name "Yeast Mitochondrial" , - name "SGC2" , - id 3 , - ncbieaa "FFLLSSSSYY**CCWWTTTTPPPPHHQQRRRRIIMMTTTTNNKKSSRRVVVVAAAADDEEGGGG", - sncbieaa "----------------------------------MM----------------------------" - -- Base1 TTTTTTTTTTTTTTTTCCCCCCCCCCCCCCCCAAAAAAAAAAAAAAAAGGGGGGGGGGGGGGGG - -- Base2 TTTTCCCCAAAAGGGGTTTTCCCCAAAAGGGGTTTTCCCCAAAAGGGGTTTTCCCCAAAAGGGG - -- Base3 TCAGTCAGTCAGTCAGTCAGTCAGTCAGTCAGTCAGTCAGTCAGTCAGTCAGTCAGTCAGTCAG - }, - { - name "Mold Mitochondrial; Protozoan Mitochondrial;" - name "Coelenterate Mitochondrial; Mycoplasma; Spiroplasma" , - name "SGC3" , - id 4 , - ncbieaa "FFLLSSSSYY**CCWWLLLLPPPPHHQQRRRRIIIMTTTTNNKKSSRRVVVVAAAADDEEGGGG", - sncbieaa "--MM---------------M------------MMMM---------------M------------" - -- Base1 TTTTTTTTTTTTTTTTCCCCCCCCCCCCCCCCAAAAAAAAAAAAAAAAGGGGGGGGGGGGGGGG - -- Base2 TTTTCCCCAAAAGGGGTTTTCCCCAAAAGGGGTTTTCCCCAAAAGGGGTTTTCCCCAAAAGGGG - -- Base3 TCAGTCAGTCAGTCAGTCAGTCAGTCAGTCAGTCAGTCAGTCAGTCAGTCAGTCAGTCAGTCAG - }, - { - name "Invertebrate Mitochondrial" , - name "SGC4" , - id 5 , - ncbieaa "FFLLSSSSYY**CCWWLLLLPPPPHHQQRRRRIIMMTTTTNNKKSSSSVVVVAAAADDEEGGGG", - sncbieaa "---M----------------------------MMMM---------------M------------" - -- Base1 TTTTTTTTTTTTTTTTCCCCCCCCCCCCCCCCAAAAAAAAAAAAAAAAGGGGGGGGGGGGGGGG - -- Base2 TTTTCCCCAAAAGGGGTTTTCCCCAAAAGGGGTTTTCCCCAAAAGGGGTTTTCCCCAAAAGGGG - -- Base3 TCAGTCAGTCAGTCAGTCAGTCAGTCAGTCAGTCAGTCAGTCAGTCAGTCAGTCAGTCAGTCAG - }, - { - name "Ciliate Nuclear; Dasycladacean Nuclear; Hexamita Nuclear" , - name "SGC5" , - id 6 , - ncbieaa "FFLLSSSSYYQQCC*WLLLLPPPPHHQQRRRRIIIMTTTTNNKKSSRRVVVVAAAADDEEGGGG", - sncbieaa "-----------------------------------M----------------------------" - -- Base1 TTTTTTTTTTTTTTTTCCCCCCCCCCCCCCCCAAAAAAAAAAAAAAAAGGGGGGGGGGGGGGGG - -- Base2 TTTTCCCCAAAAGGGGTTTTCCCCAAAAGGGGTTTTCCCCAAAAGGGGTTTTCCCCAAAAGGGG - -- Base3 TCAGTCAGTCAGTCAGTCAGTCAGTCAGTCAGTCAGTCAGTCAGTCAGTCAGTCAGTCAGTCAG - }, - { - name "Echinoderm Mitochondrial; Flatworm Mitochondrial" , - name "SGC8" , - id 9 , - ncbieaa "FFLLSSSSYY**CCWWLLLLPPPPHHQQRRRRIIIMTTTTNNNKSSSSVVVVAAAADDEEGGGG", - sncbieaa "-----------------------------------M---------------M------------" - -- Base1 TTTTTTTTTTTTTTTTCCCCCCCCCCCCCCCCAAAAAAAAAAAAAAAAGGGGGGGGGGGGGGGG - -- Base2 TTTTCCCCAAAAGGGGTTTTCCCCAAAAGGGGTTTTCCCCAAAAGGGGTTTTCCCCAAAAGGGG - -- Base3 TCAGTCAGTCAGTCAGTCAGTCAGTCAGTCAGTCAGTCAGTCAGTCAGTCAGTCAGTCAGTCAG - }, - { - name "Euplotid Nuclear" , - name "SGC9" , - id 10 , - ncbieaa "FFLLSSSSYY**CCCWLLLLPPPPHHQQRRRRIIIMTTTTNNKKSSRRVVVVAAAADDEEGGGG", - sncbieaa "-----------------------------------M----------------------------" - -- Base1 TTTTTTTTTTTTTTTTCCCCCCCCCCCCCCCCAAAAAAAAAAAAAAAAGGGGGGGGGGGGGGGG - -- Base2 TTTTCCCCAAAAGGGGTTTTCCCCAAAAGGGGTTTTCCCCAAAAGGGGTTTTCCCCAAAAGGGG - -- Base3 TCAGTCAGTCAGTCAGTCAGTCAGTCAGTCAGTCAGTCAGTCAGTCAGTCAGTCAGTCAGTCAG - }, - { - name "Bacterial and Plant Plastid" , - id 11 , - ncbieaa "FFLLSSSSYY**CC*WLLLLPPPPHHQQRRRRIIIMTTTTNNKKSSRRVVVVAAAADDEEGGGG", - sncbieaa "---M---------------M------------MMMM---------------M------------" - -- Base1 TTTTTTTTTTTTTTTTCCCCCCCCCCCCCCCCAAAAAAAAAAAAAAAAGGGGGGGGGGGGGGGG - -- Base2 TTTTCCCCAAAAGGGGTTTTCCCCAAAAGGGGTTTTCCCCAAAAGGGGTTTTCCCCAAAAGGGG - -- Base3 TCAGTCAGTCAGTCAGTCAGTCAGTCAGTCAGTCAGTCAGTCAGTCAGTCAGTCAGTCAGTCAG - }, - { - name "Alternative Yeast Nuclear" , - id 12 , - ncbieaa "FFLLSSSSYY**CC*WLLLSPPPPHHQQRRRRIIIMTTTTNNKKSSRRVVVVAAAADDEEGGGG", - sncbieaa "-------------------M---------------M----------------------------" - -- Base1 TTTTTTTTTTTTTTTTCCCCCCCCCCCCCCCCAAAAAAAAAAAAAAAAGGGGGGGGGGGGGGGG - -- Base2 TTTTCCCCAAAAGGGGTTTTCCCCAAAAGGGGTTTTCCCCAAAAGGGGTTTTCCCCAAAAGGGG - -- Base3 TCAGTCAGTCAGTCAGTCAGTCAGTCAGTCAGTCAGTCAGTCAGTCAGTCAGTCAGTCAGTCAG - }, - { - name "Ascidian Mitochondrial" , - id 13 , - ncbieaa "FFLLSSSSYY**CCWWLLLLPPPPHHQQRRRRIIMMTTTTNNKKSSGGVVVVAAAADDEEGGGG", - sncbieaa "---M------------------------------MM---------------M------------" - -- Base1 TTTTTTTTTTTTTTTTCCCCCCCCCCCCCCCCAAAAAAAAAAAAAAAAGGGGGGGGGGGGGGGG - -- Base2 TTTTCCCCAAAAGGGGTTTTCCCCAAAAGGGGTTTTCCCCAAAAGGGGTTTTCCCCAAAAGGGG - -- Base3 TCAGTCAGTCAGTCAGTCAGTCAGTCAGTCAGTCAGTCAGTCAGTCAGTCAGTCAGTCAGTCAG - }, - { - name "Alternative Flatworm Mitochondrial" , - id 14 , - ncbieaa "FFLLSSSSYYY*CCWWLLLLPPPPHHQQRRRRIIIMTTTTNNNKSSSSVVVVAAAADDEEGGGG", - sncbieaa "-----------------------------------M----------------------------" - -- Base1 TTTTTTTTTTTTTTTTCCCCCCCCCCCCCCCCAAAAAAAAAAAAAAAAGGGGGGGGGGGGGGGG - -- Base2 TTTTCCCCAAAAGGGGTTTTCCCCAAAAGGGGTTTTCCCCAAAAGGGGTTTTCCCCAAAAGGGG - -- Base3 TCAGTCAGTCAGTCAGTCAGTCAGTCAGTCAGTCAGTCAGTCAGTCAGTCAGTCAGTCAGTCAG - } , - { - name "Blepharisma Macronuclear" , - id 15 , - ncbieaa "FFLLSSSSYY*QCC*WLLLLPPPPHHQQRRRRIIIMTTTTNNKKSSRRVVVVAAAADDEEGGGG", - sncbieaa "-----------------------------------M----------------------------" - -- Base1 TTTTTTTTTTTTTTTTCCCCCCCCCCCCCCCCAAAAAAAAAAAAAAAAGGGGGGGGGGGGGGGG - -- Base2 TTTTCCCCAAAAGGGGTTTTCCCCAAAAGGGGTTTTCCCCAAAAGGGGTTTTCCCCAAAAGGGG - -- Base3 TCAGTCAGTCAGTCAGTCAGTCAGTCAGTCAGTCAGTCAGTCAGTCAGTCAGTCAGTCAGTCAG - } , - { - name "Chlorophycean Mitochondrial" , - id 16 , - ncbieaa "FFLLSSSSYY*LCC*WLLLLPPPPHHQQRRRRIIIMTTTTNNKKSSRRVVVVAAAADDEEGGGG", - sncbieaa "-----------------------------------M----------------------------" - -- Base1 TTTTTTTTTTTTTTTTCCCCCCCCCCCCCCCCAAAAAAAAAAAAAAAAGGGGGGGGGGGGGGGG - -- Base2 TTTTCCCCAAAAGGGGTTTTCCCCAAAAGGGGTTTTCCCCAAAAGGGGTTTTCCCCAAAAGGGG - -- Base3 TCAGTCAGTCAGTCAGTCAGTCAGTCAGTCAGTCAGTCAGTCAGTCAGTCAGTCAGTCAGTCAG - } , - { - name "Trematode Mitochondrial" , - id 21 , - ncbieaa "FFLLSSSSYY**CCWWLLLLPPPPHHQQRRRRIIMMTTTTNNNKSSSSVVVVAAAADDEEGGGG", - sncbieaa "-----------------------------------M---------------M------------" - -- Base1 TTTTTTTTTTTTTTTTCCCCCCCCCCCCCCCCAAAAAAAAAAAAAAAAGGGGGGGGGGGGGGGG - -- Base2 TTTTCCCCAAAAGGGGTTTTCCCCAAAAGGGGTTTTCCCCAAAAGGGGTTTTCCCCAAAAGGGG - -- Base3 TCAGTCAGTCAGTCAGTCAGTCAGTCAGTCAGTCAGTCAGTCAGTCAGTCAGTCAGTCAGTCAG - } , - { - name "Scenedesmus obliquus Mitochondrial" , - id 22 , - ncbieaa "FFLLSS*SYY*LCC*WLLLLPPPPHHQQRRRRIIIMTTTTNNKKSSRRVVVVAAAADDEEGGGG", - sncbieaa "-----------------------------------M----------------------------" - -- Base1 TTTTTTTTTTTTTTTTCCCCCCCCCCCCCCCCAAAAAAAAAAAAAAAAGGGGGGGGGGGGGGGG - -- Base2 TTTTCCCCAAAAGGGGTTTTCCCCAAAAGGGGTTTTCCCCAAAAGGGGTTTTCCCCAAAAGGGG - -- Base3 TCAGTCAGTCAGTCAGTCAGTCAGTCAGTCAGTCAGTCAGTCAGTCAGTCAGTCAGTCAGTCAG - } , - { - name "Thraustochytrium Mitochondrial" , - id 23 , - ncbieaa "FF*LSSSSYY**CC*WLLLLPPPPHHQQRRRRIIIMTTTTNNKKSSRRVVVVAAAADDEEGGGG", - sncbieaa "--------------------------------M--M---------------M------------" - -- Base1 TTTTTTTTTTTTTTTTCCCCCCCCCCCCCCCCAAAAAAAAAAAAAAAAGGGGGGGGGGGGGGGG - -- Base2 TTTTCCCCAAAAGGGGTTTTCCCCAAAAGGGGTTTTCCCCAAAAGGGGTTTTCCCCAAAAGGGG - -- Base3 TCAGTCAGTCAGTCAGTCAGTCAGTCAGTCAGTCAGTCAGTCAGTCAGTCAGTCAGTCAGTCAG - } - =head1 AUTHOR Kevin Galinsky, C<< >> @@ -1366,167 +707,3 @@ This program is free software; you can redistribute it and/or modify it under the same terms as Perl itself. =cut - -__DATA__ - -{ -name "Standard; SGC0" , -id 1 , -ncbieaa "KNKNKNTTTTTTTTTTTTTTTRSRSRSIIMIIIIIQHQHQHPPPPPPPPPPPPPPPRRRRRRRRRRRRRRRLLLLLLLLLLLLLLLEDEDEDAAAAAAAAAAAAAAAGGGGGGGGGGGGGGGVVVVVVVVVVVVVVVBBB*Y*Y*YSSSSSSSSSSSSSSS*CWCCLFLFLF*JXRRRJJXJJJJJZZZJXLLL", -sncbieaa "-----------------------------M-------------------------------------------M----------------------------------------------------------------------------------------------M-----M-----M---------M-M-" --- Base1 AAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAACCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTHHMMMMMMMMMMMSSSWWYYY --- Base2 AAAAAACCCCCCCCCCCCCCCGGGGGGTTTTTTTTAAAAAACCCCCCCCCCCCCCCGGGGGGGGGGGGGGGTTTTTTTTTTTTTTTAAAAAACCCCCCCCCCCCCCCGGGGGGGGGGGGGGGTTTTTTTTTTTTTTTMMMAAAAAACCCCCCCCCCCCCCCGGGGGTTTTTTRTTGGGTTTTTTTTAAATTTTT --- Base3 ACGTRYACGTBDHKMNRSVWYACGTRYACGTHMWYACGTRYACGTBDHKMNRSVWYACGTBDHKMNRSVWYACGTBDHKMNRSVWYACGTRYACGTBDHKMNRSVWYACGTBDHKMNRSVWYACGTBDHKMNRSVWYCTYACGTRYACGTBDHKMNRSVWYACGTYACGTRYAAGAGRACGTHMWYAGRAGAGR -} -{ -name "Vertebrate Mitochondrial; SGC1" , -id 2 , -ncbieaa "KNKNKNTTTTTTTTTTTTTTT*S*S*SMIMIXXXXXXMXXXIQHQHQHPPPPPPPPPPPPPPPRRRRRRRRRRRRRRRLLLLLLLLLLLLLLLEDEDEDAAAAAAAAAAAAAAAGGGGGGGGGGGGGGGVVVVVVVVVVVVVVVBBB*Y*Y*YSSSSSSSSSSSSSSSWCWCWCLFLFLFJJJXZZZLLL", -sncbieaa "---------------------------MMMMMMMMMMMMMMM-----------------------------------------------------------------------------------------M---------------------------------------------------M------" --- Base1 AAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAACCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTMMMRSSSYYY --- Base2 AAAAAACCCCCCCCCCCCCCCGGGGGGTTTTTTTTTTTTTTTAAAAAACCCCCCCCCCCCCCCGGGGGGGGGGGGGGGTTTTTTTTTTTTTTTAAAAAACCCCCCCCCCCCCCCGGGGGGGGGGGGGGGTTTTTTTTTTTTTTTMMMAAAAAACCCCCCCCCCCCCCCGGGGGGTTTTTTTTTTAAATTT --- Base3 ACGTRYACGTBDHKMNRSVWYACGTRYACGTBDHKMNRSVWYACGTRYACGTBDHKMNRSVWYACGTBDHKMNRSVWYACGTBDHKMNRSVWYACGTRYACGTBDHKMNRSVWYACGTBDHKMNRSVWYACGTBDHKMNRSVWYCTYACGTRYACGTBDHKMNRSVWYACGTRYACGTRYCTYGAGRAGR -} -{ -name "Yeast Mitochondrial; SGC2" , -id 3 , -ncbieaa "KNKNKNTTTTTTTTTTTTTTTRSRSRSMIMIMIQHQHQHPPPPPPPPPPPPPPPRRRRRRRRRRRRRRRTTTTTTTTTTTTTTTEDEDEDAAAAAAAAAAAAAAAGGGGGGGGGGGGGGGVVVVVVVVVVVVVVVBBB*Y*Y*YSSSSSSSSSSSSSSSWCWCWCLFLFLFRRRZZZ", -sncbieaa "---------------------------M-M-M-------------------------------------------------------------------------------------------------------------------------------------------------" --- Base1 AAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAACCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTMMMSSS --- Base2 AAAAAACCCCCCCCCCCCCCCGGGGGGTTTTTTAAAAAACCCCCCCCCCCCCCCGGGGGGGGGGGGGGGTTTTTTTTTTTTTTTAAAAAACCCCCCCCCCCCCCCGGGGGGGGGGGGGGGTTTTTTTTTTTTTTTMMMAAAAAACCCCCCCCCCCCCCCGGGGGGTTTTTTGGGAAA --- Base3 ACGTRYACGTBDHKMNRSVWYACGTRYACGTRYACGTRYACGTBDHKMNRSVWYACGTBDHKMNRSVWYACGTBDHKMNRSVWYACGTRYACGTBDHKMNRSVWYACGTBDHKMNRSVWYACGTBDHKMNRSVWYCTYACGTRYACGTBDHKMNRSVWYACGTRYACGTRYAGRAGR -} -{ -name "Mold Mitochondrial; Protozoan Mitochondrial; Coelenterate Mitochondrial; Mycoplasma; Spiroplasma; SGC3" , -id 4 , -ncbieaa "KNKNKNTTTTTTTTTTTTTTTRSRSRSIIMIXXIXIXXXXIIQHQHQHPPPPPPPPPPPPPPPRRRRRRRRRRRRRRRLLLLLLLLLLLLLLLEDEDEDAAAAAAAAAAAAAAAGGGGGGGGGGGGGGGVVVVVVVVVVVVVVVBBB*Y*Y*YSSSSSSSSSSSSSSSWCWCWCLFLFLFXXJXXRRRJJXJJJJJXXZZZXXJXXLLL", -sncbieaa "---------------------------MMMMMMMMMMMMMMM--------------------------------------M--------------------------------------------------M------------------------------------------M-M-M-MM-MM-----M-----MM---MMMMM-M-" --- Base1 AAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAACCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTBDHHKMMMMMMMMMMMNRSSSSVWWWYYY --- Base2 AAAAAACCCCCCCCCCCCCCCGGGGGGTTTTTTTTTTTTTTTAAAAAACCCCCCCCCCCCCCCGGGGGGGGGGGGGGGTTTTTTTTTTTTTTTAAAAAACCCCCCCCCCCCCCCGGGGGGGGGGGGGGGTTTTTTTTTTTTTTTMMMAAAAAACCCCCCCCCCCCCCCGGGGGGTTTTTTTTTTTGGGTTTTTTTTTTAAATTTTTTTT --- Base3 ACGTRYACGTBDHKMNRSVWYACGTRYACGTBDHKMNRSVWYACGTRYACGTBDHKMNRSVWYACGTBDHKMNRSVWYACGTBDHKMNRSVWYACGTRYACGTBDHKMNRSVWYACGTBDHKMNRSVWYACGTBDHKMNRSVWYCTYACGTRYACGTBDHKMNRSVWYACGTRYACGTRYGGAGGAGRACGTHMWYGGAGRGGAGRAGR -} -{ -name "Invertebrate Mitochondrial; SGC4" , -id 5 , -ncbieaa "KNKNKNTTTTTTTTTTTTTTTSSSSSSSSSSSSSSSMIMIXXXXXXMXXXIQHQHQHPPPPPPPPPPPPPPPRRRRRRRRRRRRRRRLLLLLLLLLLLLLLLEDEDEDAAAAAAAAAAAAAAAGGGGGGGGGGGGGGGVVVVVVVVVVVVVVVBBB*Y*Y*YSSSSSSSSSSSSSSSWCWCWCLFLFLFXXJJJXZZZXLLL", -sncbieaa "------------------------------------MMMMMMMMMMMMMMM-----------------------------------------------------------------------------------------M--------------------------------------------M---MM---M---M---" --- Base1 AAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAACCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTDKMMMRSSSWYYY --- Base2 AAAAAACCCCCCCCCCCCCCCGGGGGGGGGGGGGGGTTTTTTTTTTTTTTTAAAAAACCCCCCCCCCCCCCCGGGGGGGGGGGGGGGTTTTTTTTTTTTTTTAAAAAACCCCCCCCCCCCCCCGGGGGGGGGGGGGGGTTTTTTTTTTTTTTTMMMAAAAAACCCCCCCCCCCCCCCGGGGGGTTTTTTTTTTTTAAATTTT --- Base3 ACGTRYACGTBDHKMNRSVWYACGTBDHKMNRSVWYACGTBDHKMNRSVWYACGTRYACGTBDHKMNRSVWYACGTBDHKMNRSVWYACGTBDHKMNRSVWYACGTRYACGTBDHKMNRSVWYACGTBDHKMNRSVWYACGTBDHKMNRSVWYCTYACGTRYACGTBDHKMNRSVWYACGTRYACGTRYGGCTYGAGRGAGR -} -{ -name "Ciliate Nuclear; Dasycladacean Nuclear; Hexamita Nuclear; SGC5" , -id 6 , -ncbieaa "KNKNKNTTTTTTTTTTTTTTTRSRSRSIIMIIIIIQHQHQHPPPPPPPPPPPPPPPRRRRRRRRRRRRRRRLLLLLLLLLLLLLLLEDEDEDAAAAAAAAAAAAAAAGGGGGGGGGGGGGGGVVVVVVVVVVVVVVVBBBQYQYQYSSSSSSSSSSSSSSS*CWCCLFLFLFZZZJZZZRRRJJJJJJJZZZJQQQLLL", -sncbieaa "-----------------------------M-------------------------------------------------------------------------------------------------------------------------------------------------------------------------" --- Base1 AAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAACCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTBBBHKKKMMMMMMMMMMSSSWYYYYYY --- Base2 AAAAAACCCCCCCCCCCCCCCGGGGGGTTTTTTTTAAAAAACCCCCCCCCCCCCCCGGGGGGGGGGGGGGGTTTTTTTTTTTTTTTAAAAAACCCCCCCCCCCCCCCGGGGGGGGGGGGGGGTTTTTTTTTTTTTTTMMMAAAAAACCCCCCCCCCCCCCCGGGGGTTTTTTAAATAAAGGGTTTTTTTAAATAAATTT --- Base3 ACGTRYACGTBDHKMNRSVWYACGTRYACGTHMWYACGTRYACGTBDHKMNRSVWYACGTBDHKMNRSVWYACGTBDHKMNRSVWYACGTRYACGTBDHKMNRSVWYACGTBDHKMNRSVWYACGTBDHKMNRSVWYCTYACGTRYACGTBDHKMNRSVWYACGTYACGTRYAGRAAGRAGRACTHMWYAGRAAGRAGR -} -{ -name "Echinoderm Mitochondrial; Flatworm Mitochondrial; SGC8" , -id 9 , -ncbieaa "NNKNNNNNTTTTTTTTTTTTTTTSSSSSSSSSSSSSSSIIMIIIIIQHQHQHPPPPPPPPPPPPPPPRRRRRRRRRRRRRRRLLLLLLLLLLLLLLLEDEDEDAAAAAAAAAAAAAAAGGGGGGGGGGGGGGGVVVVVVVVVVVVVVVBBB*Y*Y*YSSSSSSSSSSSSSSSWCWCWCLFLFLFJJJJJJJJXZZZJLLL", -sncbieaa "----------------------------------------M----------------------------------------------------------------------------------------------M--------------------------------------------------------M-------" --- Base1 AAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAACCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTHMMMMMMMRSSSWYYY --- Base2 AAAAAAAACCCCCCCCCCCCCCCGGGGGGGGGGGGGGGTTTTTTTTAAAAAACCCCCCCCCCCCCCCGGGGGGGGGGGGGGGTTTTTTTTTTTTTTTAAAAAACCCCCCCCCCCCCCCGGGGGGGGGGGGGGGTTTTTTTTTTTTTTTMMMAAAAAACCCCCCCCCCCCCCCGGGGGGTTTTTTTTTTTTTTTAAATTTT --- Base3 ACGTHMWYACGTBDHKMNRSVWYACGTBDHKMNRSVWYACGTHMWYACGTRYACGTBDHKMNRSVWYACGTBDHKMNRSVWYACGTBDHKMNRSVWYACGTRYACGTBDHKMNRSVWYACGTBDHKMNRSVWYACGTBDHKMNRSVWYCTYACGTRYACGTBDHKMNRSVWYACGTRYACGTRYAACTHMWYGAGRAAGR -} -{ -name "Euplotid Nuclear; SGC9" , -id 10 , -ncbieaa "KNKNKNTTTTTTTTTTTTTTTRSRSRSIIMIIIIIQHQHQHPPPPPPPPPPPPPPPRRRRRRRRRRRRRRRLLLLLLLLLLLLLLLEDEDEDAAAAAAAAAAAAAAAGGGGGGGGGGGGGGGVVVVVVVVVVVVVVVBBB*Y*Y*YSSSSSSSSSSSSSSSCCWCCCCCLFLFLFJRRRJJJJJJJZZZJLLL", -sncbieaa "-----------------------------M-------------------------------------------------------------------------------------------------------------------------------------------------------------------" --- Base1 AAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAACCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTHMMMMMMMMMMSSSWYYY --- Base2 AAAAAACCCCCCCCCCCCCCCGGGGGGTTTTTTTTAAAAAACCCCCCCCCCCCCCCGGGGGGGGGGGGGGGTTTTTTTTTTTTTTTAAAAAACCCCCCCCCCCCCCCGGGGGGGGGGGGGGGTTTTTTTTTTTTTTTMMMAAAAAACCCCCCCCCCCCCCCGGGGGGGGTTTTTTTGGGTTTTTTTAAATTTT --- Base3 ACGTRYACGTBDHKMNRSVWYACGTRYACGTHMWYACGTRYACGTBDHKMNRSVWYACGTBDHKMNRSVWYACGTBDHKMNRSVWYACGTRYACGTBDHKMNRSVWYACGTBDHKMNRSVWYACGTBDHKMNRSVWYCTYACGTRYACGTBDHKMNRSVWYACGTHMWYACGTRYAAGRACTHMWYAGRAAGR -} -{ -name "Bacterial and Plant Plastid" , -id 11 , -ncbieaa "KNKNKNTTTTTTTTTTTTTTTRSRSRSIIMIXXIXIXXXXIIQHQHQHPPPPPPPPPPPPPPPRRRRRRRRRRRRRRRLLLLLLLLLLLLLLLEDEDEDAAAAAAAAAAAAAAAGGGGGGGGGGGGGGGVVVVVVVVVVVVVVVBBB*Y*Y*YSSSSSSSSSSSSSSS*CWCCLFLFLF*XXJXXRRRJJXJJJJJXXZZZXXJXLLL", -sncbieaa "---------------------------MMMMMMMMMMMMMMM--------------------------------------M--------------------------------------------------M-------------------------------------------M----MM-MM-----M-----MM---MM-M-M-" --- Base1 AAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAACCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTBDHHKMMMMMMMMMMMNRSSSSVWWYYY --- Base2 AAAAAACCCCCCCCCCCCCCCGGGGGGTTTTTTTTTTTTTTTAAAAAACCCCCCCCCCCCCCCGGGGGGGGGGGGGGGTTTTTTTTTTTTTTTAAAAAACCCCCCCCCCCCCCCGGGGGGGGGGGGGGGTTTTTTTTTTTTTTTMMMAAAAAACCCCCCCCCCCCCCCGGGGGTTTTTTRTTTTTGGGTTTTTTTTTTAAATTTTTTT --- Base3 ACGTRYACGTBDHKMNRSVWYACGTRYACGTBDHKMNRSVWYACGTRYACGTBDHKMNRSVWYACGTBDHKMNRSVWYACGTBDHKMNRSVWYACGTRYACGTBDHKMNRSVWYACGTBDHKMNRSVWYACGTBDHKMNRSVWYCTYACGTRYACGTBDHKMNRSVWYACGTYACGTRYAGGAGGAGRACGTHMWYGGAGRGGAGAGR -} -{ -name "Alternative Yeast Nuclear" , -id 12 , -ncbieaa "KNKNKNTTTTTTTTTTTTTTTRSRSRSIIMIIIIIQHQHQHPPPPPPPPPPPPPPPRRRRRRRRRRRRRRRLLSLLLLLEDEDEDAAAAAAAAAAAAAAAGGGGGGGGGGGGGGGVVVVVVVVVVVVVVVBBB*Y*Y*YSSSSSSSSSSSSSSS*CWCCLFLFLF*JRRRJJXJJJJJZZZJL", -sncbieaa "-----------------------------M-------------------------------------------M--------------------------------------------------------------------------------------------------M----------" --- Base1 AAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAACCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTHMMMMMMMMMMMSSSWY --- Base2 AAAAAACCCCCCCCCCCCCCCGGGGGGTTTTTTTTAAAAAACCCCCCCCCCCCCCCGGGGGGGGGGGGGGGTTTTTTTTAAAAAACCCCCCCCCCCCCCCGGGGGGGGGGGGGGGTTTTTTTTTTTTTTTMMMAAAAAACCCCCCCCCCCCCCCGGGGGTTTTTTRTGGGTTTTTTTTAAATT --- Base3 ACGTRYACGTBDHKMNRSVWYACGTRYACGTHMWYACGTRYACGTBDHKMNRSVWYACGTBDHKMNRSVWYACGTHMWYACGTRYACGTBDHKMNRSVWYACGTBDHKMNRSVWYACGTBDHKMNRSVWYCTYACGTRYACGTBDHKMNRSVWYACGTYACGTRYAAAGRACGTHMWYAGRAA -} -{ -name "Ascidian Mitochondrial" , -id 13 , -ncbieaa "KNKNKNTTTTTTTTTTTTTTTGSGSGSMIMIMIQHQHQHPPPPPPPPPPPPPPPRRRRRRRRRRRRRRRLLLLLLLLLLLLLLLEDEDEDAAAAAAAAAAAAAAAGGGGGGGGGGGGGGGVVVVVVVVVVVVVVVBBB*Y*Y*YSSSSSSSSSSSSSSSWCWCWCLFLFLFXXJJJGGGXZZZXLLL", -sncbieaa "---------------------------M-M-M------------------------------------------------------------------------------------------M--------------------------------------------M---MM------M---M---" --- Base1 AAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAACCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTDKMMMRRRRSSSWYYY --- Base2 AAAAAACCCCCCCCCCCCCCCGGGGGGTTTTTTAAAAAACCCCCCCCCCCCCCCGGGGGGGGGGGGGGGTTTTTTTTTTTTTTTAAAAAACCCCCCCCCCCCCCCGGGGGGGGGGGGGGGTTTTTTTTTTTTTTTMMMAAAAAACCCCCCCCCCCCCCCGGGGGGTTTTTTTTTTTGGGTAAATTTT --- Base3 ACGTRYACGTBDHKMNRSVWYACGTRYACGTRYACGTRYACGTBDHKMNRSVWYACGTBDHKMNRSVWYACGTBDHKMNRSVWYACGTRYACGTBDHKMNRSVWYACGTBDHKMNRSVWYACGTBDHKMNRSVWYCTYACGTRYACGTBDHKMNRSVWYACGTRYACGTRYGGCTYAGRGAGRGAGR -} -{ -name "Alternative Flatworm Mitochondrial" , -id 14 , -ncbieaa "NNKNNNNNTTTTTTTTTTTTTTTSSSSSSSSSSSSSSSIIMIIIIIQHQHQHPPPPPPPPPPPPPPPRRRRRRRRRRRRRRRLLLLLLLLLLLLLLLEDEDEDAAAAAAAAAAAAAAAGGGGGGGGGGGGGGGVVVVVVVVVVVVVVVBBBYY*YYYYYSSSSSSSSSSSSSSSWCWCWCLFLFLFJJJJJJJJZZZJLLL", -sncbieaa "----------------------------------------M----------------------------------------------------------------------------------------------------------------------------------------------------------------" --- Base1 AAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAACCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTHMMMMMMMSSSWYYY --- Base2 AAAAAAAACCCCCCCCCCCCCCCGGGGGGGGGGGGGGGTTTTTTTTAAAAAACCCCCCCCCCCCCCCGGGGGGGGGGGGGGGTTTTTTTTTTTTTTTAAAAAACCCCCCCCCCCCCCCGGGGGGGGGGGGGGGTTTTTTTTTTTTTTTMMMAAAAAAAACCCCCCCCCCCCCCCGGGGGGTTTTTTTTTTTTTTAAATTTT --- Base3 ACGTHMWYACGTBDHKMNRSVWYACGTBDHKMNRSVWYACGTHMWYACGTRYACGTBDHKMNRSVWYACGTBDHKMNRSVWYACGTBDHKMNRSVWYACGTRYACGTBDHKMNRSVWYACGTBDHKMNRSVWYACGTBDHKMNRSVWYCTYACGTHMWYACGTBDHKMNRSVWYACGTRYACGTRYAACTHMWYAGRAAGR -} -{ -name "Blepharisma Macronuclear" , -id 15 , -ncbieaa "KNKNKNTTTTTTTTTTTTTTTRSRSRSIIMIIIIIQHQHQHPPPPPPPPPPPPPPPRRRRRRRRRRRRRRRLLLLLLLLLLLLLLLEDEDEDAAAAAAAAAAAAAAAGGGGGGGGGGGGGGGVVVVVVVVVVVVVVVBBB*YQYYSSSSSSSSSSSSSSS*CWCCLFLFLF*ZJZRRRJJJJJJJZZZJQLLL", -sncbieaa "-----------------------------M-------------------------------------------------------------------------------------------------------------------------------------------------------------------" --- Base1 AAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAACCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTBHKMMMMMMMMMMSSSWYYYY --- Base2 AAAAAACCCCCCCCCCCCCCCGGGGGGTTTTTTTTAAAAAACCCCCCCCCCCCCCCGGGGGGGGGGGGGGGTTTTTTTTTTTTTTTAAAAAACCCCCCCCCCCCCCCGGGGGGGGGGGGGGGTTTTTTTTTTTTTTTMMMAAAAACCCCCCCCCCCCCCCGGGGGTTTTTTRATAGGGTTTTTTTAAATATTT --- Base3 ACGTRYACGTBDHKMNRSVWYACGTRYACGTHMWYACGTRYACGTBDHKMNRSVWYACGTBDHKMNRSVWYACGTBDHKMNRSVWYACGTRYACGTBDHKMNRSVWYACGTBDHKMNRSVWYACGTBDHKMNRSVWYCTYACGTYACGTBDHKMNRSVWYACGTYACGTRYAGAGAGRACTHMWYAGRAGAGR -} -{ -name "Chlorophycean Mitochondrial" , -id 16 , -ncbieaa "KNKNKNTTTTTTTTTTTTTTTRSRSRSIIMIIIIIQHQHQHPPPPPPPPPPPPPPPRRRRRRRRRRRRRRRLLLLLLLLLLLLLLLEDEDEDAAAAAAAAAAAAAAAGGGGGGGGGGGGGGGVVVVVVVVVVVVVVVBBB*YLYYSSSSSSSSSSSSSSS*CWCCLFLFLF*LJRRRJJJJJJJZZZJLLL", -sncbieaa "-----------------------------M-----------------------------------------------------------------------------------------------------------------------------------------------------------------" --- Base1 AAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAACCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTHMMMMMMMMMMSSSWYYY --- Base2 AAAAAACCCCCCCCCCCCCCCGGGGGGTTTTTTTTAAAAAACCCCCCCCCCCCCCCGGGGGGGGGGGGGGGTTTTTTTTTTTTTTTAAAAAACCCCCCCCCCCCCCCGGGGGGGGGGGGGGGTTTTTTTTTTTTTTTMMMAAAAACCCCCCCCCCCCCCCGGGGGTTTTTTRWTGGGTTTTTTTAAATTTT --- Base3 ACGTRYACGTBDHKMNRSVWYACGTRYACGTHMWYACGTRYACGTBDHKMNRSVWYACGTBDHKMNRSVWYACGTBDHKMNRSVWYACGTRYACGTBDHKMNRSVWYACGTBDHKMNRSVWYACGTBDHKMNRSVWYCTYACGTYACGTBDHKMNRSVWYACGTYACGTRYAGAAGRACTHMWYAGRAAGR -} -{ -name "Trematode Mitochondrial" , -id 21 , -ncbieaa "NNKNNNNNTTTTTTTTTTTTTTTSSSSSSSSSSSSSSSMIMIMIQHQHQHPPPPPPPPPPPPPPPRRRRRRRRRRRRRRRLLLLLLLLLLLLLLLEDEDEDAAAAAAAAAAAAAAAGGGGGGGGGGGGGGGVVVVVVVVVVVVVVVBBB*Y*Y*YSSSSSSSSSSSSSSSWCWCWCLFLFLFJJJXZZZLLL", -sncbieaa "----------------------------------------M--------------------------------------------------------------------------------------------M---------------------------------------------------M------" --- Base1 AAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAACCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTMMMRSSSYYY --- Base2 AAAAAAAACCCCCCCCCCCCCCCGGGGGGGGGGGGGGGTTTTTTAAAAAACCCCCCCCCCCCCCCGGGGGGGGGGGGGGGTTTTTTTTTTTTTTTAAAAAACCCCCCCCCCCCCCCGGGGGGGGGGGGGGGTTTTTTTTTTTTTTTMMMAAAAAACCCCCCCCCCCCCCCGGGGGGTTTTTTTTTTAAATTT --- Base3 ACGTHMWYACGTBDHKMNRSVWYACGTBDHKMNRSVWYACGTRYACGTRYACGTBDHKMNRSVWYACGTBDHKMNRSVWYACGTBDHKMNRSVWYACGTRYACGTBDHKMNRSVWYACGTBDHKMNRSVWYACGTBDHKMNRSVWYCTYACGTRYACGTBDHKMNRSVWYACGTRYACGTRYCTYGAGRAGR -} -{ -name "Scenedesmus obliquus Mitochondrial" , -id 22 , -ncbieaa "KNKNKNTTTTTTTTTTTTTTTRSRSRSIIMIIIIIQHQHQHPPPPPPPPPPPPPPPRRRRRRRRRRRRRRRLLLLLLLLLLLLLLLEDEDEDAAAAAAAAAAAAAAAGGGGGGGGGGGGGGGVVVVVVVVVVVVVVVBBB*YLYY*SSSSSSS*CWCCLFLFLF****LJRRRJJJJJJJZZZJLLL", -sncbieaa "-----------------------------M-------------------------------------------------------------------------------------------------------------------------------------------------------------" --- Base1 AAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAACCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGTTTTTTTTTTTTTTTTTTTTTTTTTTTTTHMMMMMMMMMMSSSWYYY --- Base2 AAAAAACCCCCCCCCCCCCCCGGGGGGTTTTTTTTAAAAAACCCCCCCCCCCCCCCGGGGGGGGGGGGGGGTTTTTTTTTTTTTTTAAAAAACCCCCCCCCCCCCCCGGGGGGGGGGGGGGGTTTTTTTTTTTTTTTMMMAAAAACCCCCCCCGGGGGTTTTTTMRSVWTGGGTTTTTTTAAATTTT --- Base3 ACGTRYACGTBDHKMNRSVWYACGTRYACGTHMWYACGTRYACGTBDHKMNRSVWYACGTBDHKMNRSVWYACGTBDHKMNRSVWYACGTRYACGTBDHKMNRSVWYACGTBDHKMNRSVWYACGTBDHKMNRSVWYCTYACGTYACGTBKSYACGTYACGTRYAAAAGAAGRACTHMWYAGRAAGR -} -{ -name "Thraustochytrium Mitochondrial" , -id 23 , -ncbieaa "KNKNKNTTTTTTTTTTTTTTTRSRSRSIIMIIXIIIQHQHQHPPPPPPPPPPPPPPPRRRRRRRRRRRRRRRLLLLLLLLLLLLLLLEDEDEDAAAAAAAAAAAAAAAGGGGGGGGGGGGGGGVVVVVVVVVVVVVVVBBB*Y*Y*YSSSSSSSSSSSSSSS*CWCC*FLFF****RRRJJJJJJJXZZZL", -sncbieaa "-----------------------------MM-M--------------------------------------------------------------------------------------------M------------------------------------------------------------M----" --- Base1 AAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAACCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTMMMMMMMMMMRSSSY --- Base2 AAAAAACCCCCCCCCCCCCCCGGGGGGTTTTTTTTTAAAAAACCCCCCCCCCCCCCCGGGGGGGGGGGGGGGTTTTTTTTTTTTTTTAAAAAACCCCCCCCCCCCCCCGGGGGGGGGGGGGGGTTTTTTTTTTTTTTTMMMAAAAAACCCCCCCCCCCCCCCGGGGGTTTTTDKRWGGGTTTTTTTTAAAT --- Base3 ACGTRYACGTBDHKMNRSVWYACGTRYACGTHKMWYACGTRYACGTBDHKMNRSVWYACGTBDHKMNRSVWYACGTBDHKMNRSVWYACGTRYACGTBDHKMNRSVWYACGTBDHKMNRSVWYACGTBDHKMNRSVWYCTYACGTRYACGTBDHKMNRSVWYACGTYACGTYAAAAAGRACTHMWYGAGRG -} -{ -name "Strict Standard" , -ncbieaa "KNKKNNTTTTTTTTTTTTTTTRSRRSSIIMIIIIIQHQQHHPPPPPPPPPPPPPPPRRRRRRRRRRRRRRRLLLLLLLLLLLLLLLEDEEDDAAAAAAAAAAAAAAAGGGGGGGGGGGGGGGBBBVVVVVVVVVVVVVVVJRRRJJJJJJJZZZ*Y**YYSSSSSSSSSSSSSSS*CWCC*LFLLFFJLLL", -sncbieaa "-----------------------------M-----------------------------------------------------------------------------------------------------------------------------------------------------------------" --- Base1 AAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAACCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGHMMMMMMMMMMSSSTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTWYYY --- Base2 AAAAAACCCCCCCCCCCCCCCGGGGGGTTTTTTTTAAAAAACCCCCCCCCCCCCCCGGGGGGGGGGGGGGGTTTTTTTTTTTTTTTAAAAAACCCCCCCCCCCCCCCGGGGGGGGGGGGGGGMMMTTTTTTTTTTTTTTTTGGGTTTTTTTAAAAAAAAACCCCCCCCCCCCCCCGGGGGRTTTTTTTTTT --- Base3 ACGRTYABCDGHKMNRSTVWYACGRTYACGHMTWYACGRTYABCDGHKMNRSTVWYABCDGHKMNRSTVWYABCDGHKMNRSTVWYACGRTYABCDGHKMNRSTVWYABCDGHKMNRSTVWYCTYABCDGHKMNRSTVWYAAGRACHMTWYAGRACGRTYABCDGHKMNRSTVWYACGTYAACGRTYAAGR -} diff --git a/lib/JCVI/Translator/Base.pm b/lib/JCVI/Translator/Base.pm new file mode 100644 index 0000000..cc93cb7 --- /dev/null +++ b/lib/JCVI/Translator/Base.pm @@ -0,0 +1,322 @@ +# JCVI::Translator::Table +# +# $Author$ +# $Date$ +# $Revision$ +# $HeadURL$ + +=head1 NAME + +JCVI::Translator::Base - Contains translation methods for JCVI::Translator + +=head1 SYNOPSIS + + my $base = new JCVI::Translator::Base; + $base->set_seq($seq_ref); + $base->set_partial($partial); + $base->prepare($strand, $table); + $base->endpoints($upper, $lower, $offset); + my $pep_arrayref = $base->translate(); + +=head1 DESCRIPTION + +This package contains the actual methods that do the translation. + +=cut + +package JCVI::Translator::Base; + +use strict; +use warnings; + +=head1 CONSTRUCTOR + +=cut + +=head2 new + +=cut + +sub new { + my $class = shift; + my $self = {}; + bless $self, $class; +} + +=head1 METHODS + +=cut + +=head2 clear + +Clear all stored variables + +=cut + +sub clear { + my ($self) = @_; + undef %$self; +} + +=head2 set_seq + +Cache the seq_ref to be translated + +=cut + +sub set_seq { + my ( $self, $seq_ref ) = @_; + $self->{seq_ref} = $seq_ref; +} + +=head2 set_partial + +Set the partial status + +=cut + +sub set_partial { + my ( $self, $partial ) = @_; + $self->{partial} = $partial; +} + +=head2 prepare + +Prepare things related to the strand. Set up the increment, the rc boolean +value (stands for reverse complement - false for + strand, true for - strand), +and the translation tables that are being used. + +=cut + +sub prepare { + my ( $self, $strand, $table ) = @_; + + # This is a good a place as any to clear the leftover - see below for more + # info + $self->{leftover} = ''; + + $self->{strand} = $strand; + + $self->{increment} = 3 * $strand; + + my $rc = $strand == 1 ? 0 : 1; + $self->{rc} = $rc; + + # The translation tables are keyed on $rc in JCVI::Translator::Table + $self->{table} = $table->_forward->[$rc]; + $self->{starts} = $table->_starts->[$rc]; +} + +=head2 endpoints + +Set the endpoints for looping up. The translate method loops until the index is +equal to the stop endpoint. For this to work, the stop must be in the same +frame as the start. For the + strand, adjust the upper bound so that it is in +phase with lower bound and offset. + +The - strand is trickier. Not only adjust is the lower bound adjusted to be in +phase with the lower bound and offset, but 3 is also subtracted from the +bounds so that the right index for substring is present. Codons are indexed on +their lower bound, so 3 is subtracted to get from the upper end to the lower. + +Below is an example that might make sense of this. Suppose we are interested in +translating the sequence "CAGTTTAACAAGTCGAAACCGTTC" between positions 4 and 20: + + Positions: 0 1 2 3 4 5 6 7 8 9 0 1 2 3 4 5 6 7 8 9 0 1 2 3 4 + Sequence C A G T T T A A C A A G T C G A A A C C G T T C + Region of interest (-): . . . .4- - - - - - - - - - - - - - - -20 . . . + For + strand: 4- - -|- - -|- - -|- - -|- - >19 + For - strand: 2. . .|< - -|- - -|- - -|- - -17 - -| + +For the + strand, endpoints will set the start to 4, and the stop to 19. This +grab the codon starting at base 4, 7, 10, 13 and 16 (at base 19, the index will +equal the stop, and the loop will terminate). Thus, we'll have the codons TTA, +ACA, AGT, CGA and AAC. + +For a - strand, start is 17 and stop is 2. This will get the codons starting at +17 (which ends at base 20), 14, 11, 8 and 5. It will not take the codon +starting at base 2, which is out of the specified bounds, because at that +point, the index will equal the stop and the loop will exit. + +=cut + +sub endpoints { + my ( $self, $lower, $upper, $offset ) = @_; + + # If offset isn't provided, designate the offset by how much is required to + # complete the leftover codon. + $offset = 3 - length( $self->{leftover} ) unless ( defined $offset ); + + $self->{lower} = $lower; + $self->{upper} = $upper; + + # Calculate the phase difference between the upper and lower adjusting for + # offset. This will be the same as the number of bases left over after + # translation. + my $phase = ( $upper - $lower - $offset ) % 3; + $self->{phase} = $phase; + + # Here is where the endpoints are actually set up. + if ( $self->{rc} ) { + # Set the start by adjusting for offset and subtract 3 as explained + # above + $self->{start} = $upper - $offset - 3; + $self->{stop} = $lower - 3 + $phase; + } + else { + # Just adjust for stop and phase. + $self->{start} = $lower + $offset; + $self->{stop} = $upper - $phase; + } +} + +=head2 translate + +Perform the actual translation. Try to translate the start codon if partial +isn't set, and then do the translation. Return the results as an arrayref. + +=cut + +sub translate { + my ($self) = @_; + + my @residues; + + # Try to translate the start codon + push @residues, $self->start() unless ( $self->{partial} ); + + my $seq_ref = $self->{seq_ref}; + my $index = $self->{start}; + + # Iterate until the index is the end of the loop + until ( $index == $self->{stop} ) { + + # Grab the codon, and look it up in the translation table + my $codon = substr( $$seq_ref, $index, 3 ); + push @residues, $self->{table}->{$codon}; + + # Increment the index + $index += $self->{increment}; + } + + return \@residues; +} + +=head2 start + +Translate the start codon if possible + +=cut + +sub start { + my ($self) = @_; + + # If start == stop, don't do anything + return '' if ( $self->{start} == $self->{stop} ); + + my $seq_ref = $self->{seq_ref}; + + # Grab the codon and look it up in the starts table. + my $codon = substr( $$seq_ref, $self->{start}, 3 ); + my $start = $self->{starts}->{$codon}; + + # Return the empty string if start isn't found in the translation table + return '' unless ($start); + + # Increment the start location and return the start codon + $self->{start} += $self->{increment}; + return $start; +} + +=head2 store_leftover + +Store the leftover bases from translation. These are codons that have been cut +by splice sites. + +=cut + +sub store_leftover { + my ($self) = @_; + + my $seq_ref = $self->{seq_ref}; + + if ( $self->{rc} ) { + # For the - strand, the leftover starts from the lower bound + $self->{leftover} = substr( $$seq_ref, $self->{lower}, $self->{phase} ); + } + else { + # For the + strand, the leftover ends at the upper bound and starts + # where translation finished. + $self->{leftover} = substr( $$seq_ref, $self->{stop}, $self->{phase} ); + } +} + +=head2 finish_leftover + +Extend the leftover to completion in the current codon, if possible. + +=cut + +sub finish_leftover { + my ($self) = @_; + + # Calculate how many bases are required to finish the codon + my $to_go = 3 - length( $self->{leftover} ); + + # If the current exon is shorter than that number, adjust so that leftover + # codon doesn't run into the intron + if ( ( my $length = $self->{upper} - $self->{lower} ) < $to_go ) { + $to_go = $length; + } + + my $seq_ref = $self->{seq_ref}; + + if ( $self->{rc} ) { + # On the - strand, prefix the leftover + $self->{leftover} = + substr( $$seq_ref, $self->{upper} - $to_go, $to_go ) + . $self->{leftover}; + } + else { + # On the + srand, append to the leftover + $self->{leftover} .= substr( $$seq_ref, $self->{lower}, $to_go ); + } + + return length( $self->{leftover} ) == 3; +} + +=head2 translate_leftover + +Translate the leftover codon. If partial isn't set, translate and then set the +partial flag. + +=cut + +sub translate_leftover { + my ($self) = @_; + + my $leftover = $self->{leftover}; + + # If this is partial, then translate the leftover normally + return $self->{table}->{$leftover} if ( $self->{partial} ); + + # Try to translate the start, but mark partial as 1 so that it doesn't try + # to translate the start again + $self->{partial} = 1; + return $self->{starts}->{$leftover} || $self->{table}->{$leftover}; +} + +1; + +=head1 AUTHOR + +Kevin Galinsky, + +=head1 COPYRIGHT & LICENSE + +Copyright 2008-2009 J. Craig Venter Institute, all rights reserved. + +This program is free software; you can redistribute it and/or modify it +under the same terms as Perl itself. + +=cut \ No newline at end of file diff --git a/lib/JCVI/Translator/Table.pm b/lib/JCVI/Translator/Table.pm new file mode 100644 index 0000000..8509369 --- /dev/null +++ b/lib/JCVI/Translator/Table.pm @@ -0,0 +1,943 @@ +# JCVI::Translator::Table +# +# $Author$ +# $Date$ +# $Revision$ +# $HeadURL$ + +=head1 NAME + +JCVI::Translator::Table - Translation table for JCVI::Translator + +=head1 SYNOPSIS + + use JCVI::Translator::Table; + + my $table = new JCVI::Translator(); + my $table = new JCVI::Translator(11); + my $table = new JCVI::Translator( 12, { type => 'id' } ); + my $table = new JCVI::Translator( 'Yeast Mitochondrial', { type => 'name' } ); + my $table = new JCVI::Translator( 'mito', { type => 'name' } ); + + my $table = custom JCVI::Translator( \$custom_table ); + my $tale = custom JCVI::Translator( \$custom_table, { bootstrap => 0 } ); + + +=cut + +package JCVI::Translator::Table; + +use strict; +use warnings; + +use base qw(Class::Accessor::Fast); +__PACKAGE__->mk_accessors(qw(id names _forward _starts _reverse)); + +use Log::Log4perl qw(:easy); +use Params::Validate; + +#use JCVI::Translator::_TablePair; + +use JCVI::DNATools qw( + %degenerate_map + $degen_match + @nucs + $nuc_match + reverse_complement +); + +use JCVI::AATools qw( + %ambiguous_forward + $aa_match +); + +our $DEFAULT_ID = 1; +our $DEFAULT_TYPE = 'id'; +our $DEFAULT_BOOTSTRAP = 1; + +# Helper constructor. Instantiates the object with arrayrefs and hashrefs in +# the right places +sub _new { + shift->SUPER::new( + { + names => [], + _forward => JCVI::Translator::_TablePair->new(), + _starts => JCVI::Translator::_TablePair->new(), + _reverse => JCVI::Translator::_TablePair->new() + } + ); +} + +=head1 CONSTRUCTORS + +=cut + +=head2 new + + my $table = JCVI::Translator::Table->new(); + my $table = JCVI::Translator::Table->new( $id ); + my $table = JCVI::Translator::Table->new( $id, \%params ); + +This method creates a translation table by loading a table string from the +internal list. Pass an ID and the type of ID. By default, it will load the +translation table with id 1. The type of ID may be "id" or "name," which +correspond to the numeric id of the translation table or the long name of the +translation table. For instance, below are the headers for the first 3 table +strings. + + { + name "Standard" , + name "SGC0" , + id 1 , + ... + }, + { + name "Vertebrate Mitochondrial" , + name "SGC1" , + id 2 , + ... + }, + { + name "Yeast Mitochondrial" , + name "SGC2" , + id 3 , + ... + }, + ... + +By default, the "Standard" translation table will be loaded. You may instantiate +this translation table by calling any of the following: + + my $t = JCVI::Translator::Table->new(); + my $t = JCVI::Translator::Table->new(1); + my $t = JCVI::Translator::Table->new( 1, { type => 'id' } ); + my $t = JCVI::Translator::Table->new( 'Standard', { type => 'name' } ); + my $t = JCVI::Translator::Table->new( 'SGC0', { type => 'name' } ); + my $t = JCVI::Translator::Table->new( 'standard', { type => 'name' } ); + my $t = JCVI::Translator::Table->new( 'stan', { type => 'name' } ); + +For partial matches, this module will use the first matching translation +table. + + my $t = JCVI::Translator::Table->new( 'mitochondrial', { type => 'name' } ); + +This will use translation table with ID 2, "Vertebrate Mitochondrial," because +that is the first match (even though "Yeast Mitochondrial" would also match). + +=cut + +sub new { + TRACE('new called'); + + my $class = shift; + + my ( $id, @p ); + + # id has a default, but if supplied, must be a scalar + ( $id, $p[0] ) = validate_pos( + @_, + { type => Params::Validate::SCALAR, default => $DEFAULT_ID }, + { type => Params::Validate::HASHREF, default => {} } + ); + + # type must be either id or name + my %p = validate( + @p, + { + type => { + default => $DEFAULT_TYPE, + regex => qr/id|name/ + } + } + ); + + TRACE( uc( $p{type} ) . ': ' . $id ); + + # Get the beginning DATA so that we can seek back to it + my $start_pos = tell DATA; + + # Set up regular expression for searching. + my $match = ( $p{type} eq 'id' ) ? qr/id $id\b/ : qr/name ".*$id.*"/i; + + # Go through every internal table until it matches on id or name. + my $found = 0; + local $/ = "}"; + local $_; + while () { + if ( $_ =~ $match ) { + $found = 1; + last; + } + } + + # Reset DATA + seek DATA, $start_pos, 0; + + # Call custom with internal table. We don't want to bootstrap. + return $class->custom( \$_, { bootstrap => 0 } ) if ($found); + + # Internal table not matched. + ERROR("Table with $p{type} of $id not found"); + return undef; +} + +=head2 custom() + + my $table = JCVI::Translator::Table->custom( $table_ref ); + my $table = JCVI::Translator::Table->custom( $table_ref, \%params ); + +Create a translation table based off a passed table reference for custom +translation tables. Loads degenerate nucleotides if bootstrap isn't set (this +can take a little time). The format of the translation table should reflect +those of the internal tables: + + name "Names separated; by semicolons" + name "May have multiple lines" + id 99 + ncbieaa "AMINOACIDS...", + sncbieaa "-M--------..." + -- Base1 AAAAAAAAAA... + -- Base2 AAAACCCCGG... + -- Base3 ACGTACTGAC... + +This module is a bit more permissive than that; see the $TABLE_REGEX regular +expression to see that actual format. + +Examples: + + $translator = new Translator( + table_ref => \'name "All Alanines; All the Time" + id 9000 + ncbieaa "AAAAAAAA" + sncbieaa "----M---" + base1 AAAAAAAA + base2 AACCGGTT + base3 ACACACAC' + ); + + $translator = new Translator( + table_ref => \$table, + bootstrap => 0 + ); + +=cut + +# Regular expression which should match translation tables and also extracts +# relevant information. +our $TABLE_REGEX = qr/ + ( (?:name\s+".+?".*?) + ) + id\s+(\d+).* + ncbieaa\s+"([a-z*]+)".* + sncbieaa\s+"([a-z-]+)".* + base1\s+([a-z]+).* + base2\s+([a-z]+).* + base3\s+([a-z]+).* + /isx; + +sub custom { + TRACE('custom called'); + + my $class = shift; + + my ( $table_ref, @p ); + + # table_ref is required and must be a refrerence to a scalar + ( $table_ref, $p[0] ) = validate_pos( + @_, + { type => Params::Validate::SCALARREF }, + { type => Params::Validate::HASHREF, default => {} } + ); + + # get the bootstrap parameter + my %p = validate( + @p, + { + bootstrap => { + default => $DEFAULT_BOOTSTRAP, + regex => qr/^[01]$/ + } + } + ); + + # Match the table or return undef. + unless ( $$table_ref =~ $TABLE_REGEX ) { + ERROR( 'Translation table is in invalid format', $$table_ref ); + return undef; + } + + # Store the data that has been stripped using descriptive names; + my $names = $1; + my $id = $2; + my $residues = $3; + my $starts = $4; + my $base1 = $5; + my $base2 = $6; + my $base3 = $7; + + my $self = $class->_new(); + + $self->id($id); + + # Extract each name, massage, and push it onto names array + while ( $names =~ /"(.+?)"/gis ) { + my @names = split( /;/, $1 ); + local $_; + foreach (@names) { + s/^\s+//; + s/\s+$//; + s/\n/ /g; + s/\s{2,}/ /g; + push @{ $self->names }, $_ if $_; + } + } + + # Get all the table pairs so we don't have to keep using accessors + my $forward_table = $self->_forward; + my $starts_table = $self->_starts; + my $reverse_table = $self->_reverse; + + # Chop is used to efficiently get the last character from each string + while ( my $residue = uc( chop $residues ) ) { + my $start = uc( chop $starts ); + my $codon = uc( chop($base1) . chop($base2) . chop($base3) ); + + my $reverse = ${ reverse_complement( \$codon ) }; + + # If the residue is valid, store it + if ( $residue ne 'X' ) { + $forward_table->store( $residue, $codon, $reverse ); + $reverse_table->push( $residue, $codon, $reverse ); + } + + # If the start is valid, store it + if ( ( $start ne '-' ) ) { + $starts_table->store( $start, $codon, $reverse ); + $reverse_table->push( '+', $codon, $reverse ); + } + } + + # Bootstrap the translation table + $self->bootstrap() if ( $p{bootstrap} ); + + return $self; +} + +=head1 METHODS + +=cut + +=head2 add_translation + + $translator->add_translation( $codon, $residue ); + $translator->add_translation( $codon, $residue, \%params ); + +Add a codon-to-residue translation to the translation table. $start inidicates +if this is a start codon. + +Examples: + + # THESE AREN'T REAL!!! + $translator->add_translation( 'ABA', 'G' ); + $translator->add_translation( 'ABA', 'M', 1 ); + +=cut + +sub add_translation { + TRACE('add_translation called'); + + my $self = shift; + + my ( $codon, $residue, @p ); + + ( $codon, $residue, $p[0] ) = validate_pos( + @_, + { regex => qr/^${nuc_match}{3}$/ }, + { regex => qr/^$aa_match$/ }, + { type => Params::Validate::HASHREF, default => {} } + ); + + my %p = validate( + @p, + { + strand => { + default => 1, + regex => qr/^[+-]?1$/, + type => Params::Validate::SCALAR + }, + start => { + default => 0, + regex => qr/^[01]$/, + type => Params::Validate::SCALAR + } + } + ); + + my $codon_ref; + my $rc_codon_ref; + + if ( $p{strand} == 1 ) { + $codon_ref = \$codon; + $rc_codon_ref = reverse_complement( \$codon ); + } + else { + $rc_codon_ref = \$codon; + $codon_ref = reverse_complement( \$codon ); + } + + # Store residue in the starts or regular translation table. + my $table = $p{start} ? '_starts' : '_forward'; + $table = $self->$table; + + $table->store( $residue, $$codon_ref, $$rc_codon_ref ); + + # Store the reverse lookup + $residue = 'start' if ( $p{start} ); + $self->_reverse->push( 'start', $$codon_ref, $$rc_codon_ref ); +} + +=head2 bootstrap + + $translator->bootstrap(); + +Bootstrap the translation table. Find every possible translation, even those +that involve degenerate nucleotides or ambiguous amino acids. + +=cut + +sub bootstrap { + TRACE('bootstrap called'); + + my $self = shift; + + # Loop through every nucleotide combination and run _translate_codon on + # each. + foreach my $n1 (@nucs) { + foreach my $n2 (@nucs) { + foreach my $n3 (@nucs) { + $self->_unroll( $n1 . $n2 . $n3, $self->_forward->[0] ); + $self->_unroll( + $n1 . $n2 . $n3, + $self->_starts->[0], + { start => 1 } + ); + } + } + } +} + +# This is the helper function for bootstrap. Handles codons with degenerate +# nucleotides: [RYMKWS] [BDHV] or N. Several codons may map to the same amino +# acid. If all possible codons for an amibguity map to the same residue, store +# that residue. + +sub _unroll { + my $self = shift; + my $codon = shift; + my $table = shift; + + # Return the codon if we have it + return $table->{$codon} if ( $table->{$codon} ); + + # Check for base case: no degenerate nucleotides; we can't unroll further. + unless ( $codon =~ /($degen_match)/ ) { + return undef; + } + + my $consensus; + my $nuc = $1; + + # Replace the nucleotide with every possiblity from degenerate map hash. + foreach ( @{ $degenerate_map{$nuc} } ) { + my $new_codon = $codon; + $new_codon =~ s/$nuc/$_/; + + # Recursively call this function + my $residue = $self->_unroll( $new_codon, $table, @_ ); + + # If the new_codon didn't come to a consensus, or if the translation + # isn't defined for new_codon in a custom translation table, return + # undef. + return undef unless ( defined $residue ); + + # If consensus isn't set, set it to the current residue. + $consensus = $residue unless ($consensus); + + # This is an interesting step. If the residue isn't the same as the + # consensus, check to see if they map to the same ambiguous amino acid. + # If true, then change the consensus to that ambiguous acid and proceed. + # Otherwise, return undef (consensus could not be reached). + if ( $residue ne $consensus ) { + if ( + ( defined $ambiguous_forward{$residue} ) + && ( defined $ambiguous_forward{$consensus} ) + && ( $ambiguous_forward{$residue} eq + $ambiguous_forward{$consensus} ) + ) + { + $consensus = $ambiguous_forward{$consensus}; + } + else { + return undef; + } + } + } + + # If we got this far, it means that we have a valid consensus sequence for + # a degenerate-nucleotide-containing codon. Cache and return results. + DEBUG("New codon translation found: $codon => $consensus"); + $self->add_translation( $codon, $consensus, @_ ); + return $consensus; +} + +=head2 string + + my $table_string_ref = $translator->string(); + my $table_string_ref = $translator->string( \%params ); + +Returns the table string. %params can specify whether or not this table should +try to bootstrap itself using the bootstrap function above. By default, it will +try to. + +Examples: + + my $table_string_ref = $translator->string(); + my $table_string_ref = $translator->string( { bootstrap => 0 } ); + +=cut + +sub string { + TRACE('table_string called'); + + my $self = shift; + + my $bootstrap = + validate_pos( @_, + { default => $DEFAULT_BOOTSTRAP, regex => qr/^[01]$/ } ); + + # Bootstrap if necessary + $self->bootstrap() if ($bootstrap); + + # Generate the names string + my $names = join( '; ', @{ $self->names } ); + + my ( $residues, $starts ); # starts/residues string + my @base = (undef) x 3; # this will store the base strings + + # Loop over all stored codons. Sort the codons in the translation table and + # starts table, then use grep to get the unique ones with the help of $prev + # which stores the previous value + my $prev = ''; + foreach my $codon ( + grep ( ( $_ ne $prev ) && ( $prev = $_ ), + sort { $a cmp $b } ( + keys( %{ $self->_forward->[0] } ), + keys( %{ $self->_starts->[0] } ) + ) ) + ) + { + $residues .= $self->_forward->[0]->{$codon} || 'X'; + $starts .= $self->_starts->[0]->{$codon} || '-'; + + # Chop up the codon because the bases are stored on separate lines + $base[ -$_ ] .= chop $codon foreach ( 1 .. 3 ); + } + + # Generate the string + my $string = join( "\n", + '{', + qq(name "$names" ,), + qq(id $self->{id} ,), + qq(ncbieaa "$residues",), + qq(sncbieaa "$starts"), + map( {"-- Base$_ $base[$_ - 1]"} ( 1 .. 3 ) ), + '}' ); + + return \$string; +} + +{ + package JCVI::Translator::_TablePair; + + use strict; + use warnings; + + use JCVI::DNATools qw(reverse_complement); + + sub new { + my $class = shift; + my $self = [ {}, {} ]; + bless $self, $class; + } + + sub store { + my ( $self, $residue, $codon, $reverse ) = @_; + + $reverse ||= ${ reverse_complement($codon) }; + + $self->[0]->{$codon} = $residue; + $self->[1]->{$reverse} = $residue; + } + + sub push { + my ( $self, $residue, $codon, $reverse ) = @_; + + $reverse ||= ${ reverse_complement($codon) }; + + $self->[0]->{$residue} ||= []; + $self->[1]->{$residue} ||= []; + + push @{ $self->[0]->{$residue} }, $codon; + push @{ $self->[1]->{$residue} }, $reverse; + } + + 1; +} + +1; + +=head1 MISC + +These are the original translation tables. The translation tables used by this +module have been boostrapped - they include translations for degenerate +nucleotides and allow ambiguous amino acids to be the targets of translation +(e.g. every effort has been made to give a translation that isn't "X"). + + { + name "Standard" , + name "SGC0" , + id 1 , + ncbieaa "FFLLSSSSYY**CC*WLLLLPPPPHHQQRRRRIIIMTTTTNNKKSSRRVVVVAAAADDEEGGGG", + sncbieaa "---M---------------M---------------M----------------------------" + -- Base1 TTTTTTTTTTTTTTTTCCCCCCCCCCCCCCCCAAAAAAAAAAAAAAAAGGGGGGGGGGGGGGGG + -- Base2 TTTTCCCCAAAAGGGGTTTTCCCCAAAAGGGGTTTTCCCCAAAAGGGGTTTTCCCCAAAAGGGG + -- Base3 TCAGTCAGTCAGTCAGTCAGTCAGTCAGTCAGTCAGTCAGTCAGTCAGTCAGTCAGTCAGTCAG + }, + { + name "Vertebrate Mitochondrial" , + name "SGC1" , + id 2 , + ncbieaa "FFLLSSSSYY**CCWWLLLLPPPPHHQQRRRRIIMMTTTTNNKKSS**VVVVAAAADDEEGGGG", + sncbieaa "--------------------------------MMMM---------------M------------" + -- Base1 TTTTTTTTTTTTTTTTCCCCCCCCCCCCCCCCAAAAAAAAAAAAAAAAGGGGGGGGGGGGGGGG + -- Base2 TTTTCCCCAAAAGGGGTTTTCCCCAAAAGGGGTTTTCCCCAAAAGGGGTTTTCCCCAAAAGGGG + -- Base3 TCAGTCAGTCAGTCAGTCAGTCAGTCAGTCAGTCAGTCAGTCAGTCAGTCAGTCAGTCAGTCAG + }, + { + name "Yeast Mitochondrial" , + name "SGC2" , + id 3 , + ncbieaa "FFLLSSSSYY**CCWWTTTTPPPPHHQQRRRRIIMMTTTTNNKKSSRRVVVVAAAADDEEGGGG", + sncbieaa "----------------------------------MM----------------------------" + -- Base1 TTTTTTTTTTTTTTTTCCCCCCCCCCCCCCCCAAAAAAAAAAAAAAAAGGGGGGGGGGGGGGGG + -- Base2 TTTTCCCCAAAAGGGGTTTTCCCCAAAAGGGGTTTTCCCCAAAAGGGGTTTTCCCCAAAAGGGG + -- Base3 TCAGTCAGTCAGTCAGTCAGTCAGTCAGTCAGTCAGTCAGTCAGTCAGTCAGTCAGTCAGTCAG + }, + { + name "Mold Mitochondrial; Protozoan Mitochondrial;" + name "Coelenterate Mitochondrial; Mycoplasma; Spiroplasma" , + name "SGC3" , + id 4 , + ncbieaa "FFLLSSSSYY**CCWWLLLLPPPPHHQQRRRRIIIMTTTTNNKKSSRRVVVVAAAADDEEGGGG", + sncbieaa "--MM---------------M------------MMMM---------------M------------" + -- Base1 TTTTTTTTTTTTTTTTCCCCCCCCCCCCCCCCAAAAAAAAAAAAAAAAGGGGGGGGGGGGGGGG + -- Base2 TTTTCCCCAAAAGGGGTTTTCCCCAAAAGGGGTTTTCCCCAAAAGGGGTTTTCCCCAAAAGGGG + -- Base3 TCAGTCAGTCAGTCAGTCAGTCAGTCAGTCAGTCAGTCAGTCAGTCAGTCAGTCAGTCAGTCAG + }, + { + name "Invertebrate Mitochondrial" , + name "SGC4" , + id 5 , + ncbieaa "FFLLSSSSYY**CCWWLLLLPPPPHHQQRRRRIIMMTTTTNNKKSSSSVVVVAAAADDEEGGGG", + sncbieaa "---M----------------------------MMMM---------------M------------" + -- Base1 TTTTTTTTTTTTTTTTCCCCCCCCCCCCCCCCAAAAAAAAAAAAAAAAGGGGGGGGGGGGGGGG + -- Base2 TTTTCCCCAAAAGGGGTTTTCCCCAAAAGGGGTTTTCCCCAAAAGGGGTTTTCCCCAAAAGGGG + -- Base3 TCAGTCAGTCAGTCAGTCAGTCAGTCAGTCAGTCAGTCAGTCAGTCAGTCAGTCAGTCAGTCAG + }, + { + name "Ciliate Nuclear; Dasycladacean Nuclear; Hexamita Nuclear" , + name "SGC5" , + id 6 , + ncbieaa "FFLLSSSSYYQQCC*WLLLLPPPPHHQQRRRRIIIMTTTTNNKKSSRRVVVVAAAADDEEGGGG", + sncbieaa "-----------------------------------M----------------------------" + -- Base1 TTTTTTTTTTTTTTTTCCCCCCCCCCCCCCCCAAAAAAAAAAAAAAAAGGGGGGGGGGGGGGGG + -- Base2 TTTTCCCCAAAAGGGGTTTTCCCCAAAAGGGGTTTTCCCCAAAAGGGGTTTTCCCCAAAAGGGG + -- Base3 TCAGTCAGTCAGTCAGTCAGTCAGTCAGTCAGTCAGTCAGTCAGTCAGTCAGTCAGTCAGTCAG + }, + { + name "Echinoderm Mitochondrial; Flatworm Mitochondrial" , + name "SGC8" , + id 9 , + ncbieaa "FFLLSSSSYY**CCWWLLLLPPPPHHQQRRRRIIIMTTTTNNNKSSSSVVVVAAAADDEEGGGG", + sncbieaa "-----------------------------------M---------------M------------" + -- Base1 TTTTTTTTTTTTTTTTCCCCCCCCCCCCCCCCAAAAAAAAAAAAAAAAGGGGGGGGGGGGGGGG + -- Base2 TTTTCCCCAAAAGGGGTTTTCCCCAAAAGGGGTTTTCCCCAAAAGGGGTTTTCCCCAAAAGGGG + -- Base3 TCAGTCAGTCAGTCAGTCAGTCAGTCAGTCAGTCAGTCAGTCAGTCAGTCAGTCAGTCAGTCAG + }, + { + name "Euplotid Nuclear" , + name "SGC9" , + id 10 , + ncbieaa "FFLLSSSSYY**CCCWLLLLPPPPHHQQRRRRIIIMTTTTNNKKSSRRVVVVAAAADDEEGGGG", + sncbieaa "-----------------------------------M----------------------------" + -- Base1 TTTTTTTTTTTTTTTTCCCCCCCCCCCCCCCCAAAAAAAAAAAAAAAAGGGGGGGGGGGGGGGG + -- Base2 TTTTCCCCAAAAGGGGTTTTCCCCAAAAGGGGTTTTCCCCAAAAGGGGTTTTCCCCAAAAGGGG + -- Base3 TCAGTCAGTCAGTCAGTCAGTCAGTCAGTCAGTCAGTCAGTCAGTCAGTCAGTCAGTCAGTCAG + }, + { + name "Bacterial and Plant Plastid" , + id 11 , + ncbieaa "FFLLSSSSYY**CC*WLLLLPPPPHHQQRRRRIIIMTTTTNNKKSSRRVVVVAAAADDEEGGGG", + sncbieaa "---M---------------M------------MMMM---------------M------------" + -- Base1 TTTTTTTTTTTTTTTTCCCCCCCCCCCCCCCCAAAAAAAAAAAAAAAAGGGGGGGGGGGGGGGG + -- Base2 TTTTCCCCAAAAGGGGTTTTCCCCAAAAGGGGTTTTCCCCAAAAGGGGTTTTCCCCAAAAGGGG + -- Base3 TCAGTCAGTCAGTCAGTCAGTCAGTCAGTCAGTCAGTCAGTCAGTCAGTCAGTCAGTCAGTCAG + }, + { + name "Alternative Yeast Nuclear" , + id 12 , + ncbieaa "FFLLSSSSYY**CC*WLLLSPPPPHHQQRRRRIIIMTTTTNNKKSSRRVVVVAAAADDEEGGGG", + sncbieaa "-------------------M---------------M----------------------------" + -- Base1 TTTTTTTTTTTTTTTTCCCCCCCCCCCCCCCCAAAAAAAAAAAAAAAAGGGGGGGGGGGGGGGG + -- Base2 TTTTCCCCAAAAGGGGTTTTCCCCAAAAGGGGTTTTCCCCAAAAGGGGTTTTCCCCAAAAGGGG + -- Base3 TCAGTCAGTCAGTCAGTCAGTCAGTCAGTCAGTCAGTCAGTCAGTCAGTCAGTCAGTCAGTCAG + }, + { + name "Ascidian Mitochondrial" , + id 13 , + ncbieaa "FFLLSSSSYY**CCWWLLLLPPPPHHQQRRRRIIMMTTTTNNKKSSGGVVVVAAAADDEEGGGG", + sncbieaa "---M------------------------------MM---------------M------------" + -- Base1 TTTTTTTTTTTTTTTTCCCCCCCCCCCCCCCCAAAAAAAAAAAAAAAAGGGGGGGGGGGGGGGG + -- Base2 TTTTCCCCAAAAGGGGTTTTCCCCAAAAGGGGTTTTCCCCAAAAGGGGTTTTCCCCAAAAGGGG + -- Base3 TCAGTCAGTCAGTCAGTCAGTCAGTCAGTCAGTCAGTCAGTCAGTCAGTCAGTCAGTCAGTCAG + }, + { + name "Alternative Flatworm Mitochondrial" , + id 14 , + ncbieaa "FFLLSSSSYYY*CCWWLLLLPPPPHHQQRRRRIIIMTTTTNNNKSSSSVVVVAAAADDEEGGGG", + sncbieaa "-----------------------------------M----------------------------" + -- Base1 TTTTTTTTTTTTTTTTCCCCCCCCCCCCCCCCAAAAAAAAAAAAAAAAGGGGGGGGGGGGGGGG + -- Base2 TTTTCCCCAAAAGGGGTTTTCCCCAAAAGGGGTTTTCCCCAAAAGGGGTTTTCCCCAAAAGGGG + -- Base3 TCAGTCAGTCAGTCAGTCAGTCAGTCAGTCAGTCAGTCAGTCAGTCAGTCAGTCAGTCAGTCAG + } , + { + name "Blepharisma Macronuclear" , + id 15 , + ncbieaa "FFLLSSSSYY*QCC*WLLLLPPPPHHQQRRRRIIIMTTTTNNKKSSRRVVVVAAAADDEEGGGG", + sncbieaa "-----------------------------------M----------------------------" + -- Base1 TTTTTTTTTTTTTTTTCCCCCCCCCCCCCCCCAAAAAAAAAAAAAAAAGGGGGGGGGGGGGGGG + -- Base2 TTTTCCCCAAAAGGGGTTTTCCCCAAAAGGGGTTTTCCCCAAAAGGGGTTTTCCCCAAAAGGGG + -- Base3 TCAGTCAGTCAGTCAGTCAGTCAGTCAGTCAGTCAGTCAGTCAGTCAGTCAGTCAGTCAGTCAG + } , + { + name "Chlorophycean Mitochondrial" , + id 16 , + ncbieaa "FFLLSSSSYY*LCC*WLLLLPPPPHHQQRRRRIIIMTTTTNNKKSSRRVVVVAAAADDEEGGGG", + sncbieaa "-----------------------------------M----------------------------" + -- Base1 TTTTTTTTTTTTTTTTCCCCCCCCCCCCCCCCAAAAAAAAAAAAAAAAGGGGGGGGGGGGGGGG + -- Base2 TTTTCCCCAAAAGGGGTTTTCCCCAAAAGGGGTTTTCCCCAAAAGGGGTTTTCCCCAAAAGGGG + -- Base3 TCAGTCAGTCAGTCAGTCAGTCAGTCAGTCAGTCAGTCAGTCAGTCAGTCAGTCAGTCAGTCAG + } , + { + name "Trematode Mitochondrial" , + id 21 , + ncbieaa "FFLLSSSSYY**CCWWLLLLPPPPHHQQRRRRIIMMTTTTNNNKSSSSVVVVAAAADDEEGGGG", + sncbieaa "-----------------------------------M---------------M------------" + -- Base1 TTTTTTTTTTTTTTTTCCCCCCCCCCCCCCCCAAAAAAAAAAAAAAAAGGGGGGGGGGGGGGGG + -- Base2 TTTTCCCCAAAAGGGGTTTTCCCCAAAAGGGGTTTTCCCCAAAAGGGGTTTTCCCCAAAAGGGG + -- Base3 TCAGTCAGTCAGTCAGTCAGTCAGTCAGTCAGTCAGTCAGTCAGTCAGTCAGTCAGTCAGTCAG + } , + { + name "Scenedesmus obliquus Mitochondrial" , + id 22 , + ncbieaa "FFLLSS*SYY*LCC*WLLLLPPPPHHQQRRRRIIIMTTTTNNKKSSRRVVVVAAAADDEEGGGG", + sncbieaa "-----------------------------------M----------------------------" + -- Base1 TTTTTTTTTTTTTTTTCCCCCCCCCCCCCCCCAAAAAAAAAAAAAAAAGGGGGGGGGGGGGGGG + -- Base2 TTTTCCCCAAAAGGGGTTTTCCCCAAAAGGGGTTTTCCCCAAAAGGGGTTTTCCCCAAAAGGGG + -- Base3 TCAGTCAGTCAGTCAGTCAGTCAGTCAGTCAGTCAGTCAGTCAGTCAGTCAGTCAGTCAGTCAG + } , + { + name "Thraustochytrium Mitochondrial" , + id 23 , + ncbieaa "FF*LSSSSYY**CC*WLLLLPPPPHHQQRRRRIIIMTTTTNNKKSSRRVVVVAAAADDEEGGGG", + sncbieaa "--------------------------------M--M---------------M------------" + -- Base1 TTTTTTTTTTTTTTTTCCCCCCCCCCCCCCCCAAAAAAAAAAAAAAAAGGGGGGGGGGGGGGGG + -- Base2 TTTTCCCCAAAAGGGGTTTTCCCCAAAAGGGGTTTTCCCCAAAAGGGGTTTTCCCCAAAAGGGG + -- Base3 TCAGTCAGTCAGTCAGTCAGTCAGTCAGTCAGTCAGTCAGTCAGTCAGTCAGTCAGTCAGTCAG + } + +=head1 AUTHOR + +Kevin Galinsky, + +=head1 COPYRIGHT & LICENSE + +Copyright 2008-2009 J. Craig Venter Institute, all rights reserved. + +This program is free software; you can redistribute it and/or modify it +under the same terms as Perl itself. + +=cut + +__DATA__ + +{ +name "Standard; SGC0" , +id 1 , +ncbieaa "KNKNKNTTTTTTTTTTTTTTTRSRSRSIIMIIIIIQHQHQHPPPPPPPPPPPPPPPRRRRRRRRRRRRRRRLLLLLLLLLLLLLLLEDEDEDAAAAAAAAAAAAAAAGGGGGGGGGGGGGGGVVVVVVVVVVVVVVVBBB*Y*Y*YSSSSSSSSSSSSSSS*CWCCLFLFLF*JXRRRJJXJJJJJZZZJXLLL", +sncbieaa "-----------------------------M-------------------------------------------M----------------------------------------------------------------------------------------------M-----M-----M---------M-M-" +-- Base1 AAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAACCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTHHMMMMMMMMMMMSSSWWYYY +-- Base2 AAAAAACCCCCCCCCCCCCCCGGGGGGTTTTTTTTAAAAAACCCCCCCCCCCCCCCGGGGGGGGGGGGGGGTTTTTTTTTTTTTTTAAAAAACCCCCCCCCCCCCCCGGGGGGGGGGGGGGGTTTTTTTTTTTTTTTMMMAAAAAACCCCCCCCCCCCCCCGGGGGTTTTTTRTTGGGTTTTTTTTAAATTTTT +-- Base3 ACGTRYACGTBDHKMNRSVWYACGTRYACGTHMWYACGTRYACGTBDHKMNRSVWYACGTBDHKMNRSVWYACGTBDHKMNRSVWYACGTRYACGTBDHKMNRSVWYACGTBDHKMNRSVWYACGTBDHKMNRSVWYCTYACGTRYACGTBDHKMNRSVWYACGTYACGTRYAAGAGRACGTHMWYAGRAGAGR +} +{ +name "Vertebrate Mitochondrial; SGC1" , +id 2 , +ncbieaa "KNKNKNTTTTTTTTTTTTTTT*S*S*SMIMIXXXXXXMXXXIQHQHQHPPPPPPPPPPPPPPPRRRRRRRRRRRRRRRLLLLLLLLLLLLLLLEDEDEDAAAAAAAAAAAAAAAGGGGGGGGGGGGGGGVVVVVVVVVVVVVVVBBB*Y*Y*YSSSSSSSSSSSSSSSWCWCWCLFLFLFJJJXZZZLLL", +sncbieaa "---------------------------MMMMMMMMMMMMMMM-----------------------------------------------------------------------------------------M---------------------------------------------------M------" +-- Base1 AAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAACCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTMMMRSSSYYY +-- Base2 AAAAAACCCCCCCCCCCCCCCGGGGGGTTTTTTTTTTTTTTTAAAAAACCCCCCCCCCCCCCCGGGGGGGGGGGGGGGTTTTTTTTTTTTTTTAAAAAACCCCCCCCCCCCCCCGGGGGGGGGGGGGGGTTTTTTTTTTTTTTTMMMAAAAAACCCCCCCCCCCCCCCGGGGGGTTTTTTTTTTAAATTT +-- Base3 ACGTRYACGTBDHKMNRSVWYACGTRYACGTBDHKMNRSVWYACGTRYACGTBDHKMNRSVWYACGTBDHKMNRSVWYACGTBDHKMNRSVWYACGTRYACGTBDHKMNRSVWYACGTBDHKMNRSVWYACGTBDHKMNRSVWYCTYACGTRYACGTBDHKMNRSVWYACGTRYACGTRYCTYGAGRAGR +} +{ +name "Yeast Mitochondrial; SGC2" , +id 3 , +ncbieaa "KNKNKNTTTTTTTTTTTTTTTRSRSRSMIMIMIQHQHQHPPPPPPPPPPPPPPPRRRRRRRRRRRRRRRTTTTTTTTTTTTTTTEDEDEDAAAAAAAAAAAAAAAGGGGGGGGGGGGGGGVVVVVVVVVVVVVVVBBB*Y*Y*YSSSSSSSSSSSSSSSWCWCWCLFLFLFRRRZZZ", +sncbieaa "---------------------------M-M-M-------------------------------------------------------------------------------------------------------------------------------------------------" +-- Base1 AAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAACCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTMMMSSS +-- Base2 AAAAAACCCCCCCCCCCCCCCGGGGGGTTTTTTAAAAAACCCCCCCCCCCCCCCGGGGGGGGGGGGGGGTTTTTTTTTTTTTTTAAAAAACCCCCCCCCCCCCCCGGGGGGGGGGGGGGGTTTTTTTTTTTTTTTMMMAAAAAACCCCCCCCCCCCCCCGGGGGGTTTTTTGGGAAA +-- Base3 ACGTRYACGTBDHKMNRSVWYACGTRYACGTRYACGTRYACGTBDHKMNRSVWYACGTBDHKMNRSVWYACGTBDHKMNRSVWYACGTRYACGTBDHKMNRSVWYACGTBDHKMNRSVWYACGTBDHKMNRSVWYCTYACGTRYACGTBDHKMNRSVWYACGTRYACGTRYAGRAGR +} +{ +name "Mold Mitochondrial; Protozoan Mitochondrial; Coelenterate Mitochondrial; Mycoplasma; Spiroplasma; SGC3" , +id 4 , +ncbieaa "KNKNKNTTTTTTTTTTTTTTTRSRSRSIIMIXXIXIXXXXIIQHQHQHPPPPPPPPPPPPPPPRRRRRRRRRRRRRRRLLLLLLLLLLLLLLLEDEDEDAAAAAAAAAAAAAAAGGGGGGGGGGGGGGGVVVVVVVVVVVVVVVBBB*Y*Y*YSSSSSSSSSSSSSSSWCWCWCLFLFLFXXJXXRRRJJXJJJJJXXZZZXXJXXLLL", +sncbieaa "---------------------------MMMMMMMMMMMMMMM--------------------------------------M--------------------------------------------------M------------------------------------------M-M-M-MM-MM-----M-----MM---MMMMM-M-" +-- Base1 AAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAACCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTBDHHKMMMMMMMMMMMNRSSSSVWWWYYY +-- Base2 AAAAAACCCCCCCCCCCCCCCGGGGGGTTTTTTTTTTTTTTTAAAAAACCCCCCCCCCCCCCCGGGGGGGGGGGGGGGTTTTTTTTTTTTTTTAAAAAACCCCCCCCCCCCCCCGGGGGGGGGGGGGGGTTTTTTTTTTTTTTTMMMAAAAAACCCCCCCCCCCCCCCGGGGGGTTTTTTTTTTTGGGTTTTTTTTTTAAATTTTTTTT +-- Base3 ACGTRYACGTBDHKMNRSVWYACGTRYACGTBDHKMNRSVWYACGTRYACGTBDHKMNRSVWYACGTBDHKMNRSVWYACGTBDHKMNRSVWYACGTRYACGTBDHKMNRSVWYACGTBDHKMNRSVWYACGTBDHKMNRSVWYCTYACGTRYACGTBDHKMNRSVWYACGTRYACGTRYGGAGGAGRACGTHMWYGGAGRGGAGRAGR +} +{ +name "Invertebrate Mitochondrial; SGC4" , +id 5 , +ncbieaa "KNKNKNTTTTTTTTTTTTTTTSSSSSSSSSSSSSSSMIMIXXXXXXMXXXIQHQHQHPPPPPPPPPPPPPPPRRRRRRRRRRRRRRRLLLLLLLLLLLLLLLEDEDEDAAAAAAAAAAAAAAAGGGGGGGGGGGGGGGVVVVVVVVVVVVVVVBBB*Y*Y*YSSSSSSSSSSSSSSSWCWCWCLFLFLFXXJJJXZZZXLLL", +sncbieaa "------------------------------------MMMMMMMMMMMMMMM-----------------------------------------------------------------------------------------M--------------------------------------------M---MM---M---M---" +-- Base1 AAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAACCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTDKMMMRSSSWYYY +-- Base2 AAAAAACCCCCCCCCCCCCCCGGGGGGGGGGGGGGGTTTTTTTTTTTTTTTAAAAAACCCCCCCCCCCCCCCGGGGGGGGGGGGGGGTTTTTTTTTTTTTTTAAAAAACCCCCCCCCCCCCCCGGGGGGGGGGGGGGGTTTTTTTTTTTTTTTMMMAAAAAACCCCCCCCCCCCCCCGGGGGGTTTTTTTTTTTTAAATTTT +-- Base3 ACGTRYACGTBDHKMNRSVWYACGTBDHKMNRSVWYACGTBDHKMNRSVWYACGTRYACGTBDHKMNRSVWYACGTBDHKMNRSVWYACGTBDHKMNRSVWYACGTRYACGTBDHKMNRSVWYACGTBDHKMNRSVWYACGTBDHKMNRSVWYCTYACGTRYACGTBDHKMNRSVWYACGTRYACGTRYGGCTYGAGRGAGR +} +{ +name "Ciliate Nuclear; Dasycladacean Nuclear; Hexamita Nuclear; SGC5" , +id 6 , +ncbieaa "KNKNKNTTTTTTTTTTTTTTTRSRSRSIIMIIIIIQHQHQHPPPPPPPPPPPPPPPRRRRRRRRRRRRRRRLLLLLLLLLLLLLLLEDEDEDAAAAAAAAAAAAAAAGGGGGGGGGGGGGGGVVVVVVVVVVVVVVVBBBQYQYQYSSSSSSSSSSSSSSS*CWCCLFLFLFZZZJZZZRRRJJJJJJJZZZJQQQLLL", +sncbieaa "-----------------------------M-------------------------------------------------------------------------------------------------------------------------------------------------------------------------" +-- Base1 AAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAACCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTBBBHKKKMMMMMMMMMMSSSWYYYYYY +-- Base2 AAAAAACCCCCCCCCCCCCCCGGGGGGTTTTTTTTAAAAAACCCCCCCCCCCCCCCGGGGGGGGGGGGGGGTTTTTTTTTTTTTTTAAAAAACCCCCCCCCCCCCCCGGGGGGGGGGGGGGGTTTTTTTTTTTTTTTMMMAAAAAACCCCCCCCCCCCCCCGGGGGTTTTTTAAATAAAGGGTTTTTTTAAATAAATTT +-- Base3 ACGTRYACGTBDHKMNRSVWYACGTRYACGTHMWYACGTRYACGTBDHKMNRSVWYACGTBDHKMNRSVWYACGTBDHKMNRSVWYACGTRYACGTBDHKMNRSVWYACGTBDHKMNRSVWYACGTBDHKMNRSVWYCTYACGTRYACGTBDHKMNRSVWYACGTYACGTRYAGRAAGRAGRACTHMWYAGRAAGRAGR +} +{ +name "Echinoderm Mitochondrial; Flatworm Mitochondrial; SGC8" , +id 9 , +ncbieaa "NNKNNNNNTTTTTTTTTTTTTTTSSSSSSSSSSSSSSSIIMIIIIIQHQHQHPPPPPPPPPPPPPPPRRRRRRRRRRRRRRRLLLLLLLLLLLLLLLEDEDEDAAAAAAAAAAAAAAAGGGGGGGGGGGGGGGVVVVVVVVVVVVVVVBBB*Y*Y*YSSSSSSSSSSSSSSSWCWCWCLFLFLFJJJJJJJJXZZZJLLL", +sncbieaa "----------------------------------------M----------------------------------------------------------------------------------------------M--------------------------------------------------------M-------" +-- Base1 AAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAACCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTHMMMMMMMRSSSWYYY +-- Base2 AAAAAAAACCCCCCCCCCCCCCCGGGGGGGGGGGGGGGTTTTTTTTAAAAAACCCCCCCCCCCCCCCGGGGGGGGGGGGGGGTTTTTTTTTTTTTTTAAAAAACCCCCCCCCCCCCCCGGGGGGGGGGGGGGGTTTTTTTTTTTTTTTMMMAAAAAACCCCCCCCCCCCCCCGGGGGGTTTTTTTTTTTTTTTAAATTTT +-- Base3 ACGTHMWYACGTBDHKMNRSVWYACGTBDHKMNRSVWYACGTHMWYACGTRYACGTBDHKMNRSVWYACGTBDHKMNRSVWYACGTBDHKMNRSVWYACGTRYACGTBDHKMNRSVWYACGTBDHKMNRSVWYACGTBDHKMNRSVWYCTYACGTRYACGTBDHKMNRSVWYACGTRYACGTRYAACTHMWYGAGRAAGR +} +{ +name "Euplotid Nuclear; SGC9" , +id 10 , +ncbieaa "KNKNKNTTTTTTTTTTTTTTTRSRSRSIIMIIIIIQHQHQHPPPPPPPPPPPPPPPRRRRRRRRRRRRRRRLLLLLLLLLLLLLLLEDEDEDAAAAAAAAAAAAAAAGGGGGGGGGGGGGGGVVVVVVVVVVVVVVVBBB*Y*Y*YSSSSSSSSSSSSSSSCCWCCCCCLFLFLFJRRRJJJJJJJZZZJLLL", +sncbieaa "-----------------------------M-------------------------------------------------------------------------------------------------------------------------------------------------------------------" +-- Base1 AAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAACCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTHMMMMMMMMMMSSSWYYY +-- Base2 AAAAAACCCCCCCCCCCCCCCGGGGGGTTTTTTTTAAAAAACCCCCCCCCCCCCCCGGGGGGGGGGGGGGGTTTTTTTTTTTTTTTAAAAAACCCCCCCCCCCCCCCGGGGGGGGGGGGGGGTTTTTTTTTTTTTTTMMMAAAAAACCCCCCCCCCCCCCCGGGGGGGGTTTTTTTGGGTTTTTTTAAATTTT +-- Base3 ACGTRYACGTBDHKMNRSVWYACGTRYACGTHMWYACGTRYACGTBDHKMNRSVWYACGTBDHKMNRSVWYACGTBDHKMNRSVWYACGTRYACGTBDHKMNRSVWYACGTBDHKMNRSVWYACGTBDHKMNRSVWYCTYACGTRYACGTBDHKMNRSVWYACGTHMWYACGTRYAAGRACTHMWYAGRAAGR +} +{ +name "Bacterial and Plant Plastid" , +id 11 , +ncbieaa "KNKNKNTTTTTTTTTTTTTTTRSRSRSIIMIXXIXIXXXXIIQHQHQHPPPPPPPPPPPPPPPRRRRRRRRRRRRRRRLLLLLLLLLLLLLLLEDEDEDAAAAAAAAAAAAAAAGGGGGGGGGGGGGGGVVVVVVVVVVVVVVVBBB*Y*Y*YSSSSSSSSSSSSSSS*CWCCLFLFLF*XXJXXRRRJJXJJJJJXXZZZXXJXLLL", +sncbieaa "---------------------------MMMMMMMMMMMMMMM--------------------------------------M--------------------------------------------------M-------------------------------------------M----MM-MM-----M-----MM---MM-M-M-" +-- Base1 AAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAACCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTBDHHKMMMMMMMMMMMNRSSSSVWWYYY +-- Base2 AAAAAACCCCCCCCCCCCCCCGGGGGGTTTTTTTTTTTTTTTAAAAAACCCCCCCCCCCCCCCGGGGGGGGGGGGGGGTTTTTTTTTTTTTTTAAAAAACCCCCCCCCCCCCCCGGGGGGGGGGGGGGGTTTTTTTTTTTTTTTMMMAAAAAACCCCCCCCCCCCCCCGGGGGTTTTTTRTTTTTGGGTTTTTTTTTTAAATTTTTTT +-- Base3 ACGTRYACGTBDHKMNRSVWYACGTRYACGTBDHKMNRSVWYACGTRYACGTBDHKMNRSVWYACGTBDHKMNRSVWYACGTBDHKMNRSVWYACGTRYACGTBDHKMNRSVWYACGTBDHKMNRSVWYACGTBDHKMNRSVWYCTYACGTRYACGTBDHKMNRSVWYACGTYACGTRYAGGAGGAGRACGTHMWYGGAGRGGAGAGR +} +{ +name "Alternative Yeast Nuclear" , +id 12 , +ncbieaa "KNKNKNTTTTTTTTTTTTTTTRSRSRSIIMIIIIIQHQHQHPPPPPPPPPPPPPPPRRRRRRRRRRRRRRRLLSLLLLLEDEDEDAAAAAAAAAAAAAAAGGGGGGGGGGGGGGGVVVVVVVVVVVVVVVBBB*Y*Y*YSSSSSSSSSSSSSSS*CWCCLFLFLF*JRRRJJXJJJJJZZZJL", +sncbieaa "-----------------------------M-------------------------------------------M--------------------------------------------------------------------------------------------------M----------" +-- Base1 AAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAACCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTHMMMMMMMMMMMSSSWY +-- Base2 AAAAAACCCCCCCCCCCCCCCGGGGGGTTTTTTTTAAAAAACCCCCCCCCCCCCCCGGGGGGGGGGGGGGGTTTTTTTTAAAAAACCCCCCCCCCCCCCCGGGGGGGGGGGGGGGTTTTTTTTTTTTTTTMMMAAAAAACCCCCCCCCCCCCCCGGGGGTTTTTTRTGGGTTTTTTTTAAATT +-- Base3 ACGTRYACGTBDHKMNRSVWYACGTRYACGTHMWYACGTRYACGTBDHKMNRSVWYACGTBDHKMNRSVWYACGTHMWYACGTRYACGTBDHKMNRSVWYACGTBDHKMNRSVWYACGTBDHKMNRSVWYCTYACGTRYACGTBDHKMNRSVWYACGTYACGTRYAAAGRACGTHMWYAGRAA +} +{ +name "Ascidian Mitochondrial" , +id 13 , +ncbieaa "KNKNKNTTTTTTTTTTTTTTTGSGSGSMIMIMIQHQHQHPPPPPPPPPPPPPPPRRRRRRRRRRRRRRRLLLLLLLLLLLLLLLEDEDEDAAAAAAAAAAAAAAAGGGGGGGGGGGGGGGVVVVVVVVVVVVVVVBBB*Y*Y*YSSSSSSSSSSSSSSSWCWCWCLFLFLFXXJJJGGGXZZZXLLL", +sncbieaa "---------------------------M-M-M------------------------------------------------------------------------------------------M--------------------------------------------M---MM------M---M---" +-- Base1 AAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAACCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTDKMMMRRRRSSSWYYY +-- Base2 AAAAAACCCCCCCCCCCCCCCGGGGGGTTTTTTAAAAAACCCCCCCCCCCCCCCGGGGGGGGGGGGGGGTTTTTTTTTTTTTTTAAAAAACCCCCCCCCCCCCCCGGGGGGGGGGGGGGGTTTTTTTTTTTTTTTMMMAAAAAACCCCCCCCCCCCCCCGGGGGGTTTTTTTTTTTGGGTAAATTTT +-- Base3 ACGTRYACGTBDHKMNRSVWYACGTRYACGTRYACGTRYACGTBDHKMNRSVWYACGTBDHKMNRSVWYACGTBDHKMNRSVWYACGTRYACGTBDHKMNRSVWYACGTBDHKMNRSVWYACGTBDHKMNRSVWYCTYACGTRYACGTBDHKMNRSVWYACGTRYACGTRYGGCTYAGRGAGRGAGR +} +{ +name "Alternative Flatworm Mitochondrial" , +id 14 , +ncbieaa "NNKNNNNNTTTTTTTTTTTTTTTSSSSSSSSSSSSSSSIIMIIIIIQHQHQHPPPPPPPPPPPPPPPRRRRRRRRRRRRRRRLLLLLLLLLLLLLLLEDEDEDAAAAAAAAAAAAAAAGGGGGGGGGGGGGGGVVVVVVVVVVVVVVVBBBYY*YYYYYSSSSSSSSSSSSSSSWCWCWCLFLFLFJJJJJJJJZZZJLLL", +sncbieaa "----------------------------------------M----------------------------------------------------------------------------------------------------------------------------------------------------------------" +-- Base1 AAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAACCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTHMMMMMMMSSSWYYY +-- Base2 AAAAAAAACCCCCCCCCCCCCCCGGGGGGGGGGGGGGGTTTTTTTTAAAAAACCCCCCCCCCCCCCCGGGGGGGGGGGGGGGTTTTTTTTTTTTTTTAAAAAACCCCCCCCCCCCCCCGGGGGGGGGGGGGGGTTTTTTTTTTTTTTTMMMAAAAAAAACCCCCCCCCCCCCCCGGGGGGTTTTTTTTTTTTTTAAATTTT +-- Base3 ACGTHMWYACGTBDHKMNRSVWYACGTBDHKMNRSVWYACGTHMWYACGTRYACGTBDHKMNRSVWYACGTBDHKMNRSVWYACGTBDHKMNRSVWYACGTRYACGTBDHKMNRSVWYACGTBDHKMNRSVWYACGTBDHKMNRSVWYCTYACGTHMWYACGTBDHKMNRSVWYACGTRYACGTRYAACTHMWYAGRAAGR +} +{ +name "Blepharisma Macronuclear" , +id 15 , +ncbieaa "KNKNKNTTTTTTTTTTTTTTTRSRSRSIIMIIIIIQHQHQHPPPPPPPPPPPPPPPRRRRRRRRRRRRRRRLLLLLLLLLLLLLLLEDEDEDAAAAAAAAAAAAAAAGGGGGGGGGGGGGGGVVVVVVVVVVVVVVVBBB*YQYYSSSSSSSSSSSSSSS*CWCCLFLFLF*ZJZRRRJJJJJJJZZZJQLLL", +sncbieaa "-----------------------------M-------------------------------------------------------------------------------------------------------------------------------------------------------------------" +-- Base1 AAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAACCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTBHKMMMMMMMMMMSSSWYYYY +-- Base2 AAAAAACCCCCCCCCCCCCCCGGGGGGTTTTTTTTAAAAAACCCCCCCCCCCCCCCGGGGGGGGGGGGGGGTTTTTTTTTTTTTTTAAAAAACCCCCCCCCCCCCCCGGGGGGGGGGGGGGGTTTTTTTTTTTTTTTMMMAAAAACCCCCCCCCCCCCCCGGGGGTTTTTTRATAGGGTTTTTTTAAATATTT +-- Base3 ACGTRYACGTBDHKMNRSVWYACGTRYACGTHMWYACGTRYACGTBDHKMNRSVWYACGTBDHKMNRSVWYACGTBDHKMNRSVWYACGTRYACGTBDHKMNRSVWYACGTBDHKMNRSVWYACGTBDHKMNRSVWYCTYACGTYACGTBDHKMNRSVWYACGTYACGTRYAGAGAGRACTHMWYAGRAGAGR +} +{ +name "Chlorophycean Mitochondrial" , +id 16 , +ncbieaa "KNKNKNTTTTTTTTTTTTTTTRSRSRSIIMIIIIIQHQHQHPPPPPPPPPPPPPPPRRRRRRRRRRRRRRRLLLLLLLLLLLLLLLEDEDEDAAAAAAAAAAAAAAAGGGGGGGGGGGGGGGVVVVVVVVVVVVVVVBBB*YLYYSSSSSSSSSSSSSSS*CWCCLFLFLF*LJRRRJJJJJJJZZZJLLL", +sncbieaa "-----------------------------M-----------------------------------------------------------------------------------------------------------------------------------------------------------------" +-- Base1 AAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAACCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTHMMMMMMMMMMSSSWYYY +-- Base2 AAAAAACCCCCCCCCCCCCCCGGGGGGTTTTTTTTAAAAAACCCCCCCCCCCCCCCGGGGGGGGGGGGGGGTTTTTTTTTTTTTTTAAAAAACCCCCCCCCCCCCCCGGGGGGGGGGGGGGGTTTTTTTTTTTTTTTMMMAAAAACCCCCCCCCCCCCCCGGGGGTTTTTTRWTGGGTTTTTTTAAATTTT +-- Base3 ACGTRYACGTBDHKMNRSVWYACGTRYACGTHMWYACGTRYACGTBDHKMNRSVWYACGTBDHKMNRSVWYACGTBDHKMNRSVWYACGTRYACGTBDHKMNRSVWYACGTBDHKMNRSVWYACGTBDHKMNRSVWYCTYACGTYACGTBDHKMNRSVWYACGTYACGTRYAGAAGRACTHMWYAGRAAGR +} +{ +name "Trematode Mitochondrial" , +id 21 , +ncbieaa "NNKNNNNNTTTTTTTTTTTTTTTSSSSSSSSSSSSSSSMIMIMIQHQHQHPPPPPPPPPPPPPPPRRRRRRRRRRRRRRRLLLLLLLLLLLLLLLEDEDEDAAAAAAAAAAAAAAAGGGGGGGGGGGGGGGVVVVVVVVVVVVVVVBBB*Y*Y*YSSSSSSSSSSSSSSSWCWCWCLFLFLFJJJXZZZLLL", +sncbieaa "----------------------------------------M--------------------------------------------------------------------------------------------M---------------------------------------------------M------" +-- Base1 AAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAACCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTMMMRSSSYYY +-- Base2 AAAAAAAACCCCCCCCCCCCCCCGGGGGGGGGGGGGGGTTTTTTAAAAAACCCCCCCCCCCCCCCGGGGGGGGGGGGGGGTTTTTTTTTTTTTTTAAAAAACCCCCCCCCCCCCCCGGGGGGGGGGGGGGGTTTTTTTTTTTTTTTMMMAAAAAACCCCCCCCCCCCCCCGGGGGGTTTTTTTTTTAAATTT +-- Base3 ACGTHMWYACGTBDHKMNRSVWYACGTBDHKMNRSVWYACGTRYACGTRYACGTBDHKMNRSVWYACGTBDHKMNRSVWYACGTBDHKMNRSVWYACGTRYACGTBDHKMNRSVWYACGTBDHKMNRSVWYACGTBDHKMNRSVWYCTYACGTRYACGTBDHKMNRSVWYACGTRYACGTRYCTYGAGRAGR +} +{ +name "Scenedesmus obliquus Mitochondrial" , +id 22 , +ncbieaa "KNKNKNTTTTTTTTTTTTTTTRSRSRSIIMIIIIIQHQHQHPPPPPPPPPPPPPPPRRRRRRRRRRRRRRRLLLLLLLLLLLLLLLEDEDEDAAAAAAAAAAAAAAAGGGGGGGGGGGGGGGVVVVVVVVVVVVVVVBBB*YLYY*SSSSSSS*CWCCLFLFLF****LJRRRJJJJJJJZZZJLLL", +sncbieaa "-----------------------------M-------------------------------------------------------------------------------------------------------------------------------------------------------------" +-- Base1 AAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAACCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGTTTTTTTTTTTTTTTTTTTTTTTTTTTTTHMMMMMMMMMMSSSWYYY +-- Base2 AAAAAACCCCCCCCCCCCCCCGGGGGGTTTTTTTTAAAAAACCCCCCCCCCCCCCCGGGGGGGGGGGGGGGTTTTTTTTTTTTTTTAAAAAACCCCCCCCCCCCCCCGGGGGGGGGGGGGGGTTTTTTTTTTTTTTTMMMAAAAACCCCCCCCGGGGGTTTTTTMRSVWTGGGTTTTTTTAAATTTT +-- Base3 ACGTRYACGTBDHKMNRSVWYACGTRYACGTHMWYACGTRYACGTBDHKMNRSVWYACGTBDHKMNRSVWYACGTBDHKMNRSVWYACGTRYACGTBDHKMNRSVWYACGTBDHKMNRSVWYACGTBDHKMNRSVWYCTYACGTYACGTBKSYACGTYACGTRYAAAAGAAGRACTHMWYAGRAAGR +} +{ +name "Thraustochytrium Mitochondrial" , +id 23 , +ncbieaa "KNKNKNTTTTTTTTTTTTTTTRSRSRSIIMIIXIIIQHQHQHPPPPPPPPPPPPPPPRRRRRRRRRRRRRRRLLLLLLLLLLLLLLLEDEDEDAAAAAAAAAAAAAAAGGGGGGGGGGGGGGGVVVVVVVVVVVVVVVBBB*Y*Y*YSSSSSSSSSSSSSSS*CWCC*FLFF****RRRJJJJJJJXZZZL", +sncbieaa "-----------------------------MM-M--------------------------------------------------------------------------------------------M------------------------------------------------------------M----" +-- Base1 AAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAACCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTMMMMMMMMMMRSSSY +-- Base2 AAAAAACCCCCCCCCCCCCCCGGGGGGTTTTTTTTTAAAAAACCCCCCCCCCCCCCCGGGGGGGGGGGGGGGTTTTTTTTTTTTTTTAAAAAACCCCCCCCCCCCCCCGGGGGGGGGGGGGGGTTTTTTTTTTTTTTTMMMAAAAAACCCCCCCCCCCCCCCGGGGGTTTTTDKRWGGGTTTTTTTTAAAT +-- Base3 ACGTRYACGTBDHKMNRSVWYACGTRYACGTHKMWYACGTRYACGTBDHKMNRSVWYACGTBDHKMNRSVWYACGTBDHKMNRSVWYACGTRYACGTBDHKMNRSVWYACGTBDHKMNRSVWYACGTBDHKMNRSVWYCTYACGTRYACGTBDHKMNRSVWYACGTYACGTYAAAAAGRACTHMWYGAGRG +} +{ +name "Strict Standard" , +ncbieaa "KNKKNNTTTTTTTTTTTTTTTRSRRSSIIMIIIIIQHQQHHPPPPPPPPPPPPPPPRRRRRRRRRRRRRRRLLLLLLLLLLLLLLLEDEEDDAAAAAAAAAAAAAAAGGGGGGGGGGGGGGGBBBVVVVVVVVVVVVVVVJRRRJJJJJJJZZZ*Y**YYSSSSSSSSSSSSSSS*CWCC*LFLLFFJLLL", +sncbieaa "-----------------------------M-----------------------------------------------------------------------------------------------------------------------------------------------------------------" +-- Base1 AAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAACCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGHMMMMMMMMMMSSSTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTWYYY +-- Base2 AAAAAACCCCCCCCCCCCCCCGGGGGGTTTTTTTTAAAAAACCCCCCCCCCCCCCCGGGGGGGGGGGGGGGTTTTTTTTTTTTTTTAAAAAACCCCCCCCCCCCCCCGGGGGGGGGGGGGGGMMMTTTTTTTTTTTTTTTTGGGTTTTTTTAAAAAAAAACCCCCCCCCCCCCCCGGGGGRTTTTTTTTTT +-- Base3 ACGRTYABCDGHKMNRSTVWYACGRTYACGHMTWYACGRTYABCDGHKMNRSTVWYABCDGHKMNRSTVWYABCDGHKMNRSTVWYACGRTYABCDGHKMNRSTVWYABCDGHKMNRSTVWYCTYABCDGHKMNRSTVWYAAGRACHMTWYAGRACGRTYABCDGHKMNRSTVWYACGTYAACGRTYAAGR +} diff --git a/lib/JCVI/Translator/Utils.pm b/lib/JCVI/Translator/Utils.pm index de8f897..c3931ba 100644 --- a/lib/JCVI/Translator/Utils.pm +++ b/lib/JCVI/Translator/Utils.pm @@ -49,21 +49,16 @@ use JCVI::AATools qw( $aa_match ); our $DEFAULT_STRAND = 0; our $DEFAULT_SANITIZED = 0; -=head1 METHODS - -=cut - sub _new { - my $self = shift->SUPER::_new(); - - $self->_regexes( [] ); - foreach my $rc ( 0 .. 1 ) { - $self->_regexes->[$rc] = {}; - } - + my $self = shift->SUPER::_new(@_); + $self->_regexes([ {}, {} ]); return $self; } +=head1 METHODS + +=cut + =head2 codons my $codon_array = $translator->codons( $residue); @@ -85,6 +80,7 @@ sub codons { { type => Params::Validate::HASHREF, default => {} } ); + # Make sure strand is 1 or -1 my %p = validate( @p, { @@ -95,15 +91,27 @@ sub codons { } ); - if ( $residue eq 'lower' ) { $residue = $p{strand} == 1 ? 'start' : '*' } - elsif ( $residue eq 'upper' ) { - $residue = $p{strand} == -1 ? 'start' : '*'; - } - elsif ( $residue eq 'start' ) { $residue = 'start' } + # Set the reverse comlement variable + my $rc = $p{strand} == 1 ? 0 : 1; + + # Lower bound is "*" on the - strand, "start" on the + strand + if ( $residue eq 'lower' ) { $residue = $rc ? '*' : 'start' } + + # Upper bound is "start" on the - strand, or "*" on the + strand + elsif ( $residue eq 'upper' ) { $residue = $rc ? 'start' : '*' } + + # Do nothing if residue is "start" (don't want to capitalize) + elsif ( $residue eq 'start' ) { } + + # Capitalize all other residues else { $residue = uc $residue } - return [ - @{ $self->_reverse->[ $p{strand} == 1 ? 0 : 1 ]->{$residue} ||= [] } ]; + # Get the codons array or set it to the empty array + my $codons = $self->table->_reverse->[$rc]->{$residue} || []; + + # Return a copy of the arrayref so that the internal array can't get + # modified + return [@$codons]; } =head2 regex @@ -132,7 +140,7 @@ sub regex { ( $residue, $p[0] ) = validate_pos( @_, { regex => qr/^(?:$aa_match|start|lower|upper)$/ }, - { type => Params::Validate::HASHREF, default => {} } + { type => Params::Validate::HASHREF, default => {} } ); my %p = validate( @@ -327,7 +335,7 @@ sub getORF { # Initialize lower bounds and regular expression for stop my @lowers = map { $_ + $p{lower} } ( 0 .. 2 ); - my $stop_regex = $self->regex( '*', $strand ); + my $stop_regex = $self->regex( '*', { strand => $strand } ); # Look for all the stops in our sequence using a regular expression. A # lookahead is used to cope with the possibility of overlapping stop @@ -485,8 +493,8 @@ sub getCDS { ); foreach my $strand ( $p{strand} == 0 ? ( -1, 1 ) : $p{strand} ) { - my $lower_regex = $self->regex( 'lower', $strand ); - my $upper_regex = $self->regex( 'upper', $strand ); + my $lower_regex = $self->regex( 'lower', { strand => $strand } ); + my $upper_regex = $self->regex( 'upper', { strand => $strand } ); # Initialize lowers. On the + strand, we don't set the lower bounds # unless strict is 0. On the - strand, we don't set the lower bounds if @@ -653,7 +661,7 @@ sub nonstop { my @frames; foreach my $strand ( $p{strand} == 0 ? ( 1, -1 ) : $p{strand} ) { - my $stop = $self->regex( '*', $strand ); + my $stop = $self->regex( '*', { strand => $strand } ); foreach my $frame ( 0 .. 2 ) { my $regex = diff --git a/t/00-load.t b/t/00-load.t index ecfb6c0..3dd6960 100644 --- a/t/00-load.t +++ b/t/00-load.t @@ -3,8 +3,10 @@ use Test::More tests => 2; BEGIN { - use_ok( 'JCVI::Translator' ); - use_ok( 'JCVI::Translator::Utils' ); + use_ok('JCVI::Translator'); + use_ok('JCVI::Translator::Utils'); + use_ok('JCVI::Translator::Base'); + use_ok('JCVI::Translator::Table'); } -diag( "Testing JCVI::Translator $JCVI::Translator::VERSION, Perl $], $^X" ); \ No newline at end of file +diag("Testing JCVI::Translator $JCVI::Translator::VERSION, Perl $], $^X"); diff --git a/t/02-translate_exons.t b/t/02-translate_exons.t index d2ed94e..6e3b386 100644 --- a/t/02-translate_exons.t +++ b/t/02-translate_exons.t @@ -62,7 +62,7 @@ is( ) }, 'FWAHEEQEAHSGREWHA*YQ', - 'Translate frame -1 with break' + 'Translate frame -1 with small breaks' ); # Gaps diff --git a/t/03-translate6.t b/t/03-translate6.t index 5ccd911..3abd27e 100644 --- a/t/03-translate6.t +++ b/t/03-translate6.t @@ -7,4 +7,17 @@ use JCVI::Translator; my $translator = new JCVI::Translator; -ok( $translator->translate6( randomDNA() ), 'translate6 ran' ); +my $dna = randomDNA(); +my $peptide = $translator->translate6($dna); + +ok( $peptide, 'translate6 returned something' ); + +foreach my $strand ( 1, -1 ) { + foreach my $offset ( 0 .. 2 ) { + my $reference = + $translator->translate( $dna, + { strand => $strand, offset => $offset } ); + is( $peptide->[ $offset + ( ( $strand == 1 ? 0 : 1 ) * 3 ) ], + $$reference, 'result of translate6 matches translate' ); + } +} diff --git a/t/04-custom.t b/t/04-custom.t index bfccef0..d662007 100644 --- a/t/04-custom.t +++ b/t/04-custom.t @@ -23,5 +23,5 @@ sncbieaa "---M---------------M---------------M----------------------------" ); ok( - $translator->table_string(), 'Table string' + $translator->table->string (), 'Table string' ); \ No newline at end of file