diff --git a/Changes b/Changes index ce4c342..b11633a 100644 --- a/Changes +++ b/Changes @@ -1,5 +1,3 @@ Revision history for JCVI-Translator -0.01 Date/time - First version, released on an unsuspecting world. - +0.4.0 Major refactor. \ No newline at end of file diff --git a/MANIFEST b/MANIFEST index 8a1749d..1922701 100644 --- a/MANIFEST +++ b/MANIFEST @@ -4,6 +4,7 @@ META.yml # Will be created by "make dist" Makefile.PL README lib/JCVI/Translator.pm +lib/JCVI/Translator/Utils.pm t/00-load.t t/boilerplate.t t/pod-coverage.t diff --git a/README b/README index bf462f9..816bbaa 100644 --- a/README +++ b/README @@ -1,15 +1,8 @@ -JCVI-Translator - -The README is used to introduce the module and provide instructions on -how to install the module, any machine dependencies it may have (for -example C compilers and installed libraries) and any other information -that should be provided before the module is installed. - -A README file is required for CPAN modules since CPAN extracts the README -file from a module distribution so that people browsing the archive -can use it get an idea of the modules uses. It is usually a good idea -to provide version information here so that people can decide whether -fixes for the module are worth downloading. +JCVI::Translator tries to be a robust translator object featuring translation +tables based off the the ones provided by NCBI +(http://www.ncbi.nlm.nih.gov/Taxonomy/Utils/wprintgc.cgi). +Key features include the ability to handle degenerate nucleotides and to +translate to ambiguous amino acids. INSTALLATION diff --git a/lib/JCVI/Translator.pm b/lib/JCVI/Translator.pm index ac018d4..474950a 100644 --- a/lib/JCVI/Translator.pm +++ b/lib/JCVI/Translator.pm @@ -1,4 +1,4 @@ -# Translator +# JCVI::Translator # # $Author$ # $Date$ @@ -7,62 +7,59 @@ =head1 NAME -JCVI::Translator - JCVI Translator object +JCVI::Translator - Translate DNA sequences -=head1 SYNOPSES +=head1 SYNOPSIS - use JCVI::Translator; +use JCVI::Translator; - my $translator = new Translator( - id => $id, - name => $name, - tableRef => $tableRef - ); + my $translator = new JCVI::Translator(); + my $translator = new JCVI::Translator(11); + my $translator = new JCVI::Translator( 12, 'id' ); + my $translator = new JCVI::Translator( 'Yeast Mitochondrial', 'name' ); + my $translator = new JCVI::Translator( 'mito', 'name' ); + + my $translator = custom JCVI::Translator( \$custom_table ); + my $translator = custom JCVI::Translator( \$custom_table, 1 ); + + $translator->translate( \$seq ); + $translator->translate( \$seq, { strand => 1 } ); + $translator->translate( \$seq, { strand => -1 } ); =head1 DESCRIPTION -JCVI::Translator tries to be a robust translator object -featuring translation tables based off the the ones provided -by NCBI +JCVI::Translator tries to be a robust translator object featuring translation +tables based off the the ones provided by NCBI (http://www.ncbi.nlm.nih.gov/Taxonomy/Utils/wprintgc.cgi). -Key features include the ability to handle degenerate -nucleotides and to translate to ambiguous amino acids. +Key features include the ability to handle degenerate nucleotides and to +translate to ambiguous amino acids. + +The way to work with JCVI::Translator is you create a new translator using an +internal translation table or a provided one, this module will translate DNA +sequences for you. -The way to work with JCVI::Translator is you create a new -translator using an internal translation table or a provided -one, and from there you can perform translations and other -functions requiring knowledge of the translation table. +Translator uses interbase coordinates. See below for the difference between +interbase coordinates and traditional numbering methods: -Translator uses interbase numbering. See below for the -difference between interbase numbering and traditional -numbering methods: + Traditional 1 2 3 4 + A C G T ... + Interbase 0 1 2 3 4 - Traditional 1 2 3 4 - A C G T ... - Interbase 0 1 2 3 4 +Conversion methods between the two methods can depend upon what you are trying +to do, but the simple way to do this is: -Conversion methods between the two methods can depend upon -what you are trying to do, but in general, just add 1 to the -start base for interbase numbering to get the boundaries -for traditional numbering (i.e. 0-4 in interbase numbering -corresponds to bases 1-4). + strand = 3' end <=> 5' end + lower = min( 5' end, 3' end ) - 1 + lower = max( 5' end, 3' end ) For logging, it uses Log::Log4Perl. This needs to be -initialized to work. See http://log4perl.sourceforge.net/. +initialized to work. For parameter validation, uses Params::Validate. This introduces a bit of overhead, however, for scripts that are fully tested, validation can be disabled. See the Params::Validate documentation. -=head1 AUTHOR - -Kevin Galinsky, - -=head1 FUNCTIONS - -=over - =cut package JCVI::Translator; @@ -70,157 +67,117 @@ package JCVI::Translator; use strict; use warnings; -our $VERSION = '0.3.1'; - -use Log::Log4perl qw(:easy); -use Params::Validate qw(:all); +use version; +our $VERSION = qv('0.4.0'); -use JCVI::DNATools qw(%degenerateMap - $degenMatch - $nucs - $nucMatch - cleanDNA - reverseComplement); +use base qw(Class::Accessor::Fast); +__PACKAGE__->mk_accessors(qw(id names _table _starts _reverse)); -use JCVI::AATools qw(%ambiguousForward); - -my $DEFAULT_ID = 1; - -=item new() - -=item $translator = new Translator(%params); - -Creates a new translator using the given parameters. If no -parameters are given, the standard translation table will be -used. The parameters are: - - id - id of an internal translation table - name - name or part of the name of an internal table - tableRef - a reference to a table string - complete - does the table string contain all degenerate nucs? - -table takes precedence, and the format of the table string should -reflect that of the internal tables. - - name "(\w+(; )?)+" - name "(\w+(; )?)+" - id \d+ - ncbieaa "\w+" - sncbieaa "[M-]+" - base1 [ACGT]+ - base2 [ACGT]+ - base3 [ACGT]+ +use Log::Log4perl qw(:easy); +use Params::Validate; + +use JCVI::DNATools qw( + %degenerate_map + $degen_match + @nucs + $nuc_match + cleanDNA + reverse_complement +); + +use JCVI::AATools qw( + %ambiguous_forward + $aa_match +); + +# Defaults. Used by the validation functions. +our $DEFAULT_ID = 1; +our $DEFAULT_TYPE = 'id'; +our $DEFAULT_COMPLETE = 0; +our $DEFAULT_BOOTSTRAP = 1; +our $DEFAULT_STRAND = 1; +our $DEFAULT_PARTIAL = 0; +our $DEFAULT_SANITIZED = 0; + +=head1 CONSTRUCTORS -Examples: +=cut - $translator = new Translator(); # Default translator - $translator = new Translator('id' => 4); - $translator = new Translator('name' => 'mitochondrial'); - $translator = new Translator('table' => - 'name "All Alanines; All the Time" - id 9000 - ncbieaa "AAAAAAAA" - sncbieaa "----M---" - base1 AAAAAAAA - base2 AACCGGTT - base3 ACACACAC' - ); +=head2 new + + my $translator = new JCVI::Translator(); + my $translator = new JCVI::Translator( $id ); + my $translator = new JCVI::Translator( $id, $type ); + +This method creates a translator by loading a translation table from the +internal list. Pass an ID and the type of ID. By default, it will load the +tranlation table with id 1. The type of ID may be "id" or "name," which +correspond to the numeric id of the translation table or the long name of the +translation table. For instance, below are the headers for the first 3 +translation tables. + + { + name "Standard" , + name "SGC0" , + id 1 , + ... + }, + { + name "Vertebrate Mitochondrial" , + name "SGC1" , + id 2 , + ... + }, + { + name "Yeast Mitochondrial" , + name "SGC2" , + id 3 , + ... + }, + ... + +By default, the "Standard" translation table will be loaded. You may create a +translator with this table by calling any the following: + + my $t = new JCVI::Translator(); # default table + my $t = new JCVI::Translator(1); # explicitly set id + my $t = new JCVI::Translator( 1, 'id' ); # set id and type + my $t = new JCVI::Translator( 'Standard', 'name' ); # set name + my $t = new JCVI::Translator( 'SGC0', 'name' ); # alternate name + my $t = new JCVI::Translator( 'standard', 'name' ); # not case-sensitive + my $t = new JCVI::Translator( 'stan', 'name' ); # partial match ok + +For partial matches, JCVI::Translator will use the first matching translation +table. + + my $t = new JCVI::Translator( 'mitochondrial', 'name' ); + +This will use translation table with ID 2, "Vertebrate Mitochondrial," because +that is the first match. =cut sub new { - my $type = shift; - - my %params = validate( @_, - { id => { default => $DEFAULT_ID, - regex => qr/^\d+$/ - }, - name => { optional => 1, - type => SCALAR - }, - tableRef => { optional => 1, - type => SCALARREF - }, - complete => { default => 0, - regex => qr/^[01]$/ - } - } - ); - - my $self = { seqRef => undef }; - bless $self, $type; - - DEBUG("New $type"); - - TRACE("Params:"); - map { TRACE("$_ => $params{$_}") } keys %params; - - my $error; - - ######################################## - # If a tableref is passed, go directly - # to loadTable. Else, go to - # loadInternalTable which will then call - # loadTable. - - ######################################## - # Maybe this should be split into two - # constructors? - - if ( $params{tableRef} ) { - DEBUG('Loading custom table'); - $error = $self->_loadTable( @params{qw(tableRef complete)} ); - } - else { - DEBUG('Loading internal table'); - $error = $self->_loadInternalTable( @params{qw(id name)} ); - } - - if ($error) { - ERROR("Error: $error"); - return $error; - } - else { - DEBUG("New $type successful"); - return $self; - } -} - -=item _loadInternalTable() + TRACE('new called'); -=item $err = $translator->_loadInternalTable($id, $name); + my $class = shift; -This method loads a table from the internal list. Gets -called from "new" if no table string is provided. Passed -either $id or $name. Not recommend to pass both, but if both -are passed, then whichever matches first is used. - -=cut - -sub _loadInternalTable { - my $self = shift; - - my ( $id, $name ) = validate_pos( @_, { regex => qr/^\d+$/ }, 0 ); + my ( $id, $type ) = validate_pos( + @_, + { default => $DEFAULT_ID }, + { default => $DEFAULT_TYPE, regex => qr/id|name/ } + ); - ######################################## - # Set up regular expression match for - # searching. + TRACE( uc($type) . ': ' . $id ); - my $match = $name ? qr/name ".*$name.*"/i : qr/id $id/i; + # Get the beginning DATA so that we can seek back to it + my $start_pos = tell DATA; - DEBUG("_loadInternalTable called"); - TRACE("ID $id") if ( defined $id ); - TRACE("Name $name") if ( defined $name ); + # Set up regular expression for searching. + my $match = ( $type eq 'id' ) ? qr/id $id\b/ : qr/name ".*$id.*"/i; + # Go through every internal table until it matches on id or name. my $found = 0; - - # Get the beginning DATA input - my $startPos = tell DATA; - - ######################################## - # Go through every internal table until - # it matches on id or name. - local $/ = "}"; local $_; while () { @@ -230,243 +187,380 @@ sub _loadInternalTable { } } - # Reset DATA input - seek DATA, $startPos, 0; + # Reset DATA + seek DATA, $start_pos, 0; + + # Call custom with internal table. Complete is set to 1. + return $class->custom( \$_, 1 ) if ($found); - ######################################## - # Call loadTable with internal table. - # Complete is set to 1. - return $found - ? $self->_loadTable( \$_, 1 ) - : 'Translation table not found'; + # Internal table not matched. + ERROR("Table with $type of $id not found"); + return undef; } -=item _loadTable() +=head2 custom() + + my $translator = $translator->custom( $table_ref ); + my $translator = $translator->custom( $table_ref, $complete ); + +Create a translator table based off a passed table reference for custom +translation tables. Loads degenerate nucleotides if $complete isn't set (this +can take a little time). The format of the translation table should reflect +those of the internal tables: + + name "Names separated; by semicolons" + name "May have multiple lines" + id 99 + ncbieaa "AMINOACIDS...", + sncbieaa "-M--------..." + -- Base1 AAAAAAAAAA... + -- Base2 AAAACCCCGG... + -- Base3 ACGTACTGAC... -=item $err = $translator->loadTable($tableRef, $complete); +JCVI::Translator is a bit more permissive, see the $TABLE_REGEX regular +expression to see that actual format. + +Examples: + + $translator = new Translator( + table_ref => \'name "All Alanines; All the Time" + id 9000 + ncbieaa "AAAAAAAA" + sncbieaa "----M---" + base1 AAAAAAAA + base2 AACCGGTT + base3 ACACACAC' + ); -Loads a table based off a passed table reference for custom -translation tables. Gets called from "new" if a table string -is provided. Loads degenerate nucs if $complete isn't set. + $translator = new Translator( + table_ref => \$table, + complete => 1 + ); =cut -sub _loadTable { - my $self = shift; +# Regular expression which should match translation tables and also extracts +# relevant information. +our $TABLE_REGEX = qr/ + ( (?:name\s+".+?".*?) + ) + id\s+(\d+).* + ncbieaa\s+"([a-z*]+)".* + sncbieaa\s+"([a-z-]+)".* + base1\s+([a-z]+).* + base2\s+([a-z]+).* + base3\s+([a-z]+).* + /isx; - my ( $tableRef, $complete ) - = validate_pos( @_, - { type => SCALARREF }, - { default => 0, - regex => qr/^[01]$/ - } - ); +sub custom { + TRACE('custom called'); + + my $class = shift; + + my ( $table_ref, $complete ) = validate_pos( + @_, + { type => Params::Validate::SCALARREF }, + { + default => $DEFAULT_COMPLETE, + regex => qr/^[01]$/ + } + ); + + # Match the table or return undef. + unless ( $$table_ref =~ $TABLE_REGEX ) { + ERROR( 'Translation table is in invalid format', $$table_ref ); + return undef; + } + + # Store the data that has been stripped using descriptive names; + my $names = $1; + my $id = $2; + my $residues = $3; + my $starts = $4; + my $base1 = $5; + my $base2 = $6; + my $base3 = $7; - DEBUG("_loadTable called"); + my $self = $class->_new(); - ( $$self{info}{id} ) = $$tableRef =~ /id\s+(\d+)/i; + $self->id($id); - ######################################## - # Extract each name, massage, and push - # it onto names array - while ( $$tableRef =~ /name\s+"(.+?)"/gis ) { + # Extract each name, massage, and push it onto names array + while ( $names =~ /"(.+?)"/gis ) { my @names = split( /;/, $1 ); + local $_; foreach (@names) { s/^\s+//; s/\s+$//; s/\n/ /g; s/\s{2,}/ /g; - push @{ $$self{info}{names} }, $_ if $_; + push @{ $self->names }, $_ if $_; } } - ######################################## - # Pull each string to be used for - # translation table generation. + # Store all the hashes in $self so we don't have to keep using accessors + my $forward_hash = $self->_table->[0]; + my $rc_forward_hash = $self->_table->[1]; - my ($residues) = $$tableRef =~ /ncbieaa.+?([a-z*]+)/i; - my ($starts) = $$tableRef =~ /sncbieaa.+?([a-z-]+)/i; - my ($base1) = $$tableRef =~ /base1.+?([a-z]+)/i; - my ($base2) = $$tableRef =~ /base2.+?([a-z]+)/i; - my ($base3) = $$tableRef =~ /base3.+?([a-z]+)/i; + my $starts_hash = $self->_starts->[0]; + my $rc_starts_hash = $self->_starts->[1]; - ######################################## - # Chop is used to efficiently get the - # last character from each string; like - # pop, but for strings. + my $reverse_hash = $self->_reverse->[0]; + my $rc_reverse_hash = $self->_reverse->[1]; + # Chop is used to efficiently get the last character from each string while ( my $residue = uc( chop $residues ) ) { my $start = uc( chop $starts ); my $codon = uc( chop($base1) . chop($base2) . chop($base3) ); - my $rc_codon_ref = reverseComplement( \$codon ); + my $rc_codon_ref = reverse_complement( \$codon ); + # If the residue is valid, store it if ( $residue ne 'X' ) { - $$self{forward}{$codon} = $residue; - $$self{rc_forward}{$$rc_codon_ref} = $residue; + $forward_hash->{$codon} = $residue; + $rc_forward_hash->{$$rc_codon_ref} = $residue; + + push @{ $reverse_hash->{$residue} }, $codon; + push @{ $rc_reverse_hash->{$residue} }, $$rc_codon_ref; } + + # If the start is valid, store it if ( ( $start ne '-' ) ) { - $$self{starts}{$codon} = $start; - $$self{rc_starts}{$$rc_codon_ref} = $start; - } + $starts_hash->{$codon} = $start; + $rc_starts_hash->{$$rc_codon_ref} = $start; - push @{ $$self{reverse}{$residue} }, $codon; - push @{ $$self{rc_reverse}{$residue} }, $$rc_codon_ref; + push @{ $reverse_hash->{start} }, $codon; + push @{ $rc_reverse_hash->{start} }, $$rc_codon_ref; + } } - ######################################## - # Use the printTable command to fill in - # the gaps in the translation table - # unless the table has been marked as - # complete. + # Unroll the translation table unless it has been marked complete + $self->bootstrap() unless ($complete); - $self->printTable() unless $complete; + return $self; +} - return $$self{forward} ? 0 : 'Translation table could not be loaded'; +# Helper constructor. Instantiates the object with arrayrefs and hashrefs in +# the right places +sub _new { + my $class = shift; + my $self = $class->SUPER::new( + { + names => [], + _table => [], + _starts => [], + _reverse => [] + } + ); + + foreach my $func (qw( _table _starts _reverse )) { + foreach my $rc ( 0 .. 1 ) { + $self->$func->[$rc] = {}; + } + } + + return $self; } -=item loadSequence +=head1 METHODS + +=cut + +=head2 add_translation + + $translator->add_translation( $codon, $residue ); + $translator->add_translation( $codon, $residue, \%params ); + +Add a codon-to-residue translation to the translation table. $start inidicates +if this is a start codon. -=item $translator->loadSequence(\$seqRef); +Examples: -Translator can cache sequences internally to be translated -from later. This can save time when translating from the -same sequence multiple times, but not all the ranges are -available immediately. + # THESE AREN'T REAL!!! + $translator->add_translation( 'ABA', 'G' ); + $translator->add_translation( 'ABA', 'M', 1 ); =cut -sub loadSequence { +sub add_translation { + TRACE('add_translation called'); + my $self = shift; - my ( $seqRef, $sanitized ) = validate_pos( + my ( $codon, $residue, @p ); + + ( $codon, $residue, $p[0] ) = validate_pos( @_, - { type => SCALARREF, - callback => { - 'Sequence contains invalid nucleotides' => - sub { ${ $_[0] } !~ /[^$nucMatch]/ } - } - }, - 0 + { regex => qr/^${nuc_match}{3}$/ }, + { regex => qr/^$aa_match$/ }, + { type => Params::Validate::HASHREF, default => {} } + ); - cleanDNA($seqRef) unless ($sanitized); + my %p = validate( + @p, + { + strand => { + default => 1, + regex => qr/^[+-]?1$/, + type => Params::Validate::SCALAR + }, + start => { + default => 0, + regex => qr/^[01]$/, + type => Params::Validate::SCALAR + } + } + ); + + my $codon_ref; + my $rc_codon_ref; + + if ( $p{strand} == 1 ) { + $codon_ref = \$codon; + $rc_codon_ref = reverse_complement( \$codon ); + } + else { + $rc_codon_ref = \$codon; + $codon_ref = reverse_complement( \$codon ); + } - DEBUG('loadSequence called'); - TRACE( 'Sequence starts with ' . substr $$seqRef, 0, 5 ); - TRACE( 'Sequence length is ' . length $$seqRef ); + # Store residue in the starts or regular translation table. + my $table = $p{start} ? '_starts' : '_table'; + $self->$table->[0]->{$$codon_ref} = $residue; + $self->$table->[1]->{$$rc_codon_ref} = $residue; - $$self{seqRef} = $seqRef; + # Store the reverse lookup + $residue = 'start' if ( $p{start} ); + push @{ $self->_reverse->[0]->{$residue} }, $$codon_ref; + push @{ $self->_reverse->[1]->{$residue} }, $$rc_codon_ref; } -=item clearSequence +=head2 bootstrap -=item $translator->clearSequence(\$seqRef); + $translator->bootstrap(); -Clear cached sequences. +Bootstrap the translation table. Find every possible translation, even those +that involve degenerate nucleotides or ambiguous amino acids. =cut -sub clearSequence { - my $self = shift; +sub bootstrap { + TRACE('bootstrap called'); - DEBUG('clearSequence called'); + my $self = shift; - undef $$self{seqRef}; + # Loop through every nucleotide combination and run _translate_codon on + # each. + foreach my $n1 (@nucs) { + foreach my $n2 (@nucs) { + foreach my $n3 (@nucs) { + $self->_translate_codon( $n1 . $n2 . $n3, $self->_table->[0] ); + $self->_translate_codon( + $n1 . $n2 . $n3, + $self->_starts->[0], + { start => 1 } + ); + } + } + } } -=item printTable() +=head2 table_string + + my $table_string_ref = $translator->_table_string(); + my $table_string_ref = $translator->_table_string( $bootstrap ); -=item $tableString = $translator->printTable(); +Returns the table string. $bootstrap specifies whether or not this table should +try to bootstrap itself using the bootstrap function above. By default, it is +1. -Returns the complete, absolute version of the table string. -Unrolls all degenerates and everything. Due to the caching -nature of translateCodon, this routine will also store all -possibilities for translation. +Examples: + + my $table_string_ref = $translator->_table_string(); + my $table_string_ref = $translator->_table_string(0); # To not bootstrap =cut -sub printTable { +sub table_string { + TRACE('table_string called'); + my $self = shift; - DEBUG('printTable called'); - - my $names = join( '; ', @{ $$self{info}{names} } ); - my ( $residues, $starts, @base, @b ); - - my @NUCS = split '', $nucs; - - ######################################## - # Rigorous loop unrolls the nucleotide - # table. This causes the subroutine to - # take a while to run. - - foreach (@NUCS) { - $b[0] = $_; - foreach (@NUCS) { - $b[1] = $_; - foreach (@NUCS) { - $b[2] = $_; - my $aa = $self->translateCodon( join( '', @b ), 1 ); - my $st = $self->translateCodon( join( '', @b ), 1, 1 ); - - unless ( ( $aa eq 'X' ) - && ( $st eq '-' ) ) - { - $residues .= $aa; - $starts .= $st; - $base[$_] .= $b[$_] foreach ( 0 .. 2 ); - } - } - } + my $bootstrap = + validate_pos( @_, + { default => $DEFAULT_BOOTSTRAP, regex => qr/^[01]$/ } ); + + # Bootstrap if necessary + $self->bootstrap() if ($bootstrap); + + # Generate the names string + my $names = join( '; ', @{ $self->names } ); + + my ( $residues, $starts ); # starts/residues string + my @base = (undef) x 3; # this will store the bases for the loop + + # Loop over all stored codons. Sort the codons in the translation table and + # starts table, then use grep to get the unique ones with the help of $prev + # which stores the previous value + my $prev = ''; + foreach my $codon ( + grep ( ( $_ ne $prev ) && ( $prev = $_ ), + sort { $a cmp $b } ( + keys( %{ $self->_table->[0] } ), + keys( %{ $self->_starts->[0] } ) + ) ) + ) + { + $residues .= $self->_table->[0]->{$codon} || 'X'; + $starts .= $self->_starts->[0]->{$codon} || '-'; + + # Chop up the codon because the bases are stored on separate lines + $base[ -$_ ] .= chop $codon foreach ( 1 .. 3 ); } - return - join( "\n", - '{', - qq(name "$names" ,), - qq(id $$self{info}{id} ,), - qq(ncbieaa "$residues",), - qq(sncbieaa "$starts"), - map( {"-- Base$_ $base[$_ - 1]"} ( 1 .. 3 ) ), - '}' ); + # Generate the string + my $string = join( "\n", + '{', + qq(name "$names" ,), + qq(id $self->{id} ,), + qq(ncbieaa "$residues",), + qq(sncbieaa "$starts"), + map( {"-- Base$_ $base[$_ - 1]"} ( 1 .. 3 ) ), + '}' ); + + return \$string; } -=item translate() +=head2 translate -=item $pepRef = $translator->translate(%params); + $pep_ref = $translator->translate( $seq_ref, \%params ); -The basic function of this module. Translate the specified -region of the sequence and return a reference to the -translated string. The parameters are: +The basic function of this module. Translate the specified region of the +sequence (passed as $seq_ref) and return a reference to the translated string. +The parameters are: - strand - 1 or -1; mandatory - lower - integer between 0 and length; optional - defaults to 0 - upper - integer between 0 and length; optional - defaults to length - seqRef - reference to a string; optional - sequence - string; optional - partial - 0 or 1; optional + strand: 1 or -1; default = 1 + lower: integer between 0 and length; default = 0 + upper: integer between 0 and length; default = length + partial: 0 or 1; default = 0 + sanitized: 0 or 1; default = 0 -Translator uses interbase coordinates. lower and upper are -optional parameters such that: +Translator uses interbase coordinates. lower and upper are optional parameters +such that: - 0 <= lower <= upper <= length + 0 <= lower <= upper <= length -Translator will log and die if those conditions are not -satisfied. strand is the only mandatory parameter. sequence -and seqRef are both optional. If both are provided, sequence -takes priority. If neither is provided, then Translator will -use a previously loaded sequence. If no sequence has been -loaded, translator will log and die. +Translator will log and die if those conditions are not satisfied. -Partial sets whether or not the sequence is a 5' partial.By -default, partial is taken to be false, and the translator -will try to translate the first codon as if it is a start -codon. If you specify partial, the translator will skip -that step. +partial sets whether or not the sequence is a 5' partial. By default, partial +is taken to be false and the translator will try to translate the first codon +as if it were a start codon. You can specify that the sequence is 5' partial +and the translator will skip that step. +sanitized is a flag translator know that this sequence has been stripped of +whitespace and that all the codons are capitalized. Otherwise, translator will +do that in order to speed up the translation process +(see JCVI::DNATools::cleanDNA). To translate the following: @@ -474,705 +568,793 @@ To translate the following: C G C G C A G G A ----------> - $pepRef = $translator->translate(seqRef => \$sequence, - strand => 1, - lower => 1, - upper => 7); + $pep_ref = $translator->translate( + \$sequence, + { + strand => 1, + lower => 1, + upper => 7 + } + ); 0 1 2 3 4 5 6 7 8 9 C G C G C A G G A <---------- - $pepRef = $translator->translate(seqRef => \$sequence, - strand => -1, - lower => 2, - upper => 8); + $pep_ref = $translator->translate( + \$sequence, + { + strand => -1, + lower => 2, + upper => 8 + } + ); Examples: - $pepRef = $translator->translate(strand => -1); + my $pep_ref = $translator->translate( \'acttgacgt' ); - $pepRef = $translator->translate(strand => 1, - seqRef => \'acttgacgt'); + my $pep_ref = $translator->translate( \'acttgacgt', { strand => -1 } ); - $pepRef = $translator->translate(strand => -1, - sequence => 'acttgacgt', - lower => 2, - upper => 5); + my $pep_ref = $translator->translate( + \'acttgacgt', + { + strand => -1, + lower => 2, + upper => 5 + } + ); - $pepRef = $translator->translate(strand => +1, - seqRef => \'acttgacgt', - lower => 0, - upper => 8, - partial => 1); + my $pep_ref = $translator->translate( + \'acttgacgt', + { + strand => 1, + lower => 0, + upper => 8, + partial => 0 + } + ); =cut sub translate { - my $self = shift; + TRACE('translate called'); - DEBUG('translate called'); + my $self = shift; - my %params = validate( + my ( $seq_ref, @p ); + ( $seq_ref, $p[0] ) = validate_pos( @_, - { strand => { default => 1, - regex => qr/^[+-]?1$/, - type => SCALAR - }, - upper => 0, - lower => { default => 0 }, - seqRef => { - optional => 1, - type => SCALARREF, - callback => { - 'Sequence contains invalid nucleotides' => sub { - ${ $_[0] } !~ /[^$nucMatch]/; - } - } - }, - partial => 0, - sanitized => 0 + { type => Params::Validate::SCALARREF, }, + { type => Params::Validate::HASHREF, default => {} } + ); + + my %p = validate( + @p, + { + strand => { + default => $DEFAULT_STRAND, + regex => qr/^[+-]?1$/, + type => Params::Validate::SCALAR + }, + lower => { + default => 0, + regex => qr/^[0-9]+$/, + type => Params::Validate::SCALAR, + callbacks => { + 'lower >= 0' => sub { $_[0] >= 0 }, + 'lower <= seq_length' => sub { $_[0] <= length($$seq_ref) } + } + }, + upper => { + default => length($$seq_ref), + regex => qr/^[0-9]+$/, + type => Params::Validate::SCALAR, + callbacks => { + 'upper >= 0' => sub { $_[0] >= 0 }, + 'upper <= seq_length' => sub { $_[0] <= length($$seq_ref) } + } + }, + partial => { default => $DEFAULT_PARTIAL }, + sanitized => { default => $DEFAULT_SANITIZED } } ); - unless ( defined $params{upper} ) { - $params{upper} = - length( defined $params{seqRef} - ? ${ $params{seqRef} } - : $self->{seqRef} - ); + # Die if upper < lower + if ( $p{upper} < $p{lower} ) { + FATAL "Upper $p{upper} < Lower $p{lower}"; + die "Upper $p{upper} < Lower $p{lower}"; } - $params{exons} = [ [ $params{lower}, $params{upper} ] ]; - delete $params{lower}; - delete $params{upper}; + $seq_ref = cleanDNA($seq_ref) unless ( $p{sanitized} ); + + # These are necessary for the _translate function + my $prep = $self->_prepare( $p{strand} ); + my $ends = $self->_endpoints( @p{qw(strand lower upper)} ); + + my $peptide = ''; - return $self->translateExons(%params); + $self->_start( $seq_ref, \$peptide, $ends, $prep ) unless ( $p{partial} ); + $self->_translate( $seq_ref, \$peptide, $ends, $prep ); + + return \$peptide; } -=item translate6() +=head2 translate6 -=item $pepRefs = $translator->translate6($seqRef); + my $pep_refs = $translator->translate6( $seq_ref ); + my $pep_refs = $translator->translate6( $seq_ref, \%params ); -Translate the sequence in every possible way. Returns an -array reference of all the translations. The -structure of the array is as follows: +Translate the sequence in every possible way. Returns an array reference of all +the translations. The structure of the array is as follows: - 0: ----------> - 1: ---------> - 2: --------> - NNNN...NNNN - 3: <---------- - 4: <--------- - 5: <-------- + 0: ----------> + 1: ---------> + 2: --------> + NNNN...NNNN + 3: <---------- + 4: <--------- + 5: <-------- -I should stress that $$pepRefs[x] is not a sequence, but a -reference to a sequence. So to access a sequence, you need -to do the following: +The parameters are similar to those use in translate: - $sequence = ${$$pepRefs[x]} + lower: integer between 0 and length; default = 0 + upper: integer between 0 and length; default = length + partial: 0 or 1; default = 0 + sanitized: 0 or 1; default = 0 Example: - $pepRefs = $translator->translate6(\'acttgacgt'); + $pep_refs = $translator->translate6(\'acttgacgt'); Output: - $pepRefs = [$pepRef0, - $pepRef1, - $pepRef2, - $reversePepRef0, - $reversePepRef1, - $reversePepRef2] + $pep_refs = [ + $pep1, + $pep2, + $pep3, + $reverse_pep1, + $reverse_pep2, + $reverse_pep3 + ] =cut sub translate6 { - my ( $self, $seqRef, $sanitized ) = @_; + TRACE('translate6 called'); + + my $self = shift; - DEBUG("translate6 called"); + my ( $seq_ref, @p ); + ( $seq_ref, $p[0] ) = validate_pos( + @_, + { type => Params::Validate::SCALARREF }, + { type => Params::Validate::HASHREF, default => {} } + ); + + my %p = validate( + @p, + { + lower => { + default => 0, + regex => qr/^[0-9]+$/, + type => Params::Validate::SCALAR, + callbacks => { + 'lower >= 0' => sub { $_[0] >= 0 }, + 'lower <= seq_length' => sub { $_[0] <= length($$seq_ref) } + } + }, + upper => { + default => length($$seq_ref), + regex => qr/^[0-9]+$/, + type => Params::Validate::SCALAR, + callbacks => { + 'upper >= 0' => sub { $_[0] >= 0 }, + 'upper <= seq_length' => sub { $_[0] <= length($$seq_ref) } + } + }, + partial => { default => $DEFAULT_PARTIAL }, + sanitized => { default => $DEFAULT_SANITIZED } + } + ); + + $seq_ref = cleanDNA($seq_ref) unless ( $p{sanitized} ); + + my @peptides; - cleanDNA($seqRef) unless ($sanitized); + foreach my $strand ( -1, 1 ) { - my @pepRefs; + # We only need to calculate prep once for a given strand + my $prep = $self->_prepare($strand); + my $rc = $prep->[0]; # True if reverse complement + my $fw = ( $rc + 1 ) % 2; # True if forward strand + foreach ( 0 .. 2 ) { - push @pepRefs, - $self->translate( seqRef => $seqRef, - lower => $_, - strand => 1, - sanitized => 1 - ) foreach ( 0 .. 2 ); - push @pepRefs, - $self->translate( seqRef => $seqRef, - upper => length($$seqRef) - $_, - strand => -1, - sanitized => 1 - ) foreach ( 0 .. 2 ); + # Calculate endpoints and translate + my $ends = $self->_endpoints( + $strand, + $p{lower} + $fw * $_, + $p{upper} - $rc * $_ + ); + $self->_start( $seq_ref, \$peptides[ $rc * 3 + $_ ], $ends, $prep ) + unless ( $p{partial} ); + $self->_translate( $seq_ref, \$peptides[ $rc * 3 + $_ ], + $ends, $prep ); + } + } - return \@pepRefs; + return \@peptides; } -=item translateExons() +=head2 translate_exons -=item $pepRef = translateExons(%params); + my $pep_ref = translate_exons( $str_ref, $exons_array_ref ); + my $pep_ref = translate_exons( $str_ref, $exons_array_ref, \%params ); Translate a gene spanning multiple exons. Paramters are: - strand: 1 or -1; mandatory - seqRef: reference to sequence; optional if one is loaded - partial: '0' or '1'; optional, defaults to '0' + strand: 1 or -1; default = 1 + partial: 0 or 1; default = 0 + sanitized: 0 or 1; default = 0 Input: - $exonRangesRef = [ - [$start0, $stop0], - [$start1, $stop1], - ... - ] + $exons_array_ref = [ + [$start0, $stop0], + [$start1, $stop1], + ... + ]; + +The order of the exons in the array doesn't matter. translate_exons will sort +the exons. Example: - $pepRef = translateExons(\'actgcat', [ [0,2], [3,7] ], 1); + $pep_ref = translate_exons(\'actgcat', [ [0,2], [3,7] ]); + $pep_ref = translate_exons(\'actgcat', [ [0,2], [3,7] ], { strand => -1}); =cut -sub translateExons { +sub translate_exons { + TRACE('translate_exons called'); + my $self = shift; - my %params = validate( + my ( $seq_ref, $exons, @p ); + ( $seq_ref, $exons, $p[0] ) = validate_pos( @_, - { strand => { regex => qr/^[+-]?1$/, - type => SCALAR - }, - seqRef => { - default => $$self{seqRef}, - type => SCALARREF, - callback => { - 'Sequence contains invalid nucleotides' => sub { - ${ $_[0] } !~ /[^$nucMatch]/; - } - } - }, - exons => { - type => ARRAYREF, - callbacks => { - 'Bound not an integer' => sub { - foreach my $bounds ( @{ $_[0] } ) { - foreach my $bound (@$bounds) { - return 0 unless ( $bound =~ /^\d+$/ ); - } - } - return 1; - }, - 'Bound out of range' => sub { - foreach my $bounds ( @{ $_[0] } ) { - foreach my $bound (@$bounds) { - return 0 - unless ( ( $bound >= 0 ) - && ( $bound <= length ${ $_[1]{seqRef} } ) - ); - } - } - return 1; - } - } - }, - partial => 0, - sanitized => 0 - } + { type => Params::Validate::SCALARREF }, + { type => Params::Validate::ARRAYREF }, + { type => Params::Validate::HASHREF, default => {} } ); - my $seqRef; - - ######################################## - # Do some further validation. - -VALIDATION: { - if ( $params{seqRef} ) { - $seqRef = $params{seqRef}; - cleanDNA($seqRef) unless ( $params{sanitized} ); - } - else { - $seqRef = $$self{seqRef}; - } + validate_pos( + @$exons, + ( + { + type => Params::Validate::ARRAYREF, + callbacks => { + 'Bound not an integer' => sub { + foreach my $bound ( @{ $_[0] } ) { + return 0 unless ( $bound =~ /^\d+$/ ); + } + return 1; + }, + 'Bound out of range' => sub { + foreach my $bound ( @{ $_[0] } ) { + return 0 + unless ( ( $bound >= 0 ) + && ( $bound <= length $$seq_ref ) ); + } + return 1; + }, + 'lower <= upper' => sub { + return $_[0][0] <= $_[0][1]; + } + } + } + ) x @$exons + ); - unless ( defined $seqRef ) { - my $logger = get_logger(); - $logger->logcroak('Sequence undefined'); + my %p = validate( + @p, + { + strand => { + default => $DEFAULT_STRAND, + regex => qr/^[+-]?1$/, + type => Params::Validate::SCALAR + }, + partial => { default => $DEFAULT_PARTIAL }, + sanitized => { default => $DEFAULT_SANITIZED } } - } - - DEBUG('translateExons called'); - TRACE("Strand is $params{strand}"); - TRACE("5' Partial") if ( $params{partial} ); - TRACE( 'Sequence starts with ' . substr ${ $params{seqRef} }, 0, 5 ); - TRACE( 'Sequence length is ' . length ${ $params{seqRef} } ); + ); - my $increment; - my $prefix; - my $offset; + my @exons = + sort { ( $a->[0] <=> $b->[0] || $a->[1] <=> $b->[1] ) * $p{strand} } + @$exons; - if ( $params{strand} == 1 ) { - $increment = 3; - $prefix = ''; - $offset = 0; - } - else { - $increment = -3; - $prefix = 'rc_'; - $offset = -3; - } - - my $peptide = ''; + my $prep = $self->_prepare( $p{strand} ); my $leftover = ''; + my $peptide; -EXON: foreach my $i ( 0 .. $#{ $params{exons} } ) { - TRACE("Exon $i"); - - my ( $lower, $upper ); - - VALIDATE: { - my $exon - = $params{strand} == 1 - ? $params{exons}[$i] - : $params{exons}[ -( $i + 1 ) ]; - - ( $lower, $upper ) = validate_pos( - @$exon, - { 'Lower out of range' => sub { - ( ( 0 <= $_[0] ) && ( $_[0] <= $_[1][1] ) ); - } - }, - { 'Upper out of range' => sub { - ( ( $_[1][0] <= $_[0] ) - && ( $_[0] <= length $$seqRef ) ); - } - } - ); - - TRACE("Lower $lower"); - TRACE("Upper $upper"); - } - - LEFTOVER: { - - ######################################## - # Deal with leftovers (exons that cut - # codons) and exons that are short. - # If the exon has fewer nucleotides than - # what is required to complete the - # codon, dump the nucleotides into the - # growing codon and then continue. - # Otherwise, complete the exon and - # increment the start index. + EXON: foreach my $exon (@exons) { + my ( $lower, $upper ) = @$exon; - my $togo = 3 - length $leftover; + LEFTOVER: { - if ( ( my $length = $upper - $lower ) <= $togo ) { - WARN("Exon very short: $length <= $togo"); + # Deal with leftovers. These are codons that have been cut by + # splicing. In the event that no codon has been cut, the leftover + # will be the first codon of the exon. - ######################################## - # For forward direction, append to - # $leftover, otherwise, prepend. + my $to_go = 3 - length($leftover); - unless ($prefix) { - $leftover .= substr( $$seqRef, $lower, $length ); - } - else { - $leftover - = substr( $$seqRef, $lower, $length ) . $leftover; - } + # If the exon has fewer nucleotides than what is required to + # complete the codon, set $to_go to be the length of that exon. + if ( ( my $length = $upper - $lower ) < $to_go ) { + $to_go = $length; + } - next EXON; + # Complete the leftover and increment the start index. + unless ( $prep->[0] ) { + $leftover .= substr( $$seq_ref, $lower, $to_go ); + $lower += $to_go; } else { - unless ($prefix) { - $leftover .= substr( $$seqRef, $lower, $togo ); - $lower += $togo; - } - else { - $upper -= $togo; - $leftover = substr( $$seqRef, $upper, $togo ) . $leftover; - } + $upper -= $to_go; + $leftover = substr( $$seq_ref, $upper, $to_go ) . $leftover; } + + # If leftover isn't long enough, then move to the next exon. + next EXON if ( length($leftover) < 3 ); } - PARTIAL: { + START: { - ######################################## - # Handle 5' partials. The first exon may - # be the actual start of the gene, so - # the option to have the start codon - # translated is left in there. Otherwise - # just translate the leftover codon like - # a regular codon. + # Handle the start codon. After the start codon has been + # translated, set the partial flag so we don't try it again. - if ( $params{partial} ) { - $peptide .= $$self{ $prefix . 'forward' }{$leftover}; - } - else { - $peptide - = $$self{ $prefix . 'starts' }{$leftover} - || $$self{ $prefix . 'forward' }{$leftover} - || 'X'; - $params{partial} = 1; + my $ends = [ 0, $prep->[1] ]; + unless ( $p{partial} ) { + $self->_start( \$leftover, \$peptide, $ends, $prep ); + $p{partial} = 1; } + + $self->_translate( \$leftover, \$peptide, $ends, $prep ); } - my $start; - my $stop; + my $ends = $self->_endpoints( $p{strand}, $lower, $upper ); + BOUNDS: { + my $phase_diff = ( $upper - $lower ) % 3; + $leftover = + $prep->[0] + ? substr( $$seq_ref, $lower, $phase_diff ) + : substr( $$seq_ref, $ends->[1], $phase_diff ); + } - BOUNDS: { - ######################################## - # Similar to translate. Set up looping - # variables. + $self->_translate( $seq_ref, \$peptide, $ends, $prep ); + } - my $phaseDiff = ( $upper - $lower ) % 3; + return \$peptide; +} - unless ($prefix) { - $start = $lower; - $stop = $upper - $phaseDiff; - $leftover = substr( $$seqRef, $stop, $phaseDiff ); - } - else { - $start = $upper - 3; - $stop = $lower + $phaseDiff - 3; - $leftover = substr( $$seqRef, $lower, $phaseDiff ); - } - } +# Returns [ $is_this_a_reverse_complement, $increment_for_loop ] +sub _prepare { + my $self = shift; + my ($strand) = @_; + return [ ( $strand == 1 ? 0 : 1 ), $strand * 3 ]; +} - ######################################## - # The guts of the translation routine. - # This routine is very fast and is what - # should be running over the course of - # most of the gene. Dealing with where - # the exons start and stop slows down - # the execution. - - for ( $start = $start; $start != $stop; $start += $increment ) { - $peptide .= $$self{ $prefix . 'forward' } - { substr( $$seqRef, $start, 3 ) } || 'X'; - } +# Convert (lower, upper) into endpoints for a loop. For the + strand, we just +# adjust upper so that it is in phase with lower. However, for the - strand, +# we not only adjust lower for phase, but also subtract 3 from everything so +# that we can take the substring properly. Here is a picture that might make +# sense of this: + +# Positions: 0 1 2 3 4 5 6 7 8 9 0 1 2 3 4 5 6 7 8 9 0 1 2 3 4 +# Region of interest (-): . . . .4- - - - - - - - - - - - - - - -20 . . . +# For + strand: 4- - -|- - -|- - -|- - -|- - >19 +# For - strand: 2. . .|< - -|- - -|- - -|- - -17 - -| + +# For a + strand, it returns [4, 19]. For a - strand, it returns [17, 2]. To +# get the first codon on the - strand, we take the substring starting at 17, +# and the loop will end once we decrement our counter to 2. + +sub _endpoints { + my $self = shift; + my ( $strand, $lower, $upper ) = @_; + + return $strand == 1 + ? [ $lower, $upper - ( ( $upper - $lower ) % 3 ) ] + : [ $upper - 3, $lower - 3 + ( ( $upper - $lower ) % 3 ) ]; +} + +# The actual translation function. Goes from start to stop, appends to the +# peptide sequence using the translation table provided. + +sub _translate { + my $self = shift; + my ( $seq_ref, $pep_ref, $ends, $prep ) = @_; + + my $table = $self->_table->[ $prep->[0] ]; + while ( $ends->[0] != $ends->[1] ) { + $$pep_ref .= $table->{ substr( $$seq_ref, $ends->[0], 3 ) } || 'X'; + $ends->[0] += $prep->[1]; } +} - return \$peptide; +# Perform translation for only one frame and adjusts the start only if it finds +# a codon in the translation table. + +sub _start { + my $self = shift; + my ( $seq_ref, $pep_ref, $ends, $prep ) = @_; + + return if ( $ends->[0] == $ends->[1] ); + + my $start = + $self->_starts->[ $prep->[0] ]->{ substr( $$seq_ref, $ends->[0], 3 ) }; + if ($start) { + $$pep_ref = $start; + $ends->[0] += $prep->[1]; + } } -=item translateCodon() +=head2 translate_codon -=item $residue = $translator->translateCodon($codon, $start); + my $residue = $translator->translate_codon( $codon ); + my $residue = $translator->translate_codon( $codon, \%params ); Translate a codon. Return 'X' or '-' if it isn't in the codon table. Handles degenerate nucleotides, so if all possible codons for an ambiguity map to the same residue, return that residue. Will also handle ambiguous amino acids. -$start dictates whether or not to translate this as a start +start dictates whether or not to translate this as a start codon. Will also cache any new translations it finds. -For those looking for the translateStart routine, it has -been merged into translateCodon. - Example: - $residue = $translator->translateCodon('atg'); - $residue = $translator->translateCodon('tty', 1); - $residue = $translator->translateCodon('cat', -1, 1); + $residue = $translator->translate_codon('atg'); + $residue = $translator->translate_codon( 'tty', { strand => -1 } ); + $residue = $translator->translate_codon( 'cat', { start => 1 } ); =cut -sub translateCodon { +sub translate_codon { + TRACE("translate_codon called"); + my $self = shift; - my ( $codon, $strand, $start ) - = validate_pos( @_, - { regex => qr/^${nucMatch}{3}$/ }, - { default => 1, - regex => qr/^[+-]?1$/, - type => SCALAR - }, - { default => 0, - regex => qr/^[01]$/, - type => SCALAR - } - ); - $codon = uc $codon; + my ( $codon, @p ); + + ( $codon, $p[0] ) = validate_pos( + @_, + { regex => qr/^${nuc_match}{3}$/ }, + { type => Params::Validate::HASHREF, default => {} } + + ); - my $prefix = $strand == 1 ? '' : 'rc_'; + my %p = validate( + @p, + { + strand => { + default => 1, + regex => qr/^[+-]?1$/, + type => Params::Validate::SCALAR + }, + start => { + default => 0, + regex => qr/^[01]$/, + type => Params::Validate::SCALAR + } + } + ); - DEBUG("translateCodon called"); - TRACE("Codon $codon"); + $codon = uc $codon; + my $rc = $p{strand} == 1 ? 0 : 1; - my $table; - my $notFound; - unless ($start) { - $table = $prefix . 'forward'; - $notFound = 'X'; + my ( $table, $not_found ); + unless ( $p{start} ) { + $table = $self->_table->[$rc]; + $not_found = 'X'; } else { - $table = $prefix . 'starts'; - $notFound = '-'; + $table = $self->_starts->[$rc]; + $not_found = '-'; } - TRACE("Using $table table"); - - return $$self{$table}{$codon} if ( defined $$self{$table}{$codon} ); - - ######################################## - # Handles codons with degenerate - # nucleotides: [RYMKWS] [BDHV] or N - # Several codons may map to the same - # amino acid. If all possible codons for - # an amibguity map to the same residue, - # return that residue rather than X - - if ( my ($nuc) = $codon =~ /($degenMatch)/ ) { - my $consensus; - - ######################################## - # Replace the nucleotide with every - # possiblity from degenerate map hash. - - foreach ( @{ $degenerateMap{$nuc} } ) { - my $newCodon = $codon; - $newCodon =~ s/$nuc/$_/; - my $residue = $self->translateCodon( $newCodon, $strand, $start ); - - ######################################## - # If consensus isn't set, set it to the - # current residue. - - $consensus = $residue unless $consensus; - - ######################################## - # If the returned residue was an 'X' or - # '-', we return that. - - return $notFound if ( $residue eq $notFound ); - - ######################################## - # This is an interesting step. If the - # residue isn't the same as the - # consensus, check to see if they map to - # the same ambiguous amino acid. If - # true, then change the consensus to - # that ambiguous acid and proceed. - # Otherwise, return $notFound. - - if ( $residue ne $consensus ) { - if ( ( defined $ambiguousForward{$residue} ) - && ( defined $ambiguousForward{$consensus} ) - && ( $ambiguousForward{$residue} eq - $ambiguousForward{$consensus} ) - ) - { - $consensus = $ambiguousForward{$consensus}; - } - else { - return $notFound; - } + + # return $table->{$codon} if ( defined $table->{$codon} ); + return $self->_translate_codon( $codon, $table, \%p ) || $not_found; +} + +# This is the helper function for translate_codon. It is designed to speed +# things up because it doesn't perform validation or try to figure out which +# tables to use, which can slow things down since this is a recursive function. +# Handles codons with degenerate nucleotides: [RYMKWS] [BDHV] or N. Several +# codons may map to the same amino acid. If all possible codons for an +# amibguity map to the same residue, return that residue rather than X. + +sub _translate_codon { + my $self = shift; + my $codon = shift; + my $table = shift; + + # Check for base case: no degenerate nucleotides; we can't unroll further. + unless ( $codon =~ /($degen_match)/ ) { + return $table->{$codon}; + } + + # Check to see if this degenerate-containing codon has been computed + return $table->{$codon} if ( $table->{$codon} ); + + my $consensus; + my $nuc = $1; + + # Replace the nucleotide with every possiblity from degenerate map hash. + foreach ( @{ $degenerate_map{$nuc} } ) { + my $new_codon = $codon; + $new_codon =~ s/$nuc/$_/; + + # Recursively call this function + my $residue = $self->_translate_codon( $new_codon, $table, @_ ); + + # If the new_codon didn't come to a consensus, or if the translation + # isn't defined for new_codon in a custom translation table, return + # undef. + return undef unless ( defined $residue ); + + # If consensus isn't set, set it to the current residue. + $consensus = $residue unless ($consensus); + + # This is an interesting step. If the residue isn't the same as the + # consensus, check to see if they map to the same ambiguous amino acid. + # If true, then change the consensus to that ambiguous acid and proceed. + # Otherwise, return undef (consensus could not be reached). + if ( $residue ne $consensus ) { + if ( + ( defined $ambiguous_forward{$residue} ) + && ( defined $ambiguous_forward{$consensus} ) + && ( $ambiguous_forward{$residue} eq + $ambiguous_forward{$consensus} ) + ) + { + $consensus = $ambiguous_forward{$consensus}; + } + else { + return undef; } } + } + + # If we got this far, it means that we have a valid consensus sequence for + # a degenerate-nucleotide-containing codon. Cache and return results. + DEBUG("New codon translation found: $codon => $consensus"); + $self->add_translation( $codon, $consensus, @_ ); + return $consensus; +} + +1; + +=head1 MISC - ######################################## - # Cache the residue in the translation - # tables. +These are the original translation tables. The translation tables used by this +module have been boostrap - they include translations for degenerate +nucleotides and allow ambiguous amino acids to be the targets of translation +(e.g. every effort has been made to give a translation that isn't "X"). + + { + name "Standard" , + name "SGC0" , + id 1 , + ncbieaa "FFLLSSSSYY**CC*WLLLLPPPPHHQQRRRRIIIMTTTTNNKKSSRRVVVVAAAADDEEGGGG", + sncbieaa "---M---------------M---------------M----------------------------" + -- Base1 TTTTTTTTTTTTTTTTCCCCCCCCCCCCCCCCAAAAAAAAAAAAAAAAGGGGGGGGGGGGGGGG + -- Base2 TTTTCCCCAAAAGGGGTTTTCCCCAAAAGGGGTTTTCCCCAAAAGGGGTTTTCCCCAAAAGGGG + -- Base3 TCAGTCAGTCAGTCAGTCAGTCAGTCAGTCAGTCAGTCAGTCAGTCAGTCAGTCAGTCAGTCAG + }, + { + name "Vertebrate Mitochondrial" , + name "SGC1" , + id 2 , + ncbieaa "FFLLSSSSYY**CCWWLLLLPPPPHHQQRRRRIIMMTTTTNNKKSS**VVVVAAAADDEEGGGG", + sncbieaa "--------------------------------MMMM---------------M------------" + -- Base1 TTTTTTTTTTTTTTTTCCCCCCCCCCCCCCCCAAAAAAAAAAAAAAAAGGGGGGGGGGGGGGGG + -- Base2 TTTTCCCCAAAAGGGGTTTTCCCCAAAAGGGGTTTTCCCCAAAAGGGGTTTTCCCCAAAAGGGG + -- Base3 TCAGTCAGTCAGTCAGTCAGTCAGTCAGTCAGTCAGTCAGTCAGTCAGTCAGTCAGTCAGTCAG + }, + { + name "Yeast Mitochondrial" , + name "SGC2" , + id 3 , + ncbieaa "FFLLSSSSYY**CCWWTTTTPPPPHHQQRRRRIIMMTTTTNNKKSSRRVVVVAAAADDEEGGGG", + sncbieaa "----------------------------------MM----------------------------" + -- Base1 TTTTTTTTTTTTTTTTCCCCCCCCCCCCCCCCAAAAAAAAAAAAAAAAGGGGGGGGGGGGGGGG + -- Base2 TTTTCCCCAAAAGGGGTTTTCCCCAAAAGGGGTTTTCCCCAAAAGGGGTTTTCCCCAAAAGGGG + -- Base3 TCAGTCAGTCAGTCAGTCAGTCAGTCAGTCAGTCAGTCAGTCAGTCAGTCAGTCAGTCAGTCAG + }, + { + name "Mold Mitochondrial; Protozoan Mitochondrial;" + name "Coelenterate Mitochondrial; Mycoplasma; Spiroplasma" , + name "SGC3" , + id 4 , + ncbieaa "FFLLSSSSYY**CCWWLLLLPPPPHHQQRRRRIIIMTTTTNNKKSSRRVVVVAAAADDEEGGGG", + sncbieaa "--MM---------------M------------MMMM---------------M------------" + -- Base1 TTTTTTTTTTTTTTTTCCCCCCCCCCCCCCCCAAAAAAAAAAAAAAAAGGGGGGGGGGGGGGGG + -- Base2 TTTTCCCCAAAAGGGGTTTTCCCCAAAAGGGGTTTTCCCCAAAAGGGGTTTTCCCCAAAAGGGG + -- Base3 TCAGTCAGTCAGTCAGTCAGTCAGTCAGTCAGTCAGTCAGTCAGTCAGTCAGTCAGTCAGTCAG + }, + { + name "Invertebrate Mitochondrial" , + name "SGC4" , + id 5 , + ncbieaa "FFLLSSSSYY**CCWWLLLLPPPPHHQQRRRRIIMMTTTTNNKKSSSSVVVVAAAADDEEGGGG", + sncbieaa "---M----------------------------MMMM---------------M------------" + -- Base1 TTTTTTTTTTTTTTTTCCCCCCCCCCCCCCCCAAAAAAAAAAAAAAAAGGGGGGGGGGGGGGGG + -- Base2 TTTTCCCCAAAAGGGGTTTTCCCCAAAAGGGGTTTTCCCCAAAAGGGGTTTTCCCCAAAAGGGG + -- Base3 TCAGTCAGTCAGTCAGTCAGTCAGTCAGTCAGTCAGTCAGTCAGTCAGTCAGTCAGTCAGTCAG + }, + { + name "Ciliate Nuclear; Dasycladacean Nuclear; Hexamita Nuclear" , + name "SGC5" , + id 6 , + ncbieaa "FFLLSSSSYYQQCC*WLLLLPPPPHHQQRRRRIIIMTTTTNNKKSSRRVVVVAAAADDEEGGGG", + sncbieaa "-----------------------------------M----------------------------" + -- Base1 TTTTTTTTTTTTTTTTCCCCCCCCCCCCCCCCAAAAAAAAAAAAAAAAGGGGGGGGGGGGGGGG + -- Base2 TTTTCCCCAAAAGGGGTTTTCCCCAAAAGGGGTTTTCCCCAAAAGGGGTTTTCCCCAAAAGGGG + -- Base3 TCAGTCAGTCAGTCAGTCAGTCAGTCAGTCAGTCAGTCAGTCAGTCAGTCAGTCAGTCAGTCAG + }, + { + name "Echinoderm Mitochondrial; Flatworm Mitochondrial" , + name "SGC8" , + id 9 , + ncbieaa "FFLLSSSSYY**CCWWLLLLPPPPHHQQRRRRIIIMTTTTNNNKSSSSVVVVAAAADDEEGGGG", + sncbieaa "-----------------------------------M---------------M------------" + -- Base1 TTTTTTTTTTTTTTTTCCCCCCCCCCCCCCCCAAAAAAAAAAAAAAAAGGGGGGGGGGGGGGGG + -- Base2 TTTTCCCCAAAAGGGGTTTTCCCCAAAAGGGGTTTTCCCCAAAAGGGGTTTTCCCCAAAAGGGG + -- Base3 TCAGTCAGTCAGTCAGTCAGTCAGTCAGTCAGTCAGTCAGTCAGTCAGTCAGTCAGTCAGTCAG + }, + { + name "Euplotid Nuclear" , + name "SGC9" , + id 10 , + ncbieaa "FFLLSSSSYY**CCCWLLLLPPPPHHQQRRRRIIIMTTTTNNKKSSRRVVVVAAAADDEEGGGG", + sncbieaa "-----------------------------------M----------------------------" + -- Base1 TTTTTTTTTTTTTTTTCCCCCCCCCCCCCCCCAAAAAAAAAAAAAAAAGGGGGGGGGGGGGGGG + -- Base2 TTTTCCCCAAAAGGGGTTTTCCCCAAAAGGGGTTTTCCCCAAAAGGGGTTTTCCCCAAAAGGGG + -- Base3 TCAGTCAGTCAGTCAGTCAGTCAGTCAGTCAGTCAGTCAGTCAGTCAGTCAGTCAGTCAGTCAG + }, + { + name "Bacterial and Plant Plastid" , + id 11 , + ncbieaa "FFLLSSSSYY**CC*WLLLLPPPPHHQQRRRRIIIMTTTTNNKKSSRRVVVVAAAADDEEGGGG", + sncbieaa "---M---------------M------------MMMM---------------M------------" + -- Base1 TTTTTTTTTTTTTTTTCCCCCCCCCCCCCCCCAAAAAAAAAAAAAAAAGGGGGGGGGGGGGGGG + -- Base2 TTTTCCCCAAAAGGGGTTTTCCCCAAAAGGGGTTTTCCCCAAAAGGGGTTTTCCCCAAAAGGGG + -- Base3 TCAGTCAGTCAGTCAGTCAGTCAGTCAGTCAGTCAGTCAGTCAGTCAGTCAGTCAGTCAGTCAG + }, + { + name "Alternative Yeast Nuclear" , + id 12 , + ncbieaa "FFLLSSSSYY**CC*WLLLSPPPPHHQQRRRRIIIMTTTTNNKKSSRRVVVVAAAADDEEGGGG", + sncbieaa "-------------------M---------------M----------------------------" + -- Base1 TTTTTTTTTTTTTTTTCCCCCCCCCCCCCCCCAAAAAAAAAAAAAAAAGGGGGGGGGGGGGGGG + -- Base2 TTTTCCCCAAAAGGGGTTTTCCCCAAAAGGGGTTTTCCCCAAAAGGGGTTTTCCCCAAAAGGGG + -- Base3 TCAGTCAGTCAGTCAGTCAGTCAGTCAGTCAGTCAGTCAGTCAGTCAGTCAGTCAGTCAGTCAG + }, + { + name "Ascidian Mitochondrial" , + id 13 , + ncbieaa "FFLLSSSSYY**CCWWLLLLPPPPHHQQRRRRIIMMTTTTNNKKSSGGVVVVAAAADDEEGGGG", + sncbieaa "---M------------------------------MM---------------M------------" + -- Base1 TTTTTTTTTTTTTTTTCCCCCCCCCCCCCCCCAAAAAAAAAAAAAAAAGGGGGGGGGGGGGGGG + -- Base2 TTTTCCCCAAAAGGGGTTTTCCCCAAAAGGGGTTTTCCCCAAAAGGGGTTTTCCCCAAAAGGGG + -- Base3 TCAGTCAGTCAGTCAGTCAGTCAGTCAGTCAGTCAGTCAGTCAGTCAGTCAGTCAGTCAGTCAG + }, + { + name "Alternative Flatworm Mitochondrial" , + id 14 , + ncbieaa "FFLLSSSSYYY*CCWWLLLLPPPPHHQQRRRRIIIMTTTTNNNKSSSSVVVVAAAADDEEGGGG", + sncbieaa "-----------------------------------M----------------------------" + -- Base1 TTTTTTTTTTTTTTTTCCCCCCCCCCCCCCCCAAAAAAAAAAAAAAAAGGGGGGGGGGGGGGGG + -- Base2 TTTTCCCCAAAAGGGGTTTTCCCCAAAAGGGGTTTTCCCCAAAAGGGGTTTTCCCCAAAAGGGG + -- Base3 TCAGTCAGTCAGTCAGTCAGTCAGTCAGTCAGTCAGTCAGTCAGTCAGTCAGTCAGTCAGTCAG + } , + { + name "Blepharisma Macronuclear" , + id 15 , + ncbieaa "FFLLSSSSYY*QCC*WLLLLPPPPHHQQRRRRIIIMTTTTNNKKSSRRVVVVAAAADDEEGGGG", + sncbieaa "-----------------------------------M----------------------------" + -- Base1 TTTTTTTTTTTTTTTTCCCCCCCCCCCCCCCCAAAAAAAAAAAAAAAAGGGGGGGGGGGGGGGG + -- Base2 TTTTCCCCAAAAGGGGTTTTCCCCAAAAGGGGTTTTCCCCAAAAGGGGTTTTCCCCAAAAGGGG + -- Base3 TCAGTCAGTCAGTCAGTCAGTCAGTCAGTCAGTCAGTCAGTCAGTCAGTCAGTCAGTCAGTCAG + } , + { + name "Chlorophycean Mitochondrial" , + id 16 , + ncbieaa "FFLLSSSSYY*LCC*WLLLLPPPPHHQQRRRRIIIMTTTTNNKKSSRRVVVVAAAADDEEGGGG", + sncbieaa "-----------------------------------M----------------------------" + -- Base1 TTTTTTTTTTTTTTTTCCCCCCCCCCCCCCCCAAAAAAAAAAAAAAAAGGGGGGGGGGGGGGGG + -- Base2 TTTTCCCCAAAAGGGGTTTTCCCCAAAAGGGGTTTTCCCCAAAAGGGGTTTTCCCCAAAAGGGG + -- Base3 TCAGTCAGTCAGTCAGTCAGTCAGTCAGTCAGTCAGTCAGTCAGTCAGTCAGTCAGTCAGTCAG + } , + { + name "Trematode Mitochondrial" , + id 21 , + ncbieaa "FFLLSSSSYY**CCWWLLLLPPPPHHQQRRRRIIMMTTTTNNNKSSSSVVVVAAAADDEEGGGG", + sncbieaa "-----------------------------------M---------------M------------" + -- Base1 TTTTTTTTTTTTTTTTCCCCCCCCCCCCCCCCAAAAAAAAAAAAAAAAGGGGGGGGGGGGGGGG + -- Base2 TTTTCCCCAAAAGGGGTTTTCCCCAAAAGGGGTTTTCCCCAAAAGGGGTTTTCCCCAAAAGGGG + -- Base3 TCAGTCAGTCAGTCAGTCAGTCAGTCAGTCAGTCAGTCAGTCAGTCAGTCAGTCAGTCAGTCAG + } , + { + name "Scenedesmus obliquus Mitochondrial" , + id 22 , + ncbieaa "FFLLSS*SYY*LCC*WLLLLPPPPHHQQRRRRIIIMTTTTNNKKSSRRVVVVAAAADDEEGGGG", + sncbieaa "-----------------------------------M----------------------------" + -- Base1 TTTTTTTTTTTTTTTTCCCCCCCCCCCCCCCCAAAAAAAAAAAAAAAAGGGGGGGGGGGGGGGG + -- Base2 TTTTCCCCAAAAGGGGTTTTCCCCAAAAGGGGTTTTCCCCAAAAGGGGTTTTCCCCAAAAGGGG + -- Base3 TCAGTCAGTCAGTCAGTCAGTCAGTCAGTCAGTCAGTCAGTCAGTCAGTCAGTCAGTCAGTCAG + } , + { + name "Thraustochytrium Mitochondrial" , + id 23 , + ncbieaa "FF*LSSSSYY**CC*WLLLLPPPPHHQQRRRRIIIMTTTTNNKKSSRRVVVVAAAADDEEGGGG", + sncbieaa "--------------------------------M--M---------------M------------" + -- Base1 TTTTTTTTTTTTTTTTCCCCCCCCCCCCCCCCAAAAAAAAAAAAAAAAGGGGGGGGGGGGGGGG + -- Base2 TTTTCCCCAAAAGGGGTTTTCCCCAAAAGGGGTTTTCCCCAAAAGGGGTTTTCCCCAAAAGGGG + -- Base3 TCAGTCAGTCAGTCAGTCAGTCAGTCAGTCAGTCAGTCAGTCAGTCAGTCAGTCAGTCAGTCAG + } - DEBUG("New codon translation found: $codon => $consensus"); +=head1 AUTHOR - my $rc_codon_ref = reverseComplement( \$codon ); +Kevin Galinsky, C<< >> - $$self{$table}{$codon} = $consensus; - $$self{ 'rc_' . $table }{$$rc_codon_ref} = $consensus; +=head1 BUGS - ######################################## - # In the case of regular codons, push - # that codon onto the reverse - # translation array. +Please report any bugs or feature requests to +C, or through the web interface at +L. +I will be notified, and then you'll automatically be notified of progress on +your bug as I make changes. - unless ($start) { - push @{ $$self{reverse}{$consensus} }, $codon; - push @{ $$self{rc_reverse}{$consensus} }, $$rc_codon_ref; - } +=head1 SUPPORT - return $consensus; - } +You can find documentation for this module with the perldoc command. - return $notFound; -} + perldoc JCVI::Translator + +You can also look for information at: + +=over 4 + +=item * AnnoCPAN: Annotated CPAN documentation -1; #end of module +L + +=item * CPAN Ratings + +L + +=item * RT: CPAN's request tracker + +L + +=item * Search CPAN + +L =back -=cut +=head1 ACKNOWLEDGEMENTS -=head1 MISC +Log::Log4perl +Params::Validate -These are the original translation tables. The translation -tables below have been unrolled - degenerate nucleotides are -in place as well as ambiguous amino acids. +=head1 COPYRIGHT & LICENSE -{ -name "Standard" , -name "SGC0" , -id 1 , -ncbieaa "FFLLSSSSYY**CC*WLLLLPPPPHHQQRRRRIIIMTTTTNNKKSSRRVVVVAAAADDEEGGGG", -sncbieaa "---M---------------M---------------M----------------------------" --- Base1 TTTTTTTTTTTTTTTTCCCCCCCCCCCCCCCCAAAAAAAAAAAAAAAAGGGGGGGGGGGGGGGG --- Base2 TTTTCCCCAAAAGGGGTTTTCCCCAAAAGGGGTTTTCCCCAAAAGGGGTTTTCCCCAAAAGGGG --- Base3 TCAGTCAGTCAGTCAGTCAGTCAGTCAGTCAGTCAGTCAGTCAGTCAGTCAGTCAGTCAGTCAG -}, -{ -name "Vertebrate Mitochondrial" , -name "SGC1" , -id 2 , -ncbieaa "FFLLSSSSYY**CCWWLLLLPPPPHHQQRRRRIIMMTTTTNNKKSS**VVVVAAAADDEEGGGG", -sncbieaa "--------------------------------MMMM---------------M------------" --- Base1 TTTTTTTTTTTTTTTTCCCCCCCCCCCCCCCCAAAAAAAAAAAAAAAAGGGGGGGGGGGGGGGG --- Base2 TTTTCCCCAAAAGGGGTTTTCCCCAAAAGGGGTTTTCCCCAAAAGGGGTTTTCCCCAAAAGGGG --- Base3 TCAGTCAGTCAGTCAGTCAGTCAGTCAGTCAGTCAGTCAGTCAGTCAGTCAGTCAGTCAGTCAG -}, -{ -name "Yeast Mitochondrial" , -name "SGC2" , -id 3 , -ncbieaa "FFLLSSSSYY**CCWWTTTTPPPPHHQQRRRRIIMMTTTTNNKKSSRRVVVVAAAADDEEGGGG", -sncbieaa "----------------------------------MM----------------------------" --- Base1 TTTTTTTTTTTTTTTTCCCCCCCCCCCCCCCCAAAAAAAAAAAAAAAAGGGGGGGGGGGGGGGG --- Base2 TTTTCCCCAAAAGGGGTTTTCCCCAAAAGGGGTTTTCCCCAAAAGGGGTTTTCCCCAAAAGGGG --- Base3 TCAGTCAGTCAGTCAGTCAGTCAGTCAGTCAGTCAGTCAGTCAGTCAGTCAGTCAGTCAGTCAG -}, -{ -name "Mold Mitochondrial; Protozoan Mitochondrial;" -name "Coelenterate Mitochondrial; Mycoplasma; Spiroplasma" , -name "SGC3" , -id 4 , -ncbieaa "FFLLSSSSYY**CCWWLLLLPPPPHHQQRRRRIIIMTTTTNNKKSSRRVVVVAAAADDEEGGGG", -sncbieaa "--MM---------------M------------MMMM---------------M------------" --- Base1 TTTTTTTTTTTTTTTTCCCCCCCCCCCCCCCCAAAAAAAAAAAAAAAAGGGGGGGGGGGGGGGG --- Base2 TTTTCCCCAAAAGGGGTTTTCCCCAAAAGGGGTTTTCCCCAAAAGGGGTTTTCCCCAAAAGGGG --- Base3 TCAGTCAGTCAGTCAGTCAGTCAGTCAGTCAGTCAGTCAGTCAGTCAGTCAGTCAGTCAGTCAG -}, -{ -name "Invertebrate Mitochondrial" , -name "SGC4" , -id 5 , -ncbieaa "FFLLSSSSYY**CCWWLLLLPPPPHHQQRRRRIIMMTTTTNNKKSSSSVVVVAAAADDEEGGGG", -sncbieaa "---M----------------------------MMMM---------------M------------" --- Base1 TTTTTTTTTTTTTTTTCCCCCCCCCCCCCCCCAAAAAAAAAAAAAAAAGGGGGGGGGGGGGGGG --- Base2 TTTTCCCCAAAAGGGGTTTTCCCCAAAAGGGGTTTTCCCCAAAAGGGGTTTTCCCCAAAAGGGG --- Base3 TCAGTCAGTCAGTCAGTCAGTCAGTCAGTCAGTCAGTCAGTCAGTCAGTCAGTCAGTCAGTCAG -}, -{ -name "Ciliate Nuclear; Dasycladacean Nuclear; Hexamita Nuclear" , -name "SGC5" , -id 6 , -ncbieaa "FFLLSSSSYYQQCC*WLLLLPPPPHHQQRRRRIIIMTTTTNNKKSSRRVVVVAAAADDEEGGGG", -sncbieaa "-----------------------------------M----------------------------" --- Base1 TTTTTTTTTTTTTTTTCCCCCCCCCCCCCCCCAAAAAAAAAAAAAAAAGGGGGGGGGGGGGGGG --- Base2 TTTTCCCCAAAAGGGGTTTTCCCCAAAAGGGGTTTTCCCCAAAAGGGGTTTTCCCCAAAAGGGG --- Base3 TCAGTCAGTCAGTCAGTCAGTCAGTCAGTCAGTCAGTCAGTCAGTCAGTCAGTCAGTCAGTCAG -}, -{ -name "Echinoderm Mitochondrial; Flatworm Mitochondrial" , -name "SGC8" , -id 9 , -ncbieaa "FFLLSSSSYY**CCWWLLLLPPPPHHQQRRRRIIIMTTTTNNNKSSSSVVVVAAAADDEEGGGG", -sncbieaa "-----------------------------------M---------------M------------" --- Base1 TTTTTTTTTTTTTTTTCCCCCCCCCCCCCCCCAAAAAAAAAAAAAAAAGGGGGGGGGGGGGGGG --- Base2 TTTTCCCCAAAAGGGGTTTTCCCCAAAAGGGGTTTTCCCCAAAAGGGGTTTTCCCCAAAAGGGG --- Base3 TCAGTCAGTCAGTCAGTCAGTCAGTCAGTCAGTCAGTCAGTCAGTCAGTCAGTCAGTCAGTCAG -}, -{ -name "Euplotid Nuclear" , -name "SGC9" , -id 10 , -ncbieaa "FFLLSSSSYY**CCCWLLLLPPPPHHQQRRRRIIIMTTTTNNKKSSRRVVVVAAAADDEEGGGG", -sncbieaa "-----------------------------------M----------------------------" --- Base1 TTTTTTTTTTTTTTTTCCCCCCCCCCCCCCCCAAAAAAAAAAAAAAAAGGGGGGGGGGGGGGGG --- Base2 TTTTCCCCAAAAGGGGTTTTCCCCAAAAGGGGTTTTCCCCAAAAGGGGTTTTCCCCAAAAGGGG --- Base3 TCAGTCAGTCAGTCAGTCAGTCAGTCAGTCAGTCAGTCAGTCAGTCAGTCAGTCAGTCAGTCAG -}, -{ -name "Bacterial and Plant Plastid" , -id 11 , -ncbieaa "FFLLSSSSYY**CC*WLLLLPPPPHHQQRRRRIIIMTTTTNNKKSSRRVVVVAAAADDEEGGGG", -sncbieaa "---M---------------M------------MMMM---------------M------------" --- Base1 TTTTTTTTTTTTTTTTCCCCCCCCCCCCCCCCAAAAAAAAAAAAAAAAGGGGGGGGGGGGGGGG --- Base2 TTTTCCCCAAAAGGGGTTTTCCCCAAAAGGGGTTTTCCCCAAAAGGGGTTTTCCCCAAAAGGGG --- Base3 TCAGTCAGTCAGTCAGTCAGTCAGTCAGTCAGTCAGTCAGTCAGTCAGTCAGTCAGTCAGTCAG -}, -{ -name "Alternative Yeast Nuclear" , -id 12 , -ncbieaa "FFLLSSSSYY**CC*WLLLSPPPPHHQQRRRRIIIMTTTTNNKKSSRRVVVVAAAADDEEGGGG", -sncbieaa "-------------------M---------------M----------------------------" --- Base1 TTTTTTTTTTTTTTTTCCCCCCCCCCCCCCCCAAAAAAAAAAAAAAAAGGGGGGGGGGGGGGGG --- Base2 TTTTCCCCAAAAGGGGTTTTCCCCAAAAGGGGTTTTCCCCAAAAGGGGTTTTCCCCAAAAGGGG --- Base3 TCAGTCAGTCAGTCAGTCAGTCAGTCAGTCAGTCAGTCAGTCAGTCAGTCAGTCAGTCAGTCAG -}, -{ -name "Ascidian Mitochondrial" , -id 13 , -ncbieaa "FFLLSSSSYY**CCWWLLLLPPPPHHQQRRRRIIMMTTTTNNKKSSGGVVVVAAAADDEEGGGG", -sncbieaa "---M------------------------------MM---------------M------------" --- Base1 TTTTTTTTTTTTTTTTCCCCCCCCCCCCCCCCAAAAAAAAAAAAAAAAGGGGGGGGGGGGGGGG --- Base2 TTTTCCCCAAAAGGGGTTTTCCCCAAAAGGGGTTTTCCCCAAAAGGGGTTTTCCCCAAAAGGGG --- Base3 TCAGTCAGTCAGTCAGTCAGTCAGTCAGTCAGTCAGTCAGTCAGTCAGTCAGTCAGTCAGTCAG -}, -{ -name "Alternative Flatworm Mitochondrial" , -id 14 , -ncbieaa "FFLLSSSSYYY*CCWWLLLLPPPPHHQQRRRRIIIMTTTTNNNKSSSSVVVVAAAADDEEGGGG", -sncbieaa "-----------------------------------M----------------------------" --- Base1 TTTTTTTTTTTTTTTTCCCCCCCCCCCCCCCCAAAAAAAAAAAAAAAAGGGGGGGGGGGGGGGG --- Base2 TTTTCCCCAAAAGGGGTTTTCCCCAAAAGGGGTTTTCCCCAAAAGGGGTTTTCCCCAAAAGGGG --- Base3 TCAGTCAGTCAGTCAGTCAGTCAGTCAGTCAGTCAGTCAGTCAGTCAGTCAGTCAGTCAGTCAG -} , -{ -name "Blepharisma Macronuclear" , -id 15 , -ncbieaa "FFLLSSSSYY*QCC*WLLLLPPPPHHQQRRRRIIIMTTTTNNKKSSRRVVVVAAAADDEEGGGG", -sncbieaa "-----------------------------------M----------------------------" --- Base1 TTTTTTTTTTTTTTTTCCCCCCCCCCCCCCCCAAAAAAAAAAAAAAAAGGGGGGGGGGGGGGGG --- Base2 TTTTCCCCAAAAGGGGTTTTCCCCAAAAGGGGTTTTCCCCAAAAGGGGTTTTCCCCAAAAGGGG --- Base3 TCAGTCAGTCAGTCAGTCAGTCAGTCAGTCAGTCAGTCAGTCAGTCAGTCAGTCAGTCAGTCAG -} , -{ -name "Chlorophycean Mitochondrial" , -id 16 , -ncbieaa "FFLLSSSSYY*LCC*WLLLLPPPPHHQQRRRRIIIMTTTTNNKKSSRRVVVVAAAADDEEGGGG", -sncbieaa "-----------------------------------M----------------------------" --- Base1 TTTTTTTTTTTTTTTTCCCCCCCCCCCCCCCCAAAAAAAAAAAAAAAAGGGGGGGGGGGGGGGG --- Base2 TTTTCCCCAAAAGGGGTTTTCCCCAAAAGGGGTTTTCCCCAAAAGGGGTTTTCCCCAAAAGGGG --- Base3 TCAGTCAGTCAGTCAGTCAGTCAGTCAGTCAGTCAGTCAGTCAGTCAGTCAGTCAGTCAGTCAG -} , -{ -name "Trematode Mitochondrial" , -id 21 , -ncbieaa "FFLLSSSSYY**CCWWLLLLPPPPHHQQRRRRIIMMTTTTNNNKSSSSVVVVAAAADDEEGGGG", -sncbieaa "-----------------------------------M---------------M------------" --- Base1 TTTTTTTTTTTTTTTTCCCCCCCCCCCCCCCCAAAAAAAAAAAAAAAAGGGGGGGGGGGGGGGG --- Base2 TTTTCCCCAAAAGGGGTTTTCCCCAAAAGGGGTTTTCCCCAAAAGGGGTTTTCCCCAAAAGGGG --- Base3 TCAGTCAGTCAGTCAGTCAGTCAGTCAGTCAGTCAGTCAGTCAGTCAGTCAGTCAGTCAGTCAG -} , -{ -name "Scenedesmus obliquus Mitochondrial" , -id 22 , -ncbieaa "FFLLSS*SYY*LCC*WLLLLPPPPHHQQRRRRIIIMTTTTNNKKSSRRVVVVAAAADDEEGGGG", -sncbieaa "-----------------------------------M----------------------------" --- Base1 TTTTTTTTTTTTTTTTCCCCCCCCCCCCCCCCAAAAAAAAAAAAAAAAGGGGGGGGGGGGGGGG --- Base2 TTTTCCCCAAAAGGGGTTTTCCCCAAAAGGGGTTTTCCCCAAAAGGGGTTTTCCCCAAAAGGGG --- Base3 TCAGTCAGTCAGTCAGTCAGTCAGTCAGTCAGTCAGTCAGTCAGTCAGTCAGTCAGTCAGTCAG -} , -{ -name "Thraustochytrium Mitochondrial" , -id 23 , -ncbieaa "FF*LSSSSYY**CC*WLLLLPPPPHHQQRRRRIIIMTTTTNNKKSSRRVVVVAAAADDEEGGGG", -sncbieaa "--------------------------------M--M---------------M------------" --- Base1 TTTTTTTTTTTTTTTTCCCCCCCCCCCCCCCCAAAAAAAAAAAAAAAAGGGGGGGGGGGGGGGG --- Base2 TTTTCCCCAAAAGGGGTTTTCCCCAAAAGGGGTTTTCCCCAAAAGGGGTTTTCCCCAAAAGGGG --- Base3 TCAGTCAGTCAGTCAGTCAGTCAGTCAGTCAGTCAGTCAGTCAGTCAGTCAGTCAGTCAGTCAG -} +Copyright 2008-2009 J. Craig Venter Institute, all rights reserved. + +This program is free software; you can redistribute it and/or modify it +under the same terms as Perl itself. =cut @@ -1338,4 +1520,4 @@ sncbieaa "-----------------------------M---------------------------------------- -- Base1 AAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAAACCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCCGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGGHMMMMMMMMMMSSSTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTTWYYY -- Base2 AAAAAACCCCCCCCCCCCCCCGGGGGGTTTTTTTTAAAAAACCCCCCCCCCCCCCCGGGGGGGGGGGGGGGTTTTTTTTTTTTTTTAAAAAACCCCCCCCCCCCCCCGGGGGGGGGGGGGGGMMMTTTTTTTTTTTTTTTTGGGTTTTTTTAAAAAAAAACCCCCCCCCCCCCCCGGGGGRTTTTTTTTTT -- Base3 ACGRTYABCDGHKMNRSTVWYACGRTYACGHMTWYACGRTYABCDGHKMNRSTVWYABCDGHKMNRSTVWYABCDGHKMNRSTVWYACGRTYABCDGHKMNRSTVWYABCDGHKMNRSTVWYCTYABCDGHKMNRSTVWYAAGRACHMTWYAGRACGRTYABCDGHKMNRSTVWYACGTYAACGRTYAAGR -} \ No newline at end of file +} diff --git a/lib/JCVI/Translator/Utils.pm b/lib/JCVI/Translator/Utils.pm index 13131ce..9d7cbcd 100644 --- a/lib/JCVI/Translator/Utils.pm +++ b/lib/JCVI/Translator/Utils.pm @@ -7,419 +7,614 @@ =head1 NAME -package Utils +JCVI::Translator::Utils - Utilities that requrie a translation table -=head1 SYNOPSES +=head1 SYNOPSIS - use JCVI::Translator::Utils; + use JCVI::Translator::Utils; - my $translator = new JCVI::Translator::Utils( - id => $id, - name => $name, - table => $table, - tableRef => $tableRef - ); + # Same constructor as JCVI::Translator + my $utils = new JCVI::Translator::Utils(); + my $utils = custom JCVI::Translator( \$custom_table ); -=head1 DESCRIPTION + my $codons = $utils->codons( $residue ); + my $regex = $utils->regex( $residue ); -See Translator for more info. Utils extends Translator and -adds a few more functions that are normally not used. - -=head1 AUTHOR + my $orf = $utils->getORF( $seq_ref ); + my $cds = $utils->getCDS( $seq_ref ); -Kevin Galinsky, + my $frames = $utils->nonstop( $seq_ref ); -=head1 FUNCTIONS +=head1 DESCRIPTION -=over +See Translator for more info. Utils extends Translator and +adds a few more functions that are normally not used. =cut package JCVI::Translator::Utils; -use base JCVI::Translator; use strict; use warnings; -our $VERSION = '0.2.2'; +use base qw(JCVI::Translator); +__PACKAGE__->mk_accessors(qw( _regexes )); use Log::Log4perl qw(:easy); -use Params::Validate qw(:all); +use Params::Validate; + +use JCVI::DNATools qw( cleanDNA ); +use JCVI::AATools qw( $aa_match ); + +our $DEFAULT_STRAND = 0; +our $DEFAULT_SANITIZED = 0; + +=head1 METHODS + +=cut + +sub _new { + my $class = shift; + my $self = $class->SUPER::_new(); + + $self->_regexes( [] ); + foreach my $rc ( 0 .. 1 ) { + $self->_regexes->[$rc] = {}; + } + + return $self; +} -=item getORF() +=head2 codons -=item [$start, $stop] = $translator->getORF($seqRef, $strand); + my $codon_array = $translator->codons( $residue); + my $codon_array = $translator->codons( $residue, $strand ); -This will get the longest region between stops and return -the strand, lower and upper bounds, inclusive: +Returns a list of codons for a particular residue or start codon. For start +codons, input "start" for the residue. + +=cut + +sub codons { + my $self = shift; + my ( $residue, $strand ) = validate_pos( + @_, + { regex => qr/^(?:$aa_match|start|lower|upper)$/ }, + { + default => 1, + regex => qr/^[+-]?1$/ + } + ); + + if ( $residue eq 'lower' ) { $residue = $strand == 1 ? 'start' : '*' } + elsif ( $residue eq 'upper' ) { $residue = $strand == -1 ? 'start' : '*' } + elsif ( $residue eq 'start' ) { $residue = 'start' } + else { $residue = uc $residue } + + return [ + @{ $self->_reverse->[ $strand == 1 ? 0 : 1 ]->{$residue} ||= [] } ]; +} + +=head2 regex + + my $regex = $translator->regex( $residue ); + my $regex = $translator->regex( $residue, $strand ); + +Returns a regular expression matching codons for a particular amino acid +residue. In addition, three special values are allowed: + + start: Start codons + lower: Start or stop codons, depending up on strand + lower: Start or stop codons, depending up on strand + +lower and upper match the respective ends of a CDS for a given strand (i.e. on +the positive strand, lower matches the start, and upper matches the stop). The +stop codon is stored as "*" by the translator. + +=cut + +sub regex { + my $self = shift; + my ( $residue, $strand ) = validate_pos( + @_, 1, + { + default => 1, + regex => qr/^[+-]?1$/ + } + ); + + my $rc = $strand == 1 ? 0 : 1; + + my $regex = $self->_regexes->[$rc]->{residue}; + + return $regex if ( defined $regex ); + + $regex = join '|', @{ $self->codons(@_) }; + $regex = qr/$regex/; + + $self->_regexes->[$rc]->{residue} = $regex; + return $regex; +} + +=head2 getORF + + my $orf_hash = $translator->getORF( $seq_ref ); + my $orf_hash = $translator->getORF( $seq_ref, \%params ); + +This will get the longest region between stops and return the strand, lower and +upper bounds, inclusive. The parameters are: + + strand: 0, 1 or -1; default = 0 (meaning search both strands) + lower: integer between 0 and length; default = 0 + upper: integer between 0 and length; default = length + sanitized: 0 or 1; default = 0 + +Lower and upper are used to specify bounds between which you are searching. +Suppose the following was the longest ORF: 0 1 2 3 4 5 6 7 8 9 10 T A A A T C T A A G ***** ***** <---------> -Will return [1, 3, 9]. You can also specify which strand -you are looking for the ORF to be on. +This will return: -For ORFs starting at the very beginning of the strand or -trailing off the end, but not in phase with the start or -ends, this method will cut at the last complete codon. + { + strand => 1, + lower => 3, + upper => 9 + } - Eg: +You can also specify which strand you are looking for the ORF to be on. - 0 1 2 3 4 5 6 7 8 9 10 - A C G T A G T T T A - ***** - <---------> +For ORFs starting at the very beginning of the strand or trailing off the end, +but not in phase with the start or ends, this method will cut at the last +complete codon. -Will return [-1, 1, 7]. The distance between lower and -upper will always be a multiple of 3. This is to make it -clear which frame the ORF is in. + Eg: -Example: + 0 1 2 3 4 5 6 7 8 9 10 + A C G T A G T T T A + ***** + <---------------> + +Will return: - $ref = $translator->getORF(\'TAGAAATAG'); + { + strand => 1, + lower => 1, + upper => 10 + } + +The distance between lower and upper will always be a multiple of 3. This is to +make it clear which frame the ORF is in. The resulting hash may be passed to +the translate method. -Output: +Example: - $ref = [$strand, $lower, $upper] + my $orf_ref = $translator->getORF( \'TAGAAATAG' ); + my $orf_ref = $translator->getORF( \$seq, { strand => -1 } ); + my $orf_ref = $translator->getORF( + \$seq, + { + lower => $lower, + upper => $upper + } + ); =cut sub getORF { + TRACE('getORF called'); + my $self = shift; - my ( $seqRef, $strand ) - = validate_pos( @_, - { type => SCALARREF }, - { default => 0, - regex => qr/^[+-]?[01]$/ - } - ); - - DEBUG('getORF called'); - - my ( $best_strand, $lower, $upper ) = ( 1, 0, 0 ); - - foreach my $cur_strand ( $strand == 0 ? ( -1, 1 ) : ($strand) ) { - my @lowers = ( 0 .. 2 ); - my $stopRegex = $self->regex( 'stop', $cur_strand ); - - ######################################## - # Rather than using a regular expression - # to find regions between stops, it - # should be more computationally - # efficient to find all the stops and - # compute from there. However, Perl's - # regular expression engine may be - # faster than code execution, so this - # may not be the case - - ######################################## - # A lookahead is used for two reasons: - # the main one is to get every position - # within two bases of the end of the - # sequence, and also to cope with the - # possibility of overlapping stop - # codons. - - while ( - $$seqRef =~ /(?= - ($stopRegex)|.{0,2}$ - )/gx - ) + my ( $seq_ref, @p ); + ( $seq_ref, $p[0] ) = validate_pos( + @_, + { type => Params::Validate::SCALARREF }, + { type => Params::Validate::HASHREF, default => {} } + ); + + my %p = validate( + @p, { - my $curUpper = pos $$seqRef; - my $frame = $curUpper % 3; - - $curUpper += length $1 if ( $1 && ( $cur_strand == 1 ) ); - - ######################################## - # If the current distance between start - # and stop is greater than the distance - # between the stored start and stop, - # change the stored start and stop to be - # the current one. - - if ( $upper - $lower < $curUpper - $lowers[$frame] ) { - $best_strand = $cur_strand; - $lower = $lowers[$frame]; - $upper = $curUpper; - } + strand => { + default => 0, + regex => qr/^[+-]?[01]$/, + type => Params::Validate::SCALAR + }, + lower => { + default => 0, + regex => qr/^[0-9]+$/, + type => Params::Validate::SCALAR, + callbacks => { + 'lower >= 0' => sub { $_[0] >= 0 }, + 'lower <= seq_length' => sub { $_[0] <= length($$seq_ref) } + } + }, + upper => { + default => length($$seq_ref), + regex => qr/^[0-9]+$/, + type => Params::Validate::SCALAR, + callbacks => { + 'upper >= 0' => sub { $_[0] >= 0 }, + 'upper <= seq_length' => sub { $_[0] <= length($$seq_ref) } + } + }, + sanitized => { default => $DEFAULT_SANITIZED } + } + ); + + return undef if ( $p{upper} < $p{lower} ); + + $seq_ref = cleanDNA($seq_ref) unless ( $p{sanitized} ); + + # Initialize the longest ORF. + my %ORF = ( + strand => 0, + lower => $p{lower}, + upper => $p{lower} + ); + + # Go through each strand which we are looking in + foreach my $strand ( $p{strand} == 0 ? ( -1, 1 ) : $p{strand} ) { + + # Initialize lower bounds and regular expression for stop + my @lowers = map { $_ + $p{lower} } ( 0 .. 2 ); + my $stop_regex = $self->regex( '*', $strand ); + + # Look for all the stops in our sequence using a regular expression. A + # lookahead is used to cope with the possibility of overlapping stop + # codons + + pos($$seq_ref) = $p{lower}; - $lowers[$frame] = $curUpper; + while ( $$seq_ref =~ /(?=stop_regex)/gx ) { + + # Get the location of the upper bound. Add 3 for the length of the + # stop codon if we are on the + strand. + my $upper = pos($$seq_ref) + ( $strand == 1 ? 3 : 0 ); + + # End the iteration if we are out of range + last if ( $upper > $p{upper} ); + + # Call our helper function + $self->_getORF( $strand, \@lowers, $upper, $p{lower}, \%ORF ); + } + + # Now evaluate for the last three ORFS + foreach my $i ( 0 .. 2 ) { + my $upper = $p{upper} - $i; + $self->_getORF( $strand, \@lowers, $upper, $p{lower}, \%ORF ); } + # NOTE: Perl's regular expression engine could be faster than code + # execution, so it may be faster to find ORFS using regular expression + # matching an entire ORF. + # m/(?=(^|$stop)((.{3})*)($stop|$))/g } - return [ $best_strand, $lower, $upper ]; + return \%ORF; +} + +# Helper function for getORF above. +sub _getORF { + my $self = shift; + my ( $strand, $lowers, $upper, $offset, $longest ) = @_; + + # Calculate the frame relative to the starting offset + my $frame = ( $upper - $offset ) % 3; + + # Compare if this is better than the longest ORF + $self->_compare_regions( + $longest, + { + strand => $strand, + lower => $lowers->[$frame], + upper => $upper + } + ); + + # Mark the lower bound for this frame + $lowers->[$frame] = $upper; } -=item getCDS() +=head2 getCDS -=item [$start, $stop] = $translator->getCDS($seqRef, $strand, $strict); + my $cds_ref = $translator->getCDS( $seq_ref ); + my $cds_ref = $translator->getCDS( $seq_ref, \%params ); -This will return the strand and boundaries of the longest -CDS. +This will return the strand and boundaries of the longest CDS. 0 1 2 3 4 5 6 7 8 9 10 A T G A A A T A A G >>>>> ***** <---------------> -Will return [1, 0, 9]. +Will return: -Strict controls how strictly getCDS functions. -There are 3 levels of strictness, enumerated 0, 1 and 2. 2 is the most strict, -and in that mode, a region will only be considered a CDS if both the start and -stop is found. In strict level 1, if a start is found, but no stop is present -before the end of the sequence, the CDS will run until the end of the sequence. -Strict level 0 assumes that start codon is present in each frame just before -the start of the molecule. + { + strand => 1, + lower => 0, + upper => 9 + } -Example: +It takes the following parameters: + + strand: 0, 1 or -1; default = 0 (meaning search both strands) + strict: 0, 1 or 2; default = 1 + sanitized: 0 or 1; default = 0 - $ref = $translator->getCDSs(\'ATGAAATAG'); - $ref = $translator->getCDSs(\'ATGAAATAG', -1); +Strict controls how strictly getCDS functions. There are 3 levels of +strictness, enumerated 0, 1 and 2. 2 is the most strict, and in that mode, a +region will only be considered a CDS if both the start and stop is found. In +strict level 1, if a start is found, but no stop is present before the end of +the sequence, the CDS will run until the end of the sequence. Strict level 0 +assumes that start codon is present in each frame just before the start of the +molecule. Level 1 is a pretty safe bet, so that is the default. -Output: +Example: - $ref = [$strand, $lower, $upper] + my $cds_ref = $translator->getCDS(\'ATGAAATAG'); + my $cds_ref = $translator->getCDS(\$seq, { strand => -1 } ); + my $cds_ref = $translator->getCDS(\$seq, { strict => 2 } ); =cut sub getCDS { + TRACE('getCDS called'); + my $self = shift; - my ( $seqRef, $strand, $strict ) - = validate_pos( @_, - { type => SCALARREF }, - { default => 0, - regex => qr/^[+-]?[01]$/ - }, - { default => 1, - regex => qr/^[012]$/ - } - ); - - DEBUG('getCDS called'); - - my ( $best_strand, $lower, $upper ) = ( 1, 0, 0 ); - - foreach my $cur_strand ( $strand == 0 ? ( -1, 1 ) : ($strand) ) { - my $lowerRegex = $self->regex( 'lower', $cur_strand ); - my $upperRegex = $self->regex( 'upper', $cur_strand ); - - ######################################## - # Initialize - my @lowers; - if ( $cur_strand == 1 ) { - @lowers = ( $strict != 0 ? map {undef} ( 0 .. 2 ) : ( 0 .. 2 ) ); - } - else { - @lowers = ( $strict == 2 - ? map {undef} ( 0 .. 2 ) - : ( 0 .. 2 ) - ); - } + my ( $seq_ref, @p ); + ( $seq_ref, $p[0] ) = validate_pos( + @_, + { type => Params::Validate::SCALARREF, }, + { type => Params::Validate::HASHREF, default => {} } + ); - ######################################## - # Similar to getORF, rather than - # using a regular expression to find - # entire regions, instead find - # individual starts and stops and react - # accordingly. It captures the starts - # and stops separately ($1 vs $2) so - # that it is easy to tell if a start or - # a stop was matched. - # - # If strict mode is at level 2, we don't - # tes is a newly added feature which t CDSs trailing off the end - # of the molecule - - my $regex = qr/(?=($lowerRegex)|($upperRegex))/; - $regex = qr/$regex|(?=.{0,2}$)/ unless ( $strict == 2 ); - - while ( $$seqRef =~ /$regex/g ) { - - my $position = pos $$seqRef; - my $frame = $position % 3; - - ######################################## - # If we match the lower regex we: - # - # In the case that we are on the '-' - # strand, that means we found a stop, - # so we update the lower bound. - # - # Otherwise, we are on the positive - # strand, meaning we have found the - # start, so only set the lower bound if - # it is not already set (don't want to - # overwrite the location of a previous - # start codon). - - if ( $1 - && ( ( $cur_strand eq '-' ) - || ( !defined $lowers[$frame] ) ) - ) - { - $lowers[$frame] = $position; - } + my %p = validate( + @p, + { + strand => { + default => 0, + regex => qr/^[+-]?1$/, + }, + lower => { + default => 0, + regex => qr/^[0-9]+$/, + type => Params::Validate::SCALAR, + callbacks => { + 'lower >= 0' => sub { $_[0] >= 0 }, + 'lower <= seq_length' => sub { $_[0] <= length($$seq_ref) } + } + }, + upper => { + default => length($$seq_ref), + regex => qr/^[0-9]+$/, + type => Params::Validate::SCALAR, + callbacks => { + 'upper >= 0' => sub { $_[0] >= 0 }, + 'upper <= seq_length' => sub { $_[0] <= length($$seq_ref) } + } + }, + strict => { + default => 1, + regex => qr/^[012]$/, - ######################################## - # If we don't match the lower regex: - # - # If this is the positive strand, that - # means that this is a valid stop - - # either a stop codon or the end of the - # string. Reset the lower bound in this - # case. - # - # On the negative strand, we only care - # if we matched a start. In that case, - # do the compute and update. - # - # Another option would be to mark where - # the start is, and only do the compute - # when we find a stop. + }, + sanitized => { default => $DEFAULT_SANITIZED } + } + ); - elsif ( ( $cur_strand == 1 ) || $2 ) { + return undef if ( $p{upper} < $p{lower} ); - # Move on if the lower is unset - next unless ( defined $lowers[$frame] ); + $seq_ref = cleanDNA($seq_ref) unless ( $p{sanitized} ); - $position += length $2 if ($2); + # Initialize the longest ORF. Length is -1. + my %CDS = ( + strand => 0, + lower => 0, + upper => -1 + ); - if ( $upper - $lower < $position - $lowers[$frame] ) { - $best_strand = $cur_strand; - $lower = $lowers[$frame]; - $upper = $position; - } + foreach my $strand ( $p{strand} == 0 ? ( -1, 1 ) : $p{strand} ) { + my $lower_regex = $self->regex( 'lower', $strand ); + my $upper_regex = $self->regex( 'upper', $strand ); - # Reset lower if we found a stop - undef $lowers[$frame] if ( $cur_strand == 1 ); - } - } - } + # Initialize lowers. On the + strand, we don't set the lower bounds + # unless strict is 0. On the - strand, we don't set the lower bounds if + # strict is 2. Otherwise, set the lower boudns to be the first bases. + my @lowers = + ( ( ( $strand == 1 ) && ( $p{strict} != 0 ) ) + || ( ( $strand == -1 ) && ( $p{strict} == 2 ) ) ) + ? (undef) x 3 + : map { $p{lower} + $_ } ( 0 .. 2 ); - return [ $best_strand, $lower, $upper ]; -} + # Similar to getORF, rather than using a regular expression to find + # entire coding regions, instead find individual starts and stops and + # react accordingly. + # The regular expression captures the starts and stops separately + # ($1 vs $2) so that it is easy to tell if a start or a stop was + # matched. -=item find() + pos($$seq_ref) = $p{lower}; -=item $positions = $translator->find( $seqRef, $type, $strand ) + while ( $$seq_ref =~ /(?=($lower_regex)|($upper_regex))/g ) { + my $position = pos $$seq_ref; + last if ( $position > $p{upper} ); -Find codons of given type (i.e. start or stop) in the sequence. Note, the -second two parameters are passed directly to regex, so please look there for -more information. Returns an arrayref containing all the locations of all those -codons. + my $frame = $position % 3; -=cut + # If the lower regex matches: + # + # In the case that it is on the '-' strand, that means a stop was + # found. CDSs always end on stops, so update the lower bound. + # + # Otherwise, it is on the positive strand, meaning a start was + # found. Internal start codons are allowed, so only set the lower + # bound if it is not already set. + if ($1) { + if ( ( $strand == -1 ) + || ( !defined $lowers[$frame] ) ) + + { + $lowers[$frame] = $position; + } + } -sub find { - my $self = shift; + # If the lower regex wasn't matched, the the upper one was. + # + # If this is the positive strand, that means that this is a stop + # codon. Compute the CDS, update if necessary, and reset the lower + # bound in this case. + # + # On the negative strand, that means that a start was matched. + # Compute the CDS, update if necessary, but don't reset the lower + # bound. - my @seqRef = splice @_, 0, 1; - my $seqRef = validate_pos( @seqRef, { type => SCALARREF } ); + else { + $position += 3; + last if ( $position > $p{upper} ); + + $self->_getCDS( $strand, \@lowers, $position, $p{lower}, + \%CDS ); + } + } - my $regex = $self->regex(@_); + # If strict mode is at level 2, we don't allow CDSs to trail off the + # end of the molecule. We also don't allow the end to trail off if we + # are on the - strand and strict isn't 0. - my @positions; + next + if ( ( $p{strict} == 2 ) + || ( ( $strand == -1 ) && ( $p{strict} != 0 ) ) ); - while ( $$seqRef =~ /(?=($regex))/g ) { - push @positions, pos $$seqRef; + foreach my $i ( 0 .. 2 ) { + my $upper = $p{upper} - $i; + $self->_getCDS( $strand, \@lowers, $upper, $p{lower}, \%CDS ); + } } - return \@positions; + return \%CDS; } -=item regex() +# Helper function for getORF above. +sub _getCDS { + my $self = shift; + my ( $strand, $lowers, $upper, $offset, $longest ) = @_; -=item $regex = $translator->regex( $type, $strand ) + # Calculate the frame relative to the starting offset + my $frame = ( $upper - $offset ) % 3; -Returns a regular expression of a certain type for a given strand. These are -'start', 'stop', 'lower' and 'upper.' Lower and upper match the lower and upper -end of a CDS for a given strand (i.e. on the positive strand, lower matches -the start, and upper matches the stop). + # Do nothing if lower bound wasn't defined + return unless ( defined $lowers->[$frame] ); -=cut + # Compare if this is better than the longest ORF + $self->_compare_regions( + $longest, + { + strand => $strand, + lower => $lowers->[$frame], + upper => $upper + } + ); -sub regex { - my $self = shift; - my ( $type, $strand ) - = validate_pos( @_, - { regex => qr/^(?:start|stop|lower|upper)$/ }, - { default => 1, - regex => qr/^[+-]?1$/ - } - ); - - my $prefix = $strand == 1 ? '' : 'rc_'; - - if ( $type eq 'lower' ) { $type = $strand == 1 ? 'start' : 'stop' } - elsif ( $type eq 'upper' ) { $type = $strand == -1 ? 'start' : 'stop' } - - unless ( defined $self->{"${prefix}${type}Regex"} ) { - my $regex = join '|', - ( $type eq 'start' - ? keys %{ $self->{"${prefix}starts"} } - : @{ $$self{"${prefix}reverse"}{'*'} } - ); - $self->{"${prefix}${type}Regex"} = qr/$regex/; - } + # Mark the lower bound for this frame + undef $lowers->[$frame] if ( $strand == 1 ); +} - return $self->{"${prefix}${type}Regex"}; +# If the current ORF is longer than the previously stored longest bounds, store +# the current ORF +sub _compare_regions { + my $self = shift; + my ( $longest, $current ) = @_; + %$longest = %$current + if ( $longest->{upper} - $longest->{lower} < + $current->{upper} - $current->{lower} ); } -=item nonstop +=head2 nonstop -=item $frames = $translator->nonstop( $seqRef, $strand ) + my $frames = $translator->nonstop( $seq_ref ); + my $frames = $translator->nonstop( $seq_ref, \%params ); -Returns the frames that contain no stop codons for the sequence. $strand is -optional and defaults to 0. Frames are 1, 2, 3, -1, -2, -3. +Returns the frames that contain no stop codons for the sequence. Valid +parameters are strand and sanitized. strand is defaults to 0. Frames are +numbered -3, -2, -1, 1, 2 and 3. - 3 ----> - 2 -----> - 1 ------> - ------- - -1 <------ - -2 <----- - -3 <---- + 3 ----> + 2 -----> + 1 ------> + ------- + -1 <------ + -2 <----- + -3 <---- Example: - $frames = $translator->nonstop(\'TACGTTGGTTAAGTT'); # [-1, -3, 2, 3] - $frames = $translator->nonstop(\'TACGTTGGTTAAGTT', 1); # [2, 3] - $frames = $translator->nonstop(\'TACGTTGGTTAAGTT', -1); # [-1, -3] + my $frames = $translator->nonstop(\'TACGTTGGTTAAGTT'); # [ 2, 3, -1, -3 ] + my $frames = $translator->nonstop(\$seq, { strand => 1 } ); # [ 2, 3 ] + my $frames = $translator->nonstop(\$seq, { strand => -1 } ); # [ -1, -3 ] =cut sub nonstop { + TRACE('nonstop called'); + my $self = shift; - my ( $seqRef, $strand ) - = validate_pos( @_, - { type => SCALARREF }, - { default => 0, - regex => qr/^[+-]?1$/ - } - ); + + my ( $seq_ref, @p ); + ( $seq_ref, $p[0] ) = validate_pos( + @_, + { type => Params::Validate::SCALARREF }, + { type => Params::Validate::HASHREF, default => {} } + ); + + my %p = validate( + @p, + { + strand => { + default => 0, + regex => qr/^[+-]?[01]$/, + type => Params::Validate::SCALAR + }, + sanitized => { default => $DEFAULT_SANITIZED } + } + ); + + $seq_ref = cleanDNA($seq_ref) unless ( $p{sanitized} ); my @frames; - foreach my $cur_strand ( $strand == 0 ? ( -1, 1 ) : ($strand) ) { - my $stop = $self->regex( 'stop', $cur_strand ); + foreach my $strand ( $p{strand} == 0 ? ( 1, -1 ) : $p{strand} ) { + my $stop = $self->regex( '*', $strand ); foreach my $frame ( 0 .. 2 ) { - my $regex = $cur_strand == 1 - ? qr/^.{$frame}(?:.{3})*$stop/ - : qr/$stop(?:.{3})*.{$frame}$/; + my $regex = + $strand == 1 + ? qr/^.{$frame}(?:.{3})*$stop/ + : qr/$stop(?:.{3})*.{$frame}$/; - push @frames, ( $frame + 1 ) * $cur_strand - unless ( $$seqRef =~ m/$regex/ ); + push @frames, ( $frame + 1 ) * $strand + unless ( $$seq_ref =~ m/$regex/ ); } } - + return \@frames; } 1; + +=head1 AUTHOR + +Kevin Galinsky, + +=head1 COPYRIGHT & LICENSE + +Copyright 2008-2009 J. Craig Venter Institute, all rights reserved. + +This program is free software; you can redistribute it and/or modify it +under the same terms as Perl itself. + +=cut diff --git a/t/00-load.t b/t/00-load.t index 0cf46a0..ecfb6c0 100644 --- a/t/00-load.t +++ b/t/00-load.t @@ -1,9 +1,10 @@ #!perl -use Test::More tests => 1; +use Test::More tests => 2; BEGIN { use_ok( 'JCVI::Translator' ); + use_ok( 'JCVI::Translator::Utils' ); } -diag( "Testing JCVI::Translator $JCVI::Translator::VERSION, Perl $], $^X" ); +diag( "Testing JCVI::Translator $JCVI::Translator::VERSION, Perl $], $^X" ); \ No newline at end of file diff --git a/t/01-translate.t b/t/01-translate.t new file mode 100644 index 0000000..f68254e --- /dev/null +++ b/t/01-translate.t @@ -0,0 +1,42 @@ +#!perl + +use Test::More 'no_plan'; + +use JCVI::Translator; + +my $seq = 'CTGATATCATGCATGCCATTCTCGACCGCTATGCGCCTCCTGTTCCTCGTGGGCCCAAAA'; + +my $translator = new JCVI::Translator(); + +is( ${ $translator->translate( \$seq ) }, + 'MISCMPFSTAMRLLFLVGPK', 'Translate frame 1' ); + +is( ${ $translator->translate( \$seq, { lower => 1 } ) }, + '*YHACHSRPLCASCSSWAQ', 'Translate frame 2' ); + +is( ${ $translator->translate( \$seq, { lower => 2 } ) }, + 'DIMHAILDRYAPPVPRGPK', 'Translate frame 3' ); + +is( ${ $translator->translate( \$seq, { strand => -1 } ) }, + 'FWAHEEQEAHSGREWHA*YQ', 'Translate frame -1' ); + +is( ${ $translator->translate( \$seq, { strand => -1, upper => 59 } ) }, + 'FGPTRNRRRIAVENGMHDI', 'Translate frame -2' ); + +is( ${ $translator->translate( \$seq, { strand => -1, upper => 58 } ) }, + 'MGPRGTGGA*RSRMACMIS', 'Translate frame -3' ); + +is( + ${ $translator->translate( \$seq, { partial => 1 } ) }, + 'LISCMPFSTAMRLLFLVGPK', + q{Translate 5' partial frame 1} +); + +is( + ${ + $translator->translate( \$seq, + { strand => -1, upper => 58, partial => 1 } ) + }, + 'LGPRGTGGA*RSRMACMIS', + q{Translate 5' partial frame -3} +); \ No newline at end of file diff --git a/t/02-translate_exons.t b/t/02-translate_exons.t new file mode 100644 index 0000000..d2ed94e --- /dev/null +++ b/t/02-translate_exons.t @@ -0,0 +1,120 @@ +#!perl + +use Test::More 'no_plan'; + +use JCVI::Translator; + +my $seq = 'CTGATATCATGCATGCCATTCTCGACCGCTATGCGCCTCCTGTTCCTCGTGGGCCCAAAA'; + +my $translator = new JCVI::Translator(); + +# Simple cases +is( ${ $translator->translate_exons( \$seq, [ [ 0, 60 ] ] ) }, + 'MISCMPFSTAMRLLFLVGPK', 'Translate frame 1' ); +is( + ${ + $translator->translate_exons( \$seq, [ [ 0, 60 ] ], { partial => 1 } ) + }, + 'LISCMPFSTAMRLLFLVGPK', + 'Translate frame 1 with partial flag' +); + +is( + ${ + $translator->translate_exons( \$seq, [ [ 0, 60 ] ], { strand => -1 } ) + }, + 'FWAHEEQEAHSGREWHA*YQ', + 'Translate frame -1' +); + +# Break in the middle +is( ${ $translator->translate_exons( \$seq, [ [ 0, 30 ], [ 30, 60 ] ] ) }, + 'MISCMPFSTAMRLLFLVGPK', 'Translate frame 1 with break' ); + +is( + ${ + $translator->translate_exons( + \$seq, + [ [ 0, 30 ], [ 30, 60 ] ], + { strand => -1 } + ) + }, + 'FWAHEEQEAHSGREWHA*YQ', + 'Translate frame -1 with break' +); + +# Small codons +is( + ${ + $translator->translate_exons( \$seq, + [ map { [ $_, $_ + 1 ] } ( 0 .. 59 ) ] ) + }, + 'MISCMPFSTAMRLLFLVGPK', + 'Translate frame 1 with small breaks' +); + +is( + ${ + $translator->translate_exons( + \$seq, + [ map { [ $_, $_ + 1 ] } ( 0 .. 59 ) ], + { strand => -1 } + ) + }, + 'FWAHEEQEAHSGREWHA*YQ', + 'Translate frame -1 with break' +); + +# Gaps +is( ${ $translator->translate_exons( \$seq, [ [ 0, 18 ], [ 21, 60 ] ] ) }, + 'MISCMPSTAMRLLFLVGPK', 'Translate frame 1 with gap' ); + +is( + ${ + $translator->translate_exons( + \$seq, + [ [ 0, 6 ], [ 9, 60 ] ], + { strand => -1 } + ) + }, + 'FWAHEEQEAHSGREWHAYQ', + 'Translate frame -1 with gap' +); + +# Traverse reading frames +is( ${ $translator->translate_exons( \$seq, [ [ 0, 18 ], [ 28, 60 ] ] ) }, + 'MISCMPLCASCSSWAQ', 'Translate with gap going from frame 1 to 2' ); + +is( + ${ + $translator->translate_exons( + \$seq, + [ [ 6, 21 ], [ 31, 58 ] ], + { strand => -1 } + ) + }, + 'MGPRGTGGAEWHA*', + 'Translate with gap going from frame -3 to -1' +); + +# Translate with random gaps +is( + ${ + $translator->translate_exons( \$seq, + [ [ 6, 23 ], [ 26, 32 ], [ 36, 54 ] ] ) + }, + 'SCMPFSAISCSSW', + 'Translate with random gaps on + strand' +); + +is( + ${ + $translator->translate_exons( + \$seq, + [ [ 6, 23 ], [ 26, 32 ], [ 36, 54 ] ], + { strand => -1 } + ) + }, + 'AHEEQEIAENGMH', + 'Translate with random gaps on - strand' +); diff --git a/t/03-translate6.t b/t/03-translate6.t new file mode 100644 index 0000000..5ccd911 --- /dev/null +++ b/t/03-translate6.t @@ -0,0 +1,10 @@ +#!perl + +use Test::More 'no_plan'; +use JCVI::DNATools qw(randomDNA); + +use JCVI::Translator; + +my $translator = new JCVI::Translator; + +ok( $translator->translate6( randomDNA() ), 'translate6 ran' ); diff --git a/t/04-custom.t b/t/04-custom.t new file mode 100644 index 0000000..bfccef0 --- /dev/null +++ b/t/04-custom.t @@ -0,0 +1,27 @@ +#!perl + +use Test::More 'no_plan'; + +use JCVI::Translator; + +my $translator; + +ok( + $translator = custom JCVI::Translator( + \'{ +name "Standard" , +name "SGC0" , +id 1 , +ncbieaa "FFLLSSSSYY**CC*WLLLLPPPPHHQQRRRRIIIMTTTTNNKKSSRRVVVVAAAADDEEGGGG", +sncbieaa "---M---------------M---------------M----------------------------" +-- Base1 TTTTTTTTTTTTTTTTCCCCCCCCCCCCCCCCAAAAAAAAAAAAAAAAGGGGGGGGGGGGGGGG +-- Base2 TTTTCCCCAAAAGGGGTTTTCCCCAAAAGGGGTTTTCCCCAAAAGGGGTTTTCCCCAAAAGGGG +-- Base3 TCAGTCAGTCAGTCAGTCAGTCAGTCAGTCAGTCAGTCAGTCAGTCAGTCAGTCAGTCAGTCAG +}' + ), + 'Custom translation table loaded' +); + +ok( + $translator->table_string(), 'Table string' +); \ No newline at end of file diff --git a/t/21-nonstop.t b/t/21-nonstop.t new file mode 100644 index 0000000..ea85f66 --- /dev/null +++ b/t/21-nonstop.t @@ -0,0 +1,47 @@ +#!perl + +use Test::More 'no_plan'; +use List::Compare::Functional 'get_symdiff'; + +use JCVI::Translator::Utils; + +my $utils = new JCVI::Translator::Utils; + +is( + scalar( + get_symdiff( + [ + $utils->nonstop( \'TACGTTGGTTAAGTT' ), + [ 2, 3, -1, -3 ] + ] + ) + ), + 0, + 'Nonstop on both strands' +); + +is( + scalar( + get_symdiff( + [ + $utils->nonstop( \'TACGTTGGTTAAGTT', { strand => 1 } ), + [ 2, 3 ] + ] + ) + ), + 0, + 'Nonstop on + strand' +); + +is( + scalar( + get_symdiff( + [ + $utils->nonstop( \'TACGTTGGTTAAGTT', { strand => -1 } ), + [ -1, -3 ] + ] + ) + ), + 0, + 'Nonstop on - strand' +); \ No newline at end of file diff --git a/t/22-getORF.t b/t/22-getORF.t new file mode 100644 index 0000000..801c768 --- /dev/null +++ b/t/22-getORF.t @@ -0,0 +1,10 @@ +#!perl + +use Test::More 'no_plan'; +use JCVI::DNATools qw(randomDNA); + +use JCVI::Translator::Utils; + +my $utils = new JCVI::Translator::Utils; + +ok( $utils->getORF( randomDNA() ), 'getORF ran' ); diff --git a/t/23-getCDS.t b/t/23-getCDS.t new file mode 100644 index 0000000..ae6be27 --- /dev/null +++ b/t/23-getCDS.t @@ -0,0 +1,10 @@ +#!perl + +use Test::More 'no_plan'; +use JCVI::DNATools qw(randomDNA); + +use JCVI::Translator::Utils; + +my $utils = new JCVI::Translator::Utils; + +ok( $utils->getCDS( randomDNA() ), 'getCDS ran' );