Skip to content

Commit

Permalink
Improve heuristics of translating ambiguous nucs
Browse files Browse the repository at this point in the history
Use a more principled approach, such that we now guarantee that ambiguous DNA
and RNA is translated to exactly the set of amino acids the genetic code
corresponds to.
  • Loading branch information
jakobnissen committed Sep 26, 2023
1 parent 1737991 commit 4839f27
Show file tree
Hide file tree
Showing 4 changed files with 63 additions and 40 deletions.
24 changes: 22 additions & 2 deletions CHANGELOG.md
Original file line number Diff line number Diff line change
Expand Up @@ -5,8 +5,28 @@ The format is based on [Keep a Changelog](http://keepachangelog.com/en/1.0.0/)
and this project adheres to [Semantic Versioning](http://semver.org/spec/v2.0.0.html).

## [UNRELEASED]
### Added
* Improved error message when encoding LongDNA from byte-like objects

## [3.1.6]
* The heuristics for translating sequences with ambiguous symbols is now improved.
Now, `translate` does not rely on heuristics but uses an algorithm that always
returns exactly the right amino acid in the face of ambiguous nucleotides.

## [3.1.5]
* Attempting to translate a nucleotide sequence with gap symbols now throws an error (#278, see #277)

## [3.1.4]
* Migrate from SnoopPrecompile to PrecompileTools (#273)

## [3.1.3]
* Improve error when mis-encoding `LongDNA` from byte-like inputs (#267)
* Remove references to internal `Random.GLOBAL_RNG` (#265)

## [3.1.2]
* Fix bug in converting `LongSubSeq` to `LongSequence` (#261)

## [3.1.1]
* Add `iterate` method for `Alphabets` (#233)
* Add SnoopPrecompile workload and dependency on SnoopPrecompile (#257)

## [3.1.0]
### Added
Expand Down
2 changes: 1 addition & 1 deletion Project.toml
Original file line number Diff line number Diff line change
@@ -1,7 +1,7 @@
name = "BioSequences"
uuid = "7e6ae17a-c86d-528c-b3b9-7f778a29fe59"
authors = ["Sabrina Jaye Ward <[email protected]>", "Jakob Nissen <[email protected]>"]
version = "3.1.5"
version = "3.1.6"

[deps]
BioSymbols = "3c28c6f8-a34d-59c4-9654-267d177fcfa9"
Expand Down
75 changes: 38 additions & 37 deletions src/geneticcode.jl
Original file line number Diff line number Diff line change
Expand Up @@ -377,50 +377,51 @@ function translate!(aaseq::LongAA,
elseif iscertain(a) & iscertain(b) & iscertain(c)
aaseq[i] = code[unambiguous_codon(a, b, c)]
else
aa = try_translate_ambiguous_codon(code, a, b, c)
if aa === nothing
if allow_ambiguous_codons
aa = AA_X
else
error("codon ", a, b, c, " cannot be unambiguously translated")
end
end
aaseq[i] = aa
aaseq[i] = try_translate_ambiguous_codon(code, a, b, c, allow_ambiguous_codons)
end
end
alternative_start && !isempty(aaseq) && (@inbounds aaseq[1] = AA_M)
aaseq
end


function try_translate_ambiguous_codon(code::GeneticCode,
x::RNA,
y::RNA,
z::RNA)
@inbounds if !isambiguous(x) & !isambiguous(y)
# try to translate a codon `(x, y, RNA_N)`
aa_a = code[unambiguous_codon(x, y, RNA_A)]
aa_c = code[unambiguous_codon(x, y, RNA_C)]
aa_g = code[unambiguous_codon(x, y, RNA_G)]
aa_u = code[unambiguous_codon(x, y, RNA_U)]
if aa_a == aa_c == aa_g == aa_u
return aa_a
function try_translate_ambiguous_codon(
code::GeneticCode,
x::RNA,
y::RNA,
z::RNA,
allow_ambiguous::Bool
)::AminoAcid
((a, b, c), unambigs) = Iterators.peel(
Iterators.product(map(UnambiguousRNAs, (x, y, z))...)
)
aa = @inbounds code[unambiguous_codon(a, b, c)]
@inbounds for (a, b, c) in unambigs
aa_new = code[unambiguous_codon(a, b, c)]
aa_new == aa && continue
allow_ambiguous || error("codon ", a, b, c, " cannot be unambiguously translated")
aa = if aa_new in (AA_N, AA_D) && aa in (AA_N, AA_D, AA_B)
AA_B
elseif aa_new in (AA_I, AA_L) && aa in (AA_I, AA_L, AA_B)
AA_J
elseif aa_new in (AA_Q, AA_E) && aa in (AA_Q, AA_E, AA_Z)
AA_Z
else
AA_X
end
aa == AA_X && break
end
return aa
end

found::Union{AminoAcid, Nothing} = nothing
for (codon, aa) in code
# TODO: Make this more tidy - maybe reuse the decode method for DNAAlph{4}.
a = reinterpret(RNA, 0x1 << ((codon >> 4) & 0x3))
b = reinterpret(RNA, 0x1 << ((codon >> 2) & 0x3))
c = reinterpret(RNA, 0x1 << (codon & 0x3))
@inbounds if (iscompatible(x, a) & iscompatible(y, b) & iscompatible(z, c))
if found === nothing
found = aa
elseif aa != found
return nothing
end
end
end
return found
struct UnambiguousRNAs
x::RNA
end

Base.eltype(::Type{UnambiguousRNAs}) = RNA
Base.length(x::UnambiguousRNAs) = count_ones(reinterpret(UInt8, x.x))
function Base.iterate(x::UnambiguousRNAs, state=reinterpret(UInt8, x.x))
iszero(state) && return nothing
rna = reinterpret(RNA, 0x01 << (trailing_zeros(state) & 7))
(rna, state & (state - 0x01))
end

2 changes: 2 additions & 0 deletions test/translation.jl
Original file line number Diff line number Diff line change
Expand Up @@ -60,6 +60,8 @@
# ambiguous codons
@test translate(rna"YUGMGG") == aa"LR"
@test translate(rna"GAYGARGAM") == aa"DEX"
@test translate(rna"MUCGGG") == aa"JG"
@test translate(rna"AAASAAUUU") == aa"KZF"

# BioSequences{RNAAlphabet{2}}
@test translate(LongSequence{RNAAlphabet{2}}("AAAUUUGGGCCC")) == translate(rna"AAAUUUGGGCCC")
Expand Down

0 comments on commit 4839f27

Please sign in to comment.