diff --git a/CHANGELOG.md b/CHANGELOG.md index ce99715b..3d8eba29 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -5,8 +5,28 @@ The format is based on [Keep a Changelog](http://keepachangelog.com/en/1.0.0/) and this project adheres to [Semantic Versioning](http://semver.org/spec/v2.0.0.html). ## [UNRELEASED] -### Added -* Improved error message when encoding LongDNA from byte-like objects + +## [3.1.6] +* The heuristics for translating sequences with ambiguous symbols is now improved. + Now, `translate` does not rely on heuristics but uses an algorithm that always + returns exactly the right amino acid in the face of ambiguous nucleotides. + +## [3.1.5] +* Attempting to translate a nucleotide sequence with gap symbols now throws an error (#278, see #277) + +## [3.1.4] +* Migrate from SnoopPrecompile to PrecompileTools (#273) + +## [3.1.3] +* Improve error when mis-encoding `LongDNA` from byte-like inputs (#267) +* Remove references to internal `Random.GLOBAL_RNG` (#265) + +## [3.1.2] +* Fix bug in converting `LongSubSeq` to `LongSequence` (#261) + +## [3.1.1] +* Add `iterate` method for `Alphabets` (#233) +* Add SnoopPrecompile workload and dependency on SnoopPrecompile (#257) ## [3.1.0] ### Added diff --git a/Project.toml b/Project.toml index 1851fa5e..c0570804 100644 --- a/Project.toml +++ b/Project.toml @@ -1,7 +1,7 @@ name = "BioSequences" uuid = "7e6ae17a-c86d-528c-b3b9-7f778a29fe59" authors = ["Sabrina Jaye Ward ", "Jakob Nissen "] -version = "3.1.5" +version = "3.1.6" [deps] BioSymbols = "3c28c6f8-a34d-59c4-9654-267d177fcfa9" diff --git a/src/geneticcode.jl b/src/geneticcode.jl index 7fab9d42..de3da73a 100644 --- a/src/geneticcode.jl +++ b/src/geneticcode.jl @@ -377,50 +377,51 @@ function translate!(aaseq::LongAA, elseif iscertain(a) & iscertain(b) & iscertain(c) aaseq[i] = code[unambiguous_codon(a, b, c)] else - aa = try_translate_ambiguous_codon(code, a, b, c) - if aa === nothing - if allow_ambiguous_codons - aa = AA_X - else - error("codon ", a, b, c, " cannot be unambiguously translated") - end - end - aaseq[i] = aa + aaseq[i] = try_translate_ambiguous_codon(code, a, b, c, allow_ambiguous_codons) end end alternative_start && !isempty(aaseq) && (@inbounds aaseq[1] = AA_M) aaseq end - -function try_translate_ambiguous_codon(code::GeneticCode, - x::RNA, - y::RNA, - z::RNA) - @inbounds if !isambiguous(x) & !isambiguous(y) - # try to translate a codon `(x, y, RNA_N)` - aa_a = code[unambiguous_codon(x, y, RNA_A)] - aa_c = code[unambiguous_codon(x, y, RNA_C)] - aa_g = code[unambiguous_codon(x, y, RNA_G)] - aa_u = code[unambiguous_codon(x, y, RNA_U)] - if aa_a == aa_c == aa_g == aa_u - return aa_a +function try_translate_ambiguous_codon( + code::GeneticCode, + x::RNA, + y::RNA, + z::RNA, + allow_ambiguous::Bool +)::AminoAcid + ((a, b, c), unambigs) = Iterators.peel( + Iterators.product(map(UnambiguousRNAs, (x, y, z))...) + ) + aa = @inbounds code[unambiguous_codon(a, b, c)] + @inbounds for (a, b, c) in unambigs + aa_new = code[unambiguous_codon(a, b, c)] + aa_new == aa && continue + allow_ambiguous || error("codon ", a, b, c, " cannot be unambiguously translated") + aa = if aa_new in (AA_N, AA_D) && aa in (AA_N, AA_D, AA_B) + AA_B + elseif aa_new in (AA_I, AA_L) && aa in (AA_I, AA_L, AA_B) + AA_J + elseif aa_new in (AA_Q, AA_E) && aa in (AA_Q, AA_E, AA_Z) + AA_Z + else + AA_X end + aa == AA_X && break end + return aa +end - found::Union{AminoAcid, Nothing} = nothing - for (codon, aa) in code - # TODO: Make this more tidy - maybe reuse the decode method for DNAAlph{4}. - a = reinterpret(RNA, 0x1 << ((codon >> 4) & 0x3)) - b = reinterpret(RNA, 0x1 << ((codon >> 2) & 0x3)) - c = reinterpret(RNA, 0x1 << (codon & 0x3)) - @inbounds if (iscompatible(x, a) & iscompatible(y, b) & iscompatible(z, c)) - if found === nothing - found = aa - elseif aa != found - return nothing - end - end - end - return found +struct UnambiguousRNAs + x::RNA end + +Base.eltype(::Type{UnambiguousRNAs}) = RNA +Base.length(x::UnambiguousRNAs) = count_ones(reinterpret(UInt8, x.x)) +function Base.iterate(x::UnambiguousRNAs, state=reinterpret(UInt8, x.x)) + iszero(state) && return nothing + rna = reinterpret(RNA, 0x01 << (trailing_zeros(state) & 7)) + (rna, state & (state - 0x01)) +end + diff --git a/test/translation.jl b/test/translation.jl index fcf71a97..f4d744b4 100644 --- a/test/translation.jl +++ b/test/translation.jl @@ -60,6 +60,8 @@ # ambiguous codons @test translate(rna"YUGMGG") == aa"LR" @test translate(rna"GAYGARGAM") == aa"DEX" + @test translate(rna"MUCGGG") == aa"JG" + @test translate(rna"AAASAAUUU") == aa"KZF" # BioSequences{RNAAlphabet{2}} @test translate(LongSequence{RNAAlphabet{2}}("AAAUUUGGGCCC")) == translate(rna"AAAUUUGGGCCC")