From f7a1328a744e30840326943bcace9aa6590aba3c Mon Sep 17 00:00:00 2001 From: Jakob Nybo Nissen Date: Wed, 13 Jul 2022 17:51:00 +0200 Subject: [PATCH] Add reverse translation (#20) Add reverse translation --- docs/src/translate.md | 41 +++++++++ src/Kmers.jl | 9 ++ src/kmer.jl | 2 + src/revtrans.jl | 198 ++++++++++++++++++++++++++++++++++++++++++ test/translation.jl | 156 +++++++++++++++++++++++++++++++++ 5 files changed, 406 insertions(+) create mode 100644 src/revtrans.jl diff --git a/docs/src/translate.md b/docs/src/translate.md index 3c0d92b..f523648 100644 --- a/docs/src/translate.md +++ b/docs/src/translate.md @@ -19,3 +19,44 @@ MGH* ``` For more information on translation and different genetic codes, see the documentation of BioSequences.jl. + +## Reverse translation +Reverse translation (or "revtrans", for short) refers to the mapping from amino acids back to the set of RNA codons that code for the given amino acid, under a given genetic code. +There is no known natural process of revtrans, but it can be useful to do _in silico_. + +In Kmers.jl, revtrans is done through the `reverse_translate` function. +This takes an amino acid sequence and produces a `Vector{CodonSet}`, where `CodonSet <: AbstractSet{RNACodon}`. +Alternatively, it takes an amino acid and produces a `CodonSet`. + +A reverse genetic code can optionally be specified as the second argument. +If not provided, it default to the reverse standard genetic code. + +### Example of reverse translation +```julia +julia> reverse_translate(AA_W) # default to standard genetic code +Kmers.CodonSet with 1 element: + UGG + +julia> code = ReverseGeneticCode(BioSequences.trematode_mitochondrial_genetic_code); + +julia> reverse_translate(AA_W, code) +Kmers.CodonSet with 2 elements: + UGA + UGG +``` + +### Important notes on reverse translation +* `AA_Gap` cannot be reverse translated. Attempting so throws an error +* In cells, `AA_O` and `AA_U` are encoded by dynamic overloading of the codons `UAG` and `UGA`, respectively. + Because these codons normally code for `AA_Term`, the forward genetic code returns `AA_Term` for these codons. + However, we can unambiguously reverse translate them, so these amino acids translate to codonsets with these + precise codons. +* Ambiguous amino acids translate to the union of the possible amino acids. For example, if `AA_L` translate to set `S1`, + and `AA_I` translate to `S2`, then `AA_J` translate to `union(S1, S2)`. + +```@docs +Kmers.CodonSet +Kmers.ReverseGeneticCoTranslatingde +reverse_translate +reverse_translate! +``` diff --git a/src/Kmers.jl b/src/Kmers.jl index 983b637..be1a2c5 100644 --- a/src/Kmers.jl +++ b/src/Kmers.jl @@ -116,6 +116,15 @@ export SpacedCanonicalKmers, fw_neighbors, bw_neighbors, + + # Immutable operators + push, + delete, + + # Translation + reverse_translate, + reverse_translate!, + ReverseGeneticCode, ### ### Sequence literals diff --git a/src/kmer.jl b/src/kmer.jl index c209b7e..62f108c 100644 --- a/src/kmer.jl +++ b/src/kmer.jl @@ -569,3 +569,5 @@ macro mer_str(seq) T = kmertype(DNAKmer{length(seq′)}) return T(seq′) end + +include("revtrans.jl") \ No newline at end of file diff --git a/src/revtrans.jl b/src/revtrans.jl new file mode 100644 index 0000000..4e8dc2e --- /dev/null +++ b/src/revtrans.jl @@ -0,0 +1,198 @@ +struct Unsafe end + +const N_AA = length(AminoAcidAlphabet()) + +""" + CodonSet <: AbstractSet{RNACodon} + +A small, immutable set of `RNACodon`. + +Create an empty set using `CodonSet()`, or from an iterable of `RNACodon` +using `CodonSet(itr)`. +Because `CodonSet` is immutable, use `push` instead of `push!`, and use +the non-mutating set operations `union`, `setdiff`, etc. + +# Examples +```jldoctest +julia> v = (mer"UAG"r, mer"GGA"r, mer"UUU"r); + +julia> Set(CodonSet(v)) == Set(v) +true + +julia> union(CodonSet(v), CodonSet([mer"GAG"r])) +Kmers.CodonSet with 4 elements: + GAG + GGA + UAG + UUU +``` + +See also: `push` +""" +struct CodonSet <: AbstractSet{RNACodon} + x::UInt64 + + CodonSet(x::UInt64, ::Unsafe) = new(x) +end +CodonSet() = CodonSet(UInt64(0), Unsafe()) +CodonSet(itr) = foldl(push, itr, init=CodonSet()) + +function Base.iterate(x::CodonSet, s::UInt64=x.x) + codon = RNACodon((trailing_zeros(s) % UInt64,)) + iszero(s) ? nothing : (codon, s & (s-1)) +end + +function push(s::CodonSet, x::RNACodon) + CodonSet(s.x | (UInt64(1) << (x.data[1] & 63)), Unsafe()) +end + +Base.length(x::CodonSet) = count_ones(x.x) +Base.in(c::RNACodon, s::CodonSet) = isodd(s.x >>> (c.data[1] & 63)) +delete(s::CodonSet, x::RNACodon) = setdiff(s, CodonSet((x,))) +Base.issubset(a::CodonSet, b::CodonSet) = isempty(setdiff(a, b)) +Base.filter(f, s::CodonSet) = CodonSet(Iterators.filter(f, s)) +Base.setdiff(a::CodonSet, b::Vararg{CodonSet}) = CodonSet(a.x & ~(union(b...).x), Unsafe()) + +for (name, f) in [(:union, |), (:intersect, &), (:symdiff, ⊻)] + @eval function Base.$(name)(a::CodonSet, b::Vararg{CodonSet}) + CodonSet(mapreduce(i -> i.x, $f, b, init=a.x), Unsafe()) + end +end + +""" + ReverseGeneticCode <: AbstractDict{AminoAcid, CodonSet} + +A mapping from an amino acid `aa` to the `CodonSet` of all codons that +translate to `aa`. Conceptually, the inverse of a `BioSequences.GeneticCode`. +Used by `reverse_translate`. + +`AA_Gap` cannot be translated. Ambiguous amino acids translate to the union +of what their constituent amino acids translate to. +Pyrrolysine and selenocysteine translate to `CodonSet` containing `UAG` and `UGA`, +respectively, whereas they are not translated to in most forward genetic codes. +For these reasons, a the mapping through `ReverseGeneticCode` is not exactly +inverse of the mapping through `GeneticCode` + +# Examples +```jldoctest +julia> code = ReverseGeneticCode(BioSequences.candidate_division_sr1_genetic_code); + +julia> code[AA_E] +Kmers.CodonSet with 2 elements: + GAA + GAG + +julia> code[AA_Gap] +ERROR: Cannot reverse translate element: - +[...] +``` + +See also: [`reverse_translate`](@ref) +""" +struct ReverseGeneticCode <: AbstractDict{AminoAcid, CodonSet} + name::String + sets::NTuple{N_AA-1, CodonSet} +end + +function ReverseGeneticCode(x::BioSequences.GeneticCode) + ind(aa::AminoAcid) = reinterpret(UInt8, aa) + 1 + + sets = fill(CodonSet(), N_AA-1) + x_set = CodonSet() + for i in Int64(0):Int64(63) + aa = x.tbl[i + 1] + codon = RNACodon((i % UInt64,)) + sets[ind(aa)] = push(sets[ind(aa)], codon) + if aa !== AA_Term + x_set = push(x_set, codon) + end + end + + # Ambiguous amino acids + for (n, (a, b)) in [(AA_B, (AA_D, AA_N)), (AA_J, (AA_I, AA_L)), (AA_Z, (AA_E, AA_Q))] + sets[ind(n)] = sets[ind(a)] ∪ sets[ind(b)] + end + + # AA_X codes for all amino acids, except AA_Term + sets[ind(AA_X)] = x_set + + # Pyrrolysine and selenocysteine are never part of the "forward" genetic + # code, but can be unambiguously resolved in the reverse genetic code. + sets[ind(AA_U)] = CodonSet((mer"UGA"r,)) + sets[ind(AA_O)] = CodonSet((mer"UAG"r,)) + + ReverseGeneticCode(x.name, Tuple(sets)) +end + +const rev_standard_genetic_code = ReverseGeneticCode( + BioSequences.standard_genetic_code +) + +function Base.getindex(s::ReverseGeneticCode, a::AminoAcid) + if reinterpret(UInt8, a) > (N_AA - 2) # cannot translate gap + error("Cannot reverse translate element: ", a) + end + @inbounds s.sets[reinterpret(UInt8, a) + 1] +end + +Base.length(c::ReverseGeneticCode) = length(c.sets) +function Base.iterate(c::ReverseGeneticCode, s=1) + s > length(c.sets) && return nothing + return (reinterpret(AminoAcid, (s-1)%UInt8) => c.sets[s], s+1) +end + +""" + reverse_translate!(v::Vector{CodonSet}, s::AASeq code=rev_standard_genetic_code) + +Reverse-translates `s` under the reverse genetic code `code`, putting the result in `v`. + +See also: [`reverse_translate`](@ref) +""" +function reverse_translate!( + v::Vector{CodonSet}, + seq::AASeq, + code=rev_standard_genetic_code +) + resize!(v, length(seq)) + @inbounds for i in eachindex(v) + v[i] = code[seq[i]] + end + v +end + +""" + reverse_translate(s::Union{AminoAcid, AASeq}, code=rev_standard_genetic_code) + +Reverse-translates sequence or amino acid `s` under `code::ReverseGeneticCode` +If `s` is an `AminoAcid`, return a `CodonSet`. +If `s` is an `AASeq`, return `Vector{CodonSet}`. + +# Examples +```jldoctest +julia> reverse_translate(AA_W) +Kmers.CodonSet with 1 element: + UGG + +julia> v = reverse_translate(aa"MMLVQ"); + +julia> typeof(v) +Vector{Kmers.CodonSet} + +julia> v[4] +Kmers.CodonSet with 4 elements: + GUA + GUC + GUG + GUU +[...] +``` + +See also: [`reverse_translate!`](@ref), [`ReverseGeneticCode`](@ref) +""" +function reverse_translate end + +function reverse_translate(seq::AASeq, code=rev_standard_genetic_code) + reverse_translate!(Vector{CodonSet}(undef, length(seq)), seq, code) +end + +reverse_translate(aa::AminoAcid, code=rev_standard_genetic_code) = code[aa] diff --git a/test/translation.jl b/test/translation.jl index 2463454..ccf34dc 100644 --- a/test/translation.jl +++ b/test/translation.jl @@ -41,3 +41,159 @@ end @test_throws MethodError translate(mer"ATG"aa) end # translation + +@testset "CodonSet" begin +CodonSet = Kmers.CodonSet + +SAMPLE_SOURCES = Any[ + [mer"UAG"r, mer"ACC"r, mer"ACC"r, mer"UGG"r], + RNACodon[], + [mer"AAA"r, mer"ACC"r, mer"AAA"r, mer"UCA"r, mer"UCC"r], + (i for i in (mer"AGC"r, mer"AGA"r, mer"UUU"r)), + (mer"AAC"r, mer"AGG"r), + (mer"UUG"r,), +] + +@testset "Construction and basics" begin + @test isempty(CodonSet()) + + # Constuct the sets and basic properties + for codons in SAMPLE_SOURCES + set = Set(codons) + codonset = CodonSet(codons) + @test issetequal(set, codonset) + @test length(codonset) == length(set) + end + + # Fails with non-codons + @test_throws MethodError CodonSet([(RNA_A, RNA_G)]) + @test_throws MethodError CodonSet((mer"UA"r,)) + @test_throws MethodError CodonSet([rna"AGG", rna"GGG"]) + @test_throws MethodError CodonSet([1,2,3]) +end + +SAMPLE_CODONSETS = map(CodonSet, SAMPLE_SOURCES) + +@testset "Iteration" begin + for things in SAMPLE_SOURCES + @test sort!(collect(CodonSet(things))) == sort!(collect(Set(things))) + end + + @test iterate(CodonSet()) === nothing + codonset = CodonSet((mer"UUU"r,)) + codon, state = iterate(codonset) + @test codon == mer"UUU"r + @test iterate(codonset, state) === nothing +end + +@testset "Membership" begin + codonset = CodonSet([mer"ACC"r, mer"UAG"r, mer"UUU"r]) + @test mer"ACC"r in codonset + @test mer"UAG"r in codonset + @test mer"UUU"r in codonset + @test !in(mer"GAA"r, codonset) + @test !in(mer"AAA"r, codonset) +end + +@testset "Modifying" begin + # Push + s1 = CodonSet([mer"GGA"r, mer"UGU"r]) + s2 = push(s1, mer"GGA"r) + @test s1 == s2 + s3 = push(s2, mer"GAG"r) + @test Set(s3) == Set([mer"GGA"r, mer"UGU"r, mer"GGA"r, mer"GAG"r]) + + # Delete + s4 = delete(s3, mer"GAG"r) + @test s2 == s4 + s5 = delete(s4, mer"UGU"r) + @test only(s5) == mer"GGA"r + s6 = delete(s5, mer"UUU"r) + @test s5 == s6 + s7 = delete(s6, mer"GGA"r) + @test isempty(s7) +end + +@testset "Set operations" begin + for c1 in SAMPLE_CODONSETS, c2 in SAMPLE_CODONSETS + s1, s2 = Set(c1), Set(c2) + for operation in [union, intersect, setdiff, symdiff] + @test Set(operation(c1, c2)) == operation(s1, s2) + end + @test issubset(c1, c2) == issubset(s1, s2) + end +end + +@testset "Filter" begin + predicates = [ + (i -> i[2] == RNA_G), + (i -> isodd(length(i))), # always true for codons + (i -> i[1] == i[3]), + (i -> i[2] != RNA_A) + ] + for codonset in SAMPLE_CODONSETS, predicate in predicates + @test Set(filter(predicate, codonset)) == filter(predicate, Set(codonset)) + end +end + +end # CodonSet + +@testset "Reverse translation" begin + CodonSet = Kmers.CodonSet + code2 = ReverseGeneticCode(BioSequences.trematode_mitochondrial_genetic_code) + for (rvcode, fwcode) in [ + (Kmers.rev_standard_genetic_code, BioSequences.standard_genetic_code), + (code2, BioSequences.trematode_mitochondrial_genetic_code) + ] + @test reverse_translate(aa"", rvcode) == CodonSet[] + observed = Dict{AminoAcid, CodonSet}() + for aa in symbols(AminoAcidAlphabet()) + # Gap cannot be reverse translated + aa in (AA_Gap,) && continue + observed[aa] = only(reverse_translate(LongAA([aa]), rvcode)) + end + + # Length and iteration of ReverseGeneticCode + @test length(rvcode) == length(symbols(AminoAcidAlphabet())) - 1 # all but AA_Gap + @test sort!(collect(rvcode), by=first) == sort!(collect(observed), by=first) + + flipped = Dict(v => k for (k, v) in observed) + for (codonset, aa) in flipped + # Ambigous AA are reverse translated to a set, each of which directly + # translate to a specific non-ambig AA, so this is not reversible + BioSequences.isambiguous(aa) && continue + # AA_O and AA_U are specifically in the reverse standard genetic code + # but not in the genetic code. + # This is because they can be unambiguously revtranslated, but not + # forward translated. + aa in (AA_O, AA_U) && continue + for codon in codonset + @test only(translate(LongRNA{2}(codon); code=fwcode)) == aa + end + end + + # Special AA + @test only(reverse_translate(aa"O", rvcode)) == CodonSet((mer"UAG"r,)) + @test only(reverse_translate(aa"U", rvcode)) == CodonSet((mer"UGA"r,)) + + # Ambiguous amino acids + for (ambig, elements) in [ + (AA_J, [AA_I, AA_L]), + (AA_Z, [AA_E, AA_Q]), + (AA_B, [AA_D, AA_N]), + (AA_X, [ + AA_A, AA_R, AA_N, AA_D, AA_C, AA_Q, AA_E, AA_G, AA_H, AA_I, AA_L, AA_K, + AA_M, AA_F, AA_P, AA_S, AA_T, AA_W, AA_Y, AA_V + ]) + ] + c1 = only(reverse_translate(LongAA([ambig]), rvcode)) + c2 = foldl(elements, init=CodonSet()) do old, aa + union(old, reverse_translate(aa, rvcode)) + end + @test c1 == c2 + end + + # Test error on gap + @test_throws Exception reverse_translate(aa"-") + end +end # reverse translation