Skip to content

Commit

Permalink
Improve error when mis-encoding LongDNA from byte-like inputs (BioJul…
Browse files Browse the repository at this point in the history
…ia#267)

This only applies to LongDNA encoded from byte-like inputs, since this uses a
fast path in BioSequences, where we have access to the whole sequence.
Other paths leading to the error happens from direct calls to e.g.
convert(DNA, x) in BioSymbols, so the error cannot be improved much.
Old error looks like: "Cannot encode 0x20 to DNAAlphabet{4}()".
New error looks like: "Cannot encode byte 0x20 (char ' ') at index 3 to DNAAlphabet{4}()".
  • Loading branch information
jakobnissen authored Feb 17, 2023
1 parent 459ea2a commit 4a31474
Show file tree
Hide file tree
Showing 3 changed files with 23 additions and 2 deletions.
2 changes: 2 additions & 0 deletions CHANGELOG.md
Original file line number Diff line number Diff line change
Expand Up @@ -5,6 +5,8 @@ The format is based on [Keep a Changelog](http://keepachangelog.com/en/1.0.0/)
and this project adheres to [Semantic Versioning](http://semver.org/spec/v2.0.0.html).

## [UNRELEASED]
### Added
* Improved error message when encoding LongDNA from byte-like objects

## [3.1.0]
### Added
Expand Down
14 changes: 12 additions & 2 deletions src/longsequences/copying.jl
Original file line number Diff line number Diff line change
Expand Up @@ -152,9 +152,19 @@ end
# This is used to effectively scan an array of UInt8 for invalid bytes, when one is detected
@noinline function throw_encode_error(A::Alphabet, src::AbstractArray{UInt8}, soff::Integer)
for i in 1:div(64, bits_per_symbol(A))
sym = src[soff+i-1]
ascii_encode(A, sym) & 0x80 == 0x80 && error("Cannot encode $(repr(sym)) to $A")
index = soff + i - 1
sym = src[index]
if ascii_encode(A, sym) & 0x80 == 0x80
# If byte is a printable char, also display it
repr_char = if sym in UInt8('\a'):UInt8('\r') || sym in UInt8(' '):UInt8('~')
" (char '$(Char(sym))')"
else
""
end
error("Cannot encode byte $(repr(sym))$(repr_char) at index $(index) to $A")
end
end
@assert false "Expected error in encoding"
end

@inline function encode_chunk(A::Alphabet, src::AbstractArray{UInt8}, soff::Integer, N::Integer)
Expand Down
9 changes: 9 additions & 0 deletions test/longsequences/basics.jl
Original file line number Diff line number Diff line change
Expand Up @@ -24,6 +24,15 @@
@test LongSequence(SimpleSeq("AUCGU")) isa LongRNA{2}
@test LongSequence(SimpleSeq("AUCGU")) == LongRNA{2}("AUCGU")
LongDNA{4}(LongRNA{4}("AUCGUA")) == LongDNA{4}("ATCGTA")

# Displays a nice error when constructed from strings substrings
# and bytearrays on encoding error
@static if VERSION >= v"1.8"
malformed = "ACWpNS"
@test_throws "Cannot encode byte $(repr(UInt8('p'))) (char 'p') at index 4 to BioSequences.DNAAlphabet{4}" LongDNA{4}(malformed)
malformed = "AGCUGUAGUCGGUAUAUAGGCGCGCUCGAUGAUGAUGCGUGCUGCUATDNANCUG"
@test_throws "Cannot encode byte $(repr(UInt8('T'))) (char 'T') at index $(length(malformed) - 7) to BioSequences.RNAAlphabet{2}" LongRNA{2}(malformed)
end
end

@testset "Copy sequence" begin
Expand Down

0 comments on commit 4a31474

Please sign in to comment.