From 965a59e798b8f7bd3c9fb06c33f05d338c2a787f Mon Sep 17 00:00:00 2001 From: Jakob Nybo Nissen Date: Mon, 8 Aug 2022 20:45:01 +0200 Subject: [PATCH] Add reader and writer string macros This commit implements the `@rdr_str` and `@wtr_str` macros, which autodetect the correct readers, writers and de/compressors to open a biological file based on the extensions of the path. The system is extensible to arbitrary biological formats, but the extensions of compression formats are hardcoded in this package. I also add a dubious overload to `Base.open`, such that the readers and writer macros can be used like so: ```julia open(rdr"foo.fna", wtr"bar.fq") do reader, writer ... end ``` --- Project.toml | 2 +- src/BioGenerics.jl | 2 + src/IO.jl | 111 ++++++++++++++++++++++++++++++++++++++++++++- 3 files changed, 113 insertions(+), 2 deletions(-) diff --git a/Project.toml b/Project.toml index f88c658..f630c11 100644 --- a/Project.toml +++ b/Project.toml @@ -1,7 +1,7 @@ name = "BioGenerics" uuid = "47718e42-2ac5-11e9-14af-e5595289c2ea" authors = ["Ben J. Ward "] -version = "0.1.2" +version = "0.1.3" [deps] TranscodingStreams = "3bb67fe8-82b1-5028-8e26-92a6c54297fa" diff --git a/src/BioGenerics.jl b/src/BioGenerics.jl index 174e907..4e1803c 100644 --- a/src/BioGenerics.jl +++ b/src/BioGenerics.jl @@ -17,4 +17,6 @@ include("IO.jl") #include("RecordHelper.jl") include("Testing.jl") +using .IO: readertype, writertype, @rdr_str, @wtr_str + end # module BioGenerics diff --git a/src/IO.jl b/src/IO.jl index dcba684..b9b4be3 100644 --- a/src/IO.jl +++ b/src/IO.jl @@ -118,6 +118,115 @@ function Base.open(::Type{T}, filepath::AbstractString, args...; kwargs_...) whe return T(open(filepath, append ? "a" : "w"), args...; kwargs...) end -# +# We have this un-extendable function here because we expect +# to not be able to control compression-related code, whereas we might be able to get +# PRs to biological readers +# That's also why we return code here instead of objects - BioGenerics does not need +# to know what GzipDecompressorStream is, so we just return a symbol that could be anything, +# and let the module that used the macro resolve it. +function de_compressor_code(ending::Union{String, SubString{String}}, read::Bool) + # TODO: It would be nice to have a good specialized BGZIP implementation... + if in(ending, ("gzip", "gz", "bgzip")) + read ? quote GzipDecompressorStream end : quote GzipCompressorStream end + elseif ending == "xz" + read ? quote XzDecompressorStream end : quote XzCompressorStream end + elseif ending == "zst" + read ? quote ZstdDecompressorStream end : quote ZstdCompressorStream end + else + nothing + end +end + +""" + readertype(::Val{S}, arg)::T + +Determine the type of reader that opens extension named by `Symbol` `S`. +For example, `readertype(::Val{:fa}, arg) = FASTA.Reader`. +Should be extended by developers making new biological file format readers. + +The extra argument `arg` can be passed like so `rdr"path.ext"arg`, and defaults +to the empty string. This can be used to pass an additional argument that is specific +to the person implementing the reader. +""" +readertype(@nospecialize(v::Val{S}), arg) where S = error("Unknown biological file extension: \"$(string(S))\"") + +""" + writertype(::Val{S}, arg)::T + +Determine the type of reader that can write a file with an extension named by `Symbol` `S`. +For example, `writertype(::Val{:fa}, arg) = FASTA.Writer`. +Should be extended by developers making new biological file format writers. + +The extra argument `arg` can be passed like so `wtr"path.ext"arg`, and defaults +to the empty string. This can be used to pass an additional argument that is specific +to the person implementing the writer. +""" +writertype(@nospecialize(v::Val{S}), arg) where S = error("Unknown biological file extension: \"$(string(S))\"") + +# Like splitext, but removes the dot from the extension +function pure_ext(path::Union{String, SubString{String}}) + (path, ext) = splitext(path) + ext = (!isempty(ext) && first(ext) == '.') ? ext[2:end] : ext + String(path), String(ext) +end + +function resolve_reader(path::Union{String, SubString{String}}, arg::String) + code = quote open($(path); lock=false) end + (path, ext) = pure_ext(path) + while (wrapper = de_compressor_code(ext, true)) !== nothing + code = quote $(wrapper)($code) end + (path, ext) = pure_ext(path) + end + quote $(readertype(Val(Symbol(ext)), arg))($code) end +end + +function resolve_writer(path::Union{String, SubString{String}}, arg::String) + code = quote open($(path), "w"; lock=false) end + (path, ext) = pure_ext(path) + while (wrapper = de_compressor_code(ext, false)) !== nothing + code = quote $(wrapper)($code) end + (path, ext) = pure_ext(path) + end + quote $(writertype(Val(Symbol(ext)), arg))($code) end +end + +macro rdr_str(path, arg) + esc(resolve_reader(path, arg)) +end + +macro rdr_str(path) + esc(resolve_reader(path, "")) +end + +macro wtr_str(path, arg) + esc(resolve_writer(path, arg)) +end + +macro wtr_str(path) + esc(resolve_writer(path, "")) +end + +""" + Base.open(f, ios::Vararg{AbstractFormattedIO}) + +Execute `f(ios...)`, then `close` each io. +`close` is run even if `f(ios...)` throws an exception. + +# Examples +```julia +julia> open(rdr"path/to/seqs.fna") do reader + # do something with reader + end +``` +""" +function Base.open(f::Function, first::AbstractFormattedIO, rest::Vararg{AbstractFormattedIO}) + try + f(first, rest...) + finally + for i in (first, rest...) + close(i) + end + end +end end # module BioGenerics.IO