From 4ceba63f6a03a60634bd0d3728204b6c2962a67d Mon Sep 17 00:00:00 2001 From: Diego Javier Zea Date: Mon, 20 Feb 2023 14:30:18 +0100 Subject: [PATCH] Retry if the download of a GZip file fails --- NEWS.md | 8 +++++ Project.toml | 2 +- src/Utils/Read.jl | 68 +++++++++++++++++++++++++++----------- test/Utils/GeneralUtils.jl | 30 +++++++++++++++++ 4 files changed, 88 insertions(+), 20 deletions(-) diff --git a/NEWS.md b/NEWS.md index 10c8d940..c27fba03 100644 --- a/NEWS.md +++ b/NEWS.md @@ -1,5 +1,13 @@ ## MIToS.jl Release Notes +### Changes from v2.11.0 to v2.11.1 + +* MIToS now checks the magic number of gzip files immediately after download. If + the gzip file does not have the correct header, MIToS will attempt to download + it again. In Julia versions below 1.2, it will retry the download once. In + Julia 1.2 or higher, it will retry the download five times, using an + ExponentialBackOff. + ### Changes from v2.10.0 to v2.11.0 * *[breaking change]* `getCA` returns `missing` if a `PDBResidue` has no CA atom diff --git a/Project.toml b/Project.toml index 9d1280f9..5d026269 100644 --- a/Project.toml +++ b/Project.toml @@ -1,6 +1,6 @@ name = "MIToS" uuid = "51bafb47-8a16-5ded-8b04-24ef4eede0b5" -version = "2.11.0" +version = "2.11.1" [deps] ArgParse = "c7e460c6-2fb9-53a9-8c5b-16f535851c63" diff --git a/src/Utils/Read.jl b/src/Utils/Read.jl index 5b169437..d1198586 100644 --- a/src/Utils/Read.jl +++ b/src/Utils/Read.jl @@ -3,6 +3,33 @@ import Base: read "`FileFormat` is used for write special `parse` (and `read`) methods on it." abstract type FileFormat end +"This function raises an error if a GZip file doesn't have the 0x1f8b magic number." +function _check_gzip_file(filename) + if endswith(filename, ".gz") + open(filename, "r") do fh + magic = read(fh, UInt16) + # 0x1f8b is the magic number for GZip files + # However, some files use 0x8b1f. + # For example, the file test/data/18gs.xml.gz uses 0x8b1f. + if magic != 0x1f8b && magic != 0x8b1f + throw(ErrorException("$filename is not a GZip file!")) + end + end + end + filename +end + +function _download_file(url::AbstractString, filename::AbstractString; + kargs...) + kargs = _modify_kargs_for_proxy(url; kargs...) + kargs_dict = Dict(kargs...) + headers = pop!(kargs_dict, "headers", Dict{String,String}()) + with_logger(ConsoleLogger(stderr, Logging.Warn)) do + HTTP.download(url, filename, headers; kargs_dict...) + end + _check_gzip_file(filename) +end + """ `download_file` uses **HTTP.jl** to download files from the web. It takes the file url as first argument and, optionally, a path to save it. @@ -22,18 +49,21 @@ julia> download_file("http://www.uniprot.org/uniprot/P69905.fasta","seq.fasta", ``` """ function download_file(url::AbstractString, filename::AbstractString; - kargs...) - kargs = _modify_kargs_for_proxy(url; kargs...) - kargs_dict = Dict(kargs...) - headers = pop!(kargs_dict, "headers", Dict{String,String}()) - with_logger(ConsoleLogger(stderr, Logging.Warn)) do - HTTP.download(url, filename, headers; kargs_dict...) + kargs...) + if VERSION >= v"1.2.0" + retry(_download_file, delays=ExponentialBackOff(n=5))(url, filename; kargs...) + else + retry(_download_file)(url, filename; kargs...) end end function download_file(url::AbstractString; - kargs...) - download_file(url, tempname(); kargs...) + kargs...) + name = tempname() + if endswith(url, ".gz") + name *= ".gz" + end + download_file(url, name; kargs...) end """ @@ -44,14 +74,14 @@ variables. function _modify_kargs_for_proxy(url; kargs...) if startswith(lowercase(url), "http://") proxy_env_var = "HTTPS_PROXY" - elseif startswith(lowercase(url),"https://") + elseif startswith(lowercase(url), "https://") proxy_env_var = "HTTPS_PROXY" else return kargs end if !(:proxy in keys(kargs)) && proxy_env_var in keys(ENV) kw = Dict() - for (k,v) in kargs + for (k, v) in kargs kw[k] = v end kw[:proxy] = ENV[proxy_env_var] @@ -62,7 +92,7 @@ end "Create an iterable object that will yield each line from a stream **or string**." lineiterator(string::String) = eachline(IOBuffer(string)) -lineiterator(stream::IO) = eachline(stream) +lineiterator(stream::IO) = eachline(stream) """ Returns the `filename`. @@ -82,9 +112,9 @@ isnotemptyfile(filename) = isfile(filename) && filesize(filename) > 0 # for using with download, since filename doesn't have file extension function _read(completename::AbstractString, - filename::AbstractString, - format::Type{T}, - args...; kargs...) where T <: FileFormat + filename::AbstractString, + format::Type{T}, + args...; kargs...) where {T<:FileFormat} check_file(filename) if endswith(completename, ".xml.gz") || endswith(completename, ".xml") document = parse_file(filename) @@ -113,11 +143,11 @@ the file is downloaded with `download` in a temporal file. Gzipped files should end on `.gz`. """ function read(completename::AbstractString, - format::Type{T}, - args...; kargs...) where T <: FileFormat - if startswith(completename, "http://") || - startswith(completename, "https://") || - startswith(completename, "ftp://") + format::Type{T}, + args...; kargs...) where {T<:FileFormat} + if startswith(completename, "http://") || + startswith(completename, "https://") || + startswith(completename, "ftp://") filename = download_file(completename, headers=Dict("Accept-Encoding" => "identity",)) try diff --git a/test/Utils/GeneralUtils.jl b/test/Utils/GeneralUtils.jl index 6ba303a8..2e09fa33 100644 --- a/test/Utils/GeneralUtils.jl +++ b/test/Utils/GeneralUtils.jl @@ -101,4 +101,34 @@ end end end end + + @testset "Test _check_gzip_file" begin + for file in readdir(DATA) + filename = joinpath(DATA, file) + if file != "2vqc.xml.gz" # is a decompressed file that has a wrong .gz extension + @test MIToS.Utils._check_gzip_file(filename) == filename + else + @test_throws ErrorException MIToS.Utils._check_gzip_file(filename) + end + end + end + + @testset "Download a gz file" begin + # Use https://www.rcsb.org/pdb/files/3NIR.pdb.gz to test downloading a gz file + # without a filename + filename = "" + try + filename = download_file("https://www.rcsb.org/pdb/files/3NIR.pdb.gz") + @test endswith(filename, ".gz") + @test MIToS.Utils._check_gzip_file(filename) == filename + finally + if isfile(filename) + rm(filename) + end + end + end end + + + +