From c8f50cff5c7cce4a455009be1221f020242ab8d5 Mon Sep 17 00:00:00 2001 From: David Widmann Date: Sat, 30 Apr 2022 01:24:19 +0200 Subject: [PATCH] Reapply #113 and prepare release of StatsFuns 1.0.0 (#142) * Use julia implementations for pdfs and some cdf-like functions (#113) * Use Julia implementations for (log)pdfs. Adjust Rmath tests to use testsets to give more informative errors. * Use Julia implementations for most cdf-like functions of the Chisq distribution. * Change Beta distribution to be using native implementations. * Switch binomial cdfs to use native implementations (but not the inverses) * Use native implementations for most cdf like gamma functions * Just use the gamma defitions for the chisq * Use the beta definitions for binomial * Add native implementions for Poisson's cdf-like functions * Rely on SpecialFunctions promotion handling in beta functions * Rely on gamma_inc_inv promotion in gammainv(c)cdf * Apply suggestions from code review * Add compat for HypergeometricFunctions * Handle negative arguments in Poisson (log)(c)cdf * Handle negative x in Gamma (log)(c)cdf * Handle arguments outside support for Beta and Fdist (log)(c)cdf * Fix Fdist quantile functions * Handle arguments out of support in Binomial (log)(c)cdf and apply the floor function to make the results correct for non-integer arguments * Avoid accidental promotion to Float64 in fdist(log)(c)cdf * Fix typo in "x" calculation for fdist(log)(c)cdf * Drop redundant low tolerance tests for Beta(1000, 2). We are now generally more accurate than Rmath so failure are because of Rmath. * Handle some non-finite edge cases in gamma and binomial * Only switch to log-version of betalogcdf when the result is tiny * Handle the degenerate cases in beta(log)(c)cdf when alpha is zero * Handle the degenerate case of the gamma distribution when k==0. Also, only calculate the log(c)cdf based on the hypergeometric function when p is zero or subnormal * Address review comments * Avoid rational return type in gamma(log)(c)cdf * Handle the case when alpha and beta are zero in the betacdf * Fix compat entries * Remove precompile statement * Update version number Co-authored-by: Andreas Noack --- .github/workflows/ci.yml | 5 ++- Project.toml | 10 +++-- src/StatsFuns.jl | 2 - src/distrs/beta.jl | 74 ++++++++++++++++++++++++++++--- src/distrs/binom.jl | 32 +++++++++++--- src/distrs/chisq.jl | 29 ++++--------- src/distrs/fdist.jl | 28 +++++++----- src/distrs/gamma.jl | 94 +++++++++++++++++++++++++++++++++++++--- src/distrs/pois.jl | 20 ++++++--- src/distrs/tdist.jl | 3 +- test/rmath.jl | 92 +++++++++++++++++++++++---------------- 11 files changed, 285 insertions(+), 104 deletions(-) diff --git a/.github/workflows/ci.yml b/.github/workflows/ci.yml index 23241c2..537fe30 100644 --- a/.github/workflows/ci.yml +++ b/.github/workflows/ci.yml @@ -26,8 +26,9 @@ jobs: fail-fast: false matrix: version: - - '1.0' - - '1' + - '1.3' # oldest supported version + - '1.6' # LTS + - '1' # latest release os: - ubuntu-latest - macos-latest diff --git a/Project.toml b/Project.toml index 997e6f3..a34f2e8 100644 --- a/Project.toml +++ b/Project.toml @@ -1,9 +1,10 @@ name = "StatsFuns" uuid = "4c63d2b9-4356-54db-8cca-17b64c39e42c" -version = "0.9.18" +version = "1.0.0" [deps] ChainRulesCore = "d360d2e6-b24c-11e9-a2a3-2a2ae2dbcce4" +HypergeometricFunctions = "34004b35-14d8-5ef3-9330-4cdb6864b03a" InverseFunctions = "3587e190-3f89-42d0-90ee-14403ec27112" IrrationalConstants = "92d709cd-6900-40b7-9082-c6be49f344b6" LogExpFunctions = "2ab3a3ac-af41-5b50-aa03-7779005ae688" @@ -13,13 +14,14 @@ SpecialFunctions = "276daf66-3868-5448-9aa4-cd146d93841b" [compat] ChainRulesCore = "1" +HypergeometricFunctions = "0.3" InverseFunctions = "0.1" IrrationalConstants = "0.1" LogExpFunctions = "0.3.2" Reexport = "1" -Rmath = "0.4, 0.5, 0.6, 0.7" -SpecialFunctions = "0.8, 0.9, 0.10, 1.0, 2" -julia = "1" +Rmath = "0.6.1, 0.7" +SpecialFunctions = "1.8.4, 2.1.4" +julia = "1.3" [extras] ChainRulesTestUtils = "cdddcdb0-9152-4a09-a978-84456f9df70a" diff --git a/src/StatsFuns.jl b/src/StatsFuns.jl index 34b110c..c4359fd 100644 --- a/src/StatsFuns.jl +++ b/src/StatsFuns.jl @@ -1,5 +1,3 @@ -__precompile__(true) - module StatsFuns using Base: Math.@horner diff --git a/src/distrs/beta.jl b/src/distrs/beta.jl index 6328c4e..1e5670e 100644 --- a/src/distrs/beta.jl +++ b/src/distrs/beta.jl @@ -1,14 +1,17 @@ # functions related to beta distributions +using HypergeometricFunctions: _₂F₁ + # R implementations -# For pdf and logpdf we use the Julia implementation using .RFunctions: - betacdf, - betaccdf, - betalogcdf, - betalogccdf, - betainvcdf, - betainvccdf, + # betapdf, + # betalogpdf, + # betacdf, + # betaccdf, + # betalogcdf, + # betalogccdf, + # betainvcdf, + # betainvccdf, betainvlogcdf, betainvlogccdf @@ -22,3 +25,60 @@ function betalogpdf(α::T, β::T, x::T) where {T<:Real} val = xlogy(α - 1, y) + xlog1py(β - 1, -y) - logbeta(α, β) return x < 0 || x > 1 ? oftype(val, -Inf) : val end + +function betacdf(α::Real, β::Real, x::Real) + # Handle a degenerate case + if iszero(α) && β > 0 + return float(last(promote(α, β, x, x >= 0))) + end + + return first(beta_inc(α, β, clamp(x, 0, 1))) +end + +function betaccdf(α::Real, β::Real, x::Real) + # Handle a degenerate case + if iszero(α) && β > 0 + return float(last(promote(α, β, x, x < 0))) + end + + last(beta_inc(α, β, clamp(x, 0, 1))) +end + +# The log version is currently based on non-log version. When the cdf is very small we shift +# to an implementation based on the hypergeometric function ₂F₁ to avoid underflow. +function betalogcdf(α::T, β::T, x::T) where {T<:Real} + # Handle a degenerate case + if iszero(α) && β > 0 + return log(last(promote(x, x >= 0))) + end + + _x = clamp(x, 0, 1) + p, q = beta_inc(α, β, _x) + if p < floatmin(p) + # see https://dlmf.nist.gov/8.17#E7 + return -log(α) + xlogy(α, _x) + log(_₂F₁(promote(α, 1 - β, α + 1, _x)...)) - logbeta(α, β) + elseif p <= 0.7 + return log(p) + else + return log1p(-q) + end +end +betalogcdf(α::Real, β::Real, x::Real) = betalogcdf(promote(α, β, x)...) + +function betalogccdf(α::Real, β::Real, x::Real) + # Handle a degenerate case + if iszero(α) && β > 0 + return log(last(promote(α, β, x, x < 0))) + end + + p, q = beta_inc(α, β, clamp(x, 0, 1)) + if q < 0.7 + return log(q) + else + return log1p(-p) + end +end + +betainvcdf(α::Real, β::Real, p::Real) = first(beta_inc_inv(α, β, p)) + +betainvccdf(α::Real, β::Real, p::Real) = last(beta_inc_inv(β, α, p)) diff --git a/src/distrs/binom.jl b/src/distrs/binom.jl index 73f8185..5cb6c8a 100644 --- a/src/distrs/binom.jl +++ b/src/distrs/binom.jl @@ -1,18 +1,18 @@ # functions related to binomial distribution # R implementations -# For pdf and logpdf we use the Julia implementation using .RFunctions: - binomcdf, - binomccdf, - binomlogcdf, - binomlogccdf, + # binompdf, + # binomlogpdf, + # binomcdf, + # binomccdf, + # binomlogcdf, + # binomlogccdf, binominvcdf, binominvccdf, binominvlogcdf, binominvlogccdf - # Julia implementations binompdf(n::Real, p::Real, k::Real) = exp(binomlogpdf(n, p, k)) @@ -22,3 +22,23 @@ function binomlogpdf(n::T, p::T, k::T) where {T<:Real} val = min(0, betalogpdf(m + 1, n - m + 1, p) - log(n + 1)) return 0 <= k <= n && isinteger(k) ? val : oftype(val, -Inf) end + +for l in ("", "log"), compl in (false, true) + fbinom = Symbol(string("binom", l, ifelse(compl, "c", "" ), "cdf")) + fbeta = Symbol(string("beta" , l, ifelse(compl, "", "c"), "cdf")) + @eval function ($fbinom)(n::Real, p::Real, k::Real) + if isnan(k) + return last(promote(n, p, k)) + end + res = ($fbeta)(max(0, floor(k) + 1), max(0, n - floor(k)), p) + + # When p == 1 the betaccdf doesn't return the correct result + # so these cases have to be special cased + if isone(p) + newres = oftype(res, $compl ? k < n : k >= n) + return $(l === "" ? :newres : :(log(newres))) + else + return res + end + end +end diff --git a/src/distrs/chisq.jl b/src/distrs/chisq.jl index 467fa08..d249b2a 100644 --- a/src/distrs/chisq.jl +++ b/src/distrs/chisq.jl @@ -1,22 +1,11 @@ # functions related to chi-square distribution -# R implementations -# For pdf and logpdf we use the Julia implementation -using .RFunctions: - chisqcdf, - chisqccdf, - chisqlogcdf, - chisqlogccdf, - chisqinvcdf, - chisqinvccdf, - chisqinvlogcdf, - chisqinvlogccdf - -# Julia implementations -# promotion ensures that we do forward e.g. `chisqpdf(::Int, ::Float32)` to -# `gammapdf(::Float32, ::Int, ::Float32)` but not `gammapdf(::Float64, ::Int, ::Float32)` -chisqpdf(k::Real, x::Real) = chisqpdf(promote(k, x)...) -chisqpdf(k::T, x::T) where {T<:Real} = gammapdf(k / 2, 2, x) - -chisqlogpdf(k::Real, x::Real) = chisqlogpdf(promote(k, x)...) -chisqlogpdf(k::T, x::T) where {T<:Real} = gammalogpdf(k / 2, 2, x) +# Just use the Gamma definitions +for f in ("pdf", "logpdf", "cdf", "ccdf", "logcdf", "logccdf", "invcdf", "invccdf", "invlogcdf", "invlogccdf") + _chisqf = Symbol("chisq"*f) + _gammaf = Symbol("gamma"*f) + @eval begin + $(_chisqf)(k::Real, x::Real) = $(_chisqf)(promote(k, x)...) + $(_chisqf)(k::T, x::T) where {T<:Real} = $(_gammaf)(k/2, 2, x) + end +end diff --git a/src/distrs/fdist.jl b/src/distrs/fdist.jl index 1f11c16..17dfe17 100644 --- a/src/distrs/fdist.jl +++ b/src/distrs/fdist.jl @@ -1,17 +1,5 @@ # functions related to F distribution -# R implementations -# For pdf and logpdf we use the Julia implementation -using .RFunctions: - fdistcdf, - fdistccdf, - fdistlogcdf, - fdistlogccdf, - fdistinvcdf, - fdistinvccdf, - fdistinvlogcdf, - fdistinvlogccdf - # Julia implementations fdistpdf(ν1::Real, ν2::Real, x::Real) = exp(fdistlogpdf(ν1, ν2, x)) @@ -23,3 +11,19 @@ function fdistlogpdf(ν1::T, ν2::T, x::T) where {T<:Real} val = (xlogy(ν1, ν1ν2) + xlogy(ν1 - 2, y) - xlogy(ν1 + ν2, 1 + ν1ν2 * y)) / 2 - logbeta(ν1 / 2, ν2 / 2) return x < 0 ? oftype(val, -Inf) : val end + +for f in ("cdf", "ccdf", "logcdf", "logccdf") + ff = Symbol("fdist"*f) + bf = Symbol("beta"*f) + @eval $ff(ν1::T, ν2::T, x::T) where {T<:Real} = $bf(ν1/2, ν2/2, inv(1 + ν2/(ν1*max(0, x)))) + @eval $ff(ν1::Real, ν2::Real, x::Real) = $ff(promote(ν1, ν2, x)...) +end +for f in ("invcdf", "invccdf", "invlogcdf", "invlogccdf") + ff = Symbol("fdist"*f) + bf = Symbol("beta"*f) + @eval function $ff(ν1::T, ν2::T, y::T) where {T<:Real} + x = $bf(ν1/2, ν2/2, y) + return x/(1 - x)*ν2/ν1 + end + @eval $ff(ν1::Real, ν2::Real, y::Real) = $ff(promote(ν1, ν2, y)...) +end diff --git a/src/distrs/gamma.jl b/src/distrs/gamma.jl index 15b3975..cd1dc0c 100644 --- a/src/distrs/gamma.jl +++ b/src/distrs/gamma.jl @@ -1,14 +1,17 @@ # functions related to gamma distribution +using HypergeometricFunctions: drummond1F1 + # R implementations -# For pdf and logpdf we use the Julia implementation using .RFunctions: - gammacdf, - gammaccdf, - gammalogcdf, - gammalogccdf, - gammainvcdf, - gammainvccdf, + # gammapdf, + # gammalogpdf, + # gammacdf, + # gammaccdf, + # gammalogcdf, + # gammalogccdf, + # gammainvcdf, + # gammainvccdf, gammainvlogcdf, gammainvlogccdf @@ -22,3 +25,80 @@ function gammalogpdf(k::T, θ::T, x::T) where {T<:Real} val = -loggamma(k) + xlogy(k - 1, xθ) - log(θ) - xθ return x < 0 ? oftype(val, -Inf) : val end + +function gammacdf(k::T, θ::T, x::T) where {T<:Real} + # Handle the degenerate case + if iszero(k) + return float(last(promote(x, x >= 0))) + end + return first(gamma_inc(k, max(0, x)/θ)) +end +gammacdf(k::Real, θ::Real, x::Real) = gammacdf(map(float, promote(k, θ, x))...) + +function gammaccdf(k::T, θ::T, x::T) where {T<:Real} + # Handle the degenerate case + if iszero(k) + return float(last(promote(x, x < 0))) + end + return last(gamma_inc(k, max(0, x)/θ)) +end +gammaccdf(k::Real, θ::Real, x::Real) = gammaccdf(map(float, promote(k, θ, x))...) + +gammalogcdf(k::Real, θ::Real, x::Real) = _gammalogcdf(map(float, promote(k, θ, x))...) + +# Implemented via the non-log version. For tiny values, we recompute the result with +# loggamma. In that situation, there is little risk of significant cancellation. +function _gammalogcdf(k::Float64, θ::Float64, x::Float64) + # Handle the degenerate case + if iszero(k) + return log(x >= 0) + end + + xdθ = max(0, x)/θ + l, u = gamma_inc(k, xdθ) + if l < floatmin(Float64) && isfinite(k) && isfinite(xdθ) + return -log(k) + k*log(xdθ) - xdθ + log(drummond1F1(1.0, 1 + k, xdθ)) - loggamma(k) + elseif l < 0.7 + return log(l) + else + return log1p(-u) + end +end +function _gammalogcdf(k::T, θ::T, x::T) where {T<:Union{Float16,Float32}} + return T(_gammalogcdf(Float64(k), Float64(θ), Float64(x))) +end + +gammalogccdf(k::Real, θ::Real, x::Real) = _gammalogccdf(map(float, promote(k, θ, x))...) + +# Implemented via the non-log version. For tiny values, we recompute the result with +# loggamma. In that situation, there is little risk of significant cancellation. +function _gammalogccdf(k::Float64, θ::Float64, x::Float64) + # Handle the degenerate case + if iszero(k) + return log(x < 0) + end + + xdθ = max(0, x)/θ + l, u = gamma_inc(k, xdθ) + if u < floatmin(Float64) + return loggamma(k, xdθ) - loggamma(k) + elseif u < 0.7 + return log(u) + else + return log1p(-l) + end +end + +function _gammalogccdf(k::T, θ::T, x::T) where {T<:Union{Float16,Float32}} + return T(_gammalogccdf(Float64(k), Float64(θ), Float64(x))) +end + +function gammainvcdf(k::Real, θ::Real, p::Real) + _k, _θ, _p = promote(k, θ, p) + return _θ*gamma_inc_inv(_k, _p, 1 - _p) +end + +function gammainvccdf(k::Real, θ::Real, p::Real) + _k, _θ, _p = promote(k, θ, p) + return _θ*gamma_inc_inv(_k, 1 - _p, _p) +end diff --git a/src/distrs/pois.jl b/src/distrs/pois.jl index ed6192b..0928fae 100644 --- a/src/distrs/pois.jl +++ b/src/distrs/pois.jl @@ -1,12 +1,13 @@ # functions related to Poisson distribution # R implementations -# For pdf and logpdf we use the Julia implementation using .RFunctions: - poiscdf, - poisccdf, - poislogcdf, - poislogccdf, + # poispdf, + # poislogpdf, + # poiscdf, + # poisccdf, + # poislogcdf, + # poislogccdf, poisinvcdf, poisinvccdf, poisinvlogcdf, @@ -20,3 +21,12 @@ function poislogpdf(λ::T, x::T) where {T <: Real} val = xlogy(x, λ) - λ - loggamma(x + 1) return x >= 0 && isinteger(x) ? val : oftype(val, -Inf) end + +# Just use the Gamma definitions +poiscdf(λ::Real, x::Real) = gammaccdf(max(0, floor(x + 1)), 1, λ) + +poisccdf(λ::Real, x::Real) = gammacdf(max(0, floor(x + 1)), 1, λ) + +poislogcdf(λ::Real, x::Real) = gammalogccdf(max(0, floor(x + 1)), 1, λ) + +poislogccdf(λ::Real, x::Real) = gammalogcdf(max(0, floor(x + 1)), 1, λ) diff --git a/src/distrs/tdist.jl b/src/distrs/tdist.jl index 4544efa..e6a46a3 100644 --- a/src/distrs/tdist.jl +++ b/src/distrs/tdist.jl @@ -1,8 +1,9 @@ # functions related to student's T distribution # R implementations -# For pdf and logpdf we use the Julia implementation using .RFunctions: + # tdistpdf, + # tdistlogpdf, tdistcdf, tdistccdf, tdistlogcdf, diff --git a/test/rmath.jl b/test/rmath.jl index 809896e..0f17011 100644 --- a/test/rmath.jl +++ b/test/rmath.jl @@ -79,47 +79,66 @@ function rmathcomp(basename, params, X::AbstractArray, rtol) end rmath_rand = has_rand ? get_rmathfun(rand) : nothing - for x in X - if has_pdf + if has_pdf + @testset "pdf with x=$x" for x in X check_rmath(pdf, stats_pdf, rmath_pdf, params, "x", x, true, rtol) + end + @testset "logpdf with x=$x" for x in X check_rmath(logpdf, stats_logpdf, rmath_logpdf, params, "x", x, false, rtol) end + end + @testset "cdf with x=$x" for x in X check_rmath(cdf, stats_cdf, rmath_cdf, params, "x", x, true, rtol) + end + @testset "ccdf with x=$x" for x in X check_rmath(ccdf, stats_ccdf, rmath_ccdf, params, "x", x, true, rtol) + end + @testset "logcdf with x=$x" for x in X check_rmath(logcdf, stats_logcdf, rmath_logcdf, params, "x", x, false, rtol) + end + @testset "logccdf with x=$x" for x in X check_rmath(logccdf, stats_logccdf, rmath_logccdf, params, "x", x, false, rtol) + end - p = rmath_cdf(params..., x) - cp = rmath_ccdf(params..., x) - lp = rmath_logcdf(params..., x) - lcp = rmath_logccdf(params..., x) + p = rmath_cdf.(params..., X) + cp = rmath_ccdf.(params..., X) + lp = rmath_logcdf.(params..., X) + lcp = rmath_logccdf.(params..., X) + @testset "invcdf with q=$_p" for _p in p check_rmath(invcdf, stats_invcdf, rmath_invcdf, - params, "q", p, false, rtol) + params, "q", _p, false, rtol) + end + @testset "invccdf with q=$_p" for _p in cp check_rmath(invccdf, stats_invccdf, rmath_invccdf, - params, "q", cp, false, rtol) + params, "q", _p, false, rtol) + end + @testset "invlogcdf with log(q)=$_p" for _p in lp check_rmath(invlogcdf, stats_invlogcdf, rmath_invlogcdf, - params, "lq", lp, false, rtol) + params, "lq", _p, false, rtol) + end + @testset "invlogccdf with log(q)=$_p" for _p in lcp check_rmath(invlogccdf, stats_invlogccdf, rmath_invlogccdf, - params, "lq", lcp, false, rtol) + params, "lq", _p, false, rtol) + end - # make sure that rand works - if has_rand - rmath_rand(params...) - end + # make sure that rand works + if has_rand + rmath_rand(params...) end end function rmathcomp_tests(basename, configs) - println("\ttesting $basename ...") - for (params, data) in configs - rmathcomp(basename, params, data) + @testset "$basename" begin + @testset "params: $params" for (params, data) in configs + rmathcomp(basename, params, data) + end end end @@ -139,27 +158,24 @@ end ((10, 2), [0, 1]), ((10, 2), 0//1:1//100:1//1), ]) - - # We test the following extreme parameters separately since - # a slightly larger tolerance is needed. - # - # For `betapdf(1000, 2, 0.58)`: - # StatsFuns: 1.9419987107407202e-231 - # Rmath: 1.941998710740941e-231 - # Mathematica: 1.941998710742487e-231 - # For `betapdf(1000, 2, 0.68)`: - # StatsFuns: 1.5205049885199752e-162 - # Rmath: 1.5205049885200616e-162 - # Mathematica: 1.520504988521358e-162 - # - # With older SpecialFunctions + Julia versions a slightly larger tolerance is needed - @testset "Beta(1000, 2)" begin - if VERSION < v"1.3" - rmathcomp("beta", (1000, 2), 0.0:0.01:1.0, 1e-12) - else - rmathcomp("beta", (1000, 2), setdiff(0.0:0.01:1.0, (0.58, 0.68))) - rmathcomp("beta", (1000, 2), [0.58, 0.68], 1e-12) - end + # It is not possible to maintain a rtol of 1e-14 for the cdf when there is such a large difference + # in the magnitude of the parameters. At least not with the current parameters. Furthermore, it + # seems that Rmath is actually less accurate than we are but since we are comparing against Rmath + # we have to use rtol=1e-12 although we are probably only off by around 1e-13. + @testset "param: (1000, 2)" begin + rmathcomp( + "beta", + (1000, 2), + # We have to drop the 0.48 value since the R quantile function fails while we succeed. + # It's tested separate below. + setdiff(collect(0.0:0.01:1.0), 0.48), + 1e-12) + # Test p=0.48 separately since R fails. (It's pretty slow, though, caused by the cdf being 9.0797754e-317) + @test betainvcdf(1000, 2, betacdf(1000, 2, 0.48)) ≈ 0.48 + end + @testset "$(f)(0, 0, $x) should error" for f in (betacdf, betaccdf, betalogcdf, betalogccdf), + x in (0.0, 0.5, 1.0) + @test_throws DomainError f(0.0, 0.0, x) end rmathcomp_tests("binom", [