diff --git a/Project.toml b/Project.toml index 42390a6de..8292bb705 100644 --- a/Project.toml +++ b/Project.toml @@ -1,7 +1,7 @@ name = "StatsBase" uuid = "2913bbd2-ae8a-5f71-8c99-4fb6c76f3a91" authors = ["JuliaStats"] -version = "0.33.12" +version = "0.33.13" [deps] DataAPI = "9a962f9c-6df0-11e9-0e5d-c546b8b5ee8a" diff --git a/src/common.jl b/src/common.jl index 36c128daa..17ee3a890 100644 --- a/src/common.jl +++ b/src/common.jl @@ -23,9 +23,9 @@ const RealFP = Union{Float32, Float64} # A convenient typealias for deprecating default corrected Bool const DepBool = Union{Bool, Nothing} -function depcheck(fname::Symbol, b::DepBool) - if b == nothing - msg = "$fname will default to corrected=true in the future. Use corrected=false for previous behaviour." +function depcheck(fname::Symbol, varname::Symbol, b::DepBool) + if b === nothing + msg = "$fname will default to $varname=true in the future. Use $varname=false for previous behaviour." Base.depwarn(msg, fname) false else diff --git a/src/cov.jl b/src/cov.jl index a77cd5086..b4b472f83 100644 --- a/src/cov.jl +++ b/src/cov.jl @@ -92,11 +92,11 @@ _scattermatm(x::DenseMatrix, wv::AbstractWeights, mean, dims::Int) = ## weighted cov covm(x::DenseMatrix, mean, w::AbstractWeights, dims::Int=1; corrected::DepBool=nothing) = - rmul!(scattermat(x, w, mean=mean, dims=dims), varcorrection(w, depcheck(:covm, corrected))) + rmul!(scattermat(x, w, mean=mean, dims=dims), varcorrection(w, depcheck(:covm, :corrected, corrected))) cov(x::DenseMatrix, w::AbstractWeights, dims::Int=1; corrected::DepBool=nothing) = - covm(x, mean(x, w, dims=dims), w, dims; corrected=depcheck(:cov, corrected)) + covm(x, mean(x, w, dims=dims), w, dims; corrected=depcheck(:cov, :corrected, corrected)) function corm(x::DenseMatrix, mean, w::AbstractWeights, vardim::Int=1) c = covm(x, mean, w, vardim; corrected=false) @@ -120,7 +120,7 @@ end function mean_and_cov(x::DenseMatrix, wv::AbstractWeights, dims::Int=1; corrected::DepBool=nothing) m = mean(x, wv, dims=dims) - return m, cov(x, wv, dims; corrected=depcheck(:mean_and_cov, corrected)) + return m, cov(x, wv, dims; corrected=depcheck(:mean_and_cov, :corrected, corrected)) end """ diff --git a/src/moments.jl b/src/moments.jl index 765626742..a527a0501 100644 --- a/src/moments.jl +++ b/src/moments.jl @@ -19,7 +19,7 @@ the population variance is computed by replacing * `Weights`: `ArgumentError` (bias correction not supported) """ varm(v::RealArray, w::AbstractWeights, m::Real; corrected::DepBool=nothing) = - _moment2(v, w, m; corrected=depcheck(:varm, corrected)) + _moment2(v, w, m; corrected=depcheck(:varm, :corrected, corrected)) """ var(x::AbstractArray, w::AbstractWeights, [dim]; mean=nothing, corrected=false) @@ -40,7 +40,7 @@ replacing ``\\frac{1}{\\sum{w}}`` with a factor dependent on the type of weights """ function var(v::RealArray, w::AbstractWeights; mean=nothing, corrected::DepBool=nothing) - corrected = depcheck(:var, corrected) + corrected = depcheck(:var, :corrected, corrected) if mean == nothing varm(v, w, Statistics.mean(v, w); corrected=corrected) @@ -53,14 +53,14 @@ end function varm!(R::AbstractArray, A::RealArray, w::AbstractWeights, M::RealArray, dim::Int; corrected::DepBool=nothing) - corrected = depcheck(:varm!, corrected) + corrected = depcheck(:varm!, :corrected, corrected) rmul!(_wsum_centralize!(R, abs2, A, convert(Vector, w), M, dim, true), varcorrection(w, corrected)) end function var!(R::AbstractArray, A::RealArray, w::AbstractWeights, dims::Int; mean=nothing, corrected::DepBool=nothing) - corrected = depcheck(:var!, corrected) + corrected = depcheck(:var!, :corrected, corrected) if mean == 0 varm!(R, A, w, Base.reducedim_initarray(A, dims, 0, eltype(R)), dims; @@ -84,14 +84,14 @@ end function varm(A::RealArray, w::AbstractWeights, M::RealArray, dim::Int; corrected::DepBool=nothing) - corrected = depcheck(:varm, corrected) + corrected = depcheck(:varm, :corrected, corrected) varm!(similar(A, Float64, Base.reduced_indices(axes(A), dim)), A, w, M, dim; corrected=corrected) end function var(A::RealArray, w::AbstractWeights, dim::Int; mean=nothing, corrected::DepBool=nothing) - corrected = depcheck(:var, corrected) + corrected = depcheck(:var, :corrected, corrected) var!(similar(A, Float64, Base.reduced_indices(axes(A), dim)), A, w, dim; mean=mean, corrected=corrected) end @@ -115,7 +115,7 @@ dependent on the type of weights used: * `Weights`: `ArgumentError` (bias correction not supported) """ stdm(v::RealArray, w::AbstractWeights, m::Real; corrected::DepBool=nothing) = - sqrt(varm(v, w, m, corrected=depcheck(:stdm, corrected))) + sqrt(varm(v, w, m, corrected=depcheck(:stdm, :corrected, corrected))) """ std(x::AbstractArray, w::AbstractWeights, [dim]; mean=nothing, corrected=false) @@ -136,18 +136,18 @@ weights used: * `Weights`: `ArgumentError` (bias correction not supported) """ std(v::RealArray, w::AbstractWeights; mean=nothing, corrected::DepBool=nothing) = - sqrt.(var(v, w; mean=mean, corrected=depcheck(:std, corrected))) + sqrt.(var(v, w; mean=mean, corrected=depcheck(:std, :corrected, corrected))) stdm(v::RealArray, m::RealArray, dim::Int; corrected::DepBool=nothing) = - sqrt!(varm(v, m, dims=dim, corrected=depcheck(:stdm, corrected))) + sqrt!(varm(v, m, dims=dim, corrected=depcheck(:stdm, :corrected, corrected))) stdm(v::RealArray, w::AbstractWeights, m::RealArray, dim::Int; corrected::DepBool=nothing) = - sqrt.(varm(v, w, m, dim; corrected=depcheck(:stdm, corrected))) + sqrt.(varm(v, w, m, dim; corrected=depcheck(:stdm, :corrected, corrected))) std(v::RealArray, w::AbstractWeights, dim::Int; mean=nothing, corrected::DepBool=nothing) = - sqrt.(var(v, w, dim; mean=mean, corrected=depcheck(:std, corrected))) + sqrt.(var(v, w, dim; mean=mean, corrected=depcheck(:std, :corrected, corrected))) ##### Fused statistics """ @@ -183,12 +183,12 @@ end function mean_and_var(x::RealArray, w::AbstractWeights; corrected::DepBool=nothing) m = mean(x, w) - v = varm(x, w, m; corrected=depcheck(:mean_and_var, corrected)) + v = varm(x, w, m; corrected=depcheck(:mean_and_var, :corrected, corrected)) m, v end function mean_and_std(x::RealArray, w::AbstractWeights; corrected::DepBool=nothing) m = mean(x, w) - s = stdm(x, w, m; corrected=depcheck(:mean_and_std, corrected)) + s = stdm(x, w, m; corrected=depcheck(:mean_and_std, :corrected, corrected)) m, s end @@ -208,13 +208,13 @@ end function mean_and_var(x::RealArray, w::AbstractWeights, dims::Int; corrected::DepBool=nothing) m = mean(x, w, dims=dims) - v = varm(x, w, m, dims; corrected=depcheck(:mean_and_var, corrected)) + v = varm(x, w, m, dims; corrected=depcheck(:mean_and_var, :corrected, corrected)) m, v end function mean_and_std(x::RealArray, w::AbstractWeights, dims::Int; corrected::DepBool=nothing) m = mean(x, w, dims=dims) - s = stdm(x, w, m, dims; corrected=depcheck(:mean_and_std, corrected)) + s = stdm(x, w, m, dims; corrected=depcheck(:mean_and_std, :corrected, corrected)) m, s end diff --git a/src/weights.jl b/src/weights.jl index 2a9f4ad62..0d03317fc 100644 --- a/src/weights.jl +++ b/src/weights.jl @@ -205,16 +205,22 @@ pweights(vs::RealArray) = ProbabilityWeights(vec(vs)) end """ - eweights(t::AbstractVector{<:Integer}, λ::Real) - eweights(t::AbstractVector{T}, r::StepRange{T}, λ::Real) where T - eweights(n::Integer, λ::Real) + eweights(t::AbstractVector{<:Integer}, λ::Real; scale=false) + eweights(t::AbstractVector{T}, r::StepRange{T}, λ::Real; scale=false) where T + eweights(n::Integer, λ::Real; scale=false) Construct a [`Weights`](@ref) vector which assigns exponentially decreasing weights to past -observations, which in this case corresponds to larger integer values `i` in `t`. -If an integer `n` is provided, weights are generated for values from 1 to `n` -(equivalent to `t = 1:n`). +observations (larger integer values `i` in `t`). +The integer value `n` represents the number of past observations to consider. +`n` defaults to `maximum(t) - minimum(t) + 1` if only `t` is passed in +and the elements are integers, and to `length(r)` if a superset range `r` is also passed in. +If `n` is explicitly passed instead of `t`, `t` defaults to `1:n`. -For each element `i` in `t` the weight value is computed as: +If `scale` is `true` then for each element `i` in `t` the weight value is computed as: + +``(1 - λ)^{n - i}`` + +If `scale` is `false` then each value is computed as: ``λ (1 - λ)^{1 - i}`` @@ -222,42 +228,59 @@ For each element `i` in `t` the weight value is computed as: - `t::AbstractVector`: temporal indices or timestamps - `r::StepRange`: a larger range to use when constructing weights from a subset of timestamps -- `n::Integer`: if provided instead of `t`, temporal indices are taken to be `1:n` +- `n::Integer`: the number of past events to consider - `λ::Real`: a smoothing factor or rate parameter such that ``0 < λ ≤ 1``. As this value approaches 0, the resulting weights will be almost equal, while values closer to 1 will put greater weight on the tail elements of the vector. +# Keyword arguments + +- `scale::Bool`: Return the weights scaled to between 0 and 1 (default: false) + # Examples ```julia-repl -julia> eweights(1:10, 0.3) +julia> eweights(1:10, 0.3; scale=true) 10-element Weights{Float64,Float64,Array{Float64,1}}: - 0.3 - 0.42857142857142855 - 0.6122448979591837 - 0.8746355685131197 - 1.249479383590171 - 1.7849705479859588 - 2.549957925694227 - 3.642797036706039 - 5.203995766722913 - 7.434279666747019 + 0.04035360699999998 + 0.05764800999999997 + 0.08235429999999996 + 0.11764899999999996 + 0.16806999999999994 + 0.24009999999999995 + 0.3429999999999999 + 0.48999999999999994 + 0.7 + 1.0 ``` -""" -function eweights(t::AbstractVector{T}, λ::Real) where T<:Integer +# Links +- https://en.wikipedia.org/wiki/Moving_average#Exponential_moving_average +- https://en.wikipedia.org/wiki/Exponential_smoothing +""" +function eweights(t::AbstractVector{<:Integer}, λ::Real; kwargs...) + isempty(t) && return Weights(copy(t), 0) + (lo, hi) = extrema(t) + return _eweights(t, λ, hi - lo + 1; kwargs...) +end + +eweights(n::Integer, λ::Real; kwargs...) = _eweights(1:n, λ, n; kwargs...) +eweights(t::AbstractVector, r::AbstractRange, λ::Real; kwargs...) = + _eweights(something.(indexin(t, r)), λ, length(r); kwargs...) + +function _eweights(t::AbstractVector{<:Integer}, λ::Real, n::Integer; scale::DepBool=nothing) 0 < λ <= 1 || throw(ArgumentError("Smoothing factor must be between 0 and 1")) + f = depcheck(:eweights, :scale, scale) ? _scaled_eweight : _unscaled_eweight w0 = map(t) do i i > 0 || throw(ArgumentError("Time indices must be non-zero positive integers")) - λ * (1 - λ)^(1 - i) + f(i, λ, n) end s = sum(w0) Weights(w0, s) end -eweights(n::Integer, λ::Real) = eweights(1:n, λ) -eweights(t::AbstractVector, r::AbstractRange, λ::Real) = - eweights(something.(indexin(t, r)), λ) +_unscaled_eweight(i, λ, n) = λ * (1 - λ)^(1 - i) +_scaled_eweight(i, λ, n) = (1 - λ)^(n - i) # NOTE: no variance correction is implemented for exponential weights @@ -310,7 +333,7 @@ julia> uweights(3) 1 1 1 - + julia> uweights(Float64, 3) 3-element UnitWeights{Float64}: 1.0 diff --git a/test/weights.jl b/test/weights.jl index 7735e04f7..61d68320a 100644 --- a/test/weights.jl +++ b/test/weights.jl @@ -502,53 +502,62 @@ end end @testset "Exponential Weights" begin + λ = 0.2 @testset "Usage" begin - θ = 5.25 - λ = 1 - exp(-1 / θ) # simple conversion for the more common/readable method - v = [λ*(1-λ)^(1-i) for i = 1:4] + v = [(1 - λ) ^ (4 - i) for i = 1:4] w = Weights(v) - @test round.(w, digits=4) == [0.1734, 0.2098, 0.2539, 0.3071] + @test round.(w, digits=4) == [0.512, 0.64, 0.8, 1.0] @testset "basic" begin - @test eweights(1:4, λ) ≈ w + @test eweights(1:4, λ; scale=true) ≈ w end @testset "1:n" begin - @test eweights(4, λ) ≈ w + @test eweights(4, λ; scale=true) ≈ w end @testset "indexin" begin - v = [λ*(1-λ)^(1-i) for i = 1:10] + v = [(1 - λ) ^ (10 - i) for i = 1:10] # Test that we should be able to skip indices easily - @test eweights([1, 3, 5, 7], 1:10, λ) ≈ Weights(v[[1, 3, 5, 7]]) + @test eweights([1, 3, 5, 7], 1:10, λ; scale=true) ≈ Weights(v[[1, 3, 5, 7]]) # This should also work with actual time types t1 = DateTime(2019, 1, 1, 1) tx = t1 + Hour(7) - tn = DateTime(2019, 1, 2, 1) + tn = DateTime(2019, 1, 1, 10) - @test eweights(t1:Hour(2):tx, t1:Hour(1):tn, λ) ≈ Weights(v[[1, 3, 5, 7]]) + @test eweights(t1:Hour(2):tx, t1:Hour(1):tn, λ; scale=true) ≈ Weights(v[[1, 3, 5, 7]]) end end @testset "Empty" begin - @test eweights(0, 0.3) == Weights(Float64[]) - @test eweights(1:0, 0.3) == Weights(Float64[]) - @test eweights(Int[], 1:10, 0.4) == Weights(Float64[]) + @test eweights(0, 0.3; scale=true) == Weights(Float64[]) + @test eweights(1:0, 0.3; scale=true) == Weights(Float64[]) + @test eweights(Int[], 1:10, 0.4; scale=true) == Weights(Float64[]) end @testset "Failure Conditions" begin # λ > 1.0 - @test_throws ArgumentError eweights(1, 1.1) + @test_throws ArgumentError eweights(1, 1.1; scale=true) # time indices are not all positive non-zero integers - @test_throws ArgumentError eweights([0, 1, 2, 3], 0.3) + @test_throws ArgumentError eweights([0, 1, 2, 3], 0.3; scale=true) # Passing in an array of bools will work because Bool <: Integer, # but any `false` values will trigger the same argument error as 0.0 - @test_throws ArgumentError eweights([true, false, true, true], 0.3) + @test_throws ArgumentError eweights([true, false, true, true], 0.3; scale=true) + end + + @testset "scale=false" begin + v = [λ * (1 - λ)^(1 - i) for i = 1:4] + w = Weights(v) + + @test round.(w, digits=4) == [0.2, 0.25, 0.3125, 0.3906] + + wv = eweights(1:10, λ; scale=false) + @test eweights(1:10, λ; scale=true) ≈ wv / maximum(wv) end end