From de83c3298fbb2ba2f17b8e9fad9a694f6cf0607c Mon Sep 17 00:00:00 2001 From: Phillip Alday Date: Mon, 11 Sep 2023 02:45:53 -0500 Subject: [PATCH 1/5] use StatsModels.[g]vif --- Project.toml | 4 ++-- src/MixedModelsExtras.jl | 8 +++++--- test/vif.jl | 6 +++--- 3 files changed, 10 insertions(+), 8 deletions(-) diff --git a/Project.toml b/Project.toml index 777dee9..138fd9b 100644 --- a/Project.toml +++ b/Project.toml @@ -1,7 +1,7 @@ name = "MixedModelsExtras" uuid = "781a26e1-49f4-409a-8f4c-c3159d78c17e" authors = ["Phillip Alday and contributors"] -version = "1.1.0" +version = "2.0.0" [deps] LinearAlgebra = "37e2e46d-f89d-539d-b4ee-838fcccc9c8e" @@ -14,7 +14,7 @@ Tables = "bd369af6-aec1-5ad0-b16a-f7cc5008161c" [compat] MixedModels = "3, 4" StatsBase = "0.33, 0.34" -StatsModels = "0.6.28, 0.7" +StatsModels = "0.7.3" Tables = "1" julia = "1.6" diff --git a/src/MixedModelsExtras.jl b/src/MixedModelsExtras.jl index a66dc21..680e97e 100644 --- a/src/MixedModelsExtras.jl +++ b/src/MixedModelsExtras.jl @@ -7,15 +7,17 @@ using StatsBase using StatsModels using Tables +using StatsModels: termnames, vif, gvif +export termnames, gvif, vif + +StatsModels.termnames(::RandomEffectsTerm) = String[] + include("icc.jl") export icc include("r2.jl") export r², r2, adjr², adjr2 -include("vif.jl") -export termnames, gvif, vif - include("remef.jl") export partial_fitted diff --git a/test/vif.jl b/test/vif.jl index 1df3f65..34fc03b 100644 --- a/test/vif.jl +++ b/test/vif.jl @@ -18,7 +18,7 @@ progress = false fm2 = fit(MixedModel, @formula(reaction ~ 1 + days * days^2 + (1 | subj)), dataset(:sleepstudy); progress) - ae_int = ArgumentError("VIF only defined for models with an intercept term") + ae_int = ArgumentError("VIF is only defined for models with an intercept term") ae_nterm = ArgumentError("VIF not meaningful for models with only one non-intercept term") @test_throws ae_int vif(fm0) @test_throws ae_nterm vif(fm1) @@ -46,7 +46,7 @@ end duncan = rdataset("car", "Duncan") lm1 = lm(@formula(Prestige ~ 1 + Income + Education), duncan) - @test termnames(lm1) == coefnames(lm1) + @test termnames(lm1)[2] == coefnames(lm1) vif_lm1 = vif(lm1) # values here taken from car @@ -54,7 +54,7 @@ end @test isapprox(vif_lm1, gvif(lm1)) lm2 = lm(@formula(Prestige ~ 1 + Income + Education + Type), duncan) - @test termnames(lm2) == ["(Intercept)", "Income", "Education", "Type"] + @test termnames(lm2)[2] == ["(Intercept)", "Income", "Education", "Type"] @test isapprox(gvif(lm2), [2.209178, 5.297584, 5.098592]; atol=1e-5) @test isapprox(gvif(lm2; scale=true), [1.486330, 2.301648, 1.502666]; atol=1e-5) From ca347284e4ff51ef2e7e812ca457274d515c03ee Mon Sep 17 00:00:00 2001 From: Phillip Alday Date: Wed, 13 Sep 2023 04:30:26 -0500 Subject: [PATCH 2/5] compat --- Project.toml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/Project.toml b/Project.toml index 138fd9b..d075330 100644 --- a/Project.toml +++ b/Project.toml @@ -12,7 +12,7 @@ StatsModels = "3eaba693-59b7-5ba5-a881-562e759f1c8d" Tables = "bd369af6-aec1-5ad0-b16a-f7cc5008161c" [compat] -MixedModels = "3, 4" +MixedModels = "4" StatsBase = "0.33, 0.34" StatsModels = "0.7.3" Tables = "1" From eff372f7e012956f10e15b032dd8749468f07217 Mon Sep 17 00:00:00 2001 From: Phillip Alday Date: Wed, 13 Sep 2023 04:33:14 -0500 Subject: [PATCH 3/5] julia compat floor --- .github/workflows/ci.yml | 2 +- Project.toml | 2 +- 2 files changed, 2 insertions(+), 2 deletions(-) diff --git a/.github/workflows/ci.yml b/.github/workflows/ci.yml index d7125fa..fd0c6bc 100644 --- a/.github/workflows/ci.yml +++ b/.github/workflows/ci.yml @@ -17,7 +17,7 @@ jobs: runs-on: ${{ matrix.os }} strategy: matrix: - version: [1.6, 1] + version: [1.8, 1] arch: [x64] os: [ubuntu-20.04] # macos-10.15, windows-2019 steps: diff --git a/Project.toml b/Project.toml index d075330..3db052b 100644 --- a/Project.toml +++ b/Project.toml @@ -16,7 +16,7 @@ MixedModels = "4" StatsBase = "0.33, 0.34" StatsModels = "0.7.3" Tables = "1" -julia = "1.6" +julia = "1.8" [extras] Test = "8dfed614-e22c-5e08-85e1-65c5234f0b40" From 2fb1518ebab7a33a1a4cc6cfe1a8049fa7e62cb0 Mon Sep 17 00:00:00 2001 From: Phillip Alday Date: Wed, 13 Sep 2023 09:36:48 +0000 Subject: [PATCH 4/5] Update src/MixedModelsExtras.jl Co-authored-by: github-actions[bot] <41898282+github-actions[bot]@users.noreply.github.com> --- src/MixedModelsExtras.jl | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/MixedModelsExtras.jl b/src/MixedModelsExtras.jl index 680e97e..476e874 100644 --- a/src/MixedModelsExtras.jl +++ b/src/MixedModelsExtras.jl @@ -10,7 +10,7 @@ using Tables using StatsModels: termnames, vif, gvif export termnames, gvif, vif -StatsModels.termnames(::RandomEffectsTerm) = String[] +StatsModels.termnames(::RandomEffectsTerm) = String[] include("icc.jl") export icc From d037e207a812b8a0b0d32a15961945c8452eac8e Mon Sep 17 00:00:00 2001 From: Phillip Alday Date: Wed, 13 Sep 2023 05:23:02 -0500 Subject: [PATCH 5/5] remove unused file --- src/vif.jl | 143 ----------------------------------------------------- 1 file changed, 143 deletions(-) delete mode 100644 src/vif.jl diff --git a/src/vif.jl b/src/vif.jl deleted file mode 100644 index 87643e8..0000000 --- a/src/vif.jl +++ /dev/null @@ -1,143 +0,0 @@ -_rename_intercept(s) = strip(s) == "1" ? "(Intercept)" : s - -""" - termnames(model) - -Return the names associated with (fixed effects) terms in a model. - -For models with only continuous predictors, this is the same as -`coefnames(model)`. -For models with categorical predictors, the returned names reflect -the categorical predictor and not the coefficients resulting from -the choice of contrast coding. -""" -termnames(model) = _rename_intercept.(string.(_terms(model))) - -_terms(model) = collect(formula(model).rhs.terms) -_terms(model::MixedModel) = collect(formula(model).rhs[1].terms) - -""" - vif(m::RegressionModel) - -Compute the variance inflation factor (VIF). - -Returns a vector of inflation factors computed for each coefficient except -for the intercept. -In other words, the corresponding coefficients are `coefnames(m)[2:end]`. - -The [variance inflation factor (VIF)](https://en.wikipedia.org/wiki/Variance_inflation_factor) measures -the increase in the variance of a parameter's estimate in a model with multiple parameters relative to -the variance of a parameter's estimate in a model containing only that parameter. - -See also [`coefnames`](@ref), [`gvif`](@ref). - -!!! warning - This method will fail if there is (numerically) perfect multicollinearity, - i.e. rank deficiency (in the fixed effects). In that case though, the VIF - isn't particularly informative anyway. -""" -function vif(m::RegressionModel) - mm = Statistics.cov2cor!(vcov(m), stderror(m)) - all(==(1), view(modelmatrix(m), :, 1)) || - throw(ArgumentError("VIF only defined for models with an intercept term")) - mm = @view mm[2:end, 2:end] - size(mm, 2) > 1 || - throw(ArgumentError("VIF not meaningful for models with only one non-intercept term")) - # NB: The correlation matrix is positive definite and hence invertible - # unless there is perfect rank deficiency, hence the warning. - # NB: The determinate technique for GVIF could also be applied - # columnwise (instead of Term-wise) here, but it wouldn't really - # be any more efficient because this is a Symmetric matrix and computing - # all those determinants has its cost. The determinants also hint at - # how you could show equivalency, if you remember that inversion is solving - # a linear system and that Cramer's rule -- which uses determinants -- - # can also a linear system - # so we want diag(inv(mm)) but directly computing inverses is bad. - # that said, these are typically small-ish matrices and this is Simple. - return diag(inv(mm)) -end - -""" - gvif(m::RegressionModel; scale=false) - -Compute the generalized variance inflation factor (GVIF). - -If `scale=true`, then each GVIF is scaled by the degrees of freedom -for (number of coefficients associated with) the predictor: ``GVIF^(1 / (2*df))`` - -Returns a vector of inflation factors computed for each term except -for the intercept. -In other words, the corresponding coefficients are `termnames(m)[2:end]`. - -The [generalized variance inflation factor (VIF)](https://doi.org/10.2307/2290467) -measures the increase in the variance of a (group of) parameter's estimate in a model -with multiple parameters relative to the variance of a parameter's estimate in a -model containing only that parameter. For continuous, numerical predictors, the GVIF -is the same as the VIF, but for categorical predictors, the GVIF provides a single -number for the entire group of contrast-coded coefficients associated with a categorical -predictor. - -See also [`termnames`](@ref), [`vif`](@ref). - -!!! warning - This method will fail if there is (numerically) perfect multicollinearity, - i.e. rank deficiency (in the fixed effects). In that case though, the VIF - isn't particularly informative anyway. - -## References - -Fox, J., & Monette, G. (1992). Generalized Collinearity Diagnostics. -Journal of the American Statistical Association, 87(417), 178. doi:10.2307/2290467 -""" -function gvif(m::RegressionModel; scale=false) - mm = StatsBase.cov2cor!(vcov(m), stderror(m)) - - StatsModels.hasintercept(formula(m)) || - throw(ArgumentError("GVIF only defined for models with an intercept term")) - mm = @view mm[2:end, 2:end] - size(mm, 2) > 1 || - throw(ArgumentError("GVIF not meaningful for models with only one non-intercept term")) - - tn = @view termnames(m)[2:end] - trms = @view _terms(m)[2:end] - - df = width.(trms) - vals = zeros(eltype(mm), length(tn)) - # julia> @benchmark gvif($fm2) # logdet(mm) - # BenchmarkTools.Trial: 10000 samples with 1 evaluation. - # Range (min … max): 28.776 μs … 12.991 ms ┊ GC (min … max): 0.00% … 99.34% - # Time (median): 32.320 μs ┊ GC (median): 0.00% - # Time (mean ± σ): 33.939 μs ± 129.624 μs ┊ GC (mean ± σ): 3.80% ± 0.99% - - # ▂▆▇▇▃▂▄▇█▆▅▄▂▁ - # ▂▃▆██████████████▇▆▅▄▄▃▃▃▃▂▂▂▂▂▂▂▂▂▂▂▂▂▂▂▂▁▁▁▁▁▁▁▁▂▂▂▂▂▂▂▂▂▂ ▄ - # 28.8 μs Histogram: frequency by time 48.3 μs < - # Memory estimate: 16.19 KiB, allocs estimate: 303. - # julia> @benchmark gvif($fm2) # logdet(cholesky(Symmetric(mm))) - # BenchmarkTools.Trial: 10000 samples with 1 evaluation. - # Range (min … max): 30.091 μs … 13.029 ms ┊ GC (min … max): 0.00% … 99.31% - # Time (median): 33.279 μs ┊ GC (median): 0.00% - # Time (mean ± σ): 35.016 μs ± 129.993 μs ┊ GC (mean ± σ): 3.70% ± 0.99% - - # ▆█▆▄▃▆▇▆▆▄▄▁ - # ▂▃▇█████████████▆▅▄▄▃▃▃▃▂▂▂▂▂▂▂▂▂▂▂▂▂▂▂▂▁▁▁▁▂▂▁▂▂▂▂▂▂▂▁▂▂▂▂▂ ▄ - # 30.1 μs Histogram: frequency by time 50.1 μs < - - # Memory estimate: 16.22 KiB, allocs estimate: 304. - logdetmm = logdet(mm) - acc = 0 - for idx in axes(vals, 1) - wt = df[idx] - trm_only = [acc < i <= (acc + wt) for i in axes(mm, 2)] - trm_excl = .!trm_only - vals[idx] = exp(logdet(view(mm, trm_only, trm_only)) + - logdet(view(mm, trm_excl, trm_excl)) - - logdetmm) - acc += wt - end - - if scale - vals .= vals .^ (1 ./ (2 .* df)) - end - return vals -end