diff --git a/.gitattributes b/.gitattributes index ec7bc9a..03d92dd 100644 --- a/.gitattributes +++ b/.gitattributes @@ -1 +1,5 @@ *.arrow filter=lfs diff=lfs merge=lfs -text +*.Rds filter=lfs diff=lfs merge=lfs -text +*.RDs filter=lfs diff=lfs merge=lfs -text +*.RDS filter=lfs diff=lfs merge=lfs -text +*.rds filter=lfs diff=lfs merge=lfs -text diff --git a/Makefile b/Makefile index c32174c..aa4f439 100644 --- a/Makefile +++ b/Makefile @@ -8,6 +8,7 @@ cleanup: clean: cleanup rm -rf .quarto/_freeze rm -rf _freeze/ + rm -rf _build/ publish: quarto publish gh-pages diff --git a/Project.toml b/Project.toml index 7bc207a..4b39ba7 100644 --- a/Project.toml +++ b/Project.toml @@ -28,6 +28,7 @@ MixedModels = "ff71e718-51f3-5ec2-a782-8ffcbfa3c316" MixedModelsDatasets = "7e9fb7ac-9f67-43bf-b2c8-96ba0796cbb6" MixedModelsExtras = "781a26e1-49f4-409a-8f4c-c3159d78c17e" MixedModelsMakie = "b12ae82c-6730-437f-aff9-d2c38332a376" +MixedModelsSerialization = "b32ace64-3998-4ca6-afd0-a0db4a0482b2" MixedModelsSim = "d5ae56c5-23ca-4a1f-b505-9fc4796fc1fe" PooledArrays = "2dfb63ee-cc39-5dd5-95bd-886bf059d720" ProgressMeter = "92933f4c-e287-5a05-a399-4b506db050ca" @@ -46,7 +47,7 @@ ZipFile = "a5390f91-8eb1-5f08-bee0-b1d1ffed6cea" [compat] AlgebraOfGraphics = "0.8" Arrow = "2" -BoxCox = "0.3" +BoxCox = "0.3.3" CSV = "0.10" CairoMakie = "0.12" Chain = "0.5,0.6" @@ -59,6 +60,7 @@ MacroTools = "0.5" MixedModels = "4.22" MixedModelsExtras = "1, 2" MixedModelsMakie = "0.4" +MixedModelsSerialization = "0.1" MixedModelsSim = "0.2.7" RCall = "0.13, 0.14" Random = "1" diff --git a/_quarto.yml b/_quarto.yml index 70321cb..e119040 100644 --- a/_quarto.yml +++ b/_quarto.yml @@ -37,6 +37,7 @@ project: - singularity.qmd - sleepstudy.qmd - sleepstudy_speed.qmd + - transformation.qmd - useful_packages.qmd # editor_options: @@ -93,10 +94,11 @@ website: - largescaledesigned.qmd - mrk17.qmd - partial_within.qmd - - section: "Contrast coding" + - section: "Contrast coding and transformations" contents: - contrasts_fggk21.qmd - contrasts_kwdyz11.qmd + - transformation.qmd - glmm.qmd - section: "Bootstrap and profiling" contents: diff --git a/_typos.toml b/_typos.toml index 8a02277..05de200 100644 --- a/_typos.toml +++ b/_typos.toml @@ -13,6 +13,8 @@ missings = "missings" Lik = "Lik" missings = "missings" represention = "representation" +GAMMs = "GAMMs" + [type.qmd] extend-glob = ["*.qmd"] diff --git a/contrasts_fggk21.qmd b/contrasts_fggk21.qmd index 8736e79..5b7aa60 100644 --- a/contrasts_fggk21.qmd +++ b/contrasts_fggk21.qmd @@ -179,9 +179,9 @@ m_ovi_SeqDiff_2 = let end ``` -The difference between older and younger childrend is larger for `Star_r` than for `Run` (0.2473). `S20_r` did not differ significantly from `Star_r` (-0.0377) and `SLJ` (-0.0113) The largest difference in developmental gain was between `BPT` and `SLJ` (0.3355). +The difference between older and younger children is larger for `Star_r` than for `Run` (0.2473). `S20_r` did not differ significantly from `Star_r` (-0.0377) and `SLJ` (-0.0113) The largest difference in developmental gain was between `BPT` and `SLJ` (0.3355). -P**lease note that standard errors of this LMM are anti-conservative because the LMM is missing a lot of information in the RES (e..g., contrast-related VCs snd CPs for `Child`, `School`, and `Cohort`.** +**Please note that standard errors of this LMM are anti-conservative because the LMM is missing a lot of information in the RES (e..g., contrast-related VCs snd CPs for `Child`, `School`, and `Cohort`.** Next we add the main effect of `Sex` and its interaction with the four test contrasts. @@ -268,9 +268,9 @@ contr3 = Dict( :Test => HypothesisCoding( [ -1 -1 -1 -1 +4 - -1 +1 0 0 0 - 0 -1 +1 0 0 - 0 0 -1 +1 0 + -1 +1 0 0 0 + 0 -1 +1 0 0 + 0 0 -1 +1 0 ]; levels=["Run", "Star_r", "S20_r", "SLJ", "BPT"], labels=["BPT-other", "Star-End", "S20-Star", "SLJ-S20"], @@ -294,10 +294,10 @@ contr1b = Dict( :Sex => EffectsCoding(; levels=["Girls", "Boys"]), :Test => HypothesisCoding( [ - -1 +1 0 0 0 - 0 -1 +1 0 0 - 0 0 -1 +1 0 - 0 0 0 -1 +1 + -1 +1 0 0 0 + 0 -1 +1 0 0 + 0 0 -1 +1 0 + 0 0 0 -1 +1 ]; levels=["Run", "Star_r", "S20_r", "SLJ", "BPT"], labels=["Star-Run", "S20-Star", "SLJ-S20", "BPT-SLJ"], @@ -375,10 +375,10 @@ contr4 = Dict( :Sex => EffectsCoding(; levels=["Girls", "Boys"]), :Test => HypothesisCoding( [ - -1 0 0 0 +1 + -1 0 0 0 +1 -3 +2 +2 +2 -3 - 0 +2 -1 -1 0 - 0 0 +1 -1 0 + 0 +2 -1 -1 0 + 0 0 +1 -1 0 ]; levels=["Run", "Star_r", "S20_r", "SLJ", "BPT"], labels=["c5.1", "c234.15", "c2.34", "c3.4"], @@ -407,10 +407,10 @@ contr4b = merge( :Sex => EffectsCoding(; levels=["Girls", "Boys"]), :Test => HypothesisCoding( [ - 0.49 -0.04 0.20 0.03 -0.85 - 0.70 -0.56 -0.21 -0.13 0.37 - 0.31 0.68 -0.56 -0.35 0.00 - 0.04 0.08 0.61 -0.78 0.13 + 0.49 -0.04 0.20 0.03 -0.85 + 0.70 -0.56 -0.21 -0.13 0.37 + 0.31 0.68 -0.56 -0.35 0.00 + 0.04 0.08 0.61 -0.78 0.13 ]; levels=["Run", "Star_r", "S20_r", "SLJ", "BPT"], labels=["c5.1", "c234.15", "c12.34", "c3.4"], diff --git a/contrasts_kwdyz11.qmd b/contrasts_kwdyz11.qmd index c68928b..3c07811 100644 --- a/contrasts_kwdyz11.qmd +++ b/contrasts_kwdyz11.qmd @@ -1,12 +1,17 @@ --- title: Contrast Coding of Visual Attention Effects engine: julia -julia: +julia: exeflags: ["--project", "--threads=auto"] + +fig-format: png --- + ```{julia} #| code-fold: true +using AlgebraOfGraphics +using CairoMakie using Chain using DataFrames using MixedModels @@ -14,20 +19,23 @@ using SMLP2024: dataset using StatsBase using StatsModels -const progress=false +CairoMakie.activate!(; type="png") + +progress = false ``` # A word of caution {#sec-caution} - +Many researchers have pointed out that contrasts should be "tested instead of, rather than as a supplement to, the ordinary 'omnibus' F test" (Hays, 1973, p. 601). + For a (quasi-)experimental set of data, there is (or should be) a clear _a priori_ theoretical commitment to specific hypotheses about differences between factor levels and how these differences enter in interactions with other factors. This specification should be used in the first LMM and reported, irrespective of the outcome. If alternative theories lead to alternative _a priori_ contrast specifications, both analyses are justified. If the observed means render the specification completely irrelevant, the comparisons originally planned could still be reported in a Supplement). -In this script, we are working through a large number of different contrasts for the same data. The purpose is to introduce both the preprogrammed (“canned”) and the general options to specify hypotheses about main effects and interactions. Obviously, we do not endorse generating a plot of the means and specifying the contrasts accordingly. This is known as the [Texas sharpshooter](https://www.bayesianspectacles.org/origin-of-the-texas-sharpshooter/) fallacy. The link leads to an illustration and brief historical account by Wagenmakers (2018). +In this script, we are working through a large number of different contrasts for the same data. The purpose is to introduce both the preprogrammed ("canned") and the general options to specify hypotheses about main effects and interactions. Obviously, we do not endorse generating a plot of the means and specifying the contrasts accordingly. This is known as the [Texas sharpshooter](https://www.bayesianspectacles.org/origin-of-the-texas-sharpshooter/) fallacy. The link leads to an illustration and brief historical account by Wagenmakers (2018). Irrespective of how results turn out, there is nothing wrong with specifying a set of post-hoc contrasts to gain a better understanding of what the data are trying to tell us. Of course, in an article or report about the study, the _a priori_ and post-hoc nature of contrast specifications must be made clear. Some kind of alpha-level adjustment (e.g., Bonferroni) may be called for, too. And, of course, there are grey zones. -There is quite a bit of statistical literature on contrasts. Two “local” references are @Brehm2022 and @Schad2020. +There is quite a bit of statistical literature on contrasts. Two "local" references are @Brehm2022 and @Schad2020. -For further readings see “Further Readings” in @Schad2020. +For further readings see "Further Readings" in @Schad2020. # Example data {#sec-data} @@ -78,6 +86,7 @@ Controlling the ordering of levels for contrasts: ## SeqDiffCoding The `SeqDiffCoding` contrast corresponds to `MASS::contr.sdif()` in R. +The assignment of random factors such as `Subj` to `Grouping()` is necessary when the sample size is very large. We recommend to include it always, but in this tutorial we do so only in the first example. ```{julia} m1 = let levels = ["val", "sod", "dos", "dod"] @@ -88,9 +97,19 @@ m1 = let levels = ["val", "sod", "dos", "dod"] end ``` + +What does the intercept represent? + +```{julia} +mean(dat1.rt) +mean(cellmeans.rt_mean) +``` + +Grand Mean is mean of condition means. + ## HypothesisCoding -`HypothesisCoding` is the most general option available. We can implement all "canned" contrasts ourselves. The next example reproduces the test statistcs from `SeqDiffCoding` - with a minor modification illustrating the flexibility of going beyond the default version. +`HypothesisCoding` is the most general option available. We can implement all "canned" contrasts ourselves. The next example reproduces the test statistics from `SeqDiffCoding` - with a minor modification illustrating the flexibility of going beyond the default version. ```{julia} m1b = let levels = ["val", "sod", "dos", "dod"] @@ -146,66 +165,74 @@ m2b = let levels = ["val", "sod", "dos", "dod"] end ``` -We can simply relevel the factor or move the column with -1s for a different base. +We can simply move the column with -1s for a different base. ```{julia} m2c = let levels = ["val", "sod", "dos", "dod"] contrasts = Dict( :CTR => HypothesisCoding( [ - -1/2 1/2 0 0 - -1/2 0 1/2 0 - -1/2 0 0 1/2 + 1 -1 0 0 + 0 -1 1 0 + 0 -1 0 1 ]; levels, - labels=levels[2:end], + labels=["val", "dos", "dod"], ) ) fit(MixedModel, form, dat1; contrasts, progress) end ``` -We can simply relevel the factor or move the column with -1s for a different base. +We can simply relevel the factor with a different base. +```{julia} +m2d = let levels = ["sod", "val", "dos", "dod"] + contrasts = Dict( + :CTR => HypothesisCoding( + [ + -1 1 0 0 + -1 0 1 0 + -1 0 0 1 + ]; + levels, + labels=levels[2:end], + ) + ) + fit(MixedModel, form, dat1; contrasts) +end +``` ## EffectsCoding -This contrast corresponds almost to `contr.sum()` in R. +`EffectsCoding` estimates the difference between the Grand Mean and three of the four levels. The difference of the fourth levels can be computed from the Grand Mean and these three differences. ```{julia} -m3 = let - contrasts = Dict(:CTR => EffectsCoding(; base="dod")) +m3 = let levels = ["val", "sod", "dos", "dod"] + contrasts = Dict(:CTR => EffectsCoding(; levels, base="val")) fit(MixedModel, form, dat1; contrasts, progress) end ``` -The “almost” qualification refers to the fact that `contr.sum()` uses the last factor levels as default base level; `EffectsCoding` uses the first level. +This contrast corresponds almost to `contr.sum()` in R. The "almost" qualification refers to the fact that `EffectsCoding` uses the first level as default base; `contr.sum()` uses the last factor level. ```{julia} -m3b = let levels = [ "dod", "val", "sod", "dos"] - contrasts = Dict( - :CTR => HypothesisCoding( - [ - -1/4 3/4 -1/4 -1/4 - -1/4 -1/4 3/4 -1/4 - -1/4 -1/4 -1/4 3/4 - ]; - levels, - labels=levels[2:end], - ) - ) +m3b = let levels = ["val", "sod", "dos", "dod"] + contrasts = Dict(:CTR => EffectsCoding(; levels, base = "dod")) fit(MixedModel, form, dat1; contrasts, progress) end ``` +How could we achieve the default result with HypothesisCoding? + ```{julia} -m3c = let levels = [ "dod", "val", "sod", "dos"] +m3c = let levels = ["val", "sod", "dos", "dod"] contrasts = Dict( :CTR => HypothesisCoding( [ - -1/2 3/2 -1/2 -1/2 - -1/2 -1/2 3/2 -1/2 - -1/2 -1/2 -1/2 3/2 + -1/4 3/4 -1/4 -1/4 # b - GM = b - (a+b+c+d)/4 => -1/4*a + 3/4*b - 1/4*c - 1/4*d + -1/4 -1/4 3/4 -1/4 # c - GM = c - (a+b+c+d)/4 => -1/4*a - 1/4*b + 3/4*c - 1/4*d + -1/4 -1/4 -1/4 3/4 # d - GM = d - (a+b+c+d)/4 => -1/4*a - 1/4*b - 1/4*c + 3/4*d ]; levels, labels=levels[2:end], @@ -218,19 +245,39 @@ end ## HelmertCoding -`HelmertCoding` codes each level as the difference from the average of the lower levels. With the default order of `CTR` levels we get the following test statistics. These contrasts are othogonal. +`HelmertCoding` codes each level as the difference from the average of the lower levels. With the default order of `CTR` levels we get the following test statistics. These contrasts are orthogonal. ```{julia} -m4 = let - contrasts = Dict(:CTR => HelmertCoding()) +m4 = let levels = ["val", "sod", "dos", "dod"] + contrasts = Dict(:CTR => HelmertCoding(; levels)) fit(MixedModel, form, dat1; contrasts, progress) end ``` ```sh -+ HeC1: (2 - 1)/2 # (391 - 358)/2 -+ HeC2: (3 - (2+1)/2)/3 # (405 - (391 + 358)/2)/3 -+ HeC3: (4 - (3+2+1)/3)/4 # (402 - (405 + 391 + 358)/3)/4 ++ HeC1: (b - a)/2 # (391 - 358)/2 = 16.5 ++ HeC2: (c - (b+a)/2)/3 # (405 - (391 + 358)/2)/3 = 10.17 ++ HeC3: (d - (c+b+a)/3)/4 # (402 - (405 + 391 + 358)/3)/4 = 4.33 +``` + +We can reconstruct the estimates, but they are scaled by the number of levels involved. With `HypothesisCoding` we can estimate the "unscaled" differences. Also the labeling of the contrasts is not as informative as they could be. + +```{julia} +m4b = let levels = ["val", "sod", "dos", "dod"] + contrasts = Dict( + :CTR => HypothesisCoding( + [ + -1 1 0 0 + -1/2 -1/2 1 0 + -1/3 -1/3 -1/3 1 + + ]; + levels, + labels= ["2-1", "3-21", "4-321"] + ) + ) + fit(MixedModel, form, dat1; contrasts, progress) +end ``` ## Reverse HelmertCoding @@ -238,18 +285,70 @@ end `Reverse HelmertCoding` codes each level as the difference from the average of the higher levels. To estimate these effects we simply reverse the order of factor levels. Of course, the contrasts are also orthogonal. ```{julia} -m4b = let levels = reverse(StatsModels.levels(dat1.CTR)) +m4c = let levels = reverse(StatsModels.levels(dat1.CTR)) contrasts = Dict(:CTR => HelmertCoding(; levels)) fit(MixedModel, form, dat1; contrasts, progress) end ``` ```sh -+ HeC1:(3 - 4)/2 # (405 - 402)/2 -+ HeC2:(2 - (3+4)/2)/3 # (391 - (405 + 402)/2)/3 -+ HeC3:(1 - (2+3+4)/3/4 # (356 -(391 + 405 + 402)/3)/4 ++ HeC1:(c - d)/2 # (405 - 402)/2 = 1.5 ++ HeC2:(b - (c+d)/2)/3 # (391 - (405 + 402)/2)/3 = -4.17 ++ HeC3:(a - (b+c+d)/3/4 # (356 -(391 + 405 + 402)/3)/4 = -10.83 +``` + +... and the unscaled-by-number-of-levels estimates. + +```{julia} +m4d = let levels = ["val", "sod", "dos", "dod"] + contrasts = Dict( + :CTR => HypothesisCoding( + [ + 0 0 1 -1 + 0 1 -1/2 -1/2 + 1 -1/3 -1/3 -1/3 + ]; + levels, + labels= ["3-4", "2-34", "1-234"] + ) + ) + fit(MixedModel, form, dat1; contrasts) +end ``` +# Other orthogonal contrasts + +For factors with more than four levels there are many options for specifying orthogonal contrasts as long as one proceeds in a top-down strictly hierarchical fashion. + +Suppose you have a factor with seven levels and let's ignore shifting columns. +In this case, you have six options for the first contrast, that is 6 vs. 1, 5 vs.2 , 4 vs. 3, 3 vs. 4, 2 vs. 5, and 1 vs. 6 levels. +Then, you specify orthogonal contrasts for partitions with more than 2 elements and so on. +That is, you don't specify a contrast that crosses an earlier partition line. + +In the following example, after an initial 4 vs 3 partitioning of levels, we specify `AnovaCoding` for the left and `HelmertCoding` for the right partition. + +```{julia} +contrasts = Dict( + :CTR => HypothesisCoding( + [ + -1/4 -1/4 -1/4 -1/4 +1/3 +1/3 +1/3 + -1/2 -1/2 +1/2 +1/2 0 0 0 + -1/2 +1/2 -1/2 +1/2 0 0 0 + +1/2 -1/2 -1/2 +1/2 0 0 0 + 0 0 0 0 -1 +1 0 + 0 0 0 0 -1/2 -1/2 1 + ]; + levels=["A1", "A2", "A3", "A4", "A5", "A6", "A7"], + labels=["c567.1234", "B", "C", "BxC", "c6.5", "c6.56"], + ), +); +``` + +There are two rules that hold for all orthogonal contrasts: + + 1. The weights within rows sum to zero. + 2. For all pairs of rows, the sum of the products of weights in the same columns sums to zero. + ## Anova Coding Factorial designs (i.e., lab experiments) are traditionally analyzed with analysis of variance. The test statistics of main effects and interactions are based on an orthogonal set of contrasts. @@ -300,10 +399,6 @@ Going beyond the four level factor; it is also helpful to see the corresponding A2 -1 +1 A2 +1 -1 ``` -### A(2) x B(2) x C(3) - -TO BE DONE - ## Nested coding Nested contrasts are often specified as follow up as post-hoc tests for ANOVA interactions. They are orthogonal. We specify them with `HypothesisCoding`. @@ -313,7 +408,8 @@ The following contrast specification returns an estimate for the main effect of In a figure With A on the x-axis and the levels of B shown as two lines, the second contrast tests whether A1-B1 is different from A1-B2 and the third contrast tests whether A2-B1 is different from A2-B2. ```{julia} -m8 = let levels = ["val", "sod", "dos", "dod"] + +m6 = let levels = ["val", "sod", "dos", "dod"] contrasts = Dict( :CTR => HypothesisCoding( [ @@ -329,41 +425,75 @@ m8 = let levels = ["val", "sod", "dos", "dod"] end ``` -The three contrasts for one main effect and two nested contrasts are orthogonal. -There is no test of the interaction (parallelism). +The three contrasts for one main effect and two nested contrasts are orthogonal. There is no test of the interaction (parallelism). -# Other orthogonal contrasts +# An Example for a complex example: Group(2) x A(2) x B(2) -For factors with more than four levels there are many options for specifying orthogonal contrasts as long as one proceeds in a top-down strictly hiearchical fashion. +Three factors: -Suppose you have a factor with seven levels and let's ignore shifting columns. -In this case, you have six options for the first contrast, that is 6 vs. 1, 5 vs.2 , 4 vs. 3, 3 vs. 4, 2 vs. 5, and 1 vs. 6 levels. -Then, you specify orthogonal contrasts for partitions with more than 2 elements and so on. -That is, you don't specify a contrast that crosses an earlier partition line. ++ G(roup): G1, G2 - between subjects ++ A: A1, A2, - within subjects ++ B: B1, B2, B3 - within subjects -In the following example, after an initial 4 vs 3 partitioning of levels, we specify `AnovaCoding` for the left and `HelmertCoding` for the right partition. +2 x 3 = 6 measures / subject ```{julia} -contrasts = Dict( - :CTR => HypothesisCoding( - [ - -1/4 -1/4 -1/4 -1/4 +1/3 +1/3 +1/3 - -1/2 -1/2 +1/2 +1/2 0 0 0 - -1/2 +1/2 -1/2 +1/2 0 0 0 - +1/2 -1/2 -1/2 +1/2 0 0 0 - 0 0 0 0 -1 +1 0 - 0 0 0 0 -1/2 -1/2 1 - ]; - levels=["A1", "A2", "A3", "A4", "A5", "A6", "A7"], - labels=["c567.1234", "B", "C", "BxC", "c6.5", "c6.56"], - ), -); +dat2 = dataset(:exp_2x2x3) ``` -There are two rules that hold for all orthogonal contrasts: +We select an LMM supported by the data. + +```{julia} +cntrst2 = Dict( + :Group => SeqDiffCoding(; levels=["G1", "G2"]), + :A => SeqDiffCoding(; levels=["A1", "A2"]), + :B => SeqDiffCoding(; levels=["B1", "B2", "B3"]), + ) + +f6_cpx = @formula dv ~ 1 + Group*A*B + (1 + A+B | Subj); +m6_cpx = fit(MixedModel, f6_cpx, dat2; contrasts=cntrst2) +issingular(m6_cpx) + +f6_zcp = @formula dv ~ 1 + Group*A*B + zerocorr(1 + A+B | Subj); +m6_zcp = fit(MixedModel, f6_zcp, dat2; contrasts=cntrst2) +issingular(m6_zcp) + +f6_ovi = @formula dv ~ 1 + Group*A*B + (1 | Subj); +m6_ovi = fit(MixedModel, f6_ovi, dat2; contrasts=cntrst2) +``` + +```{julia} +lrtest(m6_ovi, m6_zcp, m6_cpx) +``` + + +There is a significant interaction between A and the first contrast of B (i.e., B2 - B1). The interaction is not significant for A and the second contrast of B (i.e., B3 - B2). This implies that the left pair of lines in the following figure is statistically not parallel and that we do not have sufficient evidence that the right pair of lines is not parallel. + +``` +─────────────────────────────────────────────────────────────────── + Coef. Std. Error z Pr(>|z|) +─────────────────────────────────────────────────────────────────── +A: A2 & B: B2 3.53363 0.924789 3.82 0.0001 +A: A2 & B: B3 0.523243 0.950202 0.55 0.5819 +─────────────────────────────────────────────────────────────────── +``` + +```{julia} +using Chain +tbl1 = @chain DataFrame(dat2) begin + groupby(_, [:Subj, :A, :B]) + combine(_, nrow => :n, :dv => mean => :dv) + groupby(_, [:A, :B]) + combine(_, + :dv => mean => :dv_M, + :dv => std => :dv_SD, + :dv => sem => :dv_SE) +end + +fig1 = data(tbl1) * mapping(:B, :dv_M; color=:A) * (visual(Lines) + visual(Scatter)) +draw(fig1) +``` - 1. The weights within rows sum to zero. - 2. For all pairs of rows, the sum of the products of weights in the same columns sums to zero. # Appendix: Summary (Dave Kleinschmidt) diff --git a/kkl15.qmd b/kkl15.qmd index 31b8b40..008b0be 100644 --- a/kkl15.qmd +++ b/kkl15.qmd @@ -15,9 +15,9 @@ We specify three contrasts for the four-level factor CTR that are derived from s This comparison is of interest because a few years after the publication of @Kliegl2011, the theoretically critical correlation parameter (CP) between the spatial effect and the attraction effect was determined as the source of a non-singular LMM in that paper. The present study served the purpose to estimate this parameter with a larger sample and a wider variety of experimental conditions. -Here we also include two additional experimental manipulations of target size and orientation of cue rectangle. A similar analysis was reported in the parsimonious mixed-model paper [@Bates2015]; it was also used in a paper of GAMMs [@Baayen2017]. Data and R scripts of those analyses are also available in [R-package RePsychLing](https://github.com/dmbates/RePsychLing/tree/master/data/). +Here we also include two additional experimental manipulations of target size and orientation of cue rectangle. A similar analysis was reported in the parsimonious mixed-model paper [@Bates2015]; it was also used in a paper of GAMEMs [@Baayen2017]. Data and R scripts of those analyses are also available in [R-package RePsychLing](https://github.com/dmbates/RePsychLing/tree/master/data/). -The analysis is based on log-transformed reaction times `lrt`, indicated by a _boxcox()_ check of model residuals. +The analysis is based on reaction times `rt` to maintain compatibility with @Kliegl2011. In this vignette we focus on the reduction of model complexity. And we start with a quote: @@ -65,7 +65,7 @@ describe(dat_subj) ```{julia} #| code-fold: true -#| fig-cap: Comparative boxplots of mean log response time by subject under different conditions +#| fig-cap: Comparative boxplots of mean response time by subject under different conditions #| label: fig-bxpltsubjcond boxplot( dat_subj.CTR.refs, @@ -87,39 +87,9 @@ boxplot( ) ``` -Mean of log reaction times for four cue-target relations. Targets appeared at (a) the cued position (valid) in a rectangle, (b) in the same rectangle cue, but at its other end, (c) on the second rectangle, but at a corresponding horizontal/vertical physical distance, or (d) at the other end of the second rectangle, that is $\sqrt{2}$ of horizontal/vertical distance diagonally across from the cue, that is also at larger physical distance compared to (c). +Mean of reaction times for four cue-target relations. Targets appeared at (a) the cued position (valid) in a rectangle, (b) in the same rectangle cue, but at its other end, (c) on the second rectangle, but at a corresponding horizontal/vertical physical distance, or (d) at the other end of the second rectangle, that is $\sqrt{2}$ of horizontal/vertical distance diagonally across from the cue, that is also at larger physical distance compared to (c). -We remove the outlier subject and replot, but we model the data points in `dat` and check whether this subject appears as an outlier in the caterpillar plot of conditional modes. - -```{julia} -#| code-fold: true -#| fig-cap: 'Comparative boxplots of mean log response time by subject under different conditions without outlier' -#| label: fig-bxpltsubjcond2 -let dat_subj = filter(r -> r.rt_m < 510, dat_subj) - boxplot( - dat_subj.CTR.refs, - dat_subj.lrt_m; - orientation=:horizontal, - show_notch=true, - axis=(; - yticks=( - 1:4, - [ - "valid cue", - "same obj/diff pos", - "diff obj/same pos", - "diff obj/diff pos", - ] - ) - ), - figure=(; resolution=(800, 300)), - ) -end -``` - -# Setup of linear mixed model - -## Contrasts +# Contrasts ```{julia} contrasts = Dict( @@ -129,54 +99,19 @@ contrasts = Dict( ) ``` -```{julia} -m_max_rt = let - form = @formula rt ~ 1 + CTR * size * cardinal + - (1 + CTR * size * cardinal | Subj) - fit(MixedModel, form, dat; contrasts) -end -``` - -```{julia} -m_cpx_rt = let - form = @formula rt ~ 1 + CTR * size * cardinal + - (1 + CTR + size + cardinal | Subj) - fit(MixedModel, form, dat; contrasts) -end -``` - -## Box-Cox - -```{julia} -#| eval: false -bc1 = fit(BoxCoxTransformation, m_max_rt) -``` - -```{julia} -bc2 = fit(BoxCoxTransformation, m_cpx_rt) -``` - - -```{julia} -#| eval: false -boxcoxplot(bc2; conf_level=0.95) -``` - -Clear evidence for skew. Traditionally, we used log transforms for reaction times. even stronger than log. We stay with log for now. Could try `1/sqrt(rt)`. - # Maximum LMM -This is the maximum LMM for the design. +This is the maximum LMM for the design; `size` is a between-subject factor, +ignoring other information such as trial number, age and gender of subjects. ```{julia} m_max = let - form = @formula log(rt) ~ 1 + CTR * size * cardinal + - (1 + CTR * size * cardinal | Subj) + form = @formula rt ~ 1 + CTR * cardinal * size + + (1 + CTR * cardinal | Subj) fit(MixedModel, form, dat; contrasts) -end +end; ``` - ```{julia} issingular(m_max) ``` @@ -189,18 +124,20 @@ only(MixedModels.PCA(m_max)) VarCorr(m_max) ``` +The LMM `m_max` is overparameterized but it is not immediately apparent why. + # Reduction strategy 1 ## Zero-correlation parameter LMM (1) -Force CPs to zero. +Force CPs to zero. _Reduction strategy 1_ is more suited for reducing model w/o theoretical expectations about CPs. The better reduction strategy for the present experiment with an _a priori_ interest in CPs is described as _Reduction strategy 2_. ```{julia} m_zcp1 = let - form = @formula log(rt) ~ 1 + CTR * size * cardinal + - zerocorr(1 + CTR * size * cardinal | Subj) + form = @formula rt ~ 1 + CTR * cardinal * size + + zerocorr(1 + CTR * cardinal | Subj) fit(MixedModel, form, dat; contrasts) -end +end; ``` ```{julia} @@ -215,16 +152,18 @@ only(MixedModels.PCA(m_zcp1)) VarCorr(m_zcp1) ``` +The LMM `m_zcp1` is also overparameterized, but now there is clear evidence for absence of evidence for the VC of one of the interactions and the other two interaction-based VCs are also very small. + ## Reduced zcp LMM -Take out VC for interactions. +Take out VCs for interactions. ```{julia} m_zcp1_rdc = let - form = @formula log(rt) ~ 1 + CTR * size * cardinal + - zerocorr(1 + CTR + size + cardinal | Subj) + form = @formula rt ~ 1 + CTR * cardinal * size + + zerocorr(1 + CTR + cardinal | Subj) fit(MixedModel, form, dat; contrasts) -end +end; ``` ```{julia} @@ -239,27 +178,7 @@ only(MixedModels.PCA(m_zcp1_rdc)) VarCorr(m_zcp1_rdc) ``` -## Model comparison 1 -Let's compare the three models. - -```{julia} -gof_summary = let - nms = [:m_zcp1_rdc, :m_zcp1, :m_max] - mods = eval.(nms) - lrt = MixedModels.likelihoodratiotest(m_zcp1_rdc, m_zcp1, m_max) - DataFrame(; - name = nms, - dof=dof.(mods), - deviance=round.(deviance.(mods), digits=0), - AIC=round.(aic.(mods),digits=0), - AICc=round.(aicc.(mods),digits=0), - BIC=round.(bic.(mods),digits=0), - χ²=vcat(:., round.(lrt.tests.deviancediff, digits=0)), - χ²_dof=vcat(:., round.(lrt.tests.dofdiff, digits=0)), - pvalue=vcat(:., round.(lrt.tests.pvalues, digits=3)) - ) -end -``` +LMM `m_zcp_rdc` is ok . We add in CPs. ## Parsimonious LMM (1) @@ -267,10 +186,10 @@ Extend zcp-reduced LMM with CPs ```{julia} m_prm1 = let - form = @formula log(rt) ~ 1 + CTR * size * cardinal + - (1 + CTR + size + cardinal | Subj) + form = @formula rt ~ 1 + CTR * cardinal * size + + (1 + CTR + cardinal | Subj) fit(MixedModel, form, dat; contrasts) -end +end; ``` ```{julia} @@ -281,19 +200,21 @@ issingular(m_prm1) only(MixedModels.PCA(m_prm1)) ``` +LMM `m_zcp_rdc` is ok . We add in CPs. + ```{julia} VarCorr(m_prm1) ``` -We note that the critical correlation parameter between spatial (`sod`) and attraction (`dod`) is now estimated at .60 -- not that close to the 1.0 boundary that caused singularity in @Kliegl2011. +We note that the critical correlation parameter between spatial (`sod`) and attraction (`dod`) is now estimated at .54 -- not that close to the 1.0 boundary that caused singularity in @Kliegl2011. -## Model comparison 2 +## Model comparison 1 ```{julia} gof_summary = let nms = [:m_zcp1_rdc, :m_prm1, :m_max] mods = eval.(nms) - lrt = MixedModels.likelihoodratiotest(m_prm1, m_zcp1, m_max) + lrt = MixedModels.likelihoodratiotest(m_zcp1_rdc, m_prm1, m_max) DataFrame(; name = nms, dof=dof.(mods), @@ -308,37 +229,61 @@ gof_summary = let end ``` +AIC prefers LMM `m_prm1` over `m_zcp1_rdc`; BIC LMM `m_zcp1_rdc`. +As the CPs were one reason for conducting this experiment, AIC is the criterion of choice. + # Reduction strategy 2 ## Complex LMM -Take out interaction VCs. +Relative to LMM `m_max`, first we take out interaction VCs and associated CPs, because these VCs are very small. This is the same as LMM `m_prm1` above. ```{julia} m_cpx = let - form = @formula log(rt) ~ 1 + CTR * size * cardinal + - (1 + CTR + size + cardinal | Subj) + form = @formula rt ~ 1 + CTR * cardinal * size + + (1 + CTR + cardinal | Subj) fit(MixedModel, form, dat; contrasts) -end +end; ``` ## Zero-correlation parameter LMM (2) -Take out interaction VCs. +Now we check the significance of ensemble of CPs. ```{julia} m_zcp2 = let - form = @formula log(rt) ~ 1 + CTR * size * cardinal + - zerocorr(1 + CTR + size + cardinal | Subj) + form = @formula rt ~ 1 + CTR * cardinal * size + + zerocorr(1 + CTR + cardinal | Subj) fit(MixedModel, form, dat; contrasts) -end +end; +``` + + +```{julia} +VarCorr(m_zcp2) ``` -## Model comparison 3 +## Parsimonious LMM (2) + +The cardinal-related CPs are quite small. Do we need them? + +```{julia} +m_prm2 = let + form = @formula(rt ~ 1 + CTR * cardinal * size + + (1 + CTR | Subj) + (0 + cardinal | Subj)) + fit(MixedModel, form, dat; contrasts) +end; +``` + +```{julia} +VarCorr(m_prm2) +``` + +## Model comparison 2 ```{julia} gof_summary = let - nms = [:m_zcp2, :m_cpx, :m_max] + nms = [:m_zcp2, :m_prm2, :m_cpx, :m_max] mods = eval.(nms) - lrt = MixedModels.likelihoodratiotest(m_zcp2, m_cpx, m_max) + lrt = MixedModels.likelihoodratiotest(m_zcp2, m_prm2, m_cpx, m_max) DataFrame(; name = nms, dof=dof.(mods), @@ -353,25 +298,10 @@ gof_summary = let end ``` -# Check LMM for untransformed reaction times - -We see the model is singular. - -```{julia} -issingular(m_cpx_rt) -``` - -```{julia} -MixedModels.PCA(m_cpx_rt) -``` - -# Other checks - -```{julia} -m_prm1.θ -m_prm1.lowerbd -m_prm1.λ -``` +The cardinal-related CPs could be removed w/o loss of goodness of fit. +However, there is no harm in keeping them in the LMM. +The data support both LMM `m_prm2` and `m_cpx` (same as: `m_prm1`). +We keep the slightly more complex LMM `m_cpx` (`m_prm1`). # Diagnostic plots of LMM residuals @@ -427,29 +357,6 @@ qqnorm( ) ``` -## Observed and theoretical normal distribution -******We****** can see this in this plot. -Overall, it does not look too bad. - -```{julia} -#| code-fold: true -#| label: fig-stdresidm1dens -#| fig-cap: ' Kernel density plot of the standardized residuals for model m1 versus a standard normal' -CairoMakie.activate!(; type="svg") -let - n = nrow(dat) - dat_rz = (; - value=vcat(residuals(m_prm1) ./ std(residuals(m_prm1)), randn(n)), - curve=repeat(["residual", "normal"]; inner=n), - ) - draw( - data(dat_rz) * - mapping(:value; color=:curve) * - density(; bandwidth=0.1); - ) -end -``` - # Conditional modes ## Caterpillar plot @@ -464,7 +371,6 @@ caterpillar!(Figure(; resolution=(800, 1200)), cm1; orderby=2) ## Shrinkage plot -### Log-transformed reaction times (LMM `m_prm1`) ```{julia} #| code-fold: true @@ -602,10 +508,7 @@ draw( Three CPs stand out positively, the correlation between GM and the spatial effect, GM and attraction effect, and the correlation between spatial and attraction effects. The second CP was positive, but not significant in the first study. The third CP replicates a CP that was judged questionable in script `kwdyz11.jl`. - -The three remaining CPs are not well defined for log-transformed reaction times; they only fit noise and should be removed. -It is also possible that fitting the complex experimental design (including target size and rectangle orientation) will lead to more acceptable estimates. -The corresponding plot based on LMM `m1_rt` for raw reaction times still shows them with very wide distributions, but acceptable. +The three remaining CPs are not well defined for reaction times. # References diff --git a/live_coding_sessions/power_analysis.jl b/live_coding_sessions/power_analysis.jl new file mode 100644 index 0000000..bd801ab --- /dev/null +++ b/live_coding_sessions/power_analysis.jl @@ -0,0 +1,101 @@ +using DataFrames +using MixedModels +using MixedModelsSim +using Random +using Statistics +using SMLP2024: dataset + +fm1 = fit(MixedModel, + @formula(reaction ~ 1 + days + (1 + days|subj)), + dataset(:sleepstudy)) + +parametricbootstrap(MersenneTwister(42), 1000, fm1) + +slpsim = DataFrame(dataset(:sleepstudy); copycols=true) + +slpsim[:, :reaction] .= 0.0 + +slpsimmod = LinearMixedModel(@formula(reaction ~ 1 + days + (1 + days|subj)), slpsim) + +# coefnames(slpsimmod) +simulate!(MersenneTwister(42), slpsimmod; β=[500, 50], σ=250) +response(slpsimmod) +fit!(slpsimmod) + +slpsimpow = parametricbootstrap(MersenneTwister(42), 1000, slpsimmod; β=[500, 50], σ=250) + +combine(groupby(DataFrame(slpsimpow.coefpvalues), :coefname), + :p => (p -> mean(<(0.05), p)) => :power) + +# now let's do random effects! +# these are expressed _relative_ to the residual standard deviation +# so we devide by 250, which is what we set as the residual standard deviation +# TODO: add a named argument "relative=true" +subj_re = create_re(100/250, 20/250) +update!(slpsimmod; subj=subj_re) + +simulate!(MersenneTwister(42), slpsimmod; β=[500, 50], σ=250, θ=slpsimmod.θ) +fit!(slpsimmod) + +slpsimpow = parametricbootstrap(MersenneTwister(42), 1000, slpsimmod; β=[500, 50], σ=250) + +# could also use `power_table` (defined in MixedModelsSim) +combine(groupby(DataFrame(slpsimpow.coefpvalues), :coefname), + :p => (p -> mean(<(0.05), p)) => :power) + + +# big questions: +# where do we get these numbers from? + +# holding the RE and residual and FE-intercept constant, what is the smallest +# days-effect that I could detect with the given sample size? + +to_consider = [10, 20, 30, 50] +# initialize an empty array of dataframes +considerations = DataFrame[] +for eff_size in to_consider + slpsimpow = parametricbootstrap(MersenneTwister(42), 1000, slpsimmod; + β=[500, eff_size], σ=250) + power_at_size = combine(groupby(DataFrame(slpsimpow.coefpvalues), :coefname), + :p => (p -> mean(<(0.05), p)) => :power) + power_at_size[:, :eff_size] .= eff_size + push!(considerations, power_at_size) +end + +# put it all together in one dataframe +power = reduce(vcat, considerations) + +# for a given sample size, now we know what size effects we can detect +# what happens if we have an effect size and what to know what sample size we need? + +# for this study, there are no items +# and the number of observations per subject is given by the design +# so we only have one n to worry about +days = 0:9 +rng = MersenneTwister(42) +subj_re = create_re(100/250, 20/250) + +considerations = DataFrame[] +for n_subj in [10, 20, 30, 40] + subj_ids = "S" .* lpad.(string.(1:n_subj), 3, '0') + + subj_data = DataFrame[] + for subj in subj_ids + sdat = DataFrame(; subj, days) + push!(subj_data, sdat) + end + simdat = reduce(vcat, subj_data) + # initialize the response with normally distributed noise + simdat[:, :reaction] .= randn(nrow(simdat)) + + slpsimmod = LinearMixedModel(@formula(reaction ~ 1 + days + (1 + days|subj)), simdat) + + update!(slpsimmod; subj=subj_re) + slpsimpow = parametricbootstrap(rng, 1000, slpsimmod; β=[500, 10], σ=250) + power_at_size = combine(groupby(DataFrame(slpsimpow.coefpvalues), :coefname), + :p => (p -> mean(<(0.05), p)) => :power) + power_at_size[:, :n_subj] .= n_subj + push!(considerations, power_at_size) +end + +power = reduce(vcat, considerations) diff --git a/live_coding_sessions/simulating_predicting_saving.jl b/live_coding_sessions/simulating_predicting_saving.jl new file mode 100644 index 0000000..cb7bf88 --- /dev/null +++ b/live_coding_sessions/simulating_predicting_saving.jl @@ -0,0 +1,106 @@ +using MixedModels +using SMLP2024: dataset +using DataFrames +using MixedModelsSim +using Random + +##### +##### Saving and loading +##### + +fm1 = fit(MixedModel, + @formula(reaction ~ 1 + days + (1+days|subj)), + dataset(:sleepstudy)) + +saveoptsum("mymodel.json", fm1) + +fm_new_session = LinearMixedModel(@formula(reaction ~ 1 + days + (1+days|subj)), + dataset(:sleepstudy)) + +restoreoptsum!(fm_new_session, "mymodel.json") + +# the Serialization stdlib can also be used here, but it's not guaranteed +# to be compatible across Julia versions + +# this can be used with the Effects package +# and for printing the model summary, but +# does not store the model matrices and can't be used for +# e.g. "fitted" or condVar +# https://juliamixedmodels.github.io/MixedModelsSerialization.jl/stable/api/ +using MixedModelsSerialization +fm1_summary = MixedModelSummary(fm1) +save_summary("mymodelsummary.jld2", fm1_summary) + +item_btwn = Dict(:freq => ["low", "high"]) +subj_btwn = Dict(:age => ["young", "old"], :l1 => ["German", "English", "Dutch"]) +df = DataFrame(simdat_crossed(MersenneTwister(12321), 6, 2; item_btwn, subj_btwn)) +rename!(df, :dv => :rt) + +boot = parametricbootstrap(MersenneTwister(10), 1000, fm1) +savereplicates("bootstrap.arrow", boot) +# does not modify the original model but still requires it +# to get all the metadata +boot_restored = restorereplicates("bootstrap.arrow", fm1) + +# things we don't necessarily recommend but are often requested +using MixedModelsExtras + +# predict() + +# linear mixed models +slp = DataFrame(dataset(:sleepstudy); copycols=true) +slp[1:10, :subj] .= "new guy" +predict(fm1, slp) # same as predict(fm1, slp; new_re_levels=:missing) +predict(fm1, slp; new_re_levels=:population) + +# kb07 = dataset(:kb07) + +# glmm +gm1 = fit(MixedModel, + @formula(use ~ 1 + age + abs2(age) + livch + urban + (1|dist)), + dataset(:contra), + Bernoulli()) + +# default +predict(gm1, dataset(:contra); type=:response) +predict(gm1, dataset(:contra); type=:linpred) + +using Effects +contra = dataset(:contra) +design = Dict(:age => -13:0.1:13, + :livch => unique(contra.livch), + :urban => unique(contra.urban)) + +eff = effects(design, gm1; + invlink=AutoInvLink(), + eff_col="use", + level=0.95) +using CairoMakie +using AlgebraOfGraphics +plt = data(eff) * + mapping(:age, :use; color=:livch, layout=:urban) * + (visual(Lines) + + mapping(; lower=:lower, upper=:upper) * visual(LinesFill)) +draw(plt) + +# Pipes +# weird syntax for built in +plt |> draw + +using Statistics +using Chain +@chain eff begin + groupby([:livch]) + combine(:use => mean => :use) +end + +# equivalent to +@chain eff begin + groupby(_, [:livch]) + combine(_, :use => mean => :use) +end + +@macroexpand @chain eff begin + groupby([:livch]) + combine(:use => mean => :use) +end diff --git a/live_coding_sessions/transformation.jl b/live_coding_sessions/transformation.jl new file mode 100644 index 0000000..765b9e2 --- /dev/null +++ b/live_coding_sessions/transformation.jl @@ -0,0 +1,53 @@ +using BoxCox +using DataFrames +using CairoMakie +using MixedModels +using MixedModelsMakie +using SMLP2024: dataset + +kb07 = dataset(:kb07) + +mkb07 = fit(MixedModel, + @formula(rt_raw ~ 1 + spkr * prec * load + + (1 + spkr + prec + load | item) + + (1 + spkr + prec + load | subj)), + kb07) + +plot_diagnostics(model) = plot_diagnostics!(Figure(), model) +function plot_diagnostics!(f, model) + ax = Axis(f[1,1]; aspect=1, title="QQPlot", + xlabel="theoretical", ylabel="observed") + qqnorm!(ax, model) + ax = Axis(f[1, 2]; aspect=1, title="Residual PDF", + xlabel="residual value", ylabel="density") + density!(ax, residuals(model)) + + alpha = 0.3 + ax = Axis(f[2, 1]; aspect=1, title="Fitted vs Observed", + xlabel="fitted", ylabel="observed") + scatter!(ax, fitted(model), response(model); alpha) + ablines!(ax, 0, 1; linestyle=:dash, color=:black) + + ax = Axis(f[2,2]; aspect=1, title="Residuals vs Fitted", + xlabel="fitted", ylabel="residuals") + scatter!(ax, fitted(model), residuals(model); alpha) + hlines!(ax, 0; linestyle=:dash, color=:black) + return f +end + +plot_diagnostics!(Figure(;size=(700,700)), mkb07) +bckb07 = fit(BoxCoxTransformation, mkb07) + +mkb07_bc = fit(MixedModel, + @formula(bckb07(rt_raw) ~ 1 + spkr * prec * load + + (1 + spkr + prec + load | item) + + (1 + spkr + prec + load | subj)), + kb07) +plot_diagnostics!(Figure(;size=(700,700)), mkb07_bc) + +mkb07_bcalt = fit(MixedModel, + @formula(1 / sqrt(rt_raw) ~ 1 + spkr * prec * load + + (1 + spkr + prec + load | item) + + (1 + spkr + prec + load | subj)), + kb07) +plot_diagnostics!(Figure(;size=(700,700)), mkb07_bcalt) diff --git a/references.bib b/references.bib index d36a673..cbe1af8 100644 --- a/references.bib +++ b/references.bib @@ -200,4 +200,21 @@ @Article{Brehm2022 doi = {10.1016/j.jml.2022.104334}, } + +@article{YeoJohnson2000, + title = {A new family of power transformations to improve normality or symmetry}, + volume = {87}, + issn = {0006-3444, 1464-3510}, + url = {https://academic.oup.com/biomet/article-lookup/doi/10.1093/biomet/87.4.954}, + doi = {10.1093/biomet/87.4.954}, + language = {en}, + number = {4}, + urldate = {2024-08-08}, + journal = {Biometrika}, + author = {Yeo, In-Kwon and Johnson, Richard A.}, + month = dec, + year = {2000}, + pages = {954--959}, +} + @Comment{jabref-meta: databaseType:bibtex;} diff --git a/src/datasets.jl b/src/datasets.jl index 83654d4..3e27d7c 100644 --- a/src/datasets.jl +++ b/src/datasets.jl @@ -2,7 +2,7 @@ _file(x) = joinpath(CACHE[], string(x, ".arrow")) clear_scratchspaces!() = Scratch.clear_scratchspaces!(@__MODULE__) -const datasets = +const DATASETS = CSV.read( IOBuffer( """ @@ -21,6 +21,7 @@ fggk21_Child,c2fmn,1,61c91e00336e6f804e9f6b86986ebb4a14561cc4908b3a21cb27c113d2b fggk21_Score,7fqx3,1,99d73ee705aaf5f4ee696eadbba992d0113ba6f467ce337a62a63853e4617400 kkl15,p8cea,2,90d7bb137c8613d7a15c8597c461aee7c7cb0f0989a07c80fc93e1fbe2e5c156 kwdyz11,4cv52,3,2fa23aa8aa25e1adb10183c8d29646ae0d19d6baef9d711c9906f7fa1b225571 +exp_2x2x3,za9gs,1,cb09684b7373492e849c83f20a071b97f986123677134ac2ddb9ec0dcb32e503 """ ), Table; @@ -29,25 +30,34 @@ kwdyz11,4cv52,3,2fa23aa8aa25e1adb10183c8d29646ae0d19d6baef9d711c9906f7fa1b225571 ) if @isdefined(_cacheddatasets) - empty!(_cacheddatasets) # start from an empty cache in case datasets has changed + empty!(_cacheddatasets) # start from an empty cache in case DATASETS has changed else const _cacheddatasets = Dict{Symbol, Arrow.Table}() end +""" + datasets() + +Return a vector of the names of datasets available for use in [`dataset`](@ref). +""" +function datasets() + return sort!(vcat(SMLP2024.DATASETS.dsname, MixedModelsDatasets.datasets())) +end + """ dataset(name::Union(Symbol, AbstractString)) Return as an `Arrow.Table` the dataset named `name`. Available dataset names, their versions, the filenames on the osf.io site and an SHA2 checksum of their contents -are in the table `datasets`. +are in the table `DATASETS`. The files are cached in the scratchspace for this package. The name of this directory is the value of `CACHE[]`. """ function dataset(nm::AbstractString) return get!(_cacheddatasets, Symbol(nm)) do # retrieve from cache if available, otherwise - # check for nm in datasets table first so MMDS can be overridden - rows = filter(==(nm) ∘ getproperty(:dsname), datasets) + # check for nm in DATASETS table first so MMDS can be overridden + rows = filter(==(nm) ∘ getproperty(:dsname), DATASETS) if isempty(rows) nm in MMDS || error("Dataset '$nm' is not available") MixedModelsDatasets.dataset(nm) @@ -58,10 +68,12 @@ function dataset(nm::AbstractString) if ismissing(row.filename) load_quiver() # special-case `ratings` and `movies` else + @info "Downloading dataset..." Downloads.download( string("https://osf.io/", row.filename, "/download?version=", row.version), fnm, ) + @info "done" end end if row.sha2 ≠ bytes2hex(open(sha2_256, fnm)) diff --git a/transformation.qmd b/transformation.qmd new file mode 100644 index 0000000..99e9f09 --- /dev/null +++ b/transformation.qmd @@ -0,0 +1,210 @@ +--- +title: "Transformations of the predictors and the response" +engine: julia +author: Phillip Alday +julia: + exeflags: ["--project", "--threads=auto"] +--- + +## Predictors + +When dealing with categorical variables, the choice of contrast coding impacts the interpretation of the coefficients of the fitted model but does not impact the predictions made by the model nor its general goodness of it. +If we apply _linear_ transformations to our predictors, then we see a similar pattern for continuous variables. + +For example, in a model with `age` (in years) as a predictor, the untransformed variable yields a model where the intercept corresponds to `age = 0`, i.e. a newborn. +For a typical experiment with young adult participants, this presents a few challenges in interpretation: + +- newborns are widely outside the range of the observed data, so it seems problematic *prima facie* to interpret the estimated results for a value so far outside the range of the observed data +- we *know* that newborns and young adults are widely different and that the effect of age across childhood on most psychological and biological phenomena is not linear. For example, children do not grow at a constant rate from birth until adulthood. + +Beyond _centering_ a variable so that the center reflects an interpretable hypothesis, we may also want to _scale_ a variable to move towards more easily interpretable units. For example, it is common to express things in terms of standard deviations instead of raw units -- combined with centering, this yields _$z$-scoring_ . + +In addition to placing some variables on a more interpretable scale, $z$-scoring can be used across all continuous predictors to place them all on a single, common scale. +The advantage to shared scale across all continuous predictors is that the magnitude of coefficient estimates are directly comparable. +The disadvantage is that the natural units are lost, especially when the natural units are directly interpretable (e.g. milliseconds, grams, etc.). + +::: {.callout-note collapse="true" title="Nonlinear transformations"} +There are also other possible nonlinear transformation, such as the logarithm or various polynomials, but we will leave this alone. Nonlinear transformation change the predictions of the model (in addition to changing the interpretation of the associated coefficients) and should be appropriately motivated by the data and theory. +::: + +In other words, from an interpretability standpoint, many continuous variables require just as much attention to their "coding" as categorical variables do. + +::: {.callout-note collapse="true" title="Scaling can also help numerical aspects of model fitting"} + +From a practical perspective, linear transformations of the predicots may also make model fitting easier. +In an abstract mathematical sense, the scale of the variables does not matter, but computers and hence our software exist in a less idealized realm. +In an intuitive sense, we can think of rounding error -- if we are dealing with quantities on widely different scales, then the quantities on the larger scale will tend to dominate the quantities on the smaller scale. +This is why many guides on how to deal with convergence issues suggest scaling your variables. + +::: + +In Julia, the package [`StandardizedPredictors.jl`](https://beacon-biosignals.github.io/StandardizedPredictors.jl/v1/) takes advantage of this parallel between linear transformations and contrast coding and allows you to specify centering, scaling and $z$-transformations as part of the contrast specification. + +We'll also be using the [`Effects.jl`](https://beacon-biosignals.github.io/Effects.jl/v1.3/) package to demonstrate that these transformation do not change the model predictions. + +```{julia} +using DataFrames +using Effects +using MixedModels +using StandardizedPredictors +using SMLP2024: dataset +``` + +```{julia} +slp = fit(MixedModel, + @formula(reaction ~ 1 + days + (1 + days |subj)), + dataset(:sleepstudy)) +``` + +```{julia} +days_centered = fit(MixedModel, + @formula(reaction ~ 1 + days + (1 + days |subj)), + dataset(:sleepstudy); + contrasts=Dict(:days => Center())) +``` + +If we look at the log-likelihood, AIC, BIC, etc. of these two models, we see that they are the same: + +```{julia} +mods = [slp, days_centered] +DataFrame(; model=["original", "centered"], + AIC=aic.(mods), + AICc=aicc.(mods), + BIC=bic.(mods), + logLik=loglikelihood.(mods)) +``` + + +We can also see that models make identical predictions. +The Effects package will compute predictions and estimated errors at a predefined grid. +For more complicated models, we can also use the package to compute "typical" values, such as the mean, median or mode, for variables that we wish to ignore. +We don't need to worry about that right now, since we only have one non-intercept predictor. + +```{julia} +# a fully crossed grid is computed from the elements of `design`. +# this is similar to how `expand.grid` works in R. +design = Dict(:days => [1, 4, 9]) +effects(design, slp; level=0.95) +``` + +```{julia} +effects(design, days_centered; level=0.95) +``` + +If this sounds like `effects` or `emmeans` in R, that's because there is a large overlap. + +## Response + +In addition to transforming the predictors, we can also consider transforming the response (dependent variable). +There are many different common possibilities -- the log, the inverse/reciprocal, or even the square root -- and it can be difficult to choose an appropriate one. +For non-negative response (e.g., reaction time in many experiences), @Box1964 figured out a generalization that subsumes all of these possibilities: + +$$ +\begin{cases} +\frac{y^{\lambda} - 1}{\lambda} &\quad \lambda \neq 0 \\ +\log y &\quad \lambda = 0 +\end{cases} +$$ + +Our task is thus finding the appropriate $\lambda$ such that the _conditional distribution_ is as normal as possible. +In other words, we need to find $\lambda$ that results in the _residuals_ are as normal as possible. +I've emphasized _conditional distribution_ and _residuals_ because that's where the normality assumption actually lies in the linear (mixed) model. +The assumption is **not** that the response `y`, i.e. the uncondidtional distribution, is normally distributed, but rather that the residuals are normally distributed. +In other words, we can only check the quality of a given $\lambda$ by fitting a model to the transformed response. +Fortunately, [`BoxCox.jl`](https://palday.github.io/BoxCox.jl/v0.3.3/) makes this easy. + +The `fit` function takes two arguments: +- the transformation to be fit (i.e. `BoxCoxTransformation`) +- the model fit to the original data + +```{julia} +using BoxCox +bc = fit(BoxCoxTransformation, slp) +``` + +:::{callout-note} +For large models, fitting the `BoxCoxTransformation` can take a while because a mixed model must be repeatedly fit after each intermediate transformation. +::: + +Although we receive a single "best" value (approximately -1.0747) from the fitting process, it is worthwhile to look at the profile likelihood plot for the transformation: + +```{julia} +# we need a plotting backend loaded before we can use plotting functionality +# from BoxCox.jl +using CairoMakie +boxcoxplot(bc; conf_level=0.95) +``` + +Here we see that -1 is nearly as good. Moreover, time$^{-1}$ has a natural interpretation as _speed_. +In other words, we can model reaction speed instead of reaction time. +Then instead of seeing whether participants take longer to respond with each passing day, we can see whether their speed increases or decreases. +In both cases, we're looking at whether they respond _faster_ or _slower_ and even the terminology _fast_ and _slow_ suggests that speed is easily interpretable. + +If we recall the definition of the Box-Cox transformation from above: +$$ +\begin{cases} +\frac{y^{\lambda} - 1}{\lambda} &\quad \lambda \neq 0 \\ +\log y &\quad \lambda = 0 +\end{cases} +$$ + +then we see that there is a normalizing denominator that flips the sign when ``\lambda < 0``. +If we use the full Box-Cox formula, then the sign of the effect in our transformed and untransformed model remains the same. +While useful at times, speed has a natural interpretation and so we instead use the power relation, which is the actual key component, without normalization. + +Because `reaction` is stored in milliseconds, we use `1000 / reaction` instead of `1 / reaction` so that our speed units are responses per second. + +```{julia} +model_bc = fit(MixedModel, + @formula(1000 / reaction ~ 1 + days + (1 + days | subj)), + dataset(:sleepstudy)) +``` + +For our original model on the untransformed scale, the intercept was approximately 250, which means that the average response time was about 250 milliseconds. +For the model on the speed scale, we have an intercept about approximately 4, which means that the average response speed is about 4 responses per second, which implies that the the average response time is 250 milliseconds. +In other words, our new results are compatible with our previous estimates. + +This example also makes something else clear: much like transformations of the predictors, transforming the response **changes the hypothesis being tested**. +While it is relatively easy to re-formulate hypothesis about reaction time into hypotheses about speed, it can be harder to re-formulate other hypotheses. +For example, a log transformation of the response changes the hypotheses on the original scale from _additive_ effects to _multiplicative effects_. +As a very simple example, consider two observations `y1 = 100` and `y2 = 1000`. +On the original scale, there `y2 = 10 * y1`. +But on the $\log_{10}$ scale, `log10(y2) = 1 + log10(y1)`. +In other words: I recommend keeping interpretability of the model in mind before blindly chasing perfectly fulfilling all model assumptions. + +There are two other little tricks that `BoxCox.jl` has to offer. +First, the fitted transformation will work just like a function: +```{julia} +bc(1000) +``` + +```{julia} +bc.(response(slp)) +``` + +Second, the decades since the publication of @Box1964 have seen many proposed extensions to handle that that may not be strictly positive. +One such proposal from @YeoJohnson2000 is also implemented in BoxCox.jl. +The definition of the transformation is: + +$$ +\begin{cases} ((y_+1)^\lambda-1)/\lambda & \text{if }\lambda \neq 0, y \geq 0 \\ + \log(y_i + 1) & \text{if }\lambda = 0, y \geq 0 \\ + -((-y_ + 1)^{(2-\lambda)} - 1) / (2 - \lambda) & \text{if }\lambda \neq 2, y < 0 \\ + -\log(-y_ + 1) & \text{if }\lambda = 2, y < 0 +\end{cases} +$$ + +and we can fit it in BoxCox.jl with + +```{julia} +yj = fit(YeoJohnsonTransformation, slp) +``` + +```{julia} +f = boxcoxplot(yj; conf_level=0.95) +f[0, :] = Label(f, "Yeo-Johnson"; tellwidth=false) +f +``` + +:::{refs} +:::