-
Notifications
You must be signed in to change notification settings - Fork 1
/
tutorial.Rmd
1197 lines (936 loc) · 38.2 KB
/
tutorial.Rmd
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
448
449
450
451
452
453
454
455
456
457
458
459
460
461
462
463
464
465
466
467
468
469
470
471
472
473
474
475
476
477
478
479
480
481
482
483
484
485
486
487
488
489
490
491
492
493
494
495
496
497
498
499
500
501
502
503
504
505
506
507
508
509
510
511
512
513
514
515
516
517
518
519
520
521
522
523
524
525
526
527
528
529
530
531
532
533
534
535
536
537
538
539
540
541
542
543
544
545
546
547
548
549
550
551
552
553
554
555
556
557
558
559
560
561
562
563
564
565
566
567
568
569
570
571
572
573
574
575
576
577
578
579
580
581
582
583
584
585
586
587
588
589
590
591
592
593
594
595
596
597
598
599
600
601
602
603
604
605
606
607
608
609
610
611
612
613
614
615
616
617
618
619
620
621
622
623
624
625
626
627
628
629
630
631
632
633
634
635
636
637
638
639
640
641
642
643
644
645
646
647
648
649
650
651
652
653
654
655
656
657
658
659
660
661
662
663
664
665
666
667
668
669
670
671
672
673
674
675
676
677
678
679
680
681
682
683
684
685
686
687
688
689
690
691
692
693
694
695
696
697
698
699
700
701
702
703
704
705
706
707
708
709
710
711
712
713
714
715
716
717
718
719
720
721
722
723
724
725
726
727
728
729
730
731
732
733
734
735
736
737
738
739
740
741
742
743
744
745
746
747
748
749
750
751
752
753
754
755
756
757
758
759
760
761
762
763
764
765
766
767
768
769
770
771
772
773
774
775
776
777
778
779
780
781
782
783
784
785
786
787
788
789
790
791
792
793
794
795
796
797
798
799
800
801
802
803
804
805
806
807
808
809
810
811
812
813
814
815
816
817
818
819
820
821
822
823
824
825
826
827
828
829
830
831
832
833
834
835
836
837
838
839
840
841
842
843
844
845
846
847
848
849
850
851
852
853
854
855
856
857
858
859
860
861
862
863
864
865
866
867
868
869
870
871
872
873
874
875
876
877
878
879
880
881
882
883
884
885
886
887
888
889
890
891
892
893
894
895
896
897
898
899
900
901
902
903
904
905
906
907
908
909
910
911
912
913
914
915
916
917
918
919
920
921
922
923
924
925
926
927
928
929
930
931
932
933
934
935
936
937
938
939
940
941
942
943
944
945
946
947
948
949
950
951
952
953
954
955
956
957
958
959
960
961
962
963
964
965
966
967
968
969
970
971
972
973
974
975
976
977
978
979
980
981
982
983
984
985
986
987
988
989
990
991
992
993
994
995
996
997
998
999
1000
---
title: "R: `mgcv` as a general framework for (most) Generalized Models"
author: "Asger Svenning"
date: "2024-10-20"
output:
rmdformats::downcute:
use_bookdown: true
self_contained: true
thumbnails: false
lightbox: true
gallery: true
code_folding: show
out_width: 90%
editor_options:
chunk_output_type: console
---
<style>
div.question-box {
width: 75%;
margin: 10px auto;
padding: 5px;
border: 3px solid var(--code-block-background);
border-radius: 15px;
background: var(--success-color);
color: var(--text-color);
font-size: medium;
p {
padding: 10px;
margin: 0;
font-style: italic;
color: color-mix(in hwb, var(--text-color), var(--page-background) 15%);
}
p:first-child:before {
content: "Question: ";
font-style: normal;
font-weight: bold;
}
p ~ p:before {
content: "Answer: ";
font-style: normal;
font-weight: bold;
}
}
.page-content h1 {
margin-top: 3rem;
}
</style>
```{r}
#| setup, include=FALSE
Sys.setlocale(category="LC_TIME", locale="English_United States.1252")
Sys.setenv(LANG="en")
knitr::opts_chunk$set(
echo = TRUE,
cache = FALSE,
fig.width = 10,
fig.height = 10,
fig.align = "center",
out.width = "90%",
dev="png",
dpi=200
)
library(tidyverse)
library(mgcv)
library(gratia)
library(kableExtra)
library(patchwork)
source("helpers/predict.R")
source("helpers/statistics.R")
source("helpers/format_table.R")
theme_set(
theme_bw() +
theme(
text = element_text(family = "serif"),
strip.text = element_text(hjust = 0.5, size = 14, face = "bold"),
strip.text.y.right = element_text(angle = 0),
strip.text.y.left = element_text(angle = 0),
title = element_text(hjust = 0.5, size = 14, face = "bold"),
plot.title = element_text(hjust = 0.5, size = 16, face = "plain"),
legend.title = element_text(hjust = 0.5),
legend.text = element_text(size = 12),
panel.border = element_rect(color = "black", linewidth = 1)
)
)
rm_titles <- theme(
axis.title = element_blank(),
plot.title = element_blank(),
plot.subtitle = element_blank()
)
syn_gam_rsq <- "N/A"
syn_bam_rsq <- "N/A"
```
**REFERENCES**
This tutorial was developed using the following resources:
- <https://peerj.com/articles/6876/>
- <https://fromthebottomoftheheap.net/2021/02/02/random-effects-in-gams/>
- <https://lter.github.io/lterdatasampler/articles/hbr_maples_vignette.html>
# Introduction
Welcome to the `mgcv` skill training session! In this tutorial, we will explore how to use the R `mgcv` package as a unified framework for modeling various types of generalized models, including Generalized Linear Models (GLMs), Linear Mixed Models (LMMs), Generalized Linear Mixed Models (GLMMs), Generalized Additive Models (GAMs), and Generalized Additive Mixed Models (GAMMs).
By the end of the session, you will have gained practical experience in:
- Selecting the appropriate model type (GLM, LMM, GAM, GLMM, GAMM) and family (e.g., `ziP`, `betar`, `gaullss`).
- Fitting and parameterizing models using the `mgcv` package.
- Visualizing key diagnostics, including residual plots, QQ plots, and fitted values.
- Optimizing model performance for larger datasets using `bam`.
## Sections Overview
This tutorial is divided into the following sections:
1. **Introduction to Generalized Models and Smoothing Functions**\
We will start by discussing the range of models that can be fitted using `mgcv`, including GLMs, LMMs, GLMMs, GAMs, and GAMMs. You will also learn about the available families and smoothing functions in `mgcv`.
2. **Fitting Generalized Linear Models (GLM)**\
In this section, we will fit basic GLMs to datasets using different families, such as Poisson for count data and binomial for binary data.
3. **Fitting Linear Mixed Models (LMM) and Generalized Linear Mixed Models (GLMM)**\
This section covers fitting LMMs and GLMMs to handle grouped data and random effects.
4. **Fitting Generalized Additive Models (GAM) and Generalized Additive Mixed Models (GAMM)**\
We will extend GLMs and GLMMs by incorporating smoothing terms for continuous predictors, resulting in GAMs and GAMMs.
6. **Optimizing for Large Datasets Using `bam`**\
For large datasets, the `bam` function in `mgcv` provides computational efficiency. This section will focus on optimizing model fitting for larger datasets.
## Introduction to Generalized Models and Smoothing Functions
`mgcv` provides a unified framework for modeling a wide range of statistical models, including:
- **Generalized Linear Models (GLMs)**: A generalization of linear regression, allowing for different types of response distributions (e.g., Poisson, binomial).
- **Linear Mixed Models (LMMs)**: Extends linear models by incorporating random effects to account for grouped data.
- **Generalized Linear Mixed Models (GLMMs)**: Extends GLMs by including random effects.
- **Generalized Additive Models (GAMs)**: Extends GLMs by allowing nonlinear relationships through smoothing functions.
- **Generalized Additive Mixed Models (GAMMs)**: Combines GAMs and GLMMs, allowing for both smoothing and random effects.
Common families in `mgcv` include:
- **Gaussian**: For continuous, normally distributed data.
- **Poisson**: For count data.
- **Binomial**: For binary or proportion data.
- **Zero-inflated Poisson (`ziP`)**: For count data with excess zeros.
- **Beta regression (`betar`)**: For data bounded between 0 and 1.
- **Gaussian location-scale (`gaullss`)**: For modeling the mean and variance simultaneously.
- **Zero inflated (hurdle) Poisson location-scale (`ziplss`)**: For modelling count and occurrence simultaneously (e.g. presence/absence and abundance).
Smoothing functions available in `mgcv` include:
- **Thin-plate splines (`bs = "tp"`)**: A flexible, isotropic smoothing function.
- **Cubic regression splines (`bs = "cr"/"cs"`)**: Penalized/Shrinkage cubic regression splines.
- **Random effects (`s = "re"`)**: For modeling random effects in Linear, Generalized Linear, and Generalized Additive Mixed Models (LMM/GLMM/GAMM).
- **Factor smooth interactions (`s = "fs"`)**: For estimating random additive effects in GAMMs.
- **Gaussian process (`s = "gp"`)**: For interpolating autocorrelated data, also sometimes known as kriging in GIS settings.
## A few technical notes
### Optimization method:
When using `gam` or `bam` in `mgcv`, it is important to specify the `method` argument. The default is `method = "GCV.Cp"`, however unless you need to compare models with different parametric terms, it is recommended to use `method = "REML"` with `gam` and `method = "fREML"` for `bam`:
```{r optim_method, eval=F}
# Incorrect
gam(y ~ x, data = data)
# Correct
gam(y ~ x, data = data, method = "REML")
```
### Specifying multiple smoothing functions
Sometimes we might want to specify multiple smoothing functions (bases) for a single term, an example later will come up later in this tutorial where we want a random cyclical smooth effect. `mgcv` doesn't contain a built-in random cyclical smooth basis, however it does contain a random smooth basis and a cyclic smooth basis, these can then be combined by specifying both in the `bs` argument to `te()`:
```{r multiple_smooth, eval=F}
gam(y ~ te(f, x, bs=c("fs", "cc")), data = data, method = "REML")
```
### Specifying "smooth" interactions
When specifying smooth interactions in `mgcv`, often interaction *can* be specified simply by listing multiple terms inside a single `s()` term. However, this can lead to issues with model identifiability and interpretation. It is often better to separate the main and interaction effects using the `ti()`. The `ti()` function essentially functions as the `:` operator for standard R formulas, but for smooth terms (including random effects):
```{r smooth_interactions, eval=F}
# Standard
gam(y ~ s(x, y), data = data, method = "REML")
# Better
gam(y ~ s(x) + s(y) + ti(x, y), data = data, method = "REML")
```
# Fitting Generalized Linear Models
Fitting models with `mgcv` mirrors the base `R` syntax for LMs and GLMs. To illustrate this, we will fit a simple linear model and a binomial GLM to the `iris` dataset.
```{r iris_mod}
#| eval: FALSE # Remove me!! #
iris_data <- iris %>%
as_tibble %>%
mutate(
Species = factor(Species)
)
# Fit a linear model with Sepal.Length as response
# and Sepal.Width and Species as predictors
iris_lm <- gam(
Sepal.Length ~ >>>FILL_THIS<<<,
data = iris_data,
method = "REML"
)
# Fit a binomial GLM with Species == "virginica" as response
# and Sepal.Width as predictor
iris_glm <- gam(
I(Species == "virginica") ~ >>>FILL_THIS<<<,
data = iris_data,
family = "binomial",
method = "REML"
)
```
All standard GLM familes such as `gaussian`, `poisson`, `binomial`, `quasipoisson` and `quasibinomial` can also be used with `gam` by specifying the `family` argument.
## Model diagnostics with `mgcv` and `gratia`
Unfortunately, the unbuilt diagnostic tools with `mgcv` are not ideal. However, the `gratia` package provides a set of functions for diagnostics and predictions that are more user-friendly and integrate with the `tidyverse` and `ggplot2` ecosystem.
`draw` Draws the marginal effects of the terms. *(By default only plots the smooth terms like `plot`, but setting `parametric=TRUE` will also plot the fixed terms)*
```{r draw_iris_mod, fig.height=5}
#| eval: FALSE # Remove me!! #
# Draw the marginal effects of the terms
draw(iris_lm, parametric=T)
```
::: question-box
How would you interpret the output figures from `draw` for the linear model? Is the effect of `Sepal.Width` positive or negative? What do you think the `Sepal.Length` value of `virginica` at `Sepal.Width` of 3.5?
Write your answer here!
:::
`appraise` Provides the standards diagnostic plots; QQ-plot, residual vs. linear predictor values, residual histogram and observed vs. fitted values. *(Equivalent to `gam.check` in `mgcv` and `plot` for `lm` and `glm`)*
```{r appraise_iris_mod}
#| eval: FALSE # Remove me!! #
# Provide the standard diagnostic plots
appraise(iris_lm)
```
::: question-box
How would you interpret the output figures from `appraise` for the linear model? Are there any issues with the model fit?
Write your answer here!
:::
## Predicting and plotting with `mgcv`, `gratia` and `ggplot2`
`fitted_values`/`add_fitted_samples` Adds the fitted values to a `tibble`(`data.frame`).
```{r predict_iris_mod}
#| eval: FALSE # Remove me!! #
# Add the fitted values to the data
fd1 <- fitted_values(iris_lm, iris_data)
fd2 <- add_fitted_samples(iris_data, iris_lm, n = 100)
```
```{r}
#| class.source: 'fold-hide'
#| eval: FALSE # Remove me!! #
fd1 %>%
head %>%
format_table(
caption = "Result of `fitted_values(lm_iris, iris_data)`"
)
fd2 %>%
head %>%
format_table(
caption = "Result of `add_fitted_samples(iris_data, lm_iris, n = 100)`"
)
```
```{r}
#| class.source: 'fold-hide'
#| fig.height: 5
#| fig.width: 6
#| fig.cap: "Fitted values and samples for `lm_iris`"
#| eval: FALSE # Remove me!! #
fd1 %>%
ggplot(aes(x = Sepal.Width, color = Species)) +
geom_point(aes(y = Sepal.Length)) +
geom_line(aes(y = .fitted), linewidth = 1, show.legend = F) +
geom_ribbon(
aes(ymin = .lower_ci, ymax = .upper_ci),
alpha = 0.15,
linewidth = 0,
show.legend = F
) +
geom_line(
data = fd2,
aes(y = .fitted, group = paste0(Species, .draw)),
linewidth = 0.5,
alpha = 0.15,
show.legend = F
) +
coord_cartesian(expand = F)
```
And `overview` provides a tidy summary of both the fixed (parametric), smooth, random etc. effects. *(Here we will use a modified version called `mod_overview`, hopefully it will be part of `gratia`)*
```{r}
#| eval: FALSE # Remove me!! #
tab_iris_lm <- iris_lm %>%
mod_overview(accuracy = 0.00001, parametric_effect_sizes = T, stars = T) %>%
select(!c(k, edf)) # only relevant when the model includes smooth terms
tab_iris_lm %>%
format_table(
caption = "Linear model on `iris` summary",
align = "rcccccc"
)
```
::: question-box
Could you see yourself using `mgcv` for simple linear models in the future? Why or why not?
Write your answer here!
:::
# Fitting Generalized Linear Mixed Models
`mgcv` can also fit mixed models, including Linear Mixed Models (LMMs) and Generalized Linear Mixed Models (GLMMs). To illustrate this, we will fit a simple LMM and GLMM to the `hbr_maples` dataset from `lterdatasampler`.
```{r hbr_maples}
maple_data <- lterdatasampler::hbr_maples %>%
mutate(
year = factor(year),
elevation = elevation %>%
as.character %>%
replace_na("High") %>%
factor(c("Low", "Mid", "High"))
)
```
The dataset contains stem and leaf measurement data for Sugar Maple saplings from 12 transects in 2003/2004. In this tutorial we will focus on modeling the relationship between stem dry mass (`stem_dry_mass`) and stem length (`stem_length`). If we plot the data, we can see a linear relationship, but with variation between transects and years.
```{r hbr_maples_plot}
#| class.source: 'fold-hide'
#| message: FALSE
#| warning: FALSE
maple_data %>%
ggplot(aes(stem_dry_mass, stem_length, color=transect, shape=year, linetype=year)) +
geom_point() +
geom_smooth(method = "gam", se = F) +
geom_smooth(aes(group=1), method = "lm", color="black", linetype="solid")
```
::: question-box
What do you think the relationship between `stem_dry_mass` and `stem_length` is? How would you typically model this type of data?
Write your answer here!
:::
Here we will fit a model like: $$
\begin{align*}
\text{stem_length} = \beta\; &+ \\
\text{stem_dry_mass}\;&+ \\
\gamma_{year}\; &+ \\
\gamma_{transect}\; &+ \\
\gamma_{year} \times \gamma_{transect}\; &+ \\
\text{stem_dry_mass} \times \gamma_{year}\; &+ \\
\text{stem_dry_mass} \times \gamma_{transect}\; &+ \\
\text{stem_dry_mass} \times \gamma_{year} \times \gamma_{transect}\; &+ \\
\epsilon\; &
\end{align*}
$$
Where $\gamma_{transect}$ and $\gamma_{year}$ are random effects for transect and year, respectively. We can fit this model using the `gam` function in `mgcv`.
```{r hbr_maples_mod}
#| eval: FALSE # Remove me!! #
maple_lmm <- gam(
stem_length ~
stem_dry_mass +
s(year, bs="re") +
s(transect, bs="re") +
ti(year, transect, bs="re") +
ti(year, >>>FILL_THIS<<<, bs="re") +
ti(transect, stem_dry_mass, bs="re") +
ti(>>>FILL_THIS<<<, transect, stem_dry_mass, bs="re"),
data = maple_data,
family = "gaussian",
method = "REML"
)
```
::: question-box
Walk through each term in the model formula. What does each term represent and how does it contribute to the model? Focus on the difference between a "naked" term, a term wrapped in `s(...)` and `ti(...)`.
Write your answer here!
:::
::: question-box
What is the purpose of the "bs" argument and what does `bs="re"` do?
Write your answer here!
:::
Let's verify the model fit:
```{r hbr_maples_appraise}
#| eval: FALSE # Remove me!! #
>>>FILL_THIS<<<
```
This isn't perfect, but I'll leave it as an exercise to the reader to improve the model fit. *(Hint: try different terms and families)*
::: question-box
How would you interpret the output figures from `appraise` for the LMM model? What is the main issue with the model fit and what effect does it have on the interpretation of the model?
Write your answer here!
:::
We can then inspect the results:
```{r}
#| class.source: 'fold-hide'
#| eval: FALSE # Remove me!! #
tab_maple_lmm <- >>>FILL_THIS<<<
tab_maple_lmm %>%
format_table(
caption = "LMM on `lterdatasampler::hbr_maples` summary",
align = "rcccccc"
)
```
::: question-box
How would you interpret the output of `mod_overview` for the LMM model? What is the effect of `stem_dry_mass` on `stem_length`, and does it depend on the year, transect or both?
Write your answer here!
:::
And again, we will plot the fitted values using `fitted_values` and `ggplot2`. However, this time we want to visualize the main effect of `stem_dry_mass` without the random effects, as well as the full model with the random effects. We do this by using the `exclude` argument in `fitted_values`, the names of the terms to exclude can be found in the output of `mod_overview`.
```{r hbr_maples_predict}
#| eval: FALSE # Remove me!! #
maple_lmm_pred_main <- fitted_values(
maple_lmm,
exclude=c(
"s(year)",
"s(transect)",
"ti(year,transect)",
"ti(year,stem_dry_mass)",
"ti(transect,stem_dry_mass)",
"ti(year,transect,stem_dry_mass)"
)
) %>%
bind_cols(select(maple_data, !any_of(names(.))))
maple_lmm_pred_all <- fitted_values(maple_lmm) %>%
bind_cols(select(maple_data, !any_of(names(.))))
```
```{r hbr_maples_plot_pred}
#| class.source: 'fold-hide'
#| eval: FALSE # Remove me!! #
maple_lmm_pred_main %>%
ggplot(aes(stem_dry_mass, .fitted, ymin=.lower_ci, ymax=.upper_ci)) +
geom_point(aes(y=stem_length, color=transect)) +
geom_line(
data = maple_lmm_pred_all,
aes(color = transect, linetype=year),
) +
geom_line() +
geom_ribbon(alpha=0.2) +
coord_cartesian(expand=F) +
labs(y = "Stem Length", x = "Stem Dry Mass")
```
::: question-box
What is your immediate impression of the model fit from the plot? Does this visualization change your interpretation of the diagnostic plots from earlier?
Write your answer here!
:::
# Fitting Generalized Additive Models
This section will cover the core functionality of `mgcv` (and `gratia`) - fitting Generalized Additive Models (GAMs). GAMs are a generalization of GLMs that allow for nonlinear relationships between predictors and the response variable. One of the major strengths of `mgcv` is that it allows for the inclusion of smooth terms, while not being prone to overfitting due to the use of penalization.
To explore the functionality of `mgcv` we will use the `knz_bison` dataset from `lterdatasampler`. The dataset contains weight and age measurements for male and female bison from the Konza Prairie Biological Station.
```{r}
#| class.source: 'fold-hide'
head_knz_bison <- lterdatasampler::knz_bison %>%
head
head_knz_bison %>%
format_table(
caption = "First few rows of `knz_bison`",
)
```
Unfortunately the age of the animal in the supplied data has been rounded, leading to very few unique values of the age. This is very problematic for modelling in general and especially for GAMs. To get around this we will assume that the weight gain of an animal between measurements can be approximated by a linear interpolation of the weight measurements. We then sample the same number of observations from the original dataset, but at uniform continous ages between the minimum and maximum age of the animals and use the interpolated weights as the response variable.
```{r knz_bison_interpolate}
#| warning: FALSE
bison_orig <- lterdatasampler::knz_bison %>%
filter(!is.na(animal_weight) & !is.na(animal_yob)) %>%
group_by(animal_code) %>%
filter(n() > 1) %>%
ungroup %>%
mutate(
animal_age = rec_year - animal_yob,
across(c(data_code, animal_code, animal_sex, rec_month, rec_year), factor),
)
bison_data <- bison_orig %>%
group_by(animal_code) %>%
reframe(
across(c(data_code, animal_yob, animal_sex), first),
inter_data = approx(animal_age, animal_weight, xout=sort(runif(n(), min(animal_age), max(animal_age)))) %>%
set_names(c("animal_age", "animal_weight")) %>%
as_tibble
) %>%
unnest(inter_data) %>%
mutate(
rec_year = floor(animal_yob + animal_age),
)
```
This doesn't change the shape of the relationship, but it does make for a better modelling excercise.
```{r knz_bison_plot}
#| class.source: 'fold-hide'
#| fig.height: 5
shared_elements <- list(
geom_point(size = 1, alpha = 0.25, shape = 16),
scale_color_brewer(palette = "Set1"),
coord_cartesian(expand = F),
labs(x = "Age", y = "Weight (kg)", color = "Sex"),
guides(
color = guide_legend(override.aes = list(size = 5, alpha = 1))
)
)
orig_plt <- bison_orig %>%
ggplot(aes(animal_age, animal_weight, color=animal_sex)) +
shared_elements +
labs(title = "Original data")
interp_plt <- bison_data %>%
ggplot(aes(animal_age, animal_weight, color=animal_sex)) +
shared_elements +
labs(title = "Sample-interpolated data")
wrap_plots(
orig_plt, interp_plt,
nrow = 1,
guides = "collect",
axes = "collect"
) &
theme(
legend.position = "bottom"
)
```
::: question-box
How would you go about handling this type of data in your own research? Do you think the interpolation is a good approach; why or why not?
Write your answer here!
:::
We can then fit a GAM to the interpolated data using the `gam`/`bam` function in `mgcv`. Here we will use a smooth term for the animal age with a thin-plate spline basis and a factor smooth interaction term between the animal age and the year of recording, since we expect there may be some differences between years, but we are only interested in the general pattern. *Note: `k` was chosen using `gam.check`, while `family` was chosen using `appraise`.*
```{r bison_gam}
#| warning: FALSE
#| eval: FALSE # Remove me!! #
bison_gam <- bam(
animal_weight ~
s((>>>FILL_THIS<<<, bs="ts", by=animal_sex, k=50) +
ti(>>>FILL_THIS<<<, bs="fs", by=animal_sex),
data = bison_data,
family = scat(link = "log"),
method = "fREML",
discrete = T
)
```
::: question-box
What is the purpose of the `by` argument in the smooth term? How does it affect the model fit?
Write your answer here!
:::
::: question-box
Have you used the `scat` (scaled t) family before? *(See `?family.mgcv` and `?scat` for further information)*
Write your answer here!
:::
As usual, first we should inspect the diagnostic plots using `appraise`.
```{r bison_appraise}
#| fig.height: 7
#| eval: FALSE # Remove me!! #
>>>FILL_THIS<<<
```
::: question-box
Are there any issues with the model fit? How would you interpret the diagnostic plots from `appraise`?
Write your answer here!
:::
We can quickly visualise the shape of the relationships of interest using the `draw` function from `gratia`. Here we are only really interested in the effect of the animal age, so we specify `select` to only include the smooth terms involving the animal age.
```{r bison_draw}
#| fig.height: 5
#| eval: FALSE # Remove me!! #
bison_gam %>%
draw(
select = c("s(animal_age):animal_sexF", "s(animal_age):animal_sexM"),
axes = "collect",
rug = F
)
```
::: question-box
What does "partial effect" mean in the context of the `draw` function? Can you easily extract the shape of the age-weight relationship and the differences between sexes from the plot?
Write your answer here!
:::
For better control over the visualization, we can use the `fitted_values` function to generate predictions for a dataset with evenly spaced ages and both sexes. Again we use the `exclude` argument to generate predictions without the the random effects (in this case a random smooth term).
```{r bison_predict, fig.height=8}
#| warning: FALSE
#| eval: FALSE # Remove me!! #
pred_bison_data <- data_slice(
bison_gam,
animal_age = evenly(animal_age, 100, upper=24),
animal_sex = unique(animal_sex)
)
bison_inference <- bison_gam %>%
fitted_values(
pred_bison_data,
exclude = c(
"ti(rec_year,animal_yob,animal_age):animal_sexF",
"ti(rec_year,animal_yob,animal_age):animal_sexM"
)
)
```
```{r}
#| class.source: 'fold-hide'
#| eval: FALSE # Remove me!! #
head_bison_inference <- bison_inference %>%
head
head_bison_inference %>%
format_table(
caption = "Fitted values for `lterdatasampler::knz_bison`"
)
```
We can then use `ggplot2` (or whatever else you would like) to plot the fitted values, including the 95% confidence intervals.
```{r bison_predict_plt}
#| class.source: 'fold-hide'
#| warning: FALSE
#| fig.height: 5
#| eval: FALSE # Remove me!! #
bison_inference %>%
mutate(
.upper_ci = pmin(.upper_ci, 1.5 * max(bison_data$animal_weight))
) %>%
ggplot(
aes(
x = animal_age,
y = .fitted,
color = animal_sex,
fill = animal_sex,
group = animal_sex
)
) +
geom_point(
aes(y = animal_weight),
data = bison_data,
shape = 21,
size = 1,
alpha = 0.15
) +
geom_line(
size = 1,
show.legend = F
) +
geom_ribbon(
aes(ymin = .lower_ci, ymax = .upper_ci),
alpha = 0.15,
color = NA,
fill = "black",
show.legend = F
) +
scale_color_brewer(palette = "Set1") +
scale_fill_brewer(palette = "Set1") +
coord_cartesian(expand = F) +
labs(x = "Age", y = "Weight (kg)", fill = "Sex") +
guides(
color = guide_none(),
fill = guide_legend(
override.aes = list(
color = "black",
size = 7,
stroke = 1,
alpha = 1
)
)
)
```
::: question-box
Does this type of visualization help you understand the model fit better? What are the main differences to the `draw` function?
Write your answer here!
:::
As we can see, the model is able to capture the pattern extremely well, but as with all models, it is important to remember that extrapolation can be dangerous.
# Fitting Generalized Additive Mixed Models
Generalized additive mixed models (GAMMs) can take two forms, the first is a GAM with linear random effects, and the second is a GAM with smooth random effects. Both can be fitted using the `s` term in `mgcv`, where `bs = "re"` is used for linear random effects and `bs = "fs"` is used for smooth random effects. Here we will use a dataset of bird latitude abundance `bird_move` from `gratia` to illustrate the latter.
```{r bird_move}
#| eval: FALSE # Remove me!! #
bird_data <- bird_move %>%
filter(count > 0) %>%
group_by(species) %>%
mutate(
# Calculate the weight of each observation
weight = >>>FILL_THIS<<<,
weight = >>>FILL_THIS<<<
) %>%
ungroup
```
The `bird_move` dataset contains weekly observations of bird latitude abundance for 6 different species.
```{r bird_move_plot}
#| class.source: 'fold-hide'
#| eval: FALSE # Remove me!! #
bird_data %>%
ggplot(aes(week, latitude, z=count)) +
stat_summary_2d(bins=c(15, 5)) +
scale_fill_viridis_c(option = "A", trans = "log10", limits = c(0.1, 25)) +
facet_wrap(~species) +
coord_cartesian(expand = F) +
theme(
panel.background = element_rect(fill = "black"),
panel.grid = element_blank()
)
```
As we can see the species have a common pattern of migrating north during summer and south during winter, but with slightly different timings.
::: question-box
How should we model the migration pattern? What is the response and predictor variable(s)?
Write your answer here!
:::
::: question-box
Which, if any, smooth term(s) is/are appropriate?
Write your answer here!
:::
::: question-box
How should we parametrize the model?
Write your answer here!
:::
For the first question, we can use `latitude` as the response variable and `week` as the predictor. We can then add `species` as a interacting random smooth term with `week` to model the differences in migration patterns between species.
For the second question, if we consider that migration patterns are cyclical, it is logical that we should use a cyclical smooth term for the week variable. `mgcv` contains two cyclic smooth terms `cc` for cyclic cubic regression splines and `cp` for cyclic penalized regression splines.
Lastly, for the third question, we can parametrize the model in two ways. The first is to simply include a single smooth term with both `species` and `week`, however this makes it hard to disentangle the marginal (main) effect of `week` and the interaction effect of `species` and `week`. The second is to include three terms, a main effect of `week`, a random effect of `species`, and a smooth random interaction term between `species` and `week`. This will allow us to separate the main and interaction effects.
```{r bird_mod}
#| eval: FALSE # Remove me!! #
gam_bird_mod_simple <- gam(
# `k` is set so the two models have the approximately same number of EDF
latitude ~
te(species, week, bs=c("fs", "cc"), k=5),
data = bird_data,
weights = weight,
family = "gaussian",
method = "REML"
)
# Hint: The model should use three terms,
# where one of them is a ti() (interaction) term
gam_bird_mod <- gam(
latitude ~ >>>FILL_THIS<<<,
data = bird_data,
weights = weight,
family = "gaussian",
method = "REML"
)
```
::: question-box
What is the difference between the two models? What is the difference between `te` and `ti`? Which model do you think is more appropriate?
:::
Let us first compare the diagnostic plots:
```{r bird_appraise}
#| class.source: 'fold-hide'
#| fig.height: 5
#| eval: FALSE # Remove me!! #
list(
grid::textGrob("gam_bird_mod_simple", x=0.5, y=0.5, gp=grid::gpar(fontsize=20, fontface="bold")),
grid::textGrob("gam_bird_mod", x=0.5, y=0.5, gp=grid::gpar(fontsize=20, fontface="bold")),
appraise(gam_bird_mod_simple) & rm_titles,
appraise(gam_bird_mod) & rm_titles
) %>%
lapply(patchwork::wrap_elements) %>%
patchwork::wrap_plots(nrow = 2, byrow = T, heights = c(0.1, 1))
```
::: question-box
Which, if any, differences do you see between the two models? Which model do you now think is more appropriate?
Write your answer here!
:::
Next, we can inspect the results using the `mod_overview` function:
```{r}
#| class.source: 'fold-hide'
#| results: "asis"
#| eval: FALSE # Remove me!! #
tab_gam_bird_mod_simple <- gam_bird_mod_simple %>%
mod_overview(
parametric_effect_sizes = T,
stars = T
)
tab_gam_bird_mod_simple %>%
format_table(
caption = "GAMM on `gratia::bird_move` with combined main and interaction effect",
align = "rcccccc"
)
```
```{r}
#| class.source: 'fold-hide'
#| results: "asis"
#| eval: FALSE # Remove me!! #
tab_gam_bird_mod <- gam_bird_mod %>%
mod_overview(
parametric_effect_sizes = T,
stars = T
)
tab_gam_bird_mod %>%
format_table(
caption = "GAMM on `gratia::bird_move` with separate main and interaction effects",
align = "rcccccc"
)
```
::: question-box
What is the main difference between the outputs for the two models? Which model do you now think is more appropriate?
Write your answer here!
:::
::: question-box
Which inferences can we draw from the model summary of `gam_bird_mod`?
Write your answer here!
:::
Another benefit of the second model is that we can easily visualize the main and interaction effects. Here we use the `data_slice` function from `gratia` to generate a prediction dataset with evenly spaced weeks and all species. We then use the `fitted_values` with the `exclude` term to predict both with and without the effect of `species`, i.e. with and without random effects.
```{r bird_predict}
#| eval: FALSE # Remove me!! #
prediction_data <- data_slice(
gam_bird_mod,
week = >>>FILL_THIS<<<,
species = >>>FILL_THIS<<<
)
predictions <- >>>FILL_THIS<<<
main_effect_predictions <- fitted_values(
gam_bird_mod,
prediction_data,
exclude=c(>>>FILL_THIS<<<, >>>FILL_THIS<<<)
) %>%
select(.fitted, .lower_ci, .upper_ci)
predictions <- predictions %>%
mutate(
main = main_effect_predictions
) %>%
unnest(main, names_sep="_")
```
We can then plot the results:
```{r bird_predict_plt}
#| class.source: 'fold-hide'
#| eval: FALSE # Remove me!! #
predictions %>%
ggplot(aes(x=lubridate::date_decimal(2000 + week/52))) +
geom_point(
data = bird_data,
aes(y=latitude, fill=species, size=count),
shape = 21,
color = "black",
show.legend = F
) +
geom_line(aes(y=main_.fitted), linewidth=2) +
geom_ribbon(aes(ymin=main_.lower_ci, ymax=main_.upper_ci), alpha=0.25) +
geom_line(aes(y=.fitted, color=species), show.legend=F, linewidth=0.5, linetype="dashed") +
geom_ribbon(aes(ymin=.lower_ci, ymax=.upper_ci, fill=species), alpha=0.1, key_glyph=draw_key_point) +
scale_color_brewer(palette = "Dark2", guide="none") +
scale_fill_brewer(palette = "Dark2") +
scale_x_datetime(expand = expansion(), date_breaks = "1 month", date_labels = "%B") +
scale_y_continuous(expand = expansion()) +
coord_polar() +
labs(x = "Date", y = "Latitude", color = "Species", fill = "Species") +
guides(
fill = guide_legend(
override.aes = list(
alpha = 1,
size = 8,
stroke = 1,
shape = 21,
color = "black"
)
)
)
```
::: question-box
What do you think of the model fit? Does it capture the main and species-specific migration patterns well?
Write your answer here!
:::
::: question-box