From d87a5ffc15b127fca7db50e0861a2a8bfa5e56c7 Mon Sep 17 00:00:00 2001 From: Marc Baguelin Date: Tue, 19 Nov 2024 12:06:40 +0100 Subject: [PATCH 01/17] First attempt at flu pandemic example, add model and data, but no fitting yet --- 2009-flu-uk.qmd | 183 +++++++++++++++++++++++++++++++++ _quarto.yml | 1 + data/andre_estimates_21_02.txt | 38 +++++++ 3 files changed, 222 insertions(+) create mode 100644 2009-flu-uk.qmd create mode 100644 data/andre_estimates_21_02.txt diff --git a/2009-flu-uk.qmd b/2009-flu-uk.qmd new file mode 100644 index 0000000..56d8144 --- /dev/null +++ b/2009-flu-uk.qmd @@ -0,0 +1,183 @@ +# Modelling the 2009 UK flu pandemic + +```{r} +#| include: false +source("common.R") +``` + +```{r, echo=FALSE} +r_output <- function(path, highlight = NULL) { + if (is.null(highlight)) { + prefix <- "```r" + } else { + prefix <- sprintf('```{.r code-line-numbers="%s"}', highlight) + } + writeLines(c(prefix, readLines(path), "```")) +} +set.seed(1) # always the same +knitr::knit_hooks$set(small_margins = function(before, options, envir) { + if (before) { + par(mar = c(4, 4, .1, .1)) + } +}) +``` + +This chapter explores the use of `odin` and `monty` to model the dynamics of the 2009 A/H1N1 influenza pandemic in England and Wales incorporating changes in social contacts during holiday periods. We demonstrate how to construct, fit, and analyse a compartmental model to infer key epidemiological parameters. + +## Overview of the Pipeline + +This chapter illustrates how to: + +1. Construct an SEIR model with time-varying transmission using `odin`. +2. Implement an observation model, derive a likelihood based on observations using `odin`. +3. Run Bayesian inference using Markov chain Monte Carlo (MCMC) with `monty`. +4. Analyse and interpret MCMC results to estimate parameters such as: + - The basic reproduction number $R_0$. + - The ascertainment probability, i.e., the proportion of symptomatic infections reported as cases. + +```{r} +library(odin2) +library(dust2) +``` + +## Data Preparation + +The dataset consists of weekly case estimates, stratified by age group. For simplicity, we aggregate the data into a single time series for the analysis. + +## Loading data + +Data is the number of weekly cases of A/H1N1 pandemic cases from June 2009, and initially broken in seven age-groups. The data file is taken from the supplement material in [@endo_introduction_2019] and can be found [here](https://github.com/akira-endo/Intro-PMCMC/blob/master/assets/andre_estimates_21_02.txt). + +We will fit our model to all cases aggregated so we need to load our data, add the age-group, and then build a dataframe (a table where each rows is an observations of certains variables in the columns) with 'Days' and 'cases'. We use the `dplyr` package, a popular R package for data handling, with the '%>%' ('pipe') operator but other methods are possible. + +```{r} +library(dplyr) + +# Read the data +original_data <- read.csv(file = "data/andre_estimates_21_02.txt", sep = "\t") %>% rowSums() + +# Group all age groups +data <- data.frame(Cases = original_data) %>% + mutate(Days = seq(7, by = 7, length.out = n())) %>% + dplyr::select(Days, Cases) +``` + +```{r, echo=FALSE} +q_n <- 1 +``` + +## Plotting the data + +You can plot the data and see the two waves of infections and the presence of holiday periods. +```{r} +#plot the data +plot(data$Day, data$Cases, pch = 19, col = "red", + xlab="Days since start of epidemics", + ylab="Official cases count") + +# Add the holiday periods - summer holidays and half-term +# Holiday calendar assuming 30% contact reduction during summer and 15% during half term +hol_t <- c(0,38,85,140,147) +hol_v <- c(1,.70,1,.85,1) +#summer holiday block +rect(xleft = hol_t[2],ybottom = 0, xright = hol_t[3], ytop = max(data$Cases)*1.2, col = "#aaaabb33", border = NA) +#half-term holiday block +rect(xleft = hol_t[4],ybottom = 0, xright = hol_t[5], ytop = max(data$Cases)*1.2, col = "#aaaabb33", border = NA) +``` + +## Understanding the model + +The SEIR model divides the population into four compartments: + +- **S**usceptible: Individuals at risk of infection. +- **E**xposed: Infected individuals who are not yet infectious. +- **I**nfectious: Actively transmitting the disease. +- **R**ecovered: Immune individuals. + +The model incorporates time-varying transmission rates to account for changes in contact patterns during holidays. Using `odin`, we define this system of differential equations: + +## Compiling the model + +The model needs to be transpiled (e.g. translated from one programming language to another) in order to create some C++ code. That code is then compiled and loaded to create a "fast" model that can be called from the R environment. + +```{r, echo = TRUE} +seir <- odin2::odin({ +# initial conditions +initial(S) <- (1 - 2 * alpha) * N +initial(E) <- alpha*N +initial(I) <- alpha*N +initial(R) <- 0 +initial(incidence, zero_every = 7) <- 0 + +# equations +deriv(S) <- - hol * beta * S * I / N +deriv(E) <- hol * beta * S * I / N - gamma * E +deriv(I) <- gamma * E - sigma * I +deriv(R) <- sigma * I +deriv(incidence) <- gamma * E + +# parameter values +R_0 <- parameter(1.5) +L <- 1 +D <- 1.25 +alpha <- parameter(1e-4) # initial proportion +N <- 55000000 + +# convert parameters +hol <- interpolate(h_times, h_values, "constant") +h_times <- parameter() +h_values <- parameter() +dim(h_times) <- parameter(rank=1) +dim(h_values) <- length(h_times) +gamma <- 1 / L +sigma <- 1 / D +beta <- R_0 * sigma}) +``` + +## Running the model + +Once the model is compiled, it is possible to generate one (or more!) instance of your model by using the `dust_system_create()` and your model generator. + +```{r} +#mod <- seir$new(h_times = hol_t, h_values = hol_v) +mod <- dust_system_create(seir, + pars = list(h_times = hol_t, h_values = hol_v)) +``` + +```{r} +# A vector with parameter values +# First parameter -> proportion of population initially infected +# Second argument -> R_0 +# Third argument -> Proportion of infection detected +par0 <- c(5e-5, 1.3, 0.1) + +# This vector should be passed as a list +dust_system_update_pars(sys = mod, + pars = list(alpha=par0[1], + R_0=par0[2], + h_times = hol_t, + h_values= hol_v)) +``` + +Then the model can be run by specifying the time points between which to integrate the ODEs. Note that the first point (here 0) set the inital time of the integration. Once the time vector t is defined we can run the model by using the "$run(t)" method. +```{r} +t <- c(0,data$Days) +dust_system_set_state_initial(mod) +y <- dust_system_simulate(mod, t) +``` + +Let's plot the I compartment over time. + +```{r} +plot(t, dust_unpack_state(mod, y)$incidence, type = "l", col = "red", lwd = 2) +``` + +It looks a bit like the actual epidemic; a good sign, but let compare our model with the data. + +# Part 2. Observation model, likelihood and prior distributions + +We now have a dataset and a working model, all loaded in our R environment. We need to link the output of this model with the data in order to derive a likelihood function. As we are working within a Bayesian framework we also need to define the prior distribution for our parameters. + +## Cumulative incidence and observed cases + +The fundamental component of our model if the 'I' compartment tracking continuously the number of infectious people in our population (an infection prevalence). However, what we observe is a fraction of the cumulative incidence, more precisely the number of new cases in a period of 7 days (a week). In our `odin` model we track the cumulative incidence, and we have simulated it using 7 days intervals. We can thus write a simple function giving the cumulative incidence between two days: diff --git a/_quarto.yml b/_quarto.yml index 7b55dd7..0b04d93 100644 --- a/_quarto.yml +++ b/_quarto.yml @@ -25,6 +25,7 @@ book: chapters: - inference.qmd - differentiability.qmd + - 2009-flu-uk.qmd - references.qmd appendices: - installation.qmd diff --git a/data/andre_estimates_21_02.txt b/data/andre_estimates_21_02.txt new file mode 100644 index 0000000..8fd1157 --- /dev/null +++ b/data/andre_estimates_21_02.txt @@ -0,0 +1,38 @@ +1 3 30 42 42 9 2 +4 13 133 161 184 47 8 +16 55 850 479 464 159 20 +58 374 3806 1617 1475 373 42 +138 619 6748 2909 2391 630 83 +245 1302 9965 5220 5199 1524 185 +1125 5128 32217 17649 15897 4680 531 +1325 5223 22269 18436 17795 4595 492 +220 2721 9253 11989 13468 2622 207 +105 1401 4765 8232 8709 1670 116 +42 506 1935 4087 4143 1050 66 +22 247 987 2136 2129 841 48 +12 123 534 1315 1450 637 49 +7 164 927 2256 2040 658 32 +10 205 2122 2676 2236 694 75 +18 269 3575 2456 2431 1045 49 +29 401 5294 3431 3261 1544 71 +33 806 8567 6521 6265 1993 95 +74 1094 10662 8251 7726 2735 148 +151 2168 20452 11872 11388 3946 233 +258 4110 30913 14966 16874 6583 270 +363 4255 20205 12929 19952 8612 500 +262 2922 13160 11259 16817 7636 488 +227 2973 16423 8477 13376 6799 413 +240 2670 14900 5416 9326 5321 290 +226 1864 8405 3271 5400 3092 210 +215 1300 4819 2172 4067 2025 139 +154 930 3756 1748 3216 1784 138 +105 644 2363 1313 2430 1457 123 +69 366 1018 727 1351 1031 69 +39 163 378 529 1066 1100 82 +44 156 434 715 1472 1283 78 +28 99 325 421 888 690 36 +26 117 325 308 628 246 13 +23 97 307 202 394 66 3 +14 56 201 150 307 25 1 +10 41 102 75 156 7 0 +7 16 37 34 90 0 0 \ No newline at end of file From c82ea8c6c1bedcc307b6e5d8d72d0e6e1807ede3 Mon Sep 17 00:00:00 2001 From: Marc Baguelin Date: Tue, 19 Nov 2024 14:04:21 +0100 Subject: [PATCH 02/17] Intent adding comparison model --- 2009-flu-uk.qmd | 16 +++++++++++++++- 1 file changed, 15 insertions(+), 1 deletion(-) diff --git a/2009-flu-uk.qmd b/2009-flu-uk.qmd index 56d8144..4b386ad 100644 --- a/2009-flu-uk.qmd +++ b/2009-flu-uk.qmd @@ -131,7 +131,12 @@ dim(h_times) <- parameter(rank=1) dim(h_values) <- length(h_times) gamma <- 1 / L sigma <- 1 / D -beta <- R_0 * sigma}) +beta <- R_0 * sigma + +# observation model +cases <- data() +cases ~ Poisson(incidence) +}) ``` ## Running the model @@ -181,3 +186,12 @@ We now have a dataset and a working model, all loaded in our R environment. We n ## Cumulative incidence and observed cases The fundamental component of our model if the 'I' compartment tracking continuously the number of infectious people in our population (an infection prevalence). However, what we observe is a fraction of the cumulative incidence, more precisely the number of new cases in a period of 7 days (a week). In our `odin` model we track the cumulative incidence, and we have simulated it using 7 days intervals. We can thus write a simple function giving the cumulative incidence between two days: + +```{r} +data$time <- data$Days +filter <- dust_unfilter_create(seir, data = data, time_start = 0) +dust_likelihood_run(filter, list(alpha=par0[1], + R_0=par0[2], + h_times = hol_t, + h_values= hol_v)) +``` From bc0d93fecdfd9a2f6d405feabd3b06a9de4eb909 Mon Sep 17 00:00:00 2001 From: Marc Baguelin Date: Mon, 25 Nov 2024 11:21:59 +0000 Subject: [PATCH 03/17] First attempt at UK pandemic pipeline --- 2009-flu-uk.qmd | 83 +++++++++++++++++++++++++++++++++++++++++++++++-- 1 file changed, 81 insertions(+), 2 deletions(-) diff --git a/2009-flu-uk.qmd b/2009-flu-uk.qmd index 8cd52f9..3bfc0f1 100644 --- a/2009-flu-uk.qmd +++ b/2009-flu-uk.qmd @@ -38,6 +38,7 @@ This chapter illustrates how to: ```{r} library(odin2) library(dust2) +library(monty) ``` ## Data Preparation @@ -130,8 +131,9 @@ sigma <- 1 / D beta <- R_0 * sigma # observation model -cases <- data() -cases ~ Poisson(incidence) +rho <- parameter(0.1) +Cases <- data() +Cases ~ Poisson(rho*incidence) }) ``` @@ -175,6 +177,11 @@ plot(t, dust_unpack_state(mod, y)$incidence, type = "l", col = "red", lwd = 2) It looks a bit like the actual epidemic; a good sign, but let compare our model with the data. +```{r} +plot(t, dust_unpack_state(mod, y)$incidence, type = "l", col = "red", lwd = 2) +points(data$Days, data$Cases) +``` + # Part 2. Observation model, likelihood and prior distributions We now have a dataset and a working model, all loaded in our R environment. We need to link the output of this model with the data in order to derive a likelihood function. As we are working within a Bayesian framework we also need to define the prior distribution for our parameters. @@ -183,6 +190,10 @@ We now have a dataset and a working model, all loaded in our R environment. We n The fundamental component of our model if the 'I' compartment tracking continuously the number of infectious people in our population (an infection prevalence). However, what we observe is a fraction of the cumulative incidence, more precisely the number of new cases in a period of 7 days (a week). In our `odin` model we track the cumulative incidence, and we have simulated it using 7 days intervals. We can thus write a simple function giving the cumulative incidence between two days: +## Setting up the likelihood + +The likelihood is a function that takes a parameter as argument and return the probability density that our model (transmission model + observation) generates exactly the observed data. This number is usually very small as it is the product of each individual observation density given the model. As opposed to the example in the lecture, we thus always work using the log-density values rather than the density. Products then become sums and divisions become differences. + ```{r} data$time <- data$Days filter <- dust_unfilter_create(seir, data = data, time_start = 0) @@ -191,3 +202,71 @@ dust_likelihood_run(filter, list(alpha=par0[1], h_times = hol_t, h_values= hol_v)) ``` + +This is the likelihood associated with the dust model written using odin. We know need to move this into the monty statistical world to be able to integrate this into our pipeline. For this use the `packer` concept that allows us to definne a one-to-one translation between the two worlds. + +```{r} +packer <- monty_packer(c("alpha", "R_0", "rho"), + fixed = list(h_times = hol_t,h_values= hol_v)) +``` + +This defines two functions that allows to 'pack' (define) and 'unpack' (define) our parameters. + +Now building the `monty` model for the likelihood of the model is very easy: + +```{r} +likelihood <- dust_likelihood_monty(filter, packer) +``` + +## Setting up the prior distribution + +We similarly set up a log_prior function. Note that we use the function to exclude impossible values such as proportion below 0 or above 1. This is to avoid to get logdensity of -infinity given that these values would have a density value of 0 (=impossible). + +The other ingredient we need is a prior. This we can construct with monty_dsl as before, note that the names are matching the parameters from the likelihood as it should be. + +```{r} +prior <- monty_dsl({ + alpha ~ Beta(a = 4e-4, b = 2) + R_0 ~ Gamma(2, 0.7) + rho ~ Uniform(0, 1) +}) +``` + +## Setting up the posterior + +Our posterior is the product of the likelihood and prior, or the sum of their logs: + +```{r} +posterior <- likelihood + prior +``` + +Next, we define a sampler; we'll start with a random walk with a fairly arbitrary diagonal proposal matrix: + +```{r} +vcv <- diag(c(2e-8,2e-4,1e-4)) +sampler <- monty_sampler_random_walk(vcv) +``` + +We start this off, using explicit initial conditions + +```{r} +#| cache: true +init_values <- packer$pack(list(alpha=par0[1], + R_0=par0[2], + rho=0.1, + h_times = hol_t, + h_values= hol_v)) +samples <- monty_sample(posterior, sampler, 10000, initial = init_values, + n_chains = 3) +``` + +::: {.callout-note} +We have used explicit initial conditions here, which might not be what you want in all situations. Better might be to sample from the prior, but we have not yet implemented support to try a few points from the sample before getting a point with finite density, which is really needed here. +::: + +Here the log posterior density of our three chains over time, showing a rapid improvement in the posterior probability density followed by what might be reasonable (but not great) mixing: + +```{r} +matplot(samples$density, type = "l", lty = 1, + xlab = "Sample", ylab = "Log posterior probability density") +``` From d19f845c1eab353cd73aacdeee76a795d9886d30 Mon Sep 17 00:00:00 2001 From: Marc Baguelin Date: Mon, 25 Nov 2024 23:09:30 +0000 Subject: [PATCH 04/17] *testing values --- 2009-flu-uk.qmd | 12 +++++++++++- 1 file changed, 11 insertions(+), 1 deletion(-) diff --git a/2009-flu-uk.qmd b/2009-flu-uk.qmd index 3bfc0f1..d977c8d 100644 --- a/2009-flu-uk.qmd +++ b/2009-flu-uk.qmd @@ -243,7 +243,7 @@ posterior <- likelihood + prior Next, we define a sampler; we'll start with a random walk with a fairly arbitrary diagonal proposal matrix: ```{r} -vcv <- diag(c(2e-8,2e-4,1e-4)) +vcv <- diag(c(2e-8,2e-4,1e-4)*0.1) sampler <- monty_sampler_random_walk(vcv) ``` @@ -256,6 +256,7 @@ init_values <- packer$pack(list(alpha=par0[1], rho=0.1, h_times = hol_t, h_values= hol_v)) +init_values <- c(3e-5,1.35,0.05) samples <- monty_sample(posterior, sampler, 10000, initial = init_values, n_chains = 3) ``` @@ -270,3 +271,12 @@ Here the log posterior density of our three chains over time, showing a rapid im matplot(samples$density, type = "l", lty = 1, xlab = "Sample", ylab = "Log posterior probability density") ``` +```{r} +samples_df <- posterior::as_draws_df(samples) +posterior::summarise_draws(samples_df) +``` + +```{r} +bayesplot::mcmc_pairs(samples_df) +``` + From 8336a8bef37f1e770695f91a4e62893c430c21b4 Mon Sep 17 00:00:00 2001 From: Marc Baguelin Date: Mon, 25 Nov 2024 23:10:49 +0000 Subject: [PATCH 05/17] Testing values --- 2009-flu-uk.qmd | 7 ++++--- 1 file changed, 4 insertions(+), 3 deletions(-) diff --git a/2009-flu-uk.qmd b/2009-flu-uk.qmd index d977c8d..6bcc454 100644 --- a/2009-flu-uk.qmd +++ b/2009-flu-uk.qmd @@ -243,7 +243,8 @@ posterior <- likelihood + prior Next, we define a sampler; we'll start with a random walk with a fairly arbitrary diagonal proposal matrix: ```{r} -vcv <- diag(c(2e-8,2e-4,1e-4)*0.1) +vcv <- diag(c(5e-10,5e-5,1e-5))*0.0001 +#vcv <- matrix(c(4.3e-12,-1.2e-8,3.6e-9,-1.2e-8,4.9e-5,-1e-5,3.6e-9,-1e-5,1.1e-5), nrow = 3)*0.1 sampler <- monty_sampler_random_walk(vcv) ``` @@ -256,8 +257,8 @@ init_values <- packer$pack(list(alpha=par0[1], rho=0.1, h_times = hol_t, h_values= hol_v)) -init_values <- c(3e-5,1.35,0.05) -samples <- monty_sample(posterior, sampler, 10000, initial = init_values, +init_values <- c(3e-5,1.35,0.04) +samples <- monty_sample(posterior, sampler, 100000, initial = init_values, n_chains = 3) ``` From dfa97c6c94ef1f9ee829dcf8500b747d60ff6ec2 Mon Sep 17 00:00:00 2001 From: Marc Baguelin Date: Tue, 26 Nov 2024 05:25:44 +0000 Subject: [PATCH 06/17] Update value for mcmc --- 2009-flu-uk.qmd | 9 +++++---- 1 file changed, 5 insertions(+), 4 deletions(-) diff --git a/2009-flu-uk.qmd b/2009-flu-uk.qmd index 6bcc454..c183dca 100644 --- a/2009-flu-uk.qmd +++ b/2009-flu-uk.qmd @@ -243,8 +243,8 @@ posterior <- likelihood + prior Next, we define a sampler; we'll start with a random walk with a fairly arbitrary diagonal proposal matrix: ```{r} -vcv <- diag(c(5e-10,5e-5,1e-5))*0.0001 -#vcv <- matrix(c(4.3e-12,-1.2e-8,3.6e-9,-1.2e-8,4.9e-5,-1e-5,3.6e-9,-1e-5,1.1e-5), nrow = 3)*0.1 +#vcv <- diag(c(5e-10,5e-5,1e-5))*0.0001 +vcv <- matrix(c(2.5e-14,-1.04e-10,5.6e-12,-1.04e-10,4.9e-07,-2.5e-8,5.6e-12,-2.5e-8,3.5e-9), nrow = 3) sampler <- monty_sampler_random_walk(vcv) ``` @@ -257,8 +257,8 @@ init_values <- packer$pack(list(alpha=par0[1], rho=0.1, h_times = hol_t, h_values= hol_v)) -init_values <- c(3e-5,1.35,0.04) -samples <- monty_sample(posterior, sampler, 100000, initial = init_values, +init_values <- c(3.05e-5,1.354,0.04) +samples <- monty_sample(posterior, sampler, 10000, initial = init_values, n_chains = 3) ``` @@ -271,6 +271,7 @@ Here the log posterior density of our three chains over time, showing a rapid im ```{r} matplot(samples$density, type = "l", lty = 1, xlab = "Sample", ylab = "Log posterior probability density") +length(unique(samples$density))/length(samples$density) ``` ```{r} samples_df <- posterior::as_draws_df(samples) From c2e45c7e07d9c9864ca2ebdfa4a437de7db23181 Mon Sep 17 00:00:00 2001 From: Marc Baguelin Date: Tue, 26 Nov 2024 12:03:17 +0000 Subject: [PATCH 07/17] Add adaptive sampler --- 2009-flu-uk.qmd | 5 ++--- 1 file changed, 2 insertions(+), 3 deletions(-) diff --git a/2009-flu-uk.qmd b/2009-flu-uk.qmd index c183dca..3a1c787 100644 --- a/2009-flu-uk.qmd +++ b/2009-flu-uk.qmd @@ -243,9 +243,8 @@ posterior <- likelihood + prior Next, we define a sampler; we'll start with a random walk with a fairly arbitrary diagonal proposal matrix: ```{r} -#vcv <- diag(c(5e-10,5e-5,1e-5))*0.0001 -vcv <- matrix(c(2.5e-14,-1.04e-10,5.6e-12,-1.04e-10,4.9e-07,-2.5e-8,5.6e-12,-2.5e-8,3.5e-9), nrow = 3) -sampler <- monty_sampler_random_walk(vcv) +vcv <- diag(c(5e-10,5e-5,1e-5)) +sampler <- monty_sampler_adaptive(vcv, initial_vcv_weight = 100) ``` We start this off, using explicit initial conditions From 3c9649338886a4d7ae008563880fa187fd636261 Mon Sep 17 00:00:00 2001 From: Marc Baguelin Date: Tue, 26 Nov 2024 14:54:32 +0000 Subject: [PATCH 08/17] *Pull 1000 parameters and simulate from them --- 2009-flu-uk.qmd | 20 ++++++++++++++++++++ 1 file changed, 20 insertions(+) diff --git a/2009-flu-uk.qmd b/2009-flu-uk.qmd index 3a1c787..cc342b0 100644 --- a/2009-flu-uk.qmd +++ b/2009-flu-uk.qmd @@ -281,3 +281,23 @@ posterior::summarise_draws(samples_df) bayesplot::mcmc_pairs(samples_df) ``` +Finally we can get an idea of the fit of the samples by looking at 100 trajectories from the samples. + +```{r} +n_groups <- 1000 +i_rand <- sample(dim(samples$pars)[2],n_groups) +list_par <- lapply(i_rand, FUN = function(i) packer$unpack(samples$pars[,i,sample(3,1)])) + +mod_mult <- dust_system_create(seir, + pars = list_par, n_groups = n_groups) + +t <- c(0,data$Days) +dust_system_set_state_initial(mod_mult) +y <- dust_system_simulate(mod_mult, t) +y <- dust_unpack_state(mod_mult, y) + +matplot(t, t(y$incidence), type = "l", lty = 1, col = "#00000044", + xlab = "Time", ylab = "Infected population") +points(data$Days, data$Cases/mean(samples$pars["rho",,]), pch=19, col="red") +``` + From 283b86cecd2847cdf7d565eda2522b4e0ad04465 Mon Sep 17 00:00:00 2001 From: Marc Baguelin Date: Wed, 27 Nov 2024 12:01:37 +0000 Subject: [PATCH 09/17] Add example of direct sampling from the prior --- 2009-flu-uk.qmd | 14 ++++++++++++++ 1 file changed, 14 insertions(+) diff --git a/2009-flu-uk.qmd b/2009-flu-uk.qmd index cc342b0..fc20f04 100644 --- a/2009-flu-uk.qmd +++ b/2009-flu-uk.qmd @@ -232,6 +232,20 @@ prior <- monty_dsl({ }) ``` +Let's inspect the our prior `monty` model +```{r} +prior +``` + +As it has been built using the monty DSL, it comes with gradient calculation, direct sampling (so no need to use one of the Monte Carlo algorithm) and accepts multiple parameters. + +Let's try to generate samples and check that they match our prior distribution: +```{r} +r <- monty_rng$new(n_streams = 3) +monty_model_direct_sample(prior, r) +``` + + ## Setting up the posterior Our posterior is the product of the likelihood and prior, or the sum of their logs: From 656f860fd3ddba86c8ed0d764471fd3ef17c11e2 Mon Sep 17 00:00:00 2001 From: Marc Baguelin Date: Wed, 27 Nov 2024 12:08:22 +0000 Subject: [PATCH 10/17] More readible output format for direct samples --- 2009-flu-uk.qmd | 5 +++-- 1 file changed, 3 insertions(+), 2 deletions(-) diff --git a/2009-flu-uk.qmd b/2009-flu-uk.qmd index fc20f04..fdfe563 100644 --- a/2009-flu-uk.qmd +++ b/2009-flu-uk.qmd @@ -241,8 +241,9 @@ As it has been built using the monty DSL, it comes with gradient calculation, di Let's try to generate samples and check that they match our prior distribution: ```{r} -r <- monty_rng$new(n_streams = 3) -monty_model_direct_sample(prior, r) +n_streams <- 10 +r <- monty_rng$new(n_streams = n_streams) +prior_samples <- matrix(monty_model_direct_sample(prior, r), nrow = n_streams) ``` From 457c1e481aeda4a447b5a0c8444b698dc61190ef Mon Sep 17 00:00:00 2001 From: Marc Baguelin Date: Wed, 27 Nov 2024 14:26:23 +0000 Subject: [PATCH 11/17] * Replace Poisson observation model by Negative Binomial * Fit overdispersion as part of the fitting pipeline --- 2009-flu-uk.qmd | 25 +++++++++++++++++-------- 1 file changed, 17 insertions(+), 8 deletions(-) diff --git a/2009-flu-uk.qmd b/2009-flu-uk.qmd index fdfe563..60cc09a 100644 --- a/2009-flu-uk.qmd +++ b/2009-flu-uk.qmd @@ -101,8 +101,8 @@ The model needs to be transpiled (e.g. translated from one programming language seir <- odin2::odin({ # initial conditions initial(S) <- (1 - 2 * alpha) * N -initial(E) <- alpha*N -initial(I) <- alpha*N +initial(E) <- alpha * N +initial(I) <- alpha * N initial(R) <- 0 initial(incidence, zero_every = 7) <- 0 @@ -132,8 +132,9 @@ beta <- R_0 * sigma # observation model rho <- parameter(0.1) +eta <- parameter(10) Cases <- data() -Cases ~ Poisson(rho*incidence) +Cases ~ NegativeBinomial(size = eta, mu = rho*incidence) }) ``` @@ -152,7 +153,7 @@ mod <- dust_system_create(seir, # First parameter -> proportion of population initially infected # Second argument -> R_0 # Third argument -> Proportion of infection detected -par0 <- c(5e-5, 1.3, 0.1) +par0 <- c(5e-5, 1.3, 0.1, 10) # This vector should be passed as a list dust_system_update_pars(sys = mod, @@ -206,7 +207,7 @@ dust_likelihood_run(filter, list(alpha=par0[1], This is the likelihood associated with the dust model written using odin. We know need to move this into the monty statistical world to be able to integrate this into our pipeline. For this use the `packer` concept that allows us to definne a one-to-one translation between the two worlds. ```{r} -packer <- monty_packer(c("alpha", "R_0", "rho"), +packer <- monty_packer(c("alpha", "R_0", "rho", "eta"), fixed = list(h_times = hol_t,h_values= hol_v)) ``` @@ -229,6 +230,7 @@ prior <- monty_dsl({ alpha ~ Beta(a = 4e-4, b = 2) R_0 ~ Gamma(2, 0.7) rho ~ Uniform(0, 1) + eta ~ Exponential(mean = 1000) }) ``` @@ -241,11 +243,17 @@ As it has been built using the monty DSL, it comes with gradient calculation, di Let's try to generate samples and check that they match our prior distribution: ```{r} -n_streams <- 10 +n_streams <- 10000 r <- monty_rng$new(n_streams = n_streams) prior_samples <- matrix(monty_model_direct_sample(prior, r), nrow = n_streams) +colnames(prior_samples) <- prior$parameters ``` +```{r} +bayesplot::mcmc_pairs(prior_samples) +``` + + ## Setting up the posterior @@ -258,7 +266,7 @@ posterior <- likelihood + prior Next, we define a sampler; we'll start with a random walk with a fairly arbitrary diagonal proposal matrix: ```{r} -vcv <- diag(c(5e-10,5e-5,1e-5)) +vcv <- diag(c(5e-10,5e-5,1e-5,1)) sampler <- monty_sampler_adaptive(vcv, initial_vcv_weight = 100) ``` @@ -269,9 +277,10 @@ We start this off, using explicit initial conditions init_values <- packer$pack(list(alpha=par0[1], R_0=par0[2], rho=0.1, + eta=10, h_times = hol_t, h_values= hol_v)) -init_values <- c(3.05e-5,1.354,0.04) +init_values <- c(3.05e-5,1.354,0.04, 10) samples <- monty_sample(posterior, sampler, 10000, initial = init_values, n_chains = 3) ``` From 60c8fa374ec2888900af7ed44faaf301f63be70a Mon Sep 17 00:00:00 2001 From: Marc Baguelin Date: Wed, 27 Nov 2024 18:41:02 +0000 Subject: [PATCH 12/17] Restructuring and adding some text --- 2009-flu-uk.qmd | 113 +++++++++++++++++++++++++++++++++++++----------- 1 file changed, 88 insertions(+), 25 deletions(-) diff --git a/2009-flu-uk.qmd b/2009-flu-uk.qmd index 60cc09a..952b275 100644 --- a/2009-flu-uk.qmd +++ b/2009-flu-uk.qmd @@ -21,17 +21,26 @@ knitr::knit_hooks$set(small_margins = function(before, options, envir) { } }) ``` +## Introduction -This chapter explores the use of `odin` and `monty` to model the dynamics of the 2009 A/H1N1 influenza pandemic in England and Wales incorporating changes in social contacts during holiday periods. We demonstrate how to construct, fit, and analyse a compartmental model to infer key epidemiological parameters. +### Objective of this chapter -## Overview of the Pipeline +The 2009 A/H1N1 influenza pandemic posed a significant public health challenge in England and Wales, requiring rapid and evidence-based decision-making to mitigate its impact. This chapter explores the use of `odin` and `monty` to model the dynamics of the 2009 A/H1N1 influenza pandemic in England and Wales incorporating changes in social contacts during holiday periods. We demonstrate how to construct, fit, and analyse a compartmental model to infer key epidemiological parameters. + +The chapter is structured to guide the reader through the entire modelling pipeline, from data preparation and visualisation to model construction, parameter inference, and validation. Along the way, we highlight the integration of real-world data, discuss challenges in linking models to observed cases, and showcase the use of Bayesian methods for robust parameter estimation. + +### Overview of the pipeline This chapter illustrates how to: 1. Construct an SEIR model with time-varying transmission using `odin`. 2. Implement an observation model, derive a likelihood based on observations using `odin`. 3. Run Bayesian inference using Markov chain Monte Carlo (MCMC) with `monty`. -4. Analyse and interpret MCMC results to estimate parameters such as: +4. Analyse and interpret MCMC results. + +### Key parameters of interest + + - The basic reproduction number $R_0$. - The ascertainment probability, i.e., the proportion of symptomatic infections reported as cases. @@ -41,13 +50,13 @@ library(dust2) library(monty) ``` -## Data Preparation +## Data preparation and visualisation -The dataset consists of weekly case estimates, stratified by age group. For simplicity, we aggregate the data into a single time series for the analysis. +Data is the number of weekly cases of A/H1N1 pandemic cases from June 2009, and initially broken in seven age-groups. For simplicity, we aggregate the data into a single time series for the analysis. -## Loading data +### Loading data -Data is the number of weekly cases of A/H1N1 pandemic cases from June 2009, and initially broken in seven age-groups. The data file is taken from the supplement material in [@endo_introduction_2019] and can be found [here](https://github.com/akira-endo/Intro-PMCMC/blob/master/assets/andre_estimates_21_02.txt). +The data file is taken from the supplement material in [@endo_introduction_2019] and can be found [here](https://github.com/akira-endo/Intro-PMCMC/blob/master/assets/andre_estimates_21_02.txt). We will fit our model to all cases aggregated so we need to load our data, add the age-group, and then build a dataframe (a table where each rows is an observations of certains variables in the columns) with 'Days' and 'cases'. We use the `dplyr` package, a popular R package for data handling, with the '%>%' ('pipe') operator but other methods are possible. @@ -63,7 +72,7 @@ data <- data.frame(Cases = original_data) %>% dplyr::select(Days, Cases) ``` -## Plotting the data +### Plotting the data You can plot the data and see the two waves of infections and the presence of holiday periods. ```{r} @@ -81,21 +90,79 @@ rect(xleft = hol_t[2],ybottom = 0, xright = hol_t[3], ytop = max(data$Cases)*1.2 #half-term holiday block rect(xleft = hol_t[4],ybottom = 0, xright = hol_t[5], ytop = max(data$Cases)*1.2, col = "#aaaabb33", border = NA) ``` +## Model design -## Understanding the model +### Transmission model -The SEIR model divides the population into four compartments: +Similarly to the SIR model, we saw already, the SEIR model divides the population into four compartments: - **S**usceptible: Individuals at risk of infection. - **E**xposed: Infected individuals who are not yet infectious. - **I**nfectious: Actively transmitting the disease. - **R**ecovered: Immune individuals. -The model incorporates time-varying transmission rates to account for changes in contact patterns during holidays. Using `odin`, we define this system of differential equations: +Additionally, the model incorporates time-varying transmission rates to account for changes in contact patterns during holidays. + +The SEIR model is governed by the following system of ordinary differential equations: + +$$ +\begin{aligned} + \frac{dS}{dt} &= - \beta(t) \frac{S I}{N} \\ + \frac{dE}{dt} &= \beta(t) \frac{S I}{N} - \gamma E \\ + \frac{dI}{dt} &= \gamma - \sigma I \\ + \frac{dR}{dt} &= \sigma I +\end{aligned} +$$ + +The parameters for the transmission model are defined as follows: +$$ +\begin{aligned} + R_0 &\text{ (basic reproduction number): } R_0 = 1.5, \\ + \gamma &\text{ (rate of transition from exposed to infectious): } \gamma = \frac{1}{L}, \\ + \sigma &\text{ (recovery rate): } \sigma = \frac{1}{D}, \\ + \beta &\text{ (transmission rate): } \beta = R_0 \cdot \sigma, \\ + N &\text{ (total population): } N = 55,000,000, \\ + \text{hol} &\text{ (time-varying factor): interpolated from } h_{\text{times}} \text{ and } h_{\text{values}}. +\end{aligned} +$$ + +The initial conditions are: +$$ +\begin{aligned} + S(0) &= (1 - 2 \alpha) N \\ + E(0) &= \alpha N \\ + I(0) &= \alpha N \\ + R(0) &= 0 \\ +\end{aligned} +$$ + +### Observation model + +The fundamental component of our transmission model is the 'I' compartment tracking continuously the number of infectious people in our population (an infection prevalence). However, what we observe through our surveillance system is a fraction of the cumulative incidence, more precisely an estimate of the number of new cases in a period of 7 days (a week). + +In our `odin` model we can track the weekly cumulative incidence $Z(t)$ by: -## Compiling the model +1. adding a differential equation integrating the instantaneous incidence: +$$\frac{dZ}{dt} = \gamma E$$ -The model needs to be transpiled (e.g. translated from one programming language to another) in order to create some C++ code. That code is then compiled and loaded to create a "fast" model that can be called from the R environment. +2. reseting $Z(t) = 0$ at the beginning of each week. + +Then we get for each $t$ that is a multiple of 7: + +$$Z(t) = \int_{t-7}^{t} \gamma E$$ +i.e. the cumulative incidence over a week. + +Finally, the observation model is defined as: +$$ +\text{Cases(t)} \sim \text{NegativeBinomial}(\mu = \rho Z(t), \text{size} = \eta), +$$ +where \(\rho\) is the ascertainment probability, and $\eta$ is the dispersion parameter. + +### Odin code of the model + +Below is the `odin` code for our model - the structure matches closely the mathematical formulation. Note the `Cases <- data()` lines telling that `Cases` are observations. The model will compile and work even when it is not linked with any dataset. However providing data is essentiel to compute a likelihood with `dust` or derive a `monty` statistical model out of it. + +This step takes time. The model needs to be transpiled (e.g. translated from one programming language to another) in order to create some C++ code and then that code is compiled and loaded in the R environnment. After this, a "fast" model can be called from the R environment and used to simulate scenarios or perform inference. ```{r, echo = TRUE} seir <- odin2::odin({ @@ -138,7 +205,7 @@ Cases ~ NegativeBinomial(size = eta, mu = rho*incidence) }) ``` -## Running the model +### Running the model Once the model is compiled, it is possible to generate one (or more!) instance of your model by using the `dust_system_create()` and your model generator. @@ -183,15 +250,11 @@ plot(t, dust_unpack_state(mod, y)$incidence, type = "l", col = "red", lwd = 2) points(data$Days, data$Cases) ``` -# Part 2. Observation model, likelihood and prior distributions +## The MCMC pipeline We now have a dataset and a working model, all loaded in our R environment. We need to link the output of this model with the data in order to derive a likelihood function. As we are working within a Bayesian framework we also need to define the prior distribution for our parameters. -## Cumulative incidence and observed cases - -The fundamental component of our model if the 'I' compartment tracking continuously the number of infectious people in our population (an infection prevalence). However, what we observe is a fraction of the cumulative incidence, more precisely the number of new cases in a period of 7 days (a week). In our `odin` model we track the cumulative incidence, and we have simulated it using 7 days intervals. We can thus write a simple function giving the cumulative incidence between two days: - -## Setting up the likelihood +### Setting up the likelihood The likelihood is a function that takes a parameter as argument and return the probability density that our model (transmission model + observation) generates exactly the observed data. This number is usually very small as it is the product of each individual observation density given the model. As opposed to the example in the lecture, we thus always work using the log-density values rather than the density. Products then become sums and divisions become differences. @@ -219,7 +282,7 @@ Now building the `monty` model for the likelihood of the model is very easy: likelihood <- dust_likelihood_monty(filter, packer) ``` -## Setting up the prior distribution +### Setting up the prior distribution We similarly set up a log_prior function. Note that we use the function to exclude impossible values such as proportion below 0 or above 1. This is to avoid to get logdensity of -infinity given that these values would have a density value of 0 (=impossible). @@ -253,9 +316,7 @@ colnames(prior_samples) <- prior$parameters bayesplot::mcmc_pairs(prior_samples) ``` - - -## Setting up the posterior +### Setting up the posterior Our posterior is the product of the likelihood and prior, or the sum of their logs: @@ -263,6 +324,8 @@ Our posterior is the product of the likelihood and prior, or the sum of their lo posterior <- likelihood + prior ``` +### Chosing the sampler + Next, we define a sampler; we'll start with a random walk with a fairly arbitrary diagonal proposal matrix: ```{r} @@ -320,7 +383,7 @@ dust_system_set_state_initial(mod_mult) y <- dust_system_simulate(mod_mult, t) y <- dust_unpack_state(mod_mult, y) -matplot(t, t(y$incidence), type = "l", lty = 1, col = "#00000044", +matplot(t, t(y$incidence), type = "l", lty = 1, col = "#00008822", xlab = "Time", ylab = "Infected population") points(data$Days, data$Cases/mean(samples$pars["rho",,]), pch=19, col="red") ``` From caf6cbb366aa3cd553c66be5e5d489e7f0e214eb Mon Sep 17 00:00:00 2001 From: Marc Baguelin Date: Wed, 27 Nov 2024 23:51:01 +0000 Subject: [PATCH 13/17] Amend text --- 2009-flu-uk.qmd | 66 ++++++++++++++++++++++++++++++++++++------------- 1 file changed, 49 insertions(+), 17 deletions(-) diff --git a/2009-flu-uk.qmd b/2009-flu-uk.qmd index 952b275..299ebcb 100644 --- a/2009-flu-uk.qmd +++ b/2009-flu-uk.qmd @@ -75,21 +75,44 @@ data <- data.frame(Cases = original_data) %>% ### Plotting the data You can plot the data and see the two waves of infections and the presence of holiday periods. + ```{r} -#plot the data -plot(data$Day, data$Cases, pch = 19, col = "red", - xlab="Days since start of epidemics", - ylab="Official cases count") - -# Add the holiday periods - summer holidays and half-term -# Holiday calendar assuming 30% contact reduction during summer and 15% during half term -hol_t <- c(0,38,85,140,147) -hol_v <- c(1,.70,1,.85,1) -#summer holiday block -rect(xleft = hol_t[2],ybottom = 0, xright = hol_t[3], ytop = max(data$Cases)*1.2, col = "#aaaabb33", border = NA) -#half-term holiday block -rect(xleft = hol_t[4],ybottom = 0, xright = hol_t[5], ytop = max(data$Cases)*1.2, col = "#aaaabb33", border = NA) +# Set up the plot +plot(data$Day, data$Cases, pch = 19, col = "red", + xlab = "Days since start of epidemics", + ylab = "Official cases count", + main = "Cases over Time with Holiday Periods", + ylim = c(0, max(data$Cases) * 1.3), # Add padding to the y-axis + xlim = c(min(data$Day), max(data$Day))) + +# Add the holiday periods +hol_t <- c(0, 38, 85, 140, 147) # Holiday timings +hol_v <- c(1, 0.70, 1, 0.85, 1) # Contact reduction + +# Summer holidays +rect(xleft = hol_t[2], ybottom = 0, xright = hol_t[3], + ytop = max(data$Cases) * 1.2, + col = adjustcolor("skyblue", alpha.f = 0.3), border = NA) +text(x = mean(hol_t[2:3]), y = max(data$Cases) * 1.25, + labels = "Summer Holiday", col = "blue", cex = 0.8) + +# Half-term holiday +rect(xleft = hol_t[4], ybottom = 0, xright = hol_t[5], + ytop = max(data$Cases) * 1.2, + col = adjustcolor("lightgreen", alpha.f = 0.3), border = NA) +text(x = mean(hol_t[4:5]), y = max(data$Cases) * 1.25, + labels = "Half-Term", col = "darkgreen", cex = 0.8) + +# Legend +legend("topright", legend = c("Cases", "Summer Holiday", "Half-Term"), + pch = c(19, NA, NA), col = c("red", "skyblue", "lightgreen"), + pt.bg = c(NA, "skyblue", "lightgreen"), + lty = c(NA, 1, 1), lwd = c(NA, 10, 10), + bg = "white", inset = 0.02, bty = "n") + ``` + + ## Model design ### Transmission model @@ -106,14 +129,24 @@ Additionally, the model incorporates time-varying transmission rates to account The SEIR model is governed by the following system of ordinary differential equations: $$ +\left\{ \begin{aligned} \frac{dS}{dt} &= - \beta(t) \frac{S I}{N} \\ \frac{dE}{dt} &= \beta(t) \frac{S I}{N} - \gamma E \\ \frac{dI}{dt} &= \gamma - \sigma I \\ \frac{dR}{dt} &= \sigma I \end{aligned} +\right. $$ +where: + +- $\beta(t)$ is the time-dependent transmission rate, representing the rate at which susceptible individuals become exposed through contact with infectious individuals. +- $\gamma$ is the rate at which exposed individuals transition to the infectious stage (the inverse of the incubation period). +- $\sigma$ is the recovery rate, indicating the rate at which infectious individuals recover or are removed from the infectious population. +- $N$ is the total population size, assumed to remain constant. +- $S(t)$, $E(t)$, $I(t)$, and $R(t)$ represent the number of susceptible, exposed, infectious, and recovered individuals, respectively, at time $t$. + The parameters for the transmission model are defined as follows: $$ \begin{aligned} @@ -149,14 +182,14 @@ $$\frac{dZ}{dt} = \gamma E$$ Then we get for each $t$ that is a multiple of 7: -$$Z(t) = \int_{t-7}^{t} \gamma E$$ +$$Z(t) = \int_{t-7}^{t} \gamma E(s) \ ds$$ i.e. the cumulative incidence over a week. Finally, the observation model is defined as: $$ \text{Cases(t)} \sim \text{NegativeBinomial}(\mu = \rho Z(t), \text{size} = \eta), $$ -where \(\rho\) is the ascertainment probability, and $\eta$ is the dispersion parameter. +where $\rho$ is the ascertainment probability, and $\eta$ is the dispersion parameter. ### Odin code of the model @@ -210,7 +243,6 @@ Cases ~ NegativeBinomial(size = eta, mu = rho*incidence) Once the model is compiled, it is possible to generate one (or more!) instance of your model by using the `dust_system_create()` and your model generator. ```{r} -#mod <- seir$new(h_times = hol_t, h_values = hol_v) mod <- dust_system_create(seir, pars = list(h_times = hol_t, h_values = hol_v)) ``` @@ -271,7 +303,7 @@ This is the likelihood associated with the dust model written using odin. We kno ```{r} packer <- monty_packer(c("alpha", "R_0", "rho", "eta"), - fixed = list(h_times = hol_t,h_values= hol_v)) + fixed = list(h_times = hol_t, h_values= hol_v)) ``` This defines two functions that allows to 'pack' (define) and 'unpack' (define) our parameters. From 13fef496e130e178dc71099a995b039333587ed3 Mon Sep 17 00:00:00 2001 From: Marc Baguelin Date: Fri, 29 Nov 2024 11:13:21 +0000 Subject: [PATCH 14/17] Add text and restructure --- 2009-flu-uk.qmd | 72 +++++++++++++++++++++++++++++-------------------- 1 file changed, 43 insertions(+), 29 deletions(-) diff --git a/2009-flu-uk.qmd b/2009-flu-uk.qmd index 299ebcb..9847194 100644 --- a/2009-flu-uk.qmd +++ b/2009-flu-uk.qmd @@ -33,22 +33,18 @@ The chapter is structured to guide the reader through the entire modelling pipel This chapter illustrates how to: -1. Construct an SEIR model with time-varying transmission using `odin`. -2. Implement an observation model, derive a likelihood based on observations using `odin`. -3. Run Bayesian inference using Markov chain Monte Carlo (MCMC) with `monty`. +1. Construct an SEIR transmission model with time-varying contact rate using `odin`. +2. Implement a (probabilistic) observation model in `odin` +3. Derive a likelihood based on observations using `dust` and convert it to `monty`. +4. Built a prior distribution for you model parameters using the `monty` DSL. +3. Run Bayesian inference using adaptive Markov chain Monte Carlo (MCMC) with `monty`. 4. Analyse and interpret MCMC results. ### Key parameters of interest - - - The basic reproduction number $R_0$. - - The ascertainment probability, i.e., the proportion of symptomatic infections reported as cases. - -```{r} -library(odin2) -library(dust2) -library(monty) -``` +We are interested in particular in two epidemiological parameters: + - The basic reproduction number $R_0$ of the novel influenza strain; + - The ascertainment probability, i.e., the proportion of total infections reported as cases. ## Data preparation and visualisation @@ -147,19 +143,13 @@ where: - $N$ is the total population size, assumed to remain constant. - $S(t)$, $E(t)$, $I(t)$, and $R(t)$ represent the number of susceptible, exposed, infectious, and recovered individuals, respectively, at time $t$. -The parameters for the transmission model are defined as follows: -$$ -\begin{aligned} - R_0 &\text{ (basic reproduction number): } R_0 = 1.5, \\ - \gamma &\text{ (rate of transition from exposed to infectious): } \gamma = \frac{1}{L}, \\ - \sigma &\text{ (recovery rate): } \sigma = \frac{1}{D}, \\ - \beta &\text{ (transmission rate): } \beta = R_0 \cdot \sigma, \\ - N &\text{ (total population): } N = 55,000,000, \\ - \text{hol} &\text{ (time-varying factor): interpolated from } h_{\text{times}} \text{ and } h_{\text{values}}. -\end{aligned} -$$ +We assume that the contact rate is constant outside of the holiday period, and that it drops by 30% during the summer and 15% during the half-term. We have thus: + +$$\beta(t) = R_{0}*\sigma * h(t)$$ + +with $h(t)$ a piece-wise constant function equal to 0.7 during the summer, 0.85 during the half-term and 1 otherwise. -The initial conditions are: +To seed the model we assume that a small proportion of the population is infected - equally shared between the E and I compartments. The initial conditions are thus: $$ \begin{aligned} S(0) &= (1 - 2 \alpha) N \\ @@ -178,21 +168,45 @@ In our `odin` model we can track the weekly cumulative incidence $Z(t)$ by: 1. adding a differential equation integrating the instantaneous incidence: $$\frac{dZ}{dt} = \gamma E$$ -2. reseting $Z(t) = 0$ at the beginning of each week. +2. reseting $Z(t) = 0$ at the beginning of each week to only account for the new cases from the current week. -Then we get for each $t$ that is a multiple of 7: +Then we get for each $t$ that is a multiple of 7 (i.e. when we change week): $$Z(t) = \int_{t-7}^{t} \gamma E(s) \ ds$$ i.e. the cumulative incidence over a week. -Finally, the observation model is defined as: +Finally, we need to account for the imperfection of the surveillance system, assuming that only a fraction $rho$ of the infections are captured by the surveillance system and accounting for potential overdispersion we define the following observation model using a Negative-Binomial distribution with mean $\mu = \rho Z(t)$ and size $\eta$: $$ \text{Cases(t)} \sim \text{NegativeBinomial}(\mu = \rho Z(t), \text{size} = \eta), $$ where $\rho$ is the ascertainment probability, and $\eta$ is the dispersion parameter. +::: {.callout-note} +Even if the surveillance system was perfect we would expect a difference between the number of *symptomatic* cases and the number of infections as a proportion of flu cases are *asymptomatic* or 'pauci-symptomatic'. +::: + +### Summary of model parameters + +The parameters for the transmission model are defined as follows: +$$ +\begin{aligned} + R_0 &\text{ (basic reproduction number): } R_0 = 1.5, \\ + \gamma &\text{ (rate of transition from exposed to infectious): } \gamma = \frac{1}{L}, \\ + \sigma &\text{ (recovery rate): } \sigma = \frac{1}{D}, \\ + \beta &\text{ (transmission rate): } \beta = R_0 \cdot \sigma, \\ + N &\text{ (total population): } N = 55,000,000, \\ + \text{hol} &\text{ (time-varying factor): interpolated from } h_{\text{times}} \text{ and } h_{\text{values}}. +\end{aligned} +$$ + ### Odin code of the model +```{r} +library(odin2) +library(dust2) +library(monty) +``` + Below is the `odin` code for our model - the structure matches closely the mathematical formulation. Note the `Cases <- data()` lines telling that `Cases` are observations. The model will compile and work even when it is not linked with any dataset. However providing data is essentiel to compute a likelihood with `dust` or derive a `monty` statistical model out of it. This step takes time. The model needs to be transpiled (e.g. translated from one programming language to another) in order to create some C++ code and then that code is compiled and loaded in the R environnment. After this, a "fast" model can be called from the R environment and used to simulate scenarios or perform inference. @@ -275,10 +289,10 @@ Let's plot the I compartment over time. plot(t, dust_unpack_state(mod, y)$incidence, type = "l", col = "red", lwd = 2) ``` -It looks a bit like the actual epidemic; a good sign, but let compare our model with the data. +It looks a bit like the actual epidemic; a good sign, but let's compare our model with the data. ```{r} -plot(t, dust_unpack_state(mod, y)$incidence, type = "l", col = "red", lwd = 2) +plot(t, dust_unpack_state(mod, y)$incidence*par0[3], type = "l", col = "red", lwd = 2) points(data$Days, data$Cases) ``` From e7c14cae076b6e568534864e57db7ceb04dfcfa2 Mon Sep 17 00:00:00 2001 From: Marc Baguelin Date: Sun, 1 Dec 2024 20:40:38 +0000 Subject: [PATCH 15/17] More amendments --- 2009-flu-uk.qmd | 16 +++++++++------- 1 file changed, 9 insertions(+), 7 deletions(-) diff --git a/2009-flu-uk.qmd b/2009-flu-uk.qmd index 9847194..8124473 100644 --- a/2009-flu-uk.qmd +++ b/2009-flu-uk.qmd @@ -23,11 +23,11 @@ knitr::knit_hooks$set(small_margins = function(before, options, envir) { ``` ## Introduction -### Objective of this chapter - The 2009 A/H1N1 influenza pandemic posed a significant public health challenge in England and Wales, requiring rapid and evidence-based decision-making to mitigate its impact. This chapter explores the use of `odin` and `monty` to model the dynamics of the 2009 A/H1N1 influenza pandemic in England and Wales incorporating changes in social contacts during holiday periods. We demonstrate how to construct, fit, and analyse a compartmental model to infer key epidemiological parameters. -The chapter is structured to guide the reader through the entire modelling pipeline, from data preparation and visualisation to model construction, parameter inference, and validation. Along the way, we highlight the integration of real-world data, discuss challenges in linking models to observed cases, and showcase the use of Bayesian methods for robust parameter estimation. +### Objective of this chapter + +The chapter is structured to guide the reader through an entire modelling pipeline, from data preparation and visualisation to model construction, parameter inference, and validation. Along the way, we highlight the integration of real-world data, discuss challenges in linking models to observed cases, and showcase the use of Bayesian methods for robust parameter estimation. ### Overview of the pipeline @@ -43,6 +43,7 @@ This chapter illustrates how to: ### Key parameters of interest We are interested in particular in two epidemiological parameters: + - The basic reproduction number $R_0$ of the novel influenza strain; - The ascertainment probability, i.e., the proportion of total infections reported as cases. @@ -87,8 +88,9 @@ hol_v <- c(1, 0.70, 1, 0.85, 1) # Contact reduction # Summer holidays rect(xleft = hol_t[2], ybottom = 0, xright = hol_t[3], - ytop = max(data$Cases) * 1.2, - col = adjustcolor("skyblue", alpha.f = 0.3), border = NA) + ytop = max(data$Cases) * 1.2, + col = "#87CEEB77", border = NA) + #col = adjustcolor("skyblue", alpha.f = 0.3), border = NA) text(x = mean(hol_t[2:3]), y = max(data$Cases) * 1.25, labels = "Summer Holiday", col = "blue", cex = 0.8) @@ -101,8 +103,8 @@ text(x = mean(hol_t[4:5]), y = max(data$Cases) * 1.25, # Legend legend("topright", legend = c("Cases", "Summer Holiday", "Half-Term"), - pch = c(19, NA, NA), col = c("red", "skyblue", "lightgreen"), - pt.bg = c(NA, "skyblue", "lightgreen"), + pch = c(19, NA, NA), col = c("red", "#87CEEB77", "lightgreen"), + fill = c(NA, "skyblue", "lightgreen"), lty = c(NA, 1, 1), lwd = c(NA, 10, 10), bg = "white", inset = 0.02, bty = "n") From a8097eb18cba245efc0fc2b86cdb724bd6c4fec4 Mon Sep 17 00:00:00 2001 From: Marc Baguelin Date: Mon, 2 Dec 2024 11:02:55 +0000 Subject: [PATCH 16/17] Some amendment and add reference --- 2009-flu-uk.qmd | 47 +++++++++++++++++++++++------------------------ references.bib | 14 ++++++++++++++ 2 files changed, 37 insertions(+), 24 deletions(-) diff --git a/2009-flu-uk.qmd b/2009-flu-uk.qmd index 8124473..eb3042d 100644 --- a/2009-flu-uk.qmd +++ b/2009-flu-uk.qmd @@ -33,12 +33,12 @@ The chapter is structured to guide the reader through an entire modelling pipeli This chapter illustrates how to: -1. Construct an SEIR transmission model with time-varying contact rate using `odin`. +1. Construct an SEIR transmission model with time-varying contact rate using `odin` 2. Implement a (probabilistic) observation model in `odin` -3. Derive a likelihood based on observations using `dust` and convert it to `monty`. -4. Built a prior distribution for you model parameters using the `monty` DSL. -3. Run Bayesian inference using adaptive Markov chain Monte Carlo (MCMC) with `monty`. -4. Analyse and interpret MCMC results. +3. Derive a likelihood based on observations using `dust` and convert it to `monty` +4. Built a prior distribution for the model parameters using the `monty` DSL +3. Run Bayesian inference using adaptive Markov chain Monte Carlo (MCMC) with `monty` +4. Analyse and interpret the MCMC results. ### Key parameters of interest @@ -49,7 +49,7 @@ We are interested in particular in two epidemiological parameters: ## Data preparation and visualisation -Data is the number of weekly cases of A/H1N1 pandemic cases from June 2009, and initially broken in seven age-groups. For simplicity, we aggregate the data into a single time series for the analysis. +Data is the number of weekly cases of A/H1N1 pandemic cases from June 2009, and initially broken in seven age-groups. For simplicity, we aggregate the data into a single time series for this analysis. ### Loading data @@ -97,14 +97,13 @@ text(x = mean(hol_t[2:3]), y = max(data$Cases) * 1.25, # Half-term holiday rect(xleft = hol_t[4], ybottom = 0, xright = hol_t[5], ytop = max(data$Cases) * 1.2, - col = adjustcolor("lightgreen", alpha.f = 0.3), border = NA) + col = "#90EE9077", border = NA) text(x = mean(hol_t[4:5]), y = max(data$Cases) * 1.25, labels = "Half-Term", col = "darkgreen", cex = 0.8) # Legend legend("topright", legend = c("Cases", "Summer Holiday", "Half-Term"), - pch = c(19, NA, NA), col = c("red", "#87CEEB77", "lightgreen"), - fill = c(NA, "skyblue", "lightgreen"), + pch = c(19, NA, NA), col = c("red", "#87CEEB77", "#90EE9077"), lty = c(NA, 1, 1), lwd = c(NA, 10, 10), bg = "white", inset = 0.02, bty = "n") @@ -177,29 +176,27 @@ Then we get for each $t$ that is a multiple of 7 (i.e. when we change week): $$Z(t) = \int_{t-7}^{t} \gamma E(s) \ ds$$ i.e. the cumulative incidence over a week. -Finally, we need to account for the imperfection of the surveillance system, assuming that only a fraction $rho$ of the infections are captured by the surveillance system and accounting for potential overdispersion we define the following observation model using a Negative-Binomial distribution with mean $\mu = \rho Z(t)$ and size $\eta$: +Finally, we need to account for the imperfection of the surveillance system, assuming that only a fraction $\rho$ of the infections are captured by the surveillance system and accounting for potential overdispersion we define the following observation model using a Negative-Binomial distribution with mean $\mu = \rho Z(t)$ and size $\eta$: $$ \text{Cases(t)} \sim \text{NegativeBinomial}(\mu = \rho Z(t), \text{size} = \eta), $$ where $\rho$ is the ascertainment probability, and $\eta$ is the dispersion parameter. ::: {.callout-note} -Even if the surveillance system was perfect we would expect a difference between the number of *symptomatic* cases and the number of infections as a proportion of flu cases are *asymptomatic* or 'pauci-symptomatic'. +Even if the surveillance system was perfect we would expect a difference between the number of *symptomatic* cases and the number of infections as a proportion of flu cases are *asymptomatic* or 'pauci-symptomatic' (showing very few symptoms). ::: ### Summary of model parameters -The parameters for the transmission model are defined as follows: -$$ -\begin{aligned} - R_0 &\text{ (basic reproduction number): } R_0 = 1.5, \\ - \gamma &\text{ (rate of transition from exposed to infectious): } \gamma = \frac{1}{L}, \\ - \sigma &\text{ (recovery rate): } \sigma = \frac{1}{D}, \\ - \beta &\text{ (transmission rate): } \beta = R_0 \cdot \sigma, \\ - N &\text{ (total population): } N = 55,000,000, \\ - \text{hol} &\text{ (time-varying factor): interpolated from } h_{\text{times}} \text{ and } h_{\text{values}}. -\end{aligned} -$$ +A summary of the parameters of our model: + +- $R_{0}$, the basic reproduction number, +- $\gamma$, the inverse of the mean incubation period set to 1 day, +- $\sigma$, the recovery rate, the inverse of the mean recovery time assumed to be 1.25 days, +- $h(t)$, the reduction of contact due to holidays, assumed piecewise constant, +- $\rho$, the ascertainment probability i.e. the proportion of infections we expect to ascert as cases, +- $\eta$, the size parameter of our Negative Binomial observation model, +- $N$ the total size of the population in England and Wales in 2009. ### Odin code of the model @@ -209,9 +206,9 @@ library(dust2) library(monty) ``` -Below is the `odin` code for our model - the structure matches closely the mathematical formulation. Note the `Cases <- data()` lines telling that `Cases` are observations. The model will compile and work even when it is not linked with any dataset. However providing data is essentiel to compute a likelihood with `dust` or derive a `monty` statistical model out of it. +Below is the `odin` code for our model - the structure matches closely the mathematical formulation we derived in the previous section. Note the `Cases <- data()` lines telling that `Cases` are observations though the model will compile and work even if not linked with any dataset. However providing data is essential to compute a likelihood with `dust` or derive a `monty` likelihood statistical model of our system. -This step takes time. The model needs to be transpiled (e.g. translated from one programming language to another) in order to create some C++ code and then that code is compiled and loaded in the R environnment. After this, a "fast" model can be called from the R environment and used to simulate scenarios or perform inference. +The compilation step takes time. The model needs to be first transpiled (e.g. translated from one programming language to another) in order to create some C++ code and then that code is compiled and loaded in the R environnment. After this, a "fast" model can be called from the R environment and used to simulate scenarios or perform inference. ```{r, echo = TRUE} seir <- odin2::odin({ @@ -298,6 +295,8 @@ plot(t, dust_unpack_state(mod, y)$incidence*par0[3], type = "l", col = "red", lw points(data$Days, data$Cases) ``` +We seem to be overpredicted the expected number of cases by a factor two or so, but the MCMC pipeline should allow us to infer the distribution of the parameters consistent with the observations. + ## The MCMC pipeline We now have a dataset and a working model, all loaded in our R environment. We need to link the output of this model with the data in order to derive a likelihood function. As we are working within a Bayesian framework we also need to define the prior distribution for our parameters. diff --git a/references.bib b/references.bib index e07eb38..5fff51e 100644 --- a/references.bib +++ b/references.bib @@ -15,3 +15,17 @@ @article{knuth84 pages = {97–111}, numpages = {15} } + +@article{endo_introduction_2019, + title = {Introduction to particle {Markov}-chain {Monte} {Carlo} for disease dynamics modellers}, + volume = {29}, + urldate = {2021-08-27}, + journal = {Epidemics}, + author = {Endo, Akira and van Leeuwen, Edwin and Baguelin, Marc}, + month = dec, + year = {2019}, + note = {Publisher: Elsevier}, + keywords = {Hidden Markov process, Particle filter, Particle Markov-chain Monte Carlo, Sequential Monte Carlo, State-space models}, + pages = {100363}, + file = {PDF:/Users/mbagueli/Zotero/storage/Y3IHN6XR/Endo, van Leeuwen, Baguelin - 2019 - Introduction to particle Markov-chain Monte Carlo for disease dynamics modellers.pdf:application/pdf}, +} From 5cc5f4cdce30f85080725031a924a7da911905b1 Mon Sep 17 00:00:00 2001 From: Marc Baguelin Date: Mon, 2 Dec 2024 12:27:04 +0000 Subject: [PATCH 17/17] add gridExtra to imports in DESCRIPTION --- DESCRIPTION | 1 + 1 file changed, 1 insertion(+) diff --git a/DESCRIPTION b/DESCRIPTION index be18c19..9f1688d 100644 --- a/DESCRIPTION +++ b/DESCRIPTION @@ -14,6 +14,7 @@ Imports: devtools, dust2, fs, + gridExtra, knitr, monty, odin2,