diff --git a/.Rproj.user/shared/notebooks/paths b/.Rproj.user/shared/notebooks/paths index ce85c5d..7c559bd 100644 --- a/.Rproj.user/shared/notebooks/paths +++ b/.Rproj.user/shared/notebooks/paths @@ -1,4 +1,5 @@ -C:/Users/David/Documents/course_pou/01-Introduction.Rmd="6061682F" -C:/Users/David/Documents/course_pou/01-Introduction2.Rmd="CAE3CA5C" -C:/Users/David/Documents/course_pou/NOTES.txt="3DE046A6" -C:/Users/David/Documents/course_pou/_bookdown.yml="C482C268" +D:/phd/course_pou/14-bootstrap.Rmd="F85B76FF" +D:/phd/course_pou/15-maximum_likelihood.Rmd="5A6AD4DF" +D:/phd/course_pou/_bookdown.yml="07C8BE57" +D:/phd/course_pou/neural_networks.R="4FB0F4BE" +D:/phd/course_pou/supervised_learning.R="45E91390" diff --git a/14-bootstrap.Rmd b/14-bootstrap.Rmd index 6fd36b4..8be3af7 100644 --- a/14-bootstrap.Rmd +++ b/14-bootstrap.Rmd @@ -379,3 +379,232 @@ bootstrap(x, T_max) # In general, bootstrap will fail when estimating the CI for the maximum. ``` + +```{exercise, name = "Practical - and fictional - coverage interval comparison"} +In this exercise, we investigate how different kinds of CI's behave as we vary the number of measurements. + +The story behind the data: it's 2025 and we've discovered that Slovenia has rich deposits of a rare mineral called Moustachium, which can be used to accelerate moustache growth. This mineral is highly sought, so the government has decided to contract two different companies to provide information on where to begin mining. Both companies investigated mining sites in each statistical region and gave their best estimate of the average Moustachium concentration in tonnes per square kilometer. The Data Science team has been called to estimate the uncertainty in these estimates and help avoid mining in the wrong region. + +Generate synthetic data with the script below: + + set.seed(0) + + library(comprehenr) + + regions <- c("pomurska", "podravska", "koroska", "savinjska", "zasavska", "posavska", "JV Slovenija", "primorsko-notranjska", "osrednjeslovenska", "gorenjska", "goriska", "obalno-kraska") + region_rates <- seq(1.3, 2.3, length.out=length(regions)) + region_rates <- region_rates[sample.int(length(regions), length(regions))] + + make_dataset <- function(n_contractors) { + measurements <- matrix(nrow=length(regions), ncol=n_contractors) + for (i in 1:length(regions)) { + measurements[i,] <- rgamma(n_contractors, 5.0, region_rates[i]) + } + + df <- data.frame(measurements) + row.names(df) <- regions + names(df) <- to_vec(for(i in 1:n_contractors) paste("Contractor", i)) + return(df) + } + + set.seed(0) + df_2025 <- make_dataset(2) + + set.seed(0) + df_2027 <- make_dataset(10) + + set.seed(0) + df_2028 <- make_dataset(100) + + set.seed(0) + df_2029 <- make_dataset(1000) + + saveRDS(df_2025, file="moustachium_2025.Rda") + saveRDS(df_2027, file="moustachium_2027.Rda") + saveRDS(df_2028, file="moustachium_2028.Rda") + saveRDS(df_2029, file="moustachium_2029.Rda") + + a. Estimate the average concentration for different regions. + + b. Estimate the average concentration uncertainty using 95% CI's (asymptotic normality with biased and unbiased standard error, standard bootstrap CI, bootstrap percentile CI). + + c. Visualize uncertainties with a histogram and discuss the best location to start mining. + + d. The year is 2027 and the government has decided to contract 10 companies. Rerun the code with new measurements and discuss how CI's change. + + e. Technological advancements in robotics have enabled site surveys on a massive scale. Repeat the last point for 100 surveyor robots in 2028 and 1000 surveyor robots in 2029. +``` +
Exercise 14.1 Ideally, a \(1-\alpha\) CI would have \(1-\alpha\) coverage. That is, say a 95% CI should, in the long run, contain the true value of the parameter 95% of the time. In practice, it is impossible to assess the coverage of our CI method, because we rarely know the true parameter. In simulation, however, we can. Let’s assess the coverage of bootstrap percentile intervals.
+Exercise 14.1 Ideally, a \(1-\alpha\) CI would have \(1-\alpha\) coverage. That is, say a 95% CI should, in the long run, contain the true value of the parameter 95% of the time. In practice, it is impossible to assess the coverage of our CI method, because we rarely know the true parameter. In simulation, however, we can. Let’s assess the coverage of bootstrap percentile intervals.
Pick a univariate distribution with readily available mean and one that you can easily sample from.
Draw \(n = 30\) random samples from the chosen distribution and use the bootstrap (with large enough m) and percentile CI method to construct 95% CI. Repeat the process many times and count how many times the CI contains the true mean. That is, compute the actual coverage probability (don’t forget to include the standard error of the coverage probability!). What can you observe?
library(boot)
-set.seed(0)
-nit <- 1000 # Repeat the process "many times"
-alpha <- 0.05 # CI parameter
-nboot <- 100 # m parameter for bootstrap ("large enough m")
-# f: change this to 200 or 5.
-nsample <- 30 # n = 30 random samples from the chosen distribution. Comment out BCa code if it breaks.
-covers <- matrix(nrow = nit, ncol = 3)
-covers_BCa <- matrix(nrow = nit, ncol = 3)
-covers_asymp_norm <- matrix(nrow = nit, ncol = 3)
-
-isin <- function (x, lower, upper) {
- (x > lower) & (x < upper)
-}
-
-for (j in 1:nit) { # Repeating many times
- # a: pick a univariate distribution - standard normal
- x1 <- rnorm(nsample)
-
- # c: one or two different distributions - beta and poisson
- x2 <- rbeta(nsample, 1, 2)
- x3 <- rpois(nsample, 5)
-
- X1 <- matrix(data = NA, nrow = nsample, ncol = nboot)
- X2 <- matrix(data = NA, nrow = nsample, ncol = nboot)
- X3 <- matrix(data = NA, nrow = nsample, ncol = nboot)
- for (i in 1:nboot) {
- X1[ ,i] <- sample(x1, nsample, replace = T)
- X2[ ,i] <- sample(x2, nsample, T)
- X3[ ,i] <- sample(x3, nsample, T)
- }
- X1_func <- apply(X1, 2, mean)
- X2_func <- apply(X2, 2, mean)
- X3_func <- apply(X3, 2, mean)
- X1_quant <- quantile(X1_func, probs = c(alpha / 2, 1 - alpha / 2))
- X2_quant <- quantile(X2_func, probs = c(alpha / 2, 1 - alpha / 2))
- X3_quant <- quantile(X3_func, probs = c(alpha / 2, 1 - alpha / 2))
- covers[j,1] <- (0 > X1_quant[1]) & (0 < X1_quant[2])
- covers[j,2] <- ((1 / 3) > X2_quant[1]) & ((1 / 3) < X2_quant[2])
- covers[j,3] <- (5 > X3_quant[1]) & (5 < X3_quant[2])
-
- mf <- function (x, i) return(mean(x[i]))
- bootX1 <- boot(x1, statistic = mf, R = nboot)
- bootX2 <- boot(x2, statistic = mf, R = nboot)
- bootX3 <- boot(x3, statistic = mf, R = nboot)
-
- X1_quant_BCa <- boot.ci(bootX1, type = "bca")$bca
- X2_quant_BCa <- boot.ci(bootX2, type = "bca")$bca
- X3_quant_BCa <- boot.ci(bootX3, type = "bca")$bca
-
- covers_BCa[j,1] <- (0 > X1_quant_BCa[4]) & (0 < X1_quant_BCa[5])
- covers_BCa[j,2] <- ((1 / 3) > X2_quant_BCa[4]) & ((1 / 3) < X2_quant_BCa[5])
- covers_BCa[j,3] <- (5 > X3_quant_BCa[4]) & (5 < X3_quant_BCa[5])
-
- # e: estimate mean and standard error
- # sample mean:
- x1_bar <- mean(x1)
- x2_bar <- mean(x2)
- x3_bar <- mean(x3)
-
- # standard error (of the sample mean) estimate: sample standard deviation / sqrt(n)
- x1_bar_SE <- sd(x1) / sqrt(nsample)
- x2_bar_SE <- sd(x2) / sqrt(nsample)
- x3_bar_SE <- sd(x3) / sqrt(nsample)
-
- covers_asymp_norm[j,1] <- isin(0, x1_bar - 1.96 * x1_bar_SE, x1_bar + 1.96 * x1_bar_SE)
- covers_asymp_norm[j,2] <- isin(1/3, x2_bar - 1.96 * x2_bar_SE, x2_bar + 1.96 * x2_bar_SE)
- covers_asymp_norm[j,3] <- isin(5, x3_bar - 1.96 * x3_bar_SE, x3_bar + 1.96 * x3_bar_SE)
-
-}
-apply(covers, 2, mean)
library(boot)
+set.seed(0)
+<- 1000 # Repeat the process "many times"
+ nit <- 0.05 # CI parameter
+ alpha <- 100 # m parameter for bootstrap ("large enough m")
+ nboot # f: change this to 200 or 5.
+<- 30 # n = 30 random samples from the chosen distribution. Comment out BCa code if it breaks.
+ nsample <- matrix(nrow = nit, ncol = 3)
+ covers <- matrix(nrow = nit, ncol = 3)
+ covers_BCa <- matrix(nrow = nit, ncol = 3)
+ covers_asymp_norm
+<- function (x, lower, upper) {
+ isin > lower) & (x < upper)
+ (x
+ }
+for (j in 1:nit) { # Repeating many times
+# a: pick a univariate distribution - standard normal
+ <- rnorm(nsample)
+ x1
+ # c: one or two different distributions - beta and poisson
+ <- rbeta(nsample, 1, 2)
+ x2 <- rpois(nsample, 5)
+ x3
+ <- matrix(data = NA, nrow = nsample, ncol = nboot)
+ X1 <- matrix(data = NA, nrow = nsample, ncol = nboot)
+ X2 <- matrix(data = NA, nrow = nsample, ncol = nboot)
+ X3 for (i in 1:nboot) {
+ <- sample(x1, nsample, replace = T)
+ X1[ ,i] <- sample(x2, nsample, T)
+ X2[ ,i] <- sample(x3, nsample, T)
+ X3[ ,i]
+ }<- apply(X1, 2, mean)
+ X1_func <- apply(X2, 2, mean)
+ X2_func <- apply(X3, 2, mean)
+ X3_func <- quantile(X1_func, probs = c(alpha / 2, 1 - alpha / 2))
+ X1_quant <- quantile(X2_func, probs = c(alpha / 2, 1 - alpha / 2))
+ X2_quant <- quantile(X3_func, probs = c(alpha / 2, 1 - alpha / 2))
+ X3_quant 1] <- (0 > X1_quant[1]) & (0 < X1_quant[2])
+ covers[j,2] <- ((1 / 3) > X2_quant[1]) & ((1 / 3) < X2_quant[2])
+ covers[j,3] <- (5 > X3_quant[1]) & (5 < X3_quant[2])
+ covers[j,
+<- function (x, i) return(mean(x[i]))
+ mf <- boot(x1, statistic = mf, R = nboot)
+ bootX1 <- boot(x2, statistic = mf, R = nboot)
+ bootX2 <- boot(x3, statistic = mf, R = nboot)
+ bootX3
+<- boot.ci(bootX1, type = "bca")$bca
+ X1_quant_BCa <- boot.ci(bootX2, type = "bca")$bca
+ X2_quant_BCa <- boot.ci(bootX3, type = "bca")$bca
+ X3_quant_BCa
+ 1] <- (0 > X1_quant_BCa[4]) & (0 < X1_quant_BCa[5])
+ covers_BCa[j,2] <- ((1 / 3) > X2_quant_BCa[4]) & ((1 / 3) < X2_quant_BCa[5])
+ covers_BCa[j,3] <- (5 > X3_quant_BCa[4]) & (5 < X3_quant_BCa[5])
+ covers_BCa[j,
+ # e: estimate mean and standard error
+ # sample mean:
+ <- mean(x1)
+ x1_bar <- mean(x2)
+ x2_bar <- mean(x3)
+ x3_bar
+ # standard error (of the sample mean) estimate: sample standard deviation / sqrt(n)
+ <- sd(x1) / sqrt(nsample)
+ x1_bar_SE <- sd(x2) / sqrt(nsample)
+ x2_bar_SE <- sd(x3) / sqrt(nsample)
+ x3_bar_SE
+ 1] <- isin(0, x1_bar - 1.96 * x1_bar_SE, x1_bar + 1.96 * x1_bar_SE)
+ covers_asymp_norm[j,2] <- isin(1/3, x2_bar - 1.96 * x2_bar_SE, x2_bar + 1.96 * x2_bar_SE)
+ covers_asymp_norm[j,3] <- isin(5, x3_bar - 1.96 * x3_bar_SE, x3_bar + 1.96 * x3_bar_SE)
+ covers_asymp_norm[j,
+
+ }apply(covers, 2, mean)
## [1] 0.918 0.925 0.905
-
+apply(covers, 2, sd) / sqrt(nit)
## [1] 0.008680516 0.008333333 0.009276910
-
+apply(covers_BCa, 2, mean)
## [1] 0.927 0.944 0.927
-
+apply(covers_BCa, 2, sd) / sqrt(nit)
## [1] 0.008230355 0.007274401 0.008230355
-
+apply(covers_asymp_norm, 2, mean)
## [1] 0.939 0.937 0.930
-
+apply(covers_asymp_norm, 2, sd) / sqrt(nit)
## [1] 0.007572076 0.007687008 0.008072494
Exercise 14.2
+ Exercise 14.2
You are given a sample of independent observations from a process of interest: