diff --git a/.Rproj.user/shared/notebooks/paths b/.Rproj.user/shared/notebooks/paths index ce85c5d..7c559bd 100644 --- a/.Rproj.user/shared/notebooks/paths +++ b/.Rproj.user/shared/notebooks/paths @@ -1,4 +1,5 @@ -C:/Users/David/Documents/course_pou/01-Introduction.Rmd="6061682F" -C:/Users/David/Documents/course_pou/01-Introduction2.Rmd="CAE3CA5C" -C:/Users/David/Documents/course_pou/NOTES.txt="3DE046A6" -C:/Users/David/Documents/course_pou/_bookdown.yml="C482C268" +D:/phd/course_pou/14-bootstrap.Rmd="F85B76FF" +D:/phd/course_pou/15-maximum_likelihood.Rmd="5A6AD4DF" +D:/phd/course_pou/_bookdown.yml="07C8BE57" +D:/phd/course_pou/neural_networks.R="4FB0F4BE" +D:/phd/course_pou/supervised_learning.R="45E91390" diff --git a/14-bootstrap.Rmd b/14-bootstrap.Rmd index 6fd36b4..8be3af7 100644 --- a/14-bootstrap.Rmd +++ b/14-bootstrap.Rmd @@ -379,3 +379,232 @@ bootstrap(x, T_max) # In general, bootstrap will fail when estimating the CI for the maximum. ``` + +```{exercise, name = "Practical - and fictional - coverage interval comparison"} +In this exercise, we investigate how different kinds of CI's behave as we vary the number of measurements. + +The story behind the data: it's 2025 and we've discovered that Slovenia has rich deposits of a rare mineral called Moustachium, which can be used to accelerate moustache growth. This mineral is highly sought, so the government has decided to contract two different companies to provide information on where to begin mining. Both companies investigated mining sites in each statistical region and gave their best estimate of the average Moustachium concentration in tonnes per square kilometer. The Data Science team has been called to estimate the uncertainty in these estimates and help avoid mining in the wrong region. + +Generate synthetic data with the script below: + + set.seed(0) + + library(comprehenr) + + regions <- c("pomurska", "podravska", "koroska", "savinjska", "zasavska", "posavska", "JV Slovenija", "primorsko-notranjska", "osrednjeslovenska", "gorenjska", "goriska", "obalno-kraska") + region_rates <- seq(1.3, 2.3, length.out=length(regions)) + region_rates <- region_rates[sample.int(length(regions), length(regions))] + + make_dataset <- function(n_contractors) { + measurements <- matrix(nrow=length(regions), ncol=n_contractors) + for (i in 1:length(regions)) { + measurements[i,] <- rgamma(n_contractors, 5.0, region_rates[i]) + } + + df <- data.frame(measurements) + row.names(df) <- regions + names(df) <- to_vec(for(i in 1:n_contractors) paste("Contractor", i)) + return(df) + } + + set.seed(0) + df_2025 <- make_dataset(2) + + set.seed(0) + df_2027 <- make_dataset(10) + + set.seed(0) + df_2028 <- make_dataset(100) + + set.seed(0) + df_2029 <- make_dataset(1000) + + saveRDS(df_2025, file="moustachium_2025.Rda") + saveRDS(df_2027, file="moustachium_2027.Rda") + saveRDS(df_2028, file="moustachium_2028.Rda") + saveRDS(df_2029, file="moustachium_2029.Rda") + + a. Estimate the average concentration for different regions. + + b. Estimate the average concentration uncertainty using 95% CI's (asymptotic normality with biased and unbiased standard error, standard bootstrap CI, bootstrap percentile CI). + + c. Visualize uncertainties with a histogram and discuss the best location to start mining. + + d. The year is 2027 and the government has decided to contract 10 companies. Rerun the code with new measurements and discuss how CI's change. + + e. Technological advancements in robotics have enabled site surveys on a massive scale. Repeat the last point for 100 surveyor robots in 2028 and 1000 surveyor robots in 2029. +``` +
+```{r, echo = togs, message = FALSE, eval = togs, warning=FALSE} +library(ggplot2) +library(dplyr) +library(data.table) + +set.seed(0) + +input_dataset_path = "moustachium_2025.Rda" # Change this for points d and e +output_plot_path = "moustachium_2025.pdf" # Change this for points d and e + +df <- readRDS(input_dataset_path) # Data comes from here +n_contractors <- ncol(df) +results_df <- data.frame(region=row.names(df)) # Store CI bounds here + +# 1. average concentration for different mining sites +results_df$average_concetration <- rowMeans(df) + +# CI for the mean based on asymptotic normality (biased SE estimate) +biased_SE <- sqrt(apply(df, 1, function(vec) {sum((vec - mean(vec))^2) / length(vec)}) / n_contractors) +results_df$ci95.an.biased_var.low <- results_df$average_concetration - 1.96 * biased_SE +results_df$ci95.an.biased_var.high <- results_df$average_concetration + 1.96 * biased_SE + +# CI for the mean based on asymptotic normality (unbiased SE estimate) +unbiased_SE <- sqrt(apply(df, 1, var) / n_contractors) +results_df$ci95.an.unbiased_var.low <- results_df$average_concetration - 1.96 * unbiased_SE +results_df$ci95.an.unbiased_var.high <- results_df$average_concetration + 1.96 * unbiased_SE + +# Standard bootstrap CI with 1000 samples +bootstrap_variance <- function(data, n_samples) { + # n_samples is m in pseudocode + output <- numeric(n_samples) + for (i in 1:n_samples) { + index <- sample(1:length(data), length(data), rep = TRUE) + resampled_data <- data[index] + output[i] <- mean(resampled_data) + } + return(var(output)) +} + +bootstrap_1000_sd <- sqrt(apply(df, 1, function(vec){bootstrap_variance(vec, 1000)})) +results_df$ci95.bootstrap.standard.1000.low <- results_df$average_concetration - 1.96 * bootstrap_1000_sd +results_df$ci95.bootstrap.standard.1000.high <- results_df$average_concetration + 1.96 * bootstrap_1000_sd + +# Bootstrap percentile CI with 1000 samples + +bootstrap_quantile <- function(data, functional, n_samples, probs) { + # n_samples is m in pseudocode + output <- numeric(n_samples) + for (i in 1:n_samples) { + index <- sample(1:length(data), length(data), rep = TRUE) + resampled_data <- data[index] + output[i] <- functional(resampled_data) + } + return(quantile(output, probs=probs)) +} + +results_df$ci95.bootstrap.percentile.1000.low <- apply(df, 1, function(vec){bootstrap_quantile(vec, mean, 1000, 0.025)}) +results_df$ci95.bootstrap.percentile.1000.high <- apply(df, 1, function(vec){bootstrap_quantile(vec, mean, 1000, 0.975)}) + +results_df + +# Visualization: we use a bar chart with uncertainty bands + +plot_moustachium_per_region <- function(region_names, average, ci_low, ci_high) { + df_visualization <- data.frame(region=region_names, average=average, low=ci_low, high=ci_high) + ggplot(df_visualization, aes(x=region, y=average)) + geom_bar(stat="identity") +} + +mask <- endsWith(colnames(results_df), "low") +mask[c(1, 2)] <- T +results_df_low <- results_df[, mask] +colnames(results_df_low) <- gsub('.low','', colnames(results_df_low)) + +mask <- endsWith(colnames(results_df), "high") +mask[c(1, 2)] <- T +results_df_high <- results_df[, mask] +colnames(results_df_high) <- gsub('.high','', colnames(results_df_high)) + +long_results_df_low <- melt(setDT(results_df_low), id.vars=c("region", "average_concetration")) +names(long_results_df_low) <- c("region", "average_concentration", "variable", "low") + +long_results_df_high <- melt(setDT(results_df_high), id.vars=c("region", "average_concetration")) +names(long_results_df_high) <- c("region", "average_concentration", "variable", "high") + +long_results_df <- merge(long_results_df_low, long_results_df_high, by=c("region", "variable", "average_concentration"), all=T) + +moustachium_plot <- ggplot(long_results_df, aes(x=region, y=average_concentration)) + + geom_bar(stat="identity", position="dodge", alpha=0.2) + + geom_errorbar(aes(ymin=low, ymax=high, color=variable), width=0.2, position=position_dodge(0.9)) + + scale_x_discrete(guide = guide_axis(angle = 45)) + + ylim(-1, 8) + +# ggsave(plot=moustachium_plot, width=12, height=8, dpi=300, filename=output_plot_path) +moustachium_plot + +# Visualization: we can also use a map. Circle size denotes concentration in region, low transparency denotes high uncertainty. + +library(maps) + +map_data_slo <- map_data('world')[map_data('world')$region == "Slovenia",] + +map_df <- long_results_df[long_results_df$variable == "ci95.an.biased_var", ] + +# VERY approximate longitudes and latitudes for different regions. +map_df$long <- rep(0, nrow(map_df)) +map_df$lat <- rep(0, nrow(map_df)) + +map_df[map_df$region == "gorenjska"]$long <- 14.2 +map_df[map_df$region == "gorenjska"]$lat <- 46.3 + +map_df[map_df$region == "goriska"]$long <- 13.85 +map_df[map_df$region == "goriska"]$lat <- 46.0 + +map_df[map_df$region == "obalno-kraska"]$long <- 13.9 +map_df[map_df$region == "obalno-kraska"]$lat <- 45.65 + +map_df[map_df$region == "osrednjeslovenska"]$long <- 14.5 +map_df[map_df$region == "osrednjeslovenska"]$lat <- 46. + +map_df[map_df$region == "primorsko-notranjska"]$long <- 14.3 +map_df[map_df$region == "primorsko-notranjska"]$lat <- 45.7 + +map_df[map_df$region == "zasavska"]$long <- 15 +map_df[map_df$region == "zasavska"]$lat <- 46.1 + +map_df[map_df$region == "savinjska"]$long <- 15.2 +map_df[map_df$region == "savinjska"]$lat <- 46.25 + +map_df[map_df$region == "posavska"]$long <- 15.4 +map_df[map_df$region == "posavska"]$lat <- 46 + +map_df[map_df$region == "koroska"]$long <- 15.1 +map_df[map_df$region == "koroska"]$lat <- 46.5 + +map_df[map_df$region == "podravska"]$long <- 15.7 +map_df[map_df$region == "podravska"]$lat <- 46.45 + +map_df[map_df$region == "pomurska"]$long <- 16.2 +map_df[map_df$region == "pomurska"]$lat <- 46.65 + +map_df[map_df$region == "JV Slovenija"]$long <- 15. +map_df[map_df$region == "JV Slovenija"]$lat <- 45.7 + +map_df$ci_size <- (map_df$high - map_df$low) +map_df$ci_y <- map_df$lat - 0.05 +map_df$ci_label <- sprintf("(%.2f, %.2f)", map_df$low, map_df$high) +map_df$avg_label <- sprintf("%.2f", map_df$average_concentration) + +country_plot <- ggplot() + + # First layer: worldwide map + geom_polygon(data = map_data("world"), + aes(x=long, y=lat, group = group), + color = '#9c9c9c', fill = '#f3f3f3') + + # Second layer: Country map + geom_polygon( + data = map_data_slo, + aes(x=long, y=lat, group = group), + color='darkgreen', + fill='green', + alpha=0.2 + ) + + geom_point(data=map_df, aes(x=long, y=lat, fill=region, size=average_concentration, alpha=ci_size), color="black", pch=21) + + geom_text(data=map_df, aes(x=long, y=ci_y, label=ci_label), size=3) + + geom_text(data=map_df, aes(x=long, y=lat, label=avg_label), size=3) + + scale_size_continuous(range = c(3, 12), trans = "exp") + + scale_alpha_continuous(range = c(0.15, 0.75), trans = "reverse") + + ggtitle("Estimated average Moustachium concentration with 95% CI") + + coord_cartesian(xlim=c(13.2, 16.7), ylim=c(45.4, 47.)) + +# ggsave(plot=country_plot, width=18, height=12, dpi=300, filename="country.pdf") +country_plot +``` +
diff --git a/15-maximum_likelihood.Rmd b/15-maximum_likelihood.Rmd index 22daf82..caa5797 100644 --- a/15-maximum_likelihood.Rmd +++ b/15-maximum_likelihood.Rmd @@ -351,6 +351,108 @@ my_pca$rotation ``` +```{exercise, name = "Classification"} +Let $D = \{(x_i, y_i)\}_{i=1}^n$ be a dataset of feature vectors and their corresponding integer class labels. We wish to classify feature vectors into correct classes. + +a. Choose a suitable probability distribution $P_\theta(Y|X)$ and write its log likelihood $\ell$. +b. Choose a differentiable function $f_\phi$ that predicts parameters $\theta$ from a feature vector, i.e.\ $f_\phi(x_i) = \theta_i$. +c. Load the _iris_ dataset with `data(iris)` and split it into train and test subsets. +d. Use gradient descent to find parameters $\phi$ that minimize the negative log likelihood on the _iris_ dataset (equivalently: maximize the log likelihood). Reminder: gradient descent is an iterative optimization procedure $\phi_{t+1} = \phi_t - \eta \nabla_\phi \ell$. Try $\eta = 0.01$ and run optimization for 30 steps. Compute the gradient with `numDeriv::grad`. +e. Print the classification accuracy for the train and test subsets. +``` +
+```{solution, echo = togs} +a. We pick the categorical distribution. +b. Categorical distribution parameters are class probabilities that sum to 1. If there are $m$ classes, we can pick any differentiable function that takes as input a vector of features and predicts a vector of size $m$ whose elements are real numbers. We can then use a softmax transformation to map the predicted vector into one with non-negative entries that sum to 1. For simplicity, we can pick a linear transformation with $\phi = (W, b)$, followed by softmax: + \begin{align*} + f_\phi(x) &= \textrm{softmax}(Wx + b), \\ + \textrm{softmax}(u)_i &= \frac{\exp(u_i)}{\sum_{j=1}^m \exp(u_j)}, + \end{align*} +where $W \in \mathbb{R}^{d\times m}, b \in \mathbb{R}^m$ and $d$ is the number of features. +``` +```{r, echo = togs, message = FALSE, eval = togs, warning=FALSE} +data(iris) +head(iris) + +# Model: +# y ~ Categorical(softmax(weights * features + bias)) +# Want to maximize the (log) likelihood of y w.r.t. weights and bias. +# Need gradient of log likelihood w.r.t. weights and bias. +# Proceed by gradient descent on negative log likelihood. + +weights <- matrix(data=rnorm(4 * 3), nrow=4, ncol=3) +bias <- matrix(data=rnorm(3), nrow=1, ncol=3) + +model <- function(features, weights, bias) { + # parameters is a 5-element vector. First four are weights, last is bias. + return(t(features %*% weights + bias)) +} + +softmax <- function(v) { + return(exp(v) / sum(exp(v))) +} + +categorical_mass <- function(targets, probs) { + # targets: matrix of size (n_data, n_classes) whose rows are one-hot vectors + # probs: matrix of size (n_data, n_classes) whose rows are class probabilities + apply(probs * targets, 1, sum) +} + +predict_probs <- function(features, model, parameters) { + weights <- parameters[1:4, ] + bias <- parameters[5, ] + u <- model(features, weights, bias) + apply(u, 1, softmax) +} + +accuracy <- function(features, targets, model, parameters) { + probs <- predict_probs(features, model, parameters) + argmax_mat <- t(apply(probs, 1, function(v) {v == max(v)})) + correct_predictions <- apply(argmax_mat * targets, 1, sum) + return(mean(correct_predictions)) +} + +neg_log_lik <- function(features, targets, model, parameters) { + probs <- predict_probs(features, model, parameters) + -sum(log(categorical_mass(targets, probs))) +} + +grad_neg_log_lik <- function(features, targets, model, parameters){ + numDeriv::grad(function(par){neg_log_lik(features, targets, model, par)}, parameters) +} + +gradient_descent <- function(initial_parameters, features, targets, step_size = 0.01, n_steps = 30) { + parameters <- initial_parameters + for (i in 1:n_steps) { + print(sprintf("[%d] loss: %.4f, accuracy: %.2f", i, neg_log_lik(features, targets, model, parameters), accuracy(features, targets, model, parameters))) + parameters <- parameters - step_size * grad_neg_log_lik(features, targets, model, parameters) + } + return(parameters) +} + + +x <- as.matrix(subset(iris, select=-c(Species))) +y <- matrix(nrow=nrow(iris), ncol=3) +y[, 1] <- iris$Species == "setosa" +y[, 2] <- iris$Species == "versicolor" +y[, 3] <- iris$Species == "virginica" + +# Take an equal number of representatives for every class for the training and test subsets +# Note: code is written so that shuffling does not matter during optimization +x_train <- x[c(1:35, 51:85, 101:135), ] +y_train <- y[c(1:35, 51:85, 101:135), ] + +x_test <- x[-c(1:35, 51:85, 101:135), ] +y_test <- y[-c(1:35, 51:85, 101:135), ] + +set.seed(0) +optimized_parameters <- gradient_descent(rbind(weights, bias), x_train, y_train) + +accuracy(x_train, y_train, model, optimized_parameters) +accuracy(x_test, y_test, model, optimized_parameters) +``` +
+ ## Fisher information ```{exercise} @@ -507,6 +609,84 @@ ggplot() + ``` +```{exercise} + + +Find the unit Fisher information matrix for the univariate normal distribution. +``` +
+```{solution, echo = togs} + + +The normal density is +\begin{equation*} + p(x; \mu, \sigma) = \frac{1}{\sqrt{2\pi \sigma^2}} \exp\left(-0.5 \frac{(x-\mu)^2}{\sigma^2}\right). +\end{equation*} + +Its logarithm is +\begin{equation*} + \log p(x; \mu, \sigma) = -0.5\log(2\pi) - \log \sigma - 0.5 \frac{(x-\mu)^2}{\sigma^2}. +\end{equation*} + +The second order partial derivatives are +\begin{align*} + \frac{\partial}{\partial \mu^2} p(x; \mu, \sigma) &= -\frac{1}{\sigma^2}, \\ + \frac{\partial}{\partial \mu \partial \sigma} p(x; \mu, \sigma) &= -\frac{2(x-\mu)^2}{\sigma^3}, \\ + \frac{\partial^2}{\partial \sigma^2} p(x; \mu, \sigma) &= \frac{1}{\sigma^2} - \frac{3(x-\mu)^2}{\sigma^4}. +\end{align*} + +The unit Fisher information matrix is then +\begin{align*} +I(\mu, \sigma) = + - E\left[ + \begin{bmatrix} + -\frac{1}{\sigma^2} & -\frac{2(x-\mu)}{\sigma^3} \\ + -\frac{2(x-\mu)}{\sigma^3} & \frac{1}{\sigma^2} - \frac{3(x-\mu)^2}{\sigma^4} + \end{bmatrix} + \right] = + \begin{bmatrix} + \frac{1}{\sigma^2} & 0 \\ + 0 & \frac{2}{\sigma^2} + \end{bmatrix}, +\end{align*} +where we used the fact that $E[X - \mu] = 0$ and $E[(X - \mu)^2] = \sigma^2$. + +``` +
+ +```{exercise} +Find the unit Fisher information for the binomial distribution with fixed $n$. +``` +
+```{solution, echo = togs} + +The binomial mass is +\begin{equation*} + P(X = k; n, p) = \binom{n}{k}p^k(1-p)^{n-k}. +\end{equation*} + +Its logarithm is +\begin{equation*} + \log P(X = k; n, p) = \log \binom{n}{k} + k\log p + (n-k)\log(1-p). +\end{equation*} + +The partial derivatives are +\begin{align*} + \frac{\partial}{\partial p} \log P(X = k; n, p) &= \frac{k}{p} - \frac{n-k}{1-p}, \\ + \frac{\partial^2}{\partial p^2} \log P(X = k; n, p) &= -\frac{k}{p^2} - \frac{n-k}{(1-p)^2}. +\end{align*} + +The unit Fisher information is +\begin{align*} +I(p) = + - E\left[ \frac{\partial^2}{\partial p^2} \log P(X = k; n, p) \right] = + \frac{n}{p(1-p)}, +\end{align*} +where we used the fact that $E[k] = np$ for $k \sim X$. + +``` +
+ ## The German tank problem ```{exercise, name = "The German tank problem"} During WWII the allied intelligence were faced with an important problem of estimating the total production of certain German tanks, such as the Panther. What turned out to be a successful approach was to estimate the maximum from the serial numbers of the small sample of captured or destroyed tanks (describe the statistical model used). diff --git a/_bookdown.yml b/_bookdown.yml index 074641f..e7f4531 100644 --- a/_bookdown.yml +++ b/_bookdown.yml @@ -3,5 +3,5 @@ chapter_name: "Chapter " output_dir: docs rmd_files: ["index.Rmd", "01-Introduction.Rmd", "02-uncountable_probability_spaces.Rmd", "03-conditional_probability.Rmd", "04-random_variables.Rmd", "05-multiple_random_variables.Rmd", "06-integration.Rmd", "07-expected_value.Rmd", "08-multivariate_random_variables.Rmd", "09-alternative_representations.Rmd", "10-concentration_inequalities.Rmd", "11-convergence_of_random_variables.Rmd", "12-limit_theorems.Rmd", "13-estimation_basics.Rmd", "14-bootstrap.Rmd", "15-maximum_likelihood.Rmd", "16-null_hypothesis_significance_tests.Rmd", "17-Bayesian_inference.Rmd","18-distributions_intuition.Rmd", "Appendix.Rmd", "A1-R_programming_language.Rmd", "A2-probability_distributions2.Rmd", "References.Rmd"] new_session: no - +delete_merged_file: true diff --git a/docs/404.html b/docs/404.html index 908fd67..25397a0 100644 --- a/docs/404.html +++ b/docs/404.html @@ -6,7 +6,7 @@ Page not found | Principles of Uncertainty – exercises - + @@ -23,7 +23,7 @@ - + diff --git a/docs/bookdown-pou_files/figure-html/unnamed-chunk-13-1.png b/docs/bookdown-pou_files/figure-html/unnamed-chunk-13-1.png index 1c7dc8b..111b79e 100644 Binary files a/docs/bookdown-pou_files/figure-html/unnamed-chunk-13-1.png and b/docs/bookdown-pou_files/figure-html/unnamed-chunk-13-1.png differ diff --git a/docs/bookdown-pou_files/figure-html/unnamed-chunk-13-2.png b/docs/bookdown-pou_files/figure-html/unnamed-chunk-13-2.png new file mode 100644 index 0000000..bb0eb40 Binary files /dev/null and b/docs/bookdown-pou_files/figure-html/unnamed-chunk-13-2.png differ diff --git a/docs/bookdown-pou_files/figure-html/unnamed-chunk-14-1.png b/docs/bookdown-pou_files/figure-html/unnamed-chunk-14-1.png index 39af057..047f9da 100644 Binary files a/docs/bookdown-pou_files/figure-html/unnamed-chunk-14-1.png and b/docs/bookdown-pou_files/figure-html/unnamed-chunk-14-1.png differ diff --git a/docs/bookdown-pou_files/figure-html/unnamed-chunk-23-1.png b/docs/bookdown-pou_files/figure-html/unnamed-chunk-23-1.png index 8d3d0b5..6e21371 100644 Binary files a/docs/bookdown-pou_files/figure-html/unnamed-chunk-23-1.png and b/docs/bookdown-pou_files/figure-html/unnamed-chunk-23-1.png differ diff --git a/docs/bookdown-pou_files/figure-html/unnamed-chunk-24-1.png b/docs/bookdown-pou_files/figure-html/unnamed-chunk-24-1.png index 1ee82b6..6e21371 100644 Binary files a/docs/bookdown-pou_files/figure-html/unnamed-chunk-24-1.png and b/docs/bookdown-pou_files/figure-html/unnamed-chunk-24-1.png differ diff --git a/docs/bookdown-pou_files/figure-html/unnamed-chunk-26-1.png b/docs/bookdown-pou_files/figure-html/unnamed-chunk-26-1.png index 1283656..0df7ee9 100644 Binary files a/docs/bookdown-pou_files/figure-html/unnamed-chunk-26-1.png and b/docs/bookdown-pou_files/figure-html/unnamed-chunk-26-1.png differ diff --git a/docs/bookdown-pou_files/figure-html/unnamed-chunk-27-1.png b/docs/bookdown-pou_files/figure-html/unnamed-chunk-27-1.png index 6067ca9..0df7ee9 100644 Binary files a/docs/bookdown-pou_files/figure-html/unnamed-chunk-27-1.png and b/docs/bookdown-pou_files/figure-html/unnamed-chunk-27-1.png differ diff --git a/docs/bookdown-pou_files/figure-html/unnamed-chunk-28-1.png b/docs/bookdown-pou_files/figure-html/unnamed-chunk-28-1.png index e7fa4a2..0df7ee9 100644 Binary files a/docs/bookdown-pou_files/figure-html/unnamed-chunk-28-1.png and b/docs/bookdown-pou_files/figure-html/unnamed-chunk-28-1.png differ diff --git a/docs/bookdown-pou_files/figure-html/unnamed-chunk-30-1.png b/docs/bookdown-pou_files/figure-html/unnamed-chunk-30-1.png index dc9818e..0df7ee9 100644 Binary files a/docs/bookdown-pou_files/figure-html/unnamed-chunk-30-1.png and b/docs/bookdown-pou_files/figure-html/unnamed-chunk-30-1.png differ diff --git a/docs/bookdown-pou_files/figure-html/unnamed-chunk-31-1.png b/docs/bookdown-pou_files/figure-html/unnamed-chunk-31-1.png index c5574a0..0df7ee9 100644 Binary files a/docs/bookdown-pou_files/figure-html/unnamed-chunk-31-1.png and b/docs/bookdown-pou_files/figure-html/unnamed-chunk-31-1.png differ diff --git a/docs/bookdown-pou_files/figure-html/unnamed-chunk-6-1.png b/docs/bookdown-pou_files/figure-html/unnamed-chunk-6-1.png index 777ea43..b5bc9e6 100644 Binary files a/docs/bookdown-pou_files/figure-html/unnamed-chunk-6-1.png and b/docs/bookdown-pou_files/figure-html/unnamed-chunk-6-1.png differ diff --git a/docs/bookdown-pou_files/figure-html/unnamed-chunk-9-1.png b/docs/bookdown-pou_files/figure-html/unnamed-chunk-9-1.png index ed5abff..3e02f67 100644 Binary files a/docs/bookdown-pou_files/figure-html/unnamed-chunk-9-1.png and b/docs/bookdown-pou_files/figure-html/unnamed-chunk-9-1.png differ diff --git a/docs/bookdown-pou_files/figure-html/unnamed-chunk-9-2.png b/docs/bookdown-pou_files/figure-html/unnamed-chunk-9-2.png index 808921d..6ecc76d 100644 Binary files a/docs/bookdown-pou_files/figure-html/unnamed-chunk-9-2.png and b/docs/bookdown-pou_files/figure-html/unnamed-chunk-9-2.png differ diff --git a/docs/boot.html b/docs/boot.html index 0a6b41e..b0e77d4 100644 --- a/docs/boot.html +++ b/docs/boot.html @@ -6,7 +6,7 @@ Chapter 14 Bootstrap | Principles of Uncertainty – exercises - + @@ -23,7 +23,7 @@ - + @@ -327,7 +327,7 @@

Chapter 14 Bootstrap -

Exercise 14.1 Ideally, a \(1-\alpha\) CI would have \(1-\alpha\) coverage. That is, say a 95% CI should, in the long run, contain the true value of the parameter 95% of the time. In practice, it is impossible to assess the coverage of our CI method, because we rarely know the true parameter. In simulation, however, we can. Let’s assess the coverage of bootstrap percentile intervals.

+

Exercise 14.1 Ideally, a \(1-\alpha\) CI would have \(1-\alpha\) coverage. That is, say a 95% CI should, in the long run, contain the true value of the parameter 95% of the time. In practice, it is impossible to assess the coverage of our CI method, because we rarely know the true parameter. In simulation, however, we can. Let’s assess the coverage of bootstrap percentile intervals.

  1. Pick a univariate distribution with readily available mean and one that you can easily sample from.

  2. Draw \(n = 30\) random samples from the chosen distribution and use the bootstrap (with large enough m) and percentile CI method to construct 95% CI. Repeat the process many times and count how many times the CI contains the true mean. That is, compute the actual coverage probability (don’t forget to include the standard error of the coverage probability!). What can you observe?

  3. @@ -338,91 +338,91 @@

    Chapter 14 Bootstrap -
    library(boot)
    -set.seed(0)
    -nit   <- 1000  # Repeat the process "many times"
    -alpha <- 0.05  # CI parameter
    -nboot <- 100   # m parameter for bootstrap ("large enough m")
    -# f: change this to 200 or 5.
    -nsample <- 30  # n = 30 random samples from the chosen distribution. Comment out BCa code if it breaks.
    -covers     <- matrix(nrow = nit, ncol = 3)
    -covers_BCa <- matrix(nrow = nit, ncol = 3)
    -covers_asymp_norm <- matrix(nrow = nit, ncol = 3)
    -
    -isin <- function (x, lower, upper) {
    -  (x > lower) & (x < upper)
    -}
    -
    -for (j in 1:nit) {  # Repeating many times
    -  # a: pick a univariate distribution - standard normal
    -  x1 <- rnorm(nsample)
    -  
    -  # c: one or two different distributions - beta and poisson
    -  x2 <- rbeta(nsample, 1, 2)
    -  x3 <- rpois(nsample, 5)
    -  
    -  X1 <- matrix(data = NA, nrow = nsample, ncol = nboot)
    -  X2 <- matrix(data = NA, nrow = nsample, ncol = nboot)
    -  X3 <- matrix(data = NA, nrow = nsample, ncol = nboot)
    -  for (i in 1:nboot) {
    -    X1[ ,i] <- sample(x1, nsample, replace = T)
    -    X2[ ,i] <- sample(x2, nsample, T)
    -    X3[ ,i] <- sample(x3, nsample, T)
    -  }
    -  X1_func <- apply(X1, 2, mean)
    -  X2_func <- apply(X2, 2, mean)
    -  X3_func <- apply(X3, 2, mean)
    -  X1_quant <- quantile(X1_func, probs = c(alpha / 2, 1 - alpha / 2))
    -  X2_quant <- quantile(X2_func, probs = c(alpha / 2, 1 - alpha / 2))
    -  X3_quant <- quantile(X3_func, probs = c(alpha / 2, 1 - alpha / 2))
    -  covers[j,1] <- (0 > X1_quant[1]) & (0 < X1_quant[2])
    -  covers[j,2] <- ((1 / 3) > X2_quant[1]) & ((1 / 3) < X2_quant[2])
    -  covers[j,3] <- (5 > X3_quant[1]) & (5 < X3_quant[2])
    -
    -  mf     <- function (x, i) return(mean(x[i]))
    -  bootX1 <- boot(x1, statistic = mf, R = nboot)
    -  bootX2 <- boot(x2, statistic = mf, R = nboot)
    -  bootX3 <- boot(x3, statistic = mf, R = nboot)
    -
    -  X1_quant_BCa <- boot.ci(bootX1, type = "bca")$bca
    -  X2_quant_BCa <- boot.ci(bootX2, type = "bca")$bca
    -  X3_quant_BCa <- boot.ci(bootX3, type = "bca")$bca
    -  
    -  covers_BCa[j,1] <- (0 > X1_quant_BCa[4]) & (0 < X1_quant_BCa[5])
    -  covers_BCa[j,2] <- ((1 / 3) > X2_quant_BCa[4]) & ((1 / 3) < X2_quant_BCa[5])
    -  covers_BCa[j,3] <- (5 > X3_quant_BCa[4]) & (5 < X3_quant_BCa[5])
    -  
    -  # e: estimate mean and standard error
    -  # sample mean:
    -  x1_bar <- mean(x1)
    -  x2_bar <- mean(x2)
    -  x3_bar <- mean(x3)
    -  
    -  # standard error (of the sample mean) estimate: sample standard deviation / sqrt(n)
    -  x1_bar_SE <- sd(x1) / sqrt(nsample)
    -  x2_bar_SE <- sd(x2) / sqrt(nsample)
    -  x3_bar_SE <- sd(x3) / sqrt(nsample)
    -  
    -  covers_asymp_norm[j,1] <- isin(0, x1_bar - 1.96 * x1_bar_SE, x1_bar + 1.96 * x1_bar_SE)
    -  covers_asymp_norm[j,2] <- isin(1/3, x2_bar - 1.96 * x2_bar_SE, x2_bar + 1.96 * x2_bar_SE)
    -  covers_asymp_norm[j,3] <- isin(5, x3_bar - 1.96 * x3_bar_SE, x3_bar + 1.96 * x3_bar_SE)
    -
    -}
    -apply(covers, 2, mean)
    +
    library(boot)
    +set.seed(0)
    +nit   <- 1000  # Repeat the process "many times"
    +alpha <- 0.05  # CI parameter
    +nboot <- 100   # m parameter for bootstrap ("large enough m")
    +# f: change this to 200 or 5.
    +nsample <- 30  # n = 30 random samples from the chosen distribution. Comment out BCa code if it breaks.
    +covers     <- matrix(nrow = nit, ncol = 3)
    +covers_BCa <- matrix(nrow = nit, ncol = 3)
    +covers_asymp_norm <- matrix(nrow = nit, ncol = 3)
    +
    +isin <- function (x, lower, upper) {
    +  (x > lower) & (x < upper)
    +}
    +
    +for (j in 1:nit) {  # Repeating many times
    +  # a: pick a univariate distribution - standard normal
    +  x1 <- rnorm(nsample)
    +  
    +  # c: one or two different distributions - beta and poisson
    +  x2 <- rbeta(nsample, 1, 2)
    +  x3 <- rpois(nsample, 5)
    +  
    +  X1 <- matrix(data = NA, nrow = nsample, ncol = nboot)
    +  X2 <- matrix(data = NA, nrow = nsample, ncol = nboot)
    +  X3 <- matrix(data = NA, nrow = nsample, ncol = nboot)
    +  for (i in 1:nboot) {
    +    X1[ ,i] <- sample(x1, nsample, replace = T)
    +    X2[ ,i] <- sample(x2, nsample, T)
    +    X3[ ,i] <- sample(x3, nsample, T)
    +  }
    +  X1_func <- apply(X1, 2, mean)
    +  X2_func <- apply(X2, 2, mean)
    +  X3_func <- apply(X3, 2, mean)
    +  X1_quant <- quantile(X1_func, probs = c(alpha / 2, 1 - alpha / 2))
    +  X2_quant <- quantile(X2_func, probs = c(alpha / 2, 1 - alpha / 2))
    +  X3_quant <- quantile(X3_func, probs = c(alpha / 2, 1 - alpha / 2))
    +  covers[j,1] <- (0 > X1_quant[1]) & (0 < X1_quant[2])
    +  covers[j,2] <- ((1 / 3) > X2_quant[1]) & ((1 / 3) < X2_quant[2])
    +  covers[j,3] <- (5 > X3_quant[1]) & (5 < X3_quant[2])
    +
    +  mf     <- function (x, i) return(mean(x[i]))
    +  bootX1 <- boot(x1, statistic = mf, R = nboot)
    +  bootX2 <- boot(x2, statistic = mf, R = nboot)
    +  bootX3 <- boot(x3, statistic = mf, R = nboot)
    +
    +  X1_quant_BCa <- boot.ci(bootX1, type = "bca")$bca
    +  X2_quant_BCa <- boot.ci(bootX2, type = "bca")$bca
    +  X3_quant_BCa <- boot.ci(bootX3, type = "bca")$bca
    +  
    +  covers_BCa[j,1] <- (0 > X1_quant_BCa[4]) & (0 < X1_quant_BCa[5])
    +  covers_BCa[j,2] <- ((1 / 3) > X2_quant_BCa[4]) & ((1 / 3) < X2_quant_BCa[5])
    +  covers_BCa[j,3] <- (5 > X3_quant_BCa[4]) & (5 < X3_quant_BCa[5])
    +  
    +  # e: estimate mean and standard error
    +  # sample mean:
    +  x1_bar <- mean(x1)
    +  x2_bar <- mean(x2)
    +  x3_bar <- mean(x3)
    +  
    +  # standard error (of the sample mean) estimate: sample standard deviation / sqrt(n)
    +  x1_bar_SE <- sd(x1) / sqrt(nsample)
    +  x2_bar_SE <- sd(x2) / sqrt(nsample)
    +  x3_bar_SE <- sd(x3) / sqrt(nsample)
    +  
    +  covers_asymp_norm[j,1] <- isin(0, x1_bar - 1.96 * x1_bar_SE, x1_bar + 1.96 * x1_bar_SE)
    +  covers_asymp_norm[j,2] <- isin(1/3, x2_bar - 1.96 * x2_bar_SE, x2_bar + 1.96 * x2_bar_SE)
    +  covers_asymp_norm[j,3] <- isin(5, x3_bar - 1.96 * x3_bar_SE, x3_bar + 1.96 * x3_bar_SE)
    +
    +}
    +apply(covers, 2, mean)
    ## [1] 0.918 0.925 0.905
    -
    apply(covers, 2, sd) / sqrt(nit)
    +
    apply(covers, 2, sd) / sqrt(nit)
    ## [1] 0.008680516 0.008333333 0.009276910
    -
    apply(covers_BCa, 2, mean)
    +
    apply(covers_BCa, 2, mean)
    ## [1] 0.927 0.944 0.927
    -
    apply(covers_BCa, 2, sd) / sqrt(nit)
    +
    apply(covers_BCa, 2, sd) / sqrt(nit)
    ## [1] 0.008230355 0.007274401 0.008230355
    -
    apply(covers_asymp_norm, 2, mean)
    +
    apply(covers_asymp_norm, 2, mean)
    ## [1] 0.939 0.937 0.930
    -
    apply(covers_asymp_norm, 2, sd) / sqrt(nit)
    +
    apply(covers_asymp_norm, 2, sd) / sqrt(nit)
    ## [1] 0.007572076 0.007687008 0.008072494
    -

    Exercise 14.2 +

    Exercise 14.2 You are given a sample of independent observations from a process of interest:

    @@ -460,41 +460,41 @@

    Chapter 14 Bootstrap -
    # a
    -x <- c(7, 2, 4, 6, 4, 5, 9, 10)
    -n <- length(x)
    -mu <- mean(x)
    -
    -SE <- sqrt(mean((x - mu)^2)) / sqrt(n)
    -SE
    +
    # a
    +x <- c(7, 2, 4, 6, 4, 5, 9, 10)
    +n <- length(x)
    +mu <- mean(x)
    +
    +SE <- sqrt(mean((x - mu)^2)) / sqrt(n)
    +SE
    ## [1] 0.8915839
    -
    z <- qnorm(1 - 0.05 / 2)
    -c(mu - z * SE, mu + z * SE)
    +
    z <- qnorm(1 - 0.05 / 2)
    +c(mu - z * SE, mu + z * SE)
    ## [1] 4.127528 7.622472
    -
    # b
    -SE <- sd(x) / sqrt(n)
    -SE
    +
    # b
    +SE <- sd(x) / sqrt(n)
    +SE
    ## [1] 0.9531433
    -
    c(mu - z * SE, mu + z * SE)
    +
    c(mu - z * SE, mu + z * SE)
    ## [1] 4.006873 7.743127
    -
    # c
    -set.seed(0)
    -
    -m  <- 1000
    -T_mean <- function(x) {mean(x)}
    -
    -est_boot <- array(NA, m)
    -for (i in 1:m) {
    -  x_boot <- x[sample(1:n, n, rep = T)]
    -  est_boot[i] <- T_mean(x_boot)
    -}
    -
    -quantile(est_boot, p = c(0.025, 0.975))
    +
    # c
    +set.seed(0)
    +
    +m  <- 1000
    +T_mean <- function(x) {mean(x)}
    +
    +est_boot <- array(NA, m)
    +for (i in 1:m) {
    +  x_boot <- x[sample(1:n, n, rep = T)]
    +  est_boot[i] <- T_mean(x_boot)
    +}
    +
    +quantile(est_boot, p = c(0.025, 0.975))
    ##  2.5% 97.5% 
     ## 4.250 7.625
    -

    Exercise 14.3 +

    Exercise 14.3 We are given a sample of 10 independent paired (bivariate) observations:

    @@ -564,59 +564,59 @@

    Chapter 14 Bootstrap -
    x <- c(1.26, -0.33,  1.33,  1.27,  0.41, -1.54, -0.93, -0.29, -0.01,  2.40)
    -y <- c(2.64,  0.33,  0.48,  0.06, -0.88, -2.14, -2.21,  0.95,  0.83,  1.45)
    -
    -# a
    -cor(x, y)
    +
    x <- c(1.26, -0.33,  1.33,  1.27,  0.41, -1.54, -0.93, -0.29, -0.01,  2.40)
    +y <- c(2.64,  0.33,  0.48,  0.06, -0.88, -2.14, -2.21,  0.95,  0.83,  1.45)
    +
    +# a
    +cor(x, y)
    ## [1] 0.6991247
    -
    # b
    -res <- cor.test(x, y)
    -res$conf.int[1:2]
    +
    # b
    +res <- cor.test(x, y)
    +res$conf.int[1:2]
    ## [1] 0.1241458 0.9226238
    -
    # c
    -set.seed(0)
    -m  <- 1000
    -n  <- length(x) 
    -T_cor <- function(x, y) {cor(x, y)}
    -
    -est_boot <- array(NA, m)
    -for (i in 1:m) {
    -  idx <- sample(1:n, n, rep = T) # !!! important to use same indices to keep dependency between x and y
    -  est_boot[i] <- T_cor(x[idx], y[idx])
    -}
    -
    -quantile(est_boot, p = c(0.025, 0.975))
    +
    # c
    +set.seed(0)
    +m  <- 1000
    +n  <- length(x) 
    +T_cor <- function(x, y) {cor(x, y)}
    +
    +est_boot <- array(NA, m)
    +for (i in 1:m) {
    +  idx <- sample(1:n, n, rep = T) # !!! important to use same indices to keep dependency between x and y
    +  est_boot[i] <- T_cor(x[idx], y[idx])
    +}
    +
    +quantile(est_boot, p = c(0.025, 0.975))
    ##      2.5%     97.5% 
     ## 0.2565537 0.9057664
    -
    # d
    -# Yes, but the bootstrap CI is more narrow.
    -
    -# e
    -# We just use the functions for Kendall/Spearman coefficients instead:
    -T_kendall <- function(x, y) {cor(x, y, method = "kendall")}
    -T_spearman <- function(x, y) {cor(x, y, method = "spearman")}
    -
    -# Put this in a function that returns the CI
    -bootstrap_95_ci <- function(x, y, t, m = 1000) {
    -  n <- length(x)
    -  est_boot <- array(NA, m)
    -  for (i in 1:m) {
    -    idx <- sample(1:n, n, rep = T) # !!! important to use same indices to keep dependency between x and y
    -    est_boot[i] <- t(x[idx], y[idx])
    -  }
    -  quantile(est_boot, p = c(0.025, 0.975))
    -}
    -
    -bootstrap_95_ci(x, y, T_kendall)
    +
    # d
    +# Yes, but the bootstrap CI is more narrow.
    +
    +# e
    +# We just use the functions for Kendall/Spearman coefficients instead:
    +T_kendall <- function(x, y) {cor(x, y, method = "kendall")}
    +T_spearman <- function(x, y) {cor(x, y, method = "spearman")}
    +
    +# Put this in a function that returns the CI
    +bootstrap_95_ci <- function(x, y, t, m = 1000) {
    +  n <- length(x)
    +  est_boot <- array(NA, m)
    +  for (i in 1:m) {
    +    idx <- sample(1:n, n, rep = T) # !!! important to use same indices to keep dependency between x and y
    +    est_boot[i] <- t(x[idx], y[idx])
    +  }
    +  quantile(est_boot, p = c(0.025, 0.975))
    +}
    +
    +bootstrap_95_ci(x, y, T_kendall)
    ##        2.5%       97.5% 
     ## -0.08108108  0.78378378
    -
    bootstrap_95_ci(x, y, T_spearman)
    +
    bootstrap_95_ci(x, y, T_spearman)
    ##       2.5%      97.5% 
     ## -0.1701115  0.8867925
    -

    Exercise 14.4 +

    Exercise 14.4 In this problem we will illustrate the use of the nonparametric bootstrap for estimating CIs of regression model coefficients.

      @@ -626,15 +626,15 @@

      Chapter 14 Bootstrap -
      # a
      -data(longley)
      -
      -# b
      -res <- lm(Employed ~ . , longley)
      -tmp <- data.frame(summary(res)$coefficients[,1:2])
      -tmp$LB <- tmp[,1] - 1.96 * tmp[,2]
      -tmp$UB <- tmp[,1] + 1.96 * tmp[,2]
      -tmp
      +
      # a
      +data(longley)
      +
      +# b
      +res <- lm(Employed ~ . , longley)
      +tmp <- data.frame(summary(res)$coefficients[,1:2])
      +tmp$LB <- tmp[,1] - 1.96 * tmp[,2]
      +tmp$UB <- tmp[,1] + 1.96 * tmp[,2]
      +tmp
      ##                   Estimate   Std..Error            LB            UB
       ## (Intercept)  -3.482259e+03 8.904204e+02 -5.227483e+03 -1.737035e+03
       ## GNP.deflator  1.506187e-02 8.491493e-02 -1.513714e-01  1.814951e-01
      @@ -643,37 +643,37 @@ 

      Chapter 14 Bootstrap
      # c
      -set.seed(0)
      -m <- 100
      -n <- nrow(longley)
      -T_coef <- function(x) {
      -  lm(Employed ~ . , x)$coefficients
      -}
      -
      -est_boot <- array(NA, c(m, ncol(longley)))
      -for (i in 1:m) {
      -  idx <- sample(1:n, n, rep = T)
      -  est_boot[i,] <- T_coef(longley[idx,])
      -}
      -
      -SE <- apply(est_boot, 2, sd)
      -SE

    +
    # c
    +set.seed(0)
    +m <- 100
    +n <- nrow(longley)
    +T_coef <- function(x) {
    +  lm(Employed ~ . , x)$coefficients
    +}
    +
    +est_boot <- array(NA, c(m, ncol(longley)))
    +for (i in 1:m) {
    +  idx <- sample(1:n, n, rep = T)
    +  est_boot[i,] <- T_coef(longley[idx,])
    +}
    +
    +SE <- apply(est_boot, 2, sd)
    +SE
    ## [1] 1.826011e+03 1.605981e-01 5.693746e-02 8.204892e-03 3.802225e-03
     ## [6] 3.907527e-01 9.414436e-01
    -
    # Show the standard errors around coefficients
    -library(ggplot2)
    -library(reshape2)
    -df <- data.frame(index = 1:7, bootstrap_SE = SE, lm_SE = tmp$Std..Error)
    -melted_df <- melt(df[2:nrow(df), ], id.vars = "index")  # Ignore bias which has a really large magnitude
    -ggplot(melted_df, aes(x = index, y = value, fill = variable)) +
    -  geom_bar(stat="identity", position="dodge") +
    -  xlab("Coefficient") +
    -  ylab("Standard error") # + scale_y_continuous(trans = "log") # If you want to also plot bias
    -

    +
    # Show the standard errors around coefficients
    +library(ggplot2)
    +library(reshape2)
    +df <- data.frame(index = 1:7, bootstrap_SE = SE, lm_SE = tmp$Std..Error)
    +melted_df <- melt(df[2:nrow(df), ], id.vars = "index")  # Ignore bias which has a really large magnitude
    +ggplot(melted_df, aes(x = index, y = value, fill = variable)) +
    +  geom_bar(stat="identity", position="dodge") +
    +  xlab("Coefficient") +
    +  ylab("Standard error") # + scale_y_continuous(trans = "log") # If you want to also plot bias
    +

    -

    Exercise 14.5 +

    Exercise 14.5 This exercise shows a shortcoming of the bootstrap method when using the plug in estimator for the maximum.

      @@ -683,38 +683,310 @@

      Chapter 14 Bootstrap -
      # bootstrap CI for maximum
      -
      -alpha <- 0.05
      -T_max <- function(x) {max(x)}  # Equal to T_max = max
      -bootstrap <- function(x, t, m = 1000) {
      -  n <- length(x)
      -  values <- rep(0, m)
      -  for (i in 1:m) {
      -    values[i] <- t(sample(x, n, replace = T))
      -  }
      -  quantile(values, probs = c(alpha / 2, 1 - alpha / 2))
      -}
      -
      -# a
      -# Meaningless, as the normal distribution can yield arbitrarily large values.
      -x <- rnorm(100)
      -bootstrap(x, T_max)
      +
      # bootstrap CI for maximum
      +
      +alpha <- 0.05
      +T_max <- function(x) {max(x)}  # Equal to T_max = max
      +bootstrap <- function(x, t, m = 1000) {
      +  n <- length(x)
      +  values <- rep(0, m)
      +  for (i in 1:m) {
      +    values[i] <- t(sample(x, n, replace = T))
      +  }
      +  quantile(values, probs = c(alpha / 2, 1 - alpha / 2))
      +}
      +
      +# a
      +# Meaningless, as the normal distribution can yield arbitrarily large values.
      +x <- rnorm(100)
      +bootstrap(x, T_max)
      ##     2.5%    97.5% 
       ## 1.819425 2.961743
      -
      # b
      -x <- rbinom(100, size = 15, prob = 0.2) # min = 0, max = 15
      -bootstrap(x, T_max)
      +
      # b
      +x <- rbinom(100, size = 15, prob = 0.2) # min = 0, max = 15
      +bootstrap(x, T_max)
      ##  2.5% 97.5% 
       ##     6     7
      -
      # c
      -x <- rbinom(100, size = 15, prob = 0.9) # min = 0, max = 15
      -bootstrap(x, T_max)
      +
      # c
      +x <- rbinom(100, size = 15, prob = 0.9) # min = 0, max = 15
      +bootstrap(x, T_max)
      ##  2.5% 97.5% 
       ##    15    15
      -
      # Observation: to estimate the maximum, we need sufficient probability mass near the maximum value the distribution can yield.
      -# Using bootstrap is pointless when there is too little mass near the true maximum.
      -# In general, bootstrap will fail when estimating the CI for the maximum.
      +
      # Observation: to estimate the maximum, we need sufficient probability mass near the maximum value the distribution can yield.
      +# Using bootstrap is pointless when there is too little mass near the true maximum.
      +# In general, bootstrap will fail when estimating the CI for the maximum.
      +

    +
    +

    Exercise 14.6 (Practical - and fictional - coverage interval comparison) In this exercise, we investigate how different kinds of CI’s behave as we vary the number of measurements.

    +

    The story behind the data: it’s 2025 and we’ve discovered that Slovenia has rich deposits of a rare mineral called Moustachium, which can be used to accelerate moustache growth. This mineral is highly sought, so the government has decided to contract two different companies to provide information on where to begin mining. Both companies investigated mining sites in each statistical region and gave their best estimate of the average Moustachium concentration in tonnes per square kilometer. The Data Science team has been called to estimate the uncertainty in these estimates and help avoid mining in the wrong region.

    +

    Generate synthetic data with the script below:

    +
    set.seed(0)
    +
    +library(comprehenr)
    +
    +regions <- c("pomurska", "podravska", "koroska", "savinjska", "zasavska", "posavska", "JV Slovenija", "primorsko-notranjska", "osrednjeslovenska", "gorenjska", "goriska", "obalno-kraska")
    +region_rates <- seq(1.3, 2.3, length.out=length(regions))
    +region_rates <- region_rates[sample.int(length(regions), length(regions))]
    +
    +make_dataset <- function(n_contractors) {
    +    measurements <- matrix(nrow=length(regions), ncol=n_contractors)
    +    for (i in 1:length(regions)) {
    +        measurements[i,] <- rgamma(n_contractors, 5.0, region_rates[i])
    +    }
    +
    +    df <- data.frame(measurements)
    +    row.names(df) <- regions
    +    names(df) <- to_vec(for(i in 1:n_contractors) paste("Contractor", i))
    +    return(df)
    +}
    +
    +set.seed(0)
    +df_2025 <- make_dataset(2)
    +
    +set.seed(0)
    +df_2027 <- make_dataset(10)
    +
    +set.seed(0)
    +df_2028 <- make_dataset(100)
    +
    +set.seed(0)
    +df_2029 <- make_dataset(1000)
    +
    +saveRDS(df_2025, file="moustachium_2025.Rda")
    +saveRDS(df_2027, file="moustachium_2027.Rda")
    +saveRDS(df_2028, file="moustachium_2028.Rda")
    +saveRDS(df_2029, file="moustachium_2029.Rda")
    +
      +
    1. Estimate the average concentration for different regions.

    2. +
    3. Estimate the average concentration uncertainty using 95% CI’s (asymptotic normality with biased and unbiased standard error, standard bootstrap CI, bootstrap percentile CI).

    4. +
    5. Visualize uncertainties with a histogram and discuss the best location to start mining.

    6. +
    7. The year is 2027 and the government has decided to contract 10 companies. Rerun the code with new measurements and discuss how CI’s change.

    8. +
    9. Technological advancements in robotics have enabled site surveys on a massive scale. Repeat the last point for 100 surveyor robots in 2028 and 1000 surveyor robots in 2029.

    10. +
    +
    +
    +
    library(ggplot2)
    +library(dplyr)
    +library(data.table)
    +
    +set.seed(0)
    +
    +input_dataset_path = "moustachium_2025.Rda"  # Change this for points d and e
    +output_plot_path = "moustachium_2025.pdf"  # Change this for points d and e
    +
    +df <- readRDS(input_dataset_path)  # Data comes from here
    +n_contractors <- ncol(df)
    +results_df <- data.frame(region=row.names(df))  # Store CI bounds here
    +
    +# 1. average concentration for different mining sites
    +results_df$average_concetration <- rowMeans(df)
    +
    +# CI for the mean based on asymptotic normality (biased SE estimate)
    +biased_SE <- sqrt(apply(df, 1, function(vec) {sum((vec - mean(vec))^2) / length(vec)}) / n_contractors)
    +results_df$ci95.an.biased_var.low <- results_df$average_concetration - 1.96 * biased_SE
    +results_df$ci95.an.biased_var.high <- results_df$average_concetration + 1.96 * biased_SE
    +
    +# CI for the mean based on asymptotic normality (unbiased SE estimate)
    +unbiased_SE <- sqrt(apply(df, 1, var) / n_contractors)
    +results_df$ci95.an.unbiased_var.low <- results_df$average_concetration - 1.96 * unbiased_SE
    +results_df$ci95.an.unbiased_var.high <- results_df$average_concetration + 1.96 * unbiased_SE
    +
    +# Standard bootstrap CI with 1000 samples
    +bootstrap_variance <- function(data, n_samples) {
    +  # n_samples is m in pseudocode
    +  output <- numeric(n_samples)
    +  for (i in 1:n_samples) {
    +    index <- sample(1:length(data), length(data), rep = TRUE)
    +    resampled_data <- data[index]
    +    output[i] <- mean(resampled_data)
    +  }
    +  return(var(output))
    +}
    +
    +bootstrap_1000_sd <- sqrt(apply(df, 1, function(vec){bootstrap_variance(vec, 1000)}))
    +results_df$ci95.bootstrap.standard.1000.low <- results_df$average_concetration - 1.96 * bootstrap_1000_sd
    +results_df$ci95.bootstrap.standard.1000.high <- results_df$average_concetration + 1.96 * bootstrap_1000_sd
    +
    +# Bootstrap percentile CI with 1000 samples
    +
    +bootstrap_quantile <- function(data, functional, n_samples, probs) {
    +    # n_samples is m in pseudocode
    +    output <- numeric(n_samples)
    +    for (i in 1:n_samples) {
    +        index <- sample(1:length(data), length(data), rep = TRUE)
    +        resampled_data <- data[index]
    +        output[i] <- functional(resampled_data)
    +    }
    +    return(quantile(output, probs=probs))
    +}
    +
    +results_df$ci95.bootstrap.percentile.1000.low <- apply(df, 1, function(vec){bootstrap_quantile(vec, mean, 1000, 0.025)})
    +results_df$ci95.bootstrap.percentile.1000.high <- apply(df, 1, function(vec){bootstrap_quantile(vec, mean, 1000, 0.975)})
    +
    +results_df
    +
    ##                  region average_concetration ci95.an.biased_var.low
    +## 1              pomurska             2.814731              1.5351811
    +## 2             podravska             2.646518              1.5358919
    +## 3               koroska             2.010216              0.5956186
    +## 4             savinjska             4.618001              4.4057369
    +## 5              zasavska             2.458873              2.0050840
    +## 6              posavska             2.153802              1.9001244
    +## 7          JV Slovenija             2.433503              1.6860397
    +## 8  primorsko-notranjska             3.165394              2.9640430
    +## 9     osrednjeslovenska             3.696875              3.5592419
    +## 10            gorenjska             1.341931              0.2784547
    +## 11              goriska             2.767328              2.3255569
    +## 12        obalno-kraska             1.580711              1.4533751
    +##    ci95.an.biased_var.high ci95.an.unbiased_var.low ci95.an.unbiased_var.high
    +## 1                 4.094281              1.005174095                  4.624288
    +## 2                 3.757144              1.075855548                  4.217180
    +## 3                 3.424813              0.009673183                  4.010759
    +## 4                 4.830264              4.317814385                  4.918187
    +## 5                 2.912662              1.817118318                  3.100628
    +## 6                 2.407479              1.795047746                  2.512556
    +## 7                 3.180965              1.376430415                  3.490575
    +## 8                 3.366746              2.880640556                  3.450148
    +## 9                 3.834508              3.502232367                  3.891518
    +## 10                2.405407             -0.162051549                  2.845913
    +## 11                3.209099              2.142569481                  3.392086
    +## 12                1.708047              1.400630772                  1.760792
    +##    ci95.bootstrap.standard.1000.low ci95.bootstrap.standard.1000.high
    +## 1                         1.5397542                          4.089708
    +## 2                         1.5388631                          3.754173
    +## 3                         0.5492603                          3.471171
    +## 4                         4.4062860                          4.829715
    +## 5                         1.9938049                          2.923942
    +## 6                         1.9010514                          2.406552
    +## 7                         1.6932573                          3.173748
    +## 8                         2.9670216                          3.363767
    +## 9                         3.5602064                          3.833544
    +## 10                        0.2845999                          2.399262
    +## 11                        2.3293359                          3.205320
    +## 12                        1.4543352                          1.707087
    +##    ci95.bootstrap.percentile.1000.low ci95.bootstrap.percentile.1000.high
    +## 1                           1.8914878                            3.737975
    +## 2                           1.8451596                            3.447876
    +## 3                           0.9895308                            3.030901
    +## 4                           4.4648444                            4.771157
    +## 5                           2.1314473                            2.786299
    +## 6                           1.9707640                            2.336840
    +## 7                           1.8941800                            2.972825
    +## 8                           3.0201118                            3.310677
    +## 9                           3.5975676                            3.796183
    +## 10                          0.5745928                            2.109269
    +## 11                          2.4485735                            3.086082
    +## 12                          1.4888334                            1.672589
    +
    # Visualization: we use a bar chart with uncertainty bands
    +
    +plot_moustachium_per_region <- function(region_names, average, ci_low, ci_high) {
    +    df_visualization <- data.frame(region=region_names, average=average, low=ci_low, high=ci_high)
    +    ggplot(df_visualization, aes(x=region, y=average)) + geom_bar(stat="identity")
    +}
    +
    +mask <- endsWith(colnames(results_df), "low")
    +mask[c(1, 2)] <- T
    +results_df_low <- results_df[, mask]
    +colnames(results_df_low) <- gsub('.low','', colnames(results_df_low))
    +
    +mask <- endsWith(colnames(results_df), "high")
    +mask[c(1, 2)] <- T
    +results_df_high <- results_df[, mask]
    +colnames(results_df_high) <- gsub('.high','', colnames(results_df_high))
    +
    +long_results_df_low <- melt(setDT(results_df_low), id.vars=c("region", "average_concetration"))
    +names(long_results_df_low) <- c("region", "average_concentration", "variable", "low")
    +
    +long_results_df_high <- melt(setDT(results_df_high), id.vars=c("region", "average_concetration"))
    +names(long_results_df_high) <- c("region", "average_concentration", "variable", "high")
    +
    +long_results_df <- merge(long_results_df_low, long_results_df_high, by=c("region", "variable", "average_concentration"), all=T)
    +
    +moustachium_plot <- ggplot(long_results_df, aes(x=region, y=average_concentration)) +
    +    geom_bar(stat="identity", position="dodge", alpha=0.2) +
    +    geom_errorbar(aes(ymin=low, ymax=high, color=variable), width=0.2, position=position_dodge(0.9)) +
    +    scale_x_discrete(guide = guide_axis(angle = 45)) +
    +    ylim(-1, 8)
    +
    +# ggsave(plot=moustachium_plot, width=12, height=8, dpi=300, filename=output_plot_path)
    +moustachium_plot
    +

    +
    # Visualization: we can also use a map. Circle size denotes concentration in region, low transparency denotes high uncertainty.
    +
    +library(maps)
    +
    +map_data_slo <- map_data('world')[map_data('world')$region == "Slovenia",]
    +
    +map_df <- long_results_df[long_results_df$variable == "ci95.an.biased_var", ]
    +
    +# VERY approximate longitudes and latitudes for different regions.
    +map_df$long <- rep(0, nrow(map_df))
    +map_df$lat <- rep(0, nrow(map_df))
    +
    +map_df[map_df$region == "gorenjska"]$long <- 14.2
    +map_df[map_df$region == "gorenjska"]$lat <- 46.3
    +
    +map_df[map_df$region == "goriska"]$long <- 13.85
    +map_df[map_df$region == "goriska"]$lat <- 46.0
    +
    +map_df[map_df$region == "obalno-kraska"]$long <- 13.9
    +map_df[map_df$region == "obalno-kraska"]$lat <- 45.65
    +
    +map_df[map_df$region == "osrednjeslovenska"]$long <- 14.5
    +map_df[map_df$region == "osrednjeslovenska"]$lat <- 46.
    +
    +map_df[map_df$region == "primorsko-notranjska"]$long <- 14.3
    +map_df[map_df$region == "primorsko-notranjska"]$lat <- 45.7
    +
    +map_df[map_df$region == "zasavska"]$long <- 15
    +map_df[map_df$region == "zasavska"]$lat <- 46.1
    +
    +map_df[map_df$region == "savinjska"]$long <- 15.2
    +map_df[map_df$region == "savinjska"]$lat <- 46.25
    +
    +map_df[map_df$region == "posavska"]$long <- 15.4
    +map_df[map_df$region == "posavska"]$lat <- 46
    +
    +map_df[map_df$region == "koroska"]$long <- 15.1
    +map_df[map_df$region == "koroska"]$lat <- 46.5
    +
    +map_df[map_df$region == "podravska"]$long <- 15.7
    +map_df[map_df$region == "podravska"]$lat <- 46.45
    +
    +map_df[map_df$region == "pomurska"]$long <- 16.2
    +map_df[map_df$region == "pomurska"]$lat <- 46.65
    +
    +map_df[map_df$region == "JV Slovenija"]$long <- 15.
    +map_df[map_df$region == "JV Slovenija"]$lat <- 45.7
    +
    +map_df$ci_size <- (map_df$high - map_df$low)
    +map_df$ci_y <- map_df$lat - 0.05
    +map_df$ci_label <- sprintf("(%.2f, %.2f)", map_df$low, map_df$high)
    +map_df$avg_label <- sprintf("%.2f", map_df$average_concentration)
    +
    +country_plot <- ggplot() +
    +    # First layer: worldwide map
    +    geom_polygon(data = map_data("world"),
    +                 aes(x=long, y=lat, group = group),
    +                 color = '#9c9c9c', fill = '#f3f3f3') +
    +    # Second layer: Country map
    +    geom_polygon(
    +        data = map_data_slo,
    +        aes(x=long, y=lat, group = group),
    +        color='darkgreen', 
    +        fill='green', 
    +        alpha=0.2
    +    ) +
    +    geom_point(data=map_df, aes(x=long, y=lat, fill=region, size=average_concentration, alpha=ci_size), color="black", pch=21) +
    +    geom_text(data=map_df, aes(x=long, y=ci_y, label=ci_label), size=3) +
    +    geom_text(data=map_df, aes(x=long, y=lat, label=avg_label), size=3) +
    +    scale_size_continuous(range = c(3, 12), trans = "exp") +
    +    scale_alpha_continuous(range = c(0.15, 0.75), trans = "reverse") +
    +    ggtitle("Estimated average Moustachium concentration with 95% CI") +
    +    coord_cartesian(xlim=c(13.2, 16.7), ylim=c(45.4, 47.))
    +
    +# ggsave(plot=country_plot, width=18, height=12, dpi=300, filename="country.pdf")
    +country_plot
    +

    diff --git a/docs/ml.html b/docs/ml.html index b30e532..19f47b7 100644 --- a/docs/ml.html +++ b/docs/ml.html @@ -6,7 +6,7 @@ Chapter 15 Maximum likelihood | Principles of Uncertainty – exercises - + @@ -23,7 +23,7 @@ - + @@ -332,15 +332,15 @@

    Chapter 15 Maximum likelihood

    15.1 Deriving MLE

    -

    Exercise 15.1

    +

    Exercise 15.1

    1. Derive the maximum likelihood estimator of variance for N\((\mu, \sigma^2)\).
    2. -
    3. Compare with results from 13.3. What does that say about the MLE estimator?
    4. +
    5. Compare with results from ??. What does that say about the MLE estimator?
    -

    Solution.

    +

    Solution.

    1. The mean is assumed constant, so we have the likelihood \[\begin{align} @@ -369,7 +369,7 @@

      15.1 Deriving MLE -

      Exercise 15.2 (Multivariate normal distribution)

      +

      Exercise 15.2 (Multivariate normal distribution)

      1. Derive the maximum likelihood estimate for the mean and covariance matrix of the multivariate normal.

      2. Simulate \(n = 40\) samples from a bivariate normal distribution (choose non-trivial parameters, that is, mean \(\neq 0\) and covariance \(\neq 0\)). Compute the MLE for the sample. Overlay the data with an ellipse that is determined by the MLE and an ellipse that is determined by the chosen true parameters.

      3. @@ -379,7 +379,7 @@

        15.1 Deriving MLE
        -

        Solution. The log likelihood of the MVN distribution is +

        Solution. The log likelihood of the MVN distribution is \[\begin{align*} l(\mu, \Sigma ; x) &= -\frac{1}{2}\Big(\sum_{i=1}^n k\ln(2\pi) + |\Sigma| + (x_i - \mu)^T \Sigma^{-1} (x_i - \mu)\Big) \\ &= -\frac{n}{2}\ln|\Sigma| + -\frac{1}{2}\Big(\sum_{i=1}^n(x_i - \mu)^T \Sigma^{-1} (x_i - \mu)\Big) + c, @@ -408,33 +408,33 @@

        15.1 Deriving MLE
        set.seed(1)
        -n     <- 40
        -mu    <- c(1, -2)
        -Sigma <- matrix(data = c(2, -1.6, -1.6, 1.8), ncol = 2)
        -X     <- mvrnorm(n = n, mu = mu, Sigma = Sigma)
        -colnames(X) <- c("X1", "X2")
        -X     <- as.data.frame(X)
        -
        -# plot.new()
        -tru_ellip <- ellipse(mu, Sigma, draw = FALSE)
        -colnames(tru_ellip) <- c("X1", "X2")
        -tru_ellip <- as.data.frame(tru_ellip)
        -
        -mu_est    <- apply(X, 2, mean)
        -tmp       <- as.matrix(sweep(X, 2, mu_est))
        -Sigma_est <- (1 / n) * t(tmp) %*% tmp
        -
        -est_ellip <- ellipse(mu_est, Sigma_est, draw = FALSE)
        -colnames(est_ellip) <- c("X1", "X2")
        -est_ellip <- as.data.frame(est_ellip)
        -
        -ggplot(data = X, aes(x = X1, y = X2)) +
        -  geom_point() +
        -  geom_path(data = tru_ellip, aes(x = X1, y = X2, color = "truth")) +
        -  geom_path(data = est_ellip, aes(x = X1, y = X2, color = "estimated")) +
        -  labs(color = "type")

        -

        +
        set.seed(1)
        +n     <- 40
        +mu    <- c(1, -2)
        +Sigma <- matrix(data = c(2, -1.6, -1.6, 1.8), ncol = 2)
        +X     <- mvrnorm(n = n, mu = mu, Sigma = Sigma)
        +colnames(X) <- c("X1", "X2")
        +X     <- as.data.frame(X)
        +
        +# plot.new()
        +tru_ellip <- ellipse(mu, Sigma, draw = FALSE)
        +colnames(tru_ellip) <- c("X1", "X2")
        +tru_ellip <- as.data.frame(tru_ellip)
        +
        +mu_est    <- apply(X, 2, mean)
        +tmp       <- as.matrix(sweep(X, 2, mu_est))
        +Sigma_est <- (1 / n) * t(tmp) %*% tmp
        +
        +est_ellip <- ellipse(mu_est, Sigma_est, draw = FALSE)
        +colnames(est_ellip) <- c("X1", "X2")
        +est_ellip <- as.data.frame(est_ellip)
        +
        +ggplot(data = X, aes(x = X1, y = X2)) +
        +  geom_point() +
        +  geom_path(data = tru_ellip, aes(x = X1, y = X2, color = "truth")) +
        +  geom_path(data = est_ellip, aes(x = X1, y = X2, color = "estimated")) +
        +  labs(color = "type")
        +

    Exercise 15.3 (Logistic regression) Logistic regression is a popular discriminative model when our target variable is binary (categorical with 2 values). One of the ways of looking at logistic regression is that it is linear regression but instead of using the linear term as the mean of a normal RV, we use it as the mean of a Bernoulli RV. Of course, the mean of a Bernoulli is bounded on \([0,1]\), so, to avoid non-sensical values, we squeeze the linear between 0 and 1 with the inverse logit function inv_logit\((z) = 1 / (1 + e^{-z})\). This leads to the following model: \(y_i | \beta, x_i \sim \text{Bernoulli}(\text{inv_logit}(\beta x_i))\).

    @@ -445,62 +445,62 @@

    15.1 Deriving MLE\(y2\) be a response defined below. Will logistic regression work well on this dataset? Why not? How can we still use the model, without changing it?

    -
    inv_log <- function (z) {
    -  return (1 / (1 + exp(-z)))
    -}
    -set.seed(1)
    -x  <- rnorm(100)
    -y  <- rbinom(100, size = 1, prob = inv_log(1.2 * x))
    -y2 <- rbinom(100, size = 1, prob = inv_log(1.2 * x + 1.4 * x^2))
    +
    inv_log <- function (z) {
    +  return (1 / (1 + exp(-z)))
    +}
    +set.seed(1)
    +x  <- rnorm(100)
    +y  <- rbinom(100, size = 1, prob = inv_log(1.2 * x))
    +y2 <- rbinom(100, size = 1, prob = inv_log(1.2 * x + 1.4 * x^2))
    -

    Solution. \[\begin{align*} +

    Solution. \[\begin{align*} l(\beta; x, y) &= p(y | x, \beta) \\ &= \ln(\prod_{i=1}^n \text{inv_logit}(\beta x_i)^{y_i} (1 - \text{inv_logit}(\beta x_i))^{1 - y_i}) \\ &= \sum_{i=1}^n y_i \ln(\text{inv_logit}(\beta x_i)) + (1 - y_i) \ln(1 - \text{inv_logit}(\beta x_i)). \end{align*}\]

    -
    set.seed(1)
    -inv_log <- function (z) {
    -  return (1 / (1 + exp(-z)))
    -}
    -
    -x <- rnorm(100)
    -y <- x
    -y <- rbinom(100, size = 1, prob = inv_log(1.2 * x))
    -
    -l_logistic <- function (beta, X, y) {
    -  logl <- -sum(y * log(inv_log(as.vector(beta %*% X))) + (1 - y) * log((1 - inv_log(as.vector(beta %*% X)))))
    -  return(logl)
    -}
    -
    -my_optim <- optim(par = 0.5, fn = l_logistic, method = "L-BFGS-B",
    -                  lower = 0, upper = 10, X = x, y = y)
    -my_optim$par
    +
    set.seed(1)
    +inv_log <- function (z) {
    +  return (1 / (1 + exp(-z)))
    +}
    +
    +x <- rnorm(100)
    +y <- x
    +y <- rbinom(100, size = 1, prob = inv_log(1.2 * x))
    +
    +l_logistic <- function (beta, X, y) {
    +  logl <- -sum(y * log(inv_log(as.vector(beta %*% X))) + (1 - y) * log((1 - inv_log(as.vector(beta %*% X)))))
    +  return(logl)
    +}
    +
    +my_optim <- optim(par = 0.5, fn = l_logistic, method = "L-BFGS-B",
    +                  lower = 0, upper = 10, X = x, y = y)
    +my_optim$par
    ## [1] 1.166558
    -
    truth_p <- data.frame(x = x, prob = inv_log(1.2 * x), type = "truth")
    -est_p   <- data.frame(x = x, prob = inv_log(my_optim$par * x), type = "estimated")
    -plot_df <- rbind(truth_p, est_p)
    -ggplot(data = plot_df, aes(x = x, y = prob, color = type)) +
    -  geom_point(alpha = 0.3)
    -

    -
    y2 <- rbinom(2000, size = 1, prob = inv_log(1.2 * x + 1.4 * x^2))
    -X2 <- cbind(x, x^2)
    -my_optim2 <- optim(par = c(0, 0), fn = l_logistic, method = "L-BFGS-B",
    -                   lower = c(0, 0), upper = c(2, 2), X = t(X2), y = y2)
    -my_optim2$par
    +
    truth_p <- data.frame(x = x, prob = inv_log(1.2 * x), type = "truth")
    +est_p   <- data.frame(x = x, prob = inv_log(my_optim$par * x), type = "estimated")
    +plot_df <- rbind(truth_p, est_p)
    +ggplot(data = plot_df, aes(x = x, y = prob, color = type)) +
    +  geom_point(alpha = 0.3)
    +

    +
    y2 <- rbinom(2000, size = 1, prob = inv_log(1.2 * x + 1.4 * x^2))
    +X2 <- cbind(x, x^2)
    +my_optim2 <- optim(par = c(0, 0), fn = l_logistic, method = "L-BFGS-B",
    +                   lower = c(0, 0), upper = c(2, 2), X = t(X2), y = y2)
    +my_optim2$par
    ## [1] 1.153656 1.257649
    -
    tmp     <- sweep(data.frame(x = x, x2 = x^2), 2, my_optim2$par, FUN = "*")
    -tmp     <- tmp[ ,1] + tmp[ ,2]
    -truth_p <- data.frame(x = x, prob = inv_log(1.2 * x + 1.4 * x^2), type = "truth")
    -est_p   <- data.frame(x = x, prob = inv_log(tmp), type = "estimated")
    -plot_df <- rbind(truth_p, est_p)
    -ggplot(data = plot_df, aes(x = x, y = prob, color = type)) +
    -  geom_point(alpha = 0.3)
    -

    +
    tmp     <- sweep(data.frame(x = x, x2 = x^2), 2, my_optim2$par, FUN = "*")
    +tmp     <- tmp[ ,1] + tmp[ ,2]
    +truth_p <- data.frame(x = x, prob = inv_log(1.2 * x + 1.4 * x^2), type = "truth")
    +est_p   <- data.frame(x = x, prob = inv_log(tmp), type = "estimated")
    +plot_df <- rbind(truth_p, est_p)
    +ggplot(data = plot_df, aes(x = x, y = prob, color = type)) +
    +  geom_point(alpha = 0.3)
    +

    -

    Exercise 15.4 (Linear regression) For the data generated below, do the following:

    +

    Exercise 15.4 (Linear regression) For the data generated below, do the following:

    1. Compute the least squares (MLE) estimate of coefficients beta using the matrix exact solution.
    2. Compute the MLE by minimizing the sum of squared residuals using black-box optimization (optim()).
    3. @@ -509,37 +509,37 @@

      15.1 Deriving MLECompute 95% CI on the beta coefficients by using (a or b) and the bootstrap with percentile method for CI. Compare with d.

    -
    set.seed(1)
    -n <- 100
    -x1 <- rnorm(n)
    -x2 <- rnorm(n)
    -x3 <- rnorm(n)
    -
    -X <- cbind(x1, x2, x3)
    -beta <- c(0.2, 0.6, -1.2)
    -
    -y <- as.vector(t(beta %*% t(X))) + rnorm(n, sd = 0.2)
    +
    set.seed(1)
    +n <- 100
    +x1 <- rnorm(n)
    +x2 <- rnorm(n)
    +x3 <- rnorm(n)
    +
    +X <- cbind(x1, x2, x3)
    +beta <- c(0.2, 0.6, -1.2)
    +
    +y <- as.vector(t(beta %*% t(X))) + rnorm(n, sd = 0.2)
    -
    set.seed(1)
    -n <- 100
    -x1 <- rnorm(n)
    -x2 <- rnorm(n)
    -x3 <- rnorm(n)
    -
    -X <- cbind(x1, x2, x3)
    -beta <- c(0.2, 0.6, -1.2)
    -
    -y <- as.vector(t(beta %*% t(X))) + rnorm(n, sd = 0.2)
    -LS_fun <- function (beta, X, y) {
    -  return(sum((y - beta %*% t(X))^2))
    -}
    -my_optim <- optim(par = c(0, 0, 0), fn = LS_fun, lower = -5, upper = 5,
    -                  X = X, y = y, method = "L-BFGS-B")
    -my_optim$par
    +
    set.seed(1)
    +n <- 100
    +x1 <- rnorm(n)
    +x2 <- rnorm(n)
    +x3 <- rnorm(n)
    +
    +X <- cbind(x1, x2, x3)
    +beta <- c(0.2, 0.6, -1.2)
    +
    +y <- as.vector(t(beta %*% t(X))) + rnorm(n, sd = 0.2)
    +LS_fun <- function (beta, X, y) {
    +  return(sum((y - beta %*% t(X))^2))
    +}
    +my_optim <- optim(par = c(0, 0, 0), fn = LS_fun, lower = -5, upper = 5,
    +                  X = X, y = y, method = "L-BFGS-B")
    +my_optim$par
    ## [1]  0.1898162  0.5885946 -1.1788264
    -
    df <- data.frame(y = y, x1 = x1, x2 = x2, x3 = x3)
    -my_lm <- lm(y ~ x1 + x2 + x3 - 1, data = df)
    -my_lm
    +
    df <- data.frame(y = y, x1 = x1, x2 = x2, x3 = x3)
    +my_lm <- lm(y ~ x1 + x2 + x3 - 1, data = df)
    +my_lm
    ## 
     ## Call:
     ## lm(formula = y ~ x1 + x2 + x3 - 1, data = df)
    @@ -547,41 +547,41 @@ 

    15.1 Deriving MLE
    # matrix solution
    -beta_hat <- solve(t(X) %*% X) %*% t(X) %*% y
    -beta_hat

    +
    # matrix solution
    +beta_hat <- solve(t(X) %*% X) %*% t(X) %*% y
    +beta_hat
    ##          [,1]
     ## x1  0.1898162
     ## x2  0.5885946
     ## x3 -1.1788264
    -
    out <- summary(my_lm)
    -out$coefficients[ ,2]
    +
    out <- summary(my_lm)
    +out$coefficients[ ,2]
    ##         x1         x2         x3 
     ## 0.02209328 0.02087542 0.01934506
    -
    # bootstrap CI
    -nboot <- 1000
    -beta_boot <- matrix(data = NA, ncol = length(beta), nrow = nboot)
    -for (i in 1:nboot) {
    -  inds     <- sample(1:n, n, replace = T)
    -  new_df   <- df[inds, ]
    -  X_tmp    <- as.matrix(new_df[ ,-1])
    -  y_tmp    <- new_df[ ,1]
    -  # print(nrow(new_df))
    -  tmp_beta <- solve(t(X_tmp) %*% X_tmp) %*% t(X_tmp) %*% y_tmp
    -  beta_boot[i, ] <- tmp_beta
    -}
    -apply(beta_boot, 2, mean)
    +
    # bootstrap CI
    +nboot <- 1000
    +beta_boot <- matrix(data = NA, ncol = length(beta), nrow = nboot)
    +for (i in 1:nboot) {
    +  inds     <- sample(1:n, n, replace = T)
    +  new_df   <- df[inds, ]
    +  X_tmp    <- as.matrix(new_df[ ,-1])
    +  y_tmp    <- new_df[ ,1]
    +  # print(nrow(new_df))
    +  tmp_beta <- solve(t(X_tmp) %*% X_tmp) %*% t(X_tmp) %*% y_tmp
    +  beta_boot[i, ] <- tmp_beta
    +}
    +apply(beta_boot, 2, mean)
    ## [1]  0.1893281  0.5887068 -1.1800738
    -
    apply(beta_boot, 2, quantile, probs = c(0.025, 0.975))
    +
    apply(beta_boot, 2, quantile, probs = c(0.025, 0.975))
    ##            [,1]      [,2]      [,3]
     ## 2.5%  0.1389441 0.5436911 -1.221560
     ## 97.5% 0.2386295 0.6363102 -1.140416
    -
    out$coefficients[ ,2]
    +
    out$coefficients[ ,2]
    ##         x1         x2         x3 
     ## 0.02209328 0.02087542 0.01934506
    -

    Exercise 15.5 (Principal component analysis) Load the olympic data set from package ade4. The data show decathlon results for 33 men in 1988 Olympic Games. This data set serves as a great example of finding the latent structure in the data, as there are certain characteristics of the athletes that make them excel at different events. For example an explosive athlete will do particulary well in sprints and long jumps.

    +

    Exercise 15.5 (Principal component analysis) Load the olympic data set from package ade4. The data show decathlon results for 33 men in 1988 Olympic Games. This data set serves as a great example of finding the latent structure in the data, as there are certain characteristics of the athletes that make them excel at different events. For example an explosive athlete will do particulary well in sprints and long jumps.

    1. Perform PCA (prcomp) on the data set and interpret the first 2 latent dimensions. Hint: Standardize the data first to get meaningful results.

    2. Use MLE to estimate the covariance of the standardized multivariate distribution.

    3. @@ -589,12 +589,12 @@

      15.1 Deriving MLE -
      data(olympic)
      -
      -X        <- olympic$tab
      -X_scaled <- scale(X)
      -my_pca   <- prcomp(X_scaled)
      -summary(my_pca)
      +
      data(olympic)
      +
      +X        <- olympic$tab
      +X_scaled <- scale(X)
      +my_pca   <- prcomp(X_scaled)
      +summary(my_pca)
      ## Importance of components:
       ##                           PC1    PC2     PC3    PC4     PC5     PC6     PC7
       ## Standard deviation     1.8488 1.6144 0.97123 0.9370 0.74607 0.70088 0.65620
      @@ -604,47 +604,47 @@ 

      15.1 Deriving MLE
      autoplot(my_pca, data = X, loadings = TRUE, loadings.colour = 'blue',
      -         loadings.label = TRUE, loadings.label.size = 3)

    -

    -
    Sigma_est <- (1 / nrow(X_scaled)) * t(X_scaled) %*% X_scaled
    -Sigma_dec <- eigen(Sigma_est)
    -
    -Sigma_dec$vectors
    +
    autoplot(my_pca, data = X, loadings = TRUE, loadings.colour = 'blue',
    +         loadings.label = TRUE, loadings.label.size = 3)
    +

    +
    Sigma_est <- (1 / nrow(X_scaled)) * t(X_scaled) %*% X_scaled
    +Sigma_dec <- eigen(Sigma_est)
    +
    +Sigma_dec$vectors
    ##             [,1]       [,2]        [,3]        [,4]         [,5]        [,6]
    -##  [1,]  0.4158823  0.1488081 -0.26747198 -0.08833244 -0.442314456  0.03071237
    -##  [2,] -0.3940515 -0.1520815 -0.16894945 -0.24424963  0.368913901 -0.09378242
    -##  [3,] -0.2691057  0.4835374  0.09853273 -0.10776276 -0.009754680  0.23002054
    -##  [4,] -0.2122818  0.0278985 -0.85498656  0.38794393 -0.001876311  0.07454380
    -##  [5,]  0.3558474  0.3521598 -0.18949642  0.08057457  0.146965351 -0.32692886
    -##  [6,]  0.4334816  0.0695682 -0.12616012 -0.38229029 -0.088802794  0.21049130
    -##  [7,] -0.1757923  0.5033347  0.04609969  0.02558404  0.019358607  0.61491241
    -##  [8,] -0.3840821  0.1495820  0.13687235  0.14396548 -0.716743474 -0.34776037
    -##  [9,] -0.1799436  0.3719570 -0.19232803 -0.60046566  0.095582043 -0.43744387
    -## [10,]  0.1701426  0.4209653  0.22255233  0.48564231  0.339772188 -0.30032419
    +##  [1,]  0.4158823  0.1488081 -0.26747198  0.08833244 -0.442314456  0.03071237
    +##  [2,] -0.3940515 -0.1520815 -0.16894945  0.24424963  0.368913901 -0.09378242
    +##  [3,] -0.2691057  0.4835374  0.09853273  0.10776276 -0.009754680  0.23002054
    +##  [4,] -0.2122818  0.0278985 -0.85498656 -0.38794393 -0.001876311  0.07454380
    +##  [5,]  0.3558474  0.3521598 -0.18949642 -0.08057457  0.146965351 -0.32692886
    +##  [6,]  0.4334816  0.0695682 -0.12616012  0.38229029 -0.088802794  0.21049130
    +##  [7,] -0.1757923  0.5033347  0.04609969 -0.02558404  0.019358607  0.61491241
    +##  [8,] -0.3840821  0.1495820  0.13687235 -0.14396548 -0.716743474 -0.34776037
    +##  [9,] -0.1799436  0.3719570 -0.19232803  0.60046566  0.095582043 -0.43744387
    +## [10,]  0.1701426  0.4209653  0.22255233 -0.48564231  0.339772188 -0.30032419
     ##             [,7]         [,8]        [,9]       [,10]
    -##  [1,]  0.2543985  0.663712826 -0.10839531  0.10948045
    -##  [2,]  0.7505343  0.141264141  0.04613910  0.05580431
    -##  [3,] -0.1106637  0.072505560  0.42247611  0.65073655
    -##  [4,] -0.1351242 -0.155435871 -0.10206505  0.11941181
    -##  [5,]  0.1413388 -0.146839303  0.65076229 -0.33681395
    -##  [6,]  0.2725296 -0.639003579 -0.20723854  0.25971800
    -##  [7,]  0.1439726  0.009400445 -0.16724055 -0.53450315
    -##  [8,]  0.2732665 -0.276873049 -0.01766443 -0.06589572
    -##  [9,] -0.3419099  0.058519366 -0.30619617 -0.13093187
    -## [10,]  0.1868704  0.007310045 -0.45688227  0.24311846
    -
    my_pca$rotation
    +## [1,] 0.2543985 0.663712826 -0.10839531 -0.10948045 +## [2,] 0.7505343 0.141264141 0.04613910 -0.05580431 +## [3,] -0.1106637 0.072505560 0.42247611 -0.65073655 +## [4,] -0.1351242 -0.155435871 -0.10206505 -0.11941181 +## [5,] 0.1413388 -0.146839303 0.65076229 0.33681395 +## [6,] 0.2725296 -0.639003579 -0.20723854 -0.25971800 +## [7,] 0.1439726 0.009400445 -0.16724055 0.53450315 +## [8,] 0.2732665 -0.276873049 -0.01766443 0.06589572 +## [9,] -0.3419099 0.058519366 -0.30619617 0.13093187 +## [10,] 0.1868704 0.007310045 -0.45688227 -0.24311846
    +
    my_pca$rotation
    ##             PC1        PC2         PC3         PC4          PC5         PC6
    -## 100  -0.4158823  0.1488081  0.26747198 -0.08833244 -0.442314456  0.03071237
    -## long  0.3940515 -0.1520815  0.16894945 -0.24424963  0.368913901 -0.09378242
    -## poid  0.2691057  0.4835374 -0.09853273 -0.10776276 -0.009754680  0.23002054
    -## haut  0.2122818  0.0278985  0.85498656  0.38794393 -0.001876311  0.07454380
    -## 400  -0.3558474  0.3521598  0.18949642  0.08057457  0.146965351 -0.32692886
    -## 110  -0.4334816  0.0695682  0.12616012 -0.38229029 -0.088802794  0.21049130
    -## disq  0.1757923  0.5033347 -0.04609969  0.02558404  0.019358607  0.61491241
    -## perc  0.3840821  0.1495820 -0.13687235  0.14396548 -0.716743474 -0.34776037
    -## jave  0.1799436  0.3719570  0.19232803 -0.60046566  0.095582043 -0.43744387
    -## 1500 -0.1701426  0.4209653 -0.22255233  0.48564231  0.339772188 -0.30032419
    +## 100  -0.4158823  0.1488081 -0.26747198  0.08833244 -0.442314456  0.03071237
    +## long  0.3940515 -0.1520815 -0.16894945  0.24424963  0.368913901 -0.09378242
    +## poid  0.2691057  0.4835374  0.09853273  0.10776276 -0.009754680  0.23002054
    +## haut  0.2122818  0.0278985 -0.85498656 -0.38794393 -0.001876311  0.07454380
    +## 400  -0.3558474  0.3521598 -0.18949642 -0.08057457  0.146965351 -0.32692886
    +## 110  -0.4334816  0.0695682 -0.12616012  0.38229029 -0.088802794  0.21049130
    +## disq  0.1757923  0.5033347  0.04609969 -0.02558404  0.019358607  0.61491241
    +## perc  0.3840821  0.1495820  0.13687235 -0.14396548 -0.716743474 -0.34776037
    +## jave  0.1799436  0.3719570 -0.19232803  0.60046566  0.095582043 -0.43744387
    +## 1500 -0.1701426  0.4209653  0.22255233 -0.48564231  0.339772188 -0.30032419
     ##             PC7          PC8         PC9        PC10
     ## 100   0.2543985 -0.663712826  0.10839531 -0.10948045
     ## long  0.7505343 -0.141264141 -0.04613910 -0.05580431
    @@ -657,11 +657,151 @@ 

    15.1 Deriving MLE +

    Exercise 15.6 (Classification) Let \(D = \{(x_i, y_i)\}_{i=1}^n\) be a dataset of feature vectors and their corresponding integer class labels. We wish to classify feature vectors into correct classes.

    +
      +
    1. Choose a suitable probability distribution \(P_\theta(Y|X)\) and write its log likelihood \(\ell\).
    2. +
    3. Choose a differentiable function \(f_\phi\) that predicts parameters \(\theta\) from a feature vector, i.e. \(f_\phi(x_i) = \theta_i\).
    4. +
    5. Load the iris dataset with data(iris) and split it into train and test subsets.
    6. +
    7. Use gradient descent to find parameters \(\phi\) that minimize the negative log likelihood on the iris dataset (equivalently: maximize the log likelihood). Reminder: gradient descent is an iterative optimization procedure \(\phi_{t+1} = \phi_t - \eta \nabla_\phi \ell\). Try \(\eta = 0.01\) and run optimization for 30 steps. Compute the gradient with numDeriv::grad.
    8. +
    9. Print the classification accuracy for the train and test subsets.
    10. +
    + +
    +
    +

    Solution.

    +
      +
    1. We pick the categorical distribution.
    2. +
    3. Categorical distribution parameters are class probabilities that sum to 1. If there are \(m\) classes, we can pick any differentiable function that takes as input a vector of features and predicts a vector of size \(m\) whose elements are real numbers. We can then use a softmax transformation to map the predicted vector into one with non-negative entries that sum to 1. For simplicity, we can pick a linear transformation with \(\phi = (W, b)\), followed by softmax: +\[\begin{align*} +f_\phi(x) &= \textrm{softmax}(Wx + b), \\ +\textrm{softmax}(u)_i &= \frac{\exp(u_i)}{\sum_{j=1}^m \exp(u_j)}, + \end{align*}\] +where \(W \in \mathbb{R}^{d\times m}, b \in \mathbb{R}^m\) and \(d\) is the number of features.
    4. +
    +
    +
    data(iris)
    +head(iris)
    +
    ##   Sepal.Length Sepal.Width Petal.Length Petal.Width Species
    +## 1          5.1         3.5          1.4         0.2  setosa
    +## 2          4.9         3.0          1.4         0.2  setosa
    +## 3          4.7         3.2          1.3         0.2  setosa
    +## 4          4.6         3.1          1.5         0.2  setosa
    +## 5          5.0         3.6          1.4         0.2  setosa
    +## 6          5.4         3.9          1.7         0.4  setosa
    +
    # Model:
    +# y ~ Categorical(softmax(weights * features + bias))
    +# Want to maximize the (log) likelihood of y w.r.t. weights and bias.
    +# Need gradient of log likelihood w.r.t. weights and bias.
    +# Proceed by gradient descent on negative log likelihood.
    +
    +weights <- matrix(data=rnorm(4 * 3), nrow=4, ncol=3)
    +bias <- matrix(data=rnorm(3), nrow=1, ncol=3)
    +
    +model <- function(features, weights, bias) {
    +  # parameters is a 5-element vector. First four are weights, last is bias.
    +  return(t(features %*% weights + bias))
    +}
    +
    +softmax <- function(v) {
    +  return(exp(v) / sum(exp(v)))
    +}
    +
    +categorical_mass <- function(targets, probs) {
    +  # targets: matrix of size (n_data, n_classes) whose rows are one-hot vectors
    +  # probs: matrix of size (n_data, n_classes) whose rows are class probabilities
    +  apply(probs * targets, 1, sum)
    +}
    +
    +predict_probs <- function(features, model, parameters) {
    +  weights <- parameters[1:4, ]
    +  bias <- parameters[5, ]
    +  u <- model(features, weights, bias)
    +  apply(u, 1, softmax)
    +}
    +
    +accuracy <- function(features, targets, model, parameters) {
    +  probs <- predict_probs(features, model, parameters)
    +  argmax_mat <- t(apply(probs, 1, function(v) {v == max(v)}))
    +  correct_predictions <- apply(argmax_mat * targets, 1, sum)
    +  return(mean(correct_predictions))
    +}
    +
    +neg_log_lik <- function(features, targets, model, parameters) {
    +  probs <- predict_probs(features, model, parameters)
    +  -sum(log(categorical_mass(targets, probs)))
    +}
    +
    +grad_neg_log_lik <- function(features, targets, model, parameters){
    +  numDeriv::grad(function(par){neg_log_lik(features, targets, model, par)}, parameters)
    +}
    +
    +gradient_descent <- function(initial_parameters, features, targets, step_size = 0.01, n_steps = 30) {
    +  parameters <- initial_parameters
    +  for (i in 1:n_steps) {
    +    print(sprintf("[%d] loss: %.4f, accuracy: %.2f", i, neg_log_lik(features, targets, model, parameters), accuracy(features, targets, model, parameters)))
    +    parameters <- parameters - step_size * grad_neg_log_lik(features, targets, model, parameters)
    +  }
    +  return(parameters)
    +}
    +
    +
    +x <- as.matrix(subset(iris, select=-c(Species)))
    +y <- matrix(nrow=nrow(iris), ncol=3)
    +y[, 1] <- iris$Species == "setosa"
    +y[, 2] <- iris$Species == "versicolor"
    +y[, 3] <- iris$Species == "virginica"
    +
    +# Take an equal number of representatives for every class for the training and test subsets
    +# Note: code is written so that shuffling does not matter during optimization
    +x_train <- x[c(1:35, 51:85, 101:135), ]
    +y_train <- y[c(1:35, 51:85, 101:135), ]
    +
    +x_test <- x[-c(1:35, 51:85, 101:135), ]
    +y_test <- y[-c(1:35, 51:85, 101:135), ]
    +
    +set.seed(0)
    +optimized_parameters <- gradient_descent(rbind(weights, bias), x_train, y_train)
    +
    ## [1] "[1] loss: 687.3208, accuracy: 0.02"
    +## [1] "[2] loss: 452.4356, accuracy: 0.66"
    +## [1] "[3] loss: 429.3215, accuracy: 0.92"
    +## [1] "[4] loss: 427.2969, accuracy: 0.93"
    +## [1] "[5] loss: 425.9729, accuracy: 0.93"
    +## [1] "[6] loss: 424.9825, accuracy: 0.95"
    +## [1] "[7] loss: 424.1846, accuracy: 0.95"
    +## [1] "[8] loss: 423.5169, accuracy: 0.95"
    +## [1] "[9] loss: 422.9466, accuracy: 0.95"
    +## [1] "[10] loss: 422.4535, accuracy: 0.94"
    +## [1] "[11] loss: 422.0234, accuracy: 0.94"
    +## [1] "[12] loss: 421.6456, accuracy: 0.93"
    +## [1] "[13] loss: 421.3116, accuracy: 0.92"
    +## [1] "[14] loss: 421.0144, accuracy: 0.92"
    +## [1] "[15] loss: 420.7485, accuracy: 0.92"
    +## [1] "[16] loss: 420.5091, accuracy: 0.92"
    +## [1] "[17] loss: 420.2926, accuracy: 0.92"
    +## [1] "[18] loss: 420.0955, accuracy: 0.92"
    +## [1] "[19] loss: 419.9153, accuracy: 0.92"
    +## [1] "[20] loss: 419.7498, accuracy: 0.92"
    +## [1] "[21] loss: 419.5969, accuracy: 0.92"
    +## [1] "[22] loss: 419.4552, accuracy: 0.92"
    +## [1] "[23] loss: 419.3233, accuracy: 0.92"
    +## [1] "[24] loss: 419.2000, accuracy: 0.92"
    +## [1] "[25] loss: 419.0843, accuracy: 0.92"
    +## [1] "[26] loss: 418.9755, accuracy: 0.92"
    +## [1] "[27] loss: 418.8727, accuracy: 0.92"
    +## [1] "[28] loss: 418.7754, accuracy: 0.92"
    +## [1] "[29] loss: 418.6831, accuracy: 0.92"
    +## [1] "[30] loss: 418.5952, accuracy: 0.92"
    +
    accuracy(x_train, y_train, model, optimized_parameters)
    +
    ## [1] 0.9238095
    +
    accuracy(x_test, y_test, model, optimized_parameters)
    +
    ## [1] 0.9555556
    +

    15.2 Fisher information

    -

    Exercise 15.6 Let us assume a Poisson likelihood.

    +

    Exercise 15.7 Let us assume a Poisson likelihood.

    1. Derive the MLE estimate of the mean.

    2. Derive the Fisher information.

    3. @@ -669,10 +809,10 @@

      15.2 Fisher informationUse bootstrap to construct the CI for the mean. Compare with c) and discuss.

    -
    x <- c(2, 5, 3, 1, 2, 1, 0, 3, 0, 2)
    +
    x <- c(2, 5, 3, 1, 2, 1, 0, 3, 0, 2)
    -

    Solution.

    +

    Solution.

    1. The log likelihood of the Poisson is \[\begin{align*} @@ -696,28 +836,28 @@

      15.2 Fisher information
      set.seed(1)
      -x          <- c(2, 5, 3, 1, 2, 1, 0, 3, 0, 2)
      -lambda_hat <- mean(x)
      -finfo      <- length(x) / lambda_hat
      -mle_CI     <- c(lambda_hat - 1.96 * sqrt(1 / finfo),
      -                lambda_hat + 1.96 * sqrt(1 / finfo))
      -boot_lambda <- c()
      -nboot       <- 1000
      -for (i in 1:nboot) {
      -  tmp_x          <- sample(x, length(x), replace = T)
      -  boot_lambda[i] <- mean(tmp_x)
      -}
      -boot_CI <- c(quantile(boot_lambda, 0.025),
      -             quantile(boot_lambda, 0.975))
      -mle_CI

    +
    set.seed(1)
    +x          <- c(2, 5, 3, 1, 2, 1, 0, 3, 0, 2)
    +lambda_hat <- mean(x)
    +finfo      <- length(x) / lambda_hat
    +mle_CI     <- c(lambda_hat - 1.96 * sqrt(1 / finfo),
    +                lambda_hat + 1.96 * sqrt(1 / finfo))
    +boot_lambda <- c()
    +nboot       <- 1000
    +for (i in 1:nboot) {
    +  tmp_x          <- sample(x, length(x), replace = T)
    +  boot_lambda[i] <- mean(tmp_x)
    +}
    +boot_CI <- c(quantile(boot_lambda, 0.025),
    +             quantile(boot_lambda, 0.975))
    +mle_CI
    ## [1] 1.045656 2.754344
    -
    boot_CI
    +
    boot_CI
    ##  2.5% 97.5% 
     ##   1.0   2.7
    -

    Exercise 15.7

    +

    Exercise 15.8

    1. Find the Fisher information matrix for the Gamma distribution.

    2. Generate 20 samples from a Gamma distribution and plot a confidence ellipse of the inverse of Fisher information matrix around the ML estimates of the parameters. Also plot the theoretical values. Repeat the sampling several times. What do you observe?

    3. @@ -727,7 +867,7 @@

      15.2 Fisher information
      -

      Solution.

      +

      Solution.

      1. The log likelihood of the Gamma is \[\begin{equation*} @@ -761,50 +901,113 @@

        15.2 Fisher information
        set.seed(1)
        -n  <- 20
        -pars_theor <- c(5, 2)
        -x  <- rgamma(n, 5, 2)
        -
        -
        -# MLE for alpha and beta
        -log_lik <- function (pars, x) {
        -  n <- length(x)
        -  return (- (n * pars[1] * log(pars[2]) -
        -             n * log(gamma(pars[1])) +
        -             (pars[1] - 1) * sum(log(x)) -
        -             pars[2] * sum(x)))
        -}
        -my_optim <- optim(par = c(1,1), fn = log_lik, method = "L-BFGS-B",
        -                  lower = c(0.001, 0.001), upper = c(8, 8), x = x)
        -pars_mle <- my_optim$par
        -
        -fish_mat <- matrix(data = NA, nrow = 2, ncol = 2)
        -fish_mat[1,2] <- - n / pars_mle[2]
        -fish_mat[2,1] <- - n / pars_mle[2]
        -fish_mat[2,2] <- (n * pars_mle[1]) / (pars_mle[2]^2)
        -fish_mat[1,1] <- n * grad(digamma, pars_mle[1])
        -
        -fish_mat_inv <- solve(fish_mat)
        -
        -est_ellip <- ellipse(pars_mle, fish_mat_inv, draw = FALSE)
        -colnames(est_ellip) <- c("X1", "X2")
        -est_ellip <- as.data.frame(est_ellip)
        -
        -ggplot() +
        -  geom_point(data = data.frame(x = pars_mle[1], y = pars_mle[2]), aes(x = x, y = y)) +
        -  geom_path(data = est_ellip, aes(x = X1, y = X2)) +
        -  geom_point(aes(x = pars_theor[1], y = pars_theor[2]), color = "red") +
        -  geom_text(aes(x = pars_theor[1], y = pars_theor[2], label = "Theoretical parameters"), 
        -            color = "red",
        -            nudge_y = -0.2)

      -

      +
      set.seed(1)
      +n  <- 20
      +pars_theor <- c(5, 2)
      +x  <- rgamma(n, 5, 2)
      +
      +
      +# MLE for alpha and beta
      +log_lik <- function (pars, x) {
      +  n <- length(x)
      +  return (- (n * pars[1] * log(pars[2]) -
      +             n * log(gamma(pars[1])) +
      +             (pars[1] - 1) * sum(log(x)) -
      +             pars[2] * sum(x)))
      +}
      +my_optim <- optim(par = c(1,1), fn = log_lik, method = "L-BFGS-B",
      +                  lower = c(0.001, 0.001), upper = c(8, 8), x = x)
      +pars_mle <- my_optim$par
      +
      +fish_mat <- matrix(data = NA, nrow = 2, ncol = 2)
      +fish_mat[1,2] <- - n / pars_mle[2]
      +fish_mat[2,1] <- - n / pars_mle[2]
      +fish_mat[2,2] <- (n * pars_mle[1]) / (pars_mle[2]^2)
      +fish_mat[1,1] <- n * grad(digamma, pars_mle[1])
      +
      +fish_mat_inv <- solve(fish_mat)
      +
      +est_ellip <- ellipse(pars_mle, fish_mat_inv, draw = FALSE)
      +colnames(est_ellip) <- c("X1", "X2")
      +est_ellip <- as.data.frame(est_ellip)
      +
      +ggplot() +
      +  geom_point(data = data.frame(x = pars_mle[1], y = pars_mle[2]), aes(x = x, y = y)) +
      +  geom_path(data = est_ellip, aes(x = X1, y = X2)) +
      +  geom_point(aes(x = pars_theor[1], y = pars_theor[2]), color = "red") +
      +  geom_text(aes(x = pars_theor[1], y = pars_theor[2], label = "Theoretical parameters"), 
      +            color = "red",
      +            nudge_y = -0.2)
      +

      +

    +
    +

    Exercise 15.9 Find the unit Fisher information matrix for the univariate normal distribution.

    +
    +
    +
    +

    Solution. The normal density is +\[\begin{equation*} + p(x; \mu, \sigma) = \frac{1}{\sqrt{2\pi \sigma^2}} \exp\left(-0.5 \frac{(x-\mu)^2}{\sigma^2}\right). +\end{equation*}\]

    +

    Its logarithm is +\[\begin{equation*} + \log p(x; \mu, \sigma) = -0.5\log(2\pi) - \log \sigma - 0.5 \frac{(x-\mu)^2}{\sigma^2}. +\end{equation*}\]

    +

    The second order partial derivatives are +\[\begin{align*} + \frac{\partial}{\partial \mu^2} p(x; \mu, \sigma) &= -\frac{1}{\sigma^2}, \\ + \frac{\partial}{\partial \mu \partial \sigma} p(x; \mu, \sigma) &= -\frac{2(x-\mu)^2}{\sigma^3}, \\ + \frac{\partial^2}{\partial \sigma^2} p(x; \mu, \sigma) &= \frac{1}{\sigma^2} - \frac{3(x-\mu)^2}{\sigma^4}. +\end{align*}\]

    +

    The unit Fisher information matrix is then +\[\begin{align*} +I(\mu, \sigma) = + - E\left[ + \begin{bmatrix} + -\frac{1}{\sigma^2} & -\frac{2(x-\mu)}{\sigma^3} \\ + -\frac{2(x-\mu)}{\sigma^3} & \frac{1}{\sigma^2} - \frac{3(x-\mu)^2}{\sigma^4} + \end{bmatrix} + \right] = + \begin{bmatrix} + \frac{1}{\sigma^2} & 0 \\ + 0 & \frac{2}{\sigma^2} + \end{bmatrix}, +\end{align*}\] +where we used the fact that \(E[X - \mu] = 0\) and \(E[(X - \mu)^2] = \sigma^2\).

    +
    +
    +
    +

    Exercise 15.10 Find the unit Fisher information for the binomial distribution with fixed \(n\).

    +
    +
    +
    +

    Solution. The binomial mass is +\[\begin{equation*} + P(X = k; n, p) = \binom{n}{k}p^k(1-p)^{n-k}. +\end{equation*}\]

    +

    Its logarithm is +\[\begin{equation*} + \log P(X = k; n, p) = \log \binom{n}{k} + k\log p + (n-k)\log(1-p). +\end{equation*}\]

    +

    The partial derivatives are +\[\begin{align*} + \frac{\partial}{\partial p} \log P(X = k; n, p) &= \frac{k}{p} - \frac{n-k}{1-p}, \\ + \frac{\partial^2}{\partial p^2} \log P(X = k; n, p) &= -\frac{k}{p^2} - \frac{n-k}{(1-p)^2}. +\end{align*}\]

    +

    The unit Fisher information is +\[\begin{align*} +I(p) = + - E\left[ \frac{\partial^2}{\partial p^2} \log P(X = k; n, p) \right] = + \frac{n}{p(1-p)}, +\end{align*}\] +where we used the fact that \(E[k] = np\) for \(k \sim X\).

    +

    15.3 The German tank problem

    -

    Exercise 15.8 (The German tank problem) During WWII the allied intelligence were faced with an important problem of estimating the total production of certain German tanks, such as the Panther. What turned out to be a successful approach was to estimate the maximum from the serial numbers of the small sample of captured or destroyed tanks (describe the statistical model used).

    +

    Exercise 15.11 (The German tank problem) During WWII the allied intelligence were faced with an important problem of estimating the total production of certain German tanks, such as the Panther. What turned out to be a successful approach was to estimate the maximum from the serial numbers of the small sample of captured or destroyed tanks (describe the statistical model used).

    1. What assumptions were made by using the above model? Do you think they are reasonable assumptions in practice?
    2. Show that the plug-in estimate for the maximum (i.e. the maximum of the sample) is a biased estimator.
    3. @@ -814,7 +1017,7 @@

      15.3 The German tank problem
      -
      library(ggplot2)
      -my_fun <- function (x, m, k) {
      -  tmp        <-  1 / (choose(x, k))
      -  tmp[x < m] <- 0
      -  return (tmp)
      -}
      -x  <- 1:20
      -y  <- my_fun(x, 10, 4)
      -df <- data.frame(x = x, y = y)
      -ggplot(data = df, aes(x = x, y = y)) +
      -  geom_line()
      -

      +

      library(ggplot2)
      +my_fun <- function (x, m, k) {
      +  tmp        <-  1 / (choose(x, k))
      +  tmp[x < m] <- 0
      +  return (tmp)
      +}
      +x  <- 1:20
      +y  <- my_fun(x, 10, 4)
      +df <- data.frame(x = x, y = y)
      +ggplot(data = df, aes(x = x, y = y)) +
      +  geom_line()
      +

      ::: {.solution}

      1. (continued) We observe that the maximum of this function lies at the maximum value of the sample. Therefore \(n^* = m\) and ML estimate equals the plug-in estimate.

      2. diff --git a/docs/reference-keys.txt b/docs/reference-keys.txt index ed6807c..04d9363 100644 --- a/docs/reference-keys.txt +++ b/docs/reference-keys.txt @@ -210,3 +210,12 @@ exr:unnamed-chunk-32 exr:unnamed-chunk-35 exr:unnamed-chunk-5 exr:unnamed-chunk-40 +exr:unnamed-chunk-10 +exr:unnamed-chunk-19 +exr:unnamed-chunk-22 +exr:unnamed-chunk-24 +exr:unnamed-chunk-26 +exr:unnamed-chunk-28 +exr:unnamed-chunk-27 +exr:unnamed-chunk-29 +exr:unnamed-chunk-12 diff --git a/docs/search_index.json b/docs/search_index.json index 479182c..6f2fbb8 100644 --- a/docs/search_index.json +++ b/docs/search_index.json @@ -1 +1 @@ -[["rvs.html", "Chapter 4 Random variables 4.1 General properties and calculations 4.2 Discrete random variables 4.3 Continuous random variables 4.4 Singular random variables 4.5 Transformations", " Chapter 4 Random variables This chapter deals with random variables and their distributions. The students are expected to acquire the following knowledge: Theoretical Identification of random variables. Convolutions of random variables. Derivation of PDF, PMF, CDF, and quantile function. Definitions and properties of common discrete random variables. Definitions and properties of common continuous random variables. Transforming univariate random variables. R Familiarize with PDF, PMF, CDF, and quantile functions for several distributions. Visual inspection of probability distributions. Analytical and empirical calculation of probabilities based on distributions. New R functions for plotting (for example, facet_wrap). Creating random number generators based on the Uniform distribution. .fold-btn { float: right; margin: 5px 5px 0 0; } .fold { border: 1px solid black; min-height: 40px; } 4.1 General properties and calculations Exercise 4.1 Which of the functions below are valid CDFs? Find their respective densities. R: Plot the three functions. \\[\\begin{equation} F(x) = \\begin{cases} 1 - e^{-x^2} & x \\geq 0 \\\\ 0 & x < 0. \\end{cases} \\end{equation}\\] \\[\\begin{equation} F(x) = \\begin{cases} e^{-\\frac{1}{x}} & x > 0 \\\\ 0 & x \\leq 0. \\end{cases} \\end{equation}\\] \\[\\begin{equation} F(x) = \\begin{cases} 0 & x \\leq 0 \\\\ \\frac{1}{3} & 0 < x \\leq \\frac{1}{2} \\\\ 1 & x > \\frac{1}{2}. \\end{cases} \\end{equation}\\] Solution. Yes. First, let us check the limits. \\(\\lim_{x \\rightarrow -\\infty} (0) = 0\\). \\(\\lim_{x \\rightarrow \\infty} (1 - e^{-x^2}) = 1 - \\lim_{x \\rightarrow \\infty} e^{-x^2} = 1 - 0 = 1\\). Second, let us check whether the function is increasing. Let \\(x > y \\geq 0\\). Then \\(1 - e^{-x^2} \\geq 1 - e^{-y^2}\\). We only have to check right continuity for the point zero. \\(F(0) = 0\\) and \\(\\lim_{\\epsilon \\downarrow 0}F (0 + \\epsilon) = \\lim_{\\epsilon \\downarrow 0} 1 - e^{-\\epsilon^2} = 1 - \\lim_{\\epsilon \\downarrow 0} e^{-\\epsilon^2} = 1 - 1 = 0\\). We get the density by differentiating the CDF. \\(p(x) = \\frac{d}{dx} 1 - e^{-x^2} = 2xe^{-x^2}.\\) Students are encouraged to check that this is a proper PDF. Yes. First, let us check the limits. $_{x -} (0) = 0 and \\(\\lim_{x \\rightarrow \\infty} (e^{-\\frac{1}{x}}) = 1\\). Second, let us check whether the function is increasing. Let \\(x > y \\geq 0\\). Then \\(e^{-\\frac{1}{x}} \\geq e^{-\\frac{1}{y}}\\). We only have to check right continuity for the point zero. \\(F(0) = 0\\) and \\(\\lim_{\\epsilon \\downarrow 0}F (0 + \\epsilon) = \\lim_{\\epsilon \\downarrow 0} e^{-\\frac{1}{\\epsilon}} = 0\\). We get the density by differentiating the CDF. \\(p(x) = \\frac{d}{dx} e^{-\\frac{1}{x}} = \\frac{1}{x^2}e^{-\\frac{1}{x}}.\\) Students are encouraged to check that this is a proper PDF. No. The function is not right continuous as \\(F(\\frac{1}{2}) = \\frac{1}{3}\\), but \\(\\lim_{\\epsilon \\downarrow 0} F(\\frac{1}{2} + \\epsilon) = 1\\). f1 <- function (x) { tmp <- 1 - exp(-x^2) tmp[x < 0] <- 0 return(tmp) } f2 <- function (x) { tmp <- exp(-(1 / x)) tmp[x <= 0] <- 0 return(tmp) } f3 <- function (x) { tmp <- x tmp[x == x] <- 1 tmp[x <= 0.5] <- 1/3 tmp[x <= 0] <- 0 return(tmp) } cdf_data <- tibble(x = seq(-1, 20, by = 0.001), f1 = f1(x), f2 = f2(x), f3 = f3(x)) %>% melt(id.vars = "x") cdf_plot <- ggplot(data = cdf_data, aes(x = x, y = value, color = variable)) + geom_hline(yintercept = 1) + geom_line() plot(cdf_plot) Exercise 4.2 Let \\(X\\) be a random variable with CDF \\[\\begin{equation} F(x) = \\begin{cases} 0 & x < 0 \\\\ \\frac{x^2}{2} & 0 \\leq x < 1 \\\\ \\frac{1}{2} + \\frac{p}{2} & 1 \\leq x < 2 \\\\ \\frac{1}{2} + \\frac{p}{2} + \\frac{1 - p}{2} & x \\geq 2 \\end{cases} \\end{equation}\\] R: Plot this CDF for \\(p = 0.3\\). Is it a discrete, continuous, or mixed random varible? Find the probability density/mass of \\(X\\). f1 <- function (x, p) { tmp <- x tmp[x >= 2] <- 0.5 + (p * 0.5) + ((1-p) * 0.5) tmp[x < 2] <- 0.5 + (p * 0.5) tmp[x < 1] <- (x[x < 1])^2 / 2 tmp[x < 0] <- 0 return(tmp) } cdf_data <- tibble(x = seq(-1, 5, by = 0.001), y = f1(x, 0.3)) cdf_plot <- ggplot(data = cdf_data, aes(x = x, y = y)) + geom_hline(yintercept = 1) + geom_line(color = "blue") plot(cdf_plot) ::: {.solution} \\(X\\) is a mixed random variable. Since \\(X\\) is a mixed random variable, we have to find the PDF of the continuous part and the PMF of the discrete part. We get the continuous part by differentiating the corresponding CDF, \\(\\frac{d}{dx}\\frac{x^2}{2} = x\\). So the PDF, when \\(0 \\leq x < 1\\), is \\(p(x) = x\\). Let us look at the discrete part now. It has two steps, so this is a discrete distribution with two outcomes – numbers 1 and 2. The first happens with probability \\(\\frac{p}{2}\\), and the second with probability \\(\\frac{1 - p}{2}\\). This reminds us of the Bernoulli distribution. The PMF for the discrete part is \\(P(X = x) = (\\frac{p}{2})^{2 - x} (\\frac{1 - p}{2})^{x - 1}\\). ::: Exercise 4.3 (Convolutions) Convolutions are probability distributions that correspond to sums of independent random variables. Let \\(X\\) and \\(Y\\) be independent discrete variables. Find the PMF of \\(Z = X + Y\\). Hint: Use the law of total probability. Let \\(X\\) and \\(Y\\) be independent continuous variables. Find the PDF of \\(Z = X + Y\\). Hint: Start with the CDF. Solution. \\[\\begin{align} P(Z = z) &= P(X + Y = z) & \\\\ &= \\sum_{k = -\\infty}^\\infty P(X + Y = z | Y = k) P(Y = k) & \\text{ (law of total probability)} \\\\ &= \\sum_{k = -\\infty}^\\infty P(X + k = z | Y = k) P(Y = k) & \\\\ &= \\sum_{k = -\\infty}^\\infty P(X + k = z) P(Y = k) & \\text{ (independence of $X$ and $Y$)} \\\\ &= \\sum_{k = -\\infty}^\\infty P(X = z - k) P(Y = k). & \\end{align}\\] Let \\(f\\) and \\(g\\) be the PDFs of \\(X\\) and \\(Y\\) respectively. \\[\\begin{align} F(z) &= P(Z < z) \\\\ &= P(X + Y < z) \\\\ &= \\int_{-\\infty}^{\\infty} P(X + Y < z | Y = y)P(Y = y)dy \\\\ &= \\int_{-\\infty}^{\\infty} P(X + y < z | Y = y)P(Y = y)dy \\\\ &= \\int_{-\\infty}^{\\infty} P(X + y < z)P(Y = y)dy \\\\ &= \\int_{-\\infty}^{\\infty} P(X < z - y)P(Y = y)dy \\\\ &= \\int_{-\\infty}^{\\infty} (\\int_{-\\infty}^{z - y} f(x) dx) g(y) dy \\end{align}\\] Now \\[\\begin{align} p(z) &= \\frac{d}{dz} F(z) & \\\\ &= \\int_{-\\infty}^{\\infty} (\\frac{d}{dz}\\int_{-\\infty}^{z - y} f(x) dx) g(y) dy & \\\\ &= \\int_{-\\infty}^{\\infty} f(z - y) g(y) dy & \\text{ (fundamental theorem of calculus)}. \\end{align}\\] 4.2 Discrete random variables Exercise 4.4 (Binomial random variable) Let \\(X_k\\), \\(k = 1,...,n\\), be random variables with the Bernoulli measure as the PMF. Let \\(X = \\sum_{k=1}^n X_k\\). We call \\(X_k\\) a Bernoulli random variable with parameter \\(p \\in (0,1)\\). Find the CDF of \\(X_k\\). Find PMF of \\(X\\). This is a Binomial random variable with support in \\(\\{0,1,2,...,n\\}\\) and parameters \\(p \\in (0,1)\\) and \\(n \\in \\mathbb{N}_0\\). We denote \\[\\begin{equation} X | n,p \\sim \\text{binomial}(n,p). \\end{equation}\\] Find CDF of \\(X\\). R: Simulate from the binomial distribution with \\(n = 10\\) and \\(p = 0.5\\), and from \\(n\\) Bernoulli distributions with \\(p = 0.5\\). Visually compare the sum of Bernoullis and the binomial. Hint: there is no standard function like rpois for a Bernoulli random variable. Check exercise ?? to find out how to sample from a Bernoulli distribution. Solution. There are two outcomes – zero and one. Zero happens with probability \\(1 - p\\). Therefore \\[\\begin{equation} F(k) = \\begin{cases} 0 & k < 0 \\\\ 1 - p & 0 \\leq k < 1 \\\\ 1 & k \\geq 1. \\end{cases} \\end{equation}\\] For the probability of \\(X\\) to be equal to some \\(k \\leq n\\), exactly \\(k\\) Bernoulli variables need to be one, and the others zero. So \\(p^k(1-p)^{n-k}\\). There are \\(\\binom{n}{k}\\) such possible arrangements. Therefore \\[\\begin{align} P(X = k) = \\binom{n}{k} p^k (1 - p)^{n-k}. \\end{align}\\] \\[\\begin{equation} F(k) = \\sum_{i = 0}^{\\lfloor k \\rfloor} \\binom{n}{i} p^i (1 - p)^{n - i} \\end{equation}\\] set.seed(1) nsamps <- 10000 binom_samp <- rbinom(nsamps, size = 10, prob = 0.5) bernoulli_mat <- matrix(data = NA, nrow = nsamps, ncol = 10) for (i in 1:nsamps) { bernoulli_mat[i, ] <- rbinom(10, size = 1, prob = 0.5) } bern_samp <- apply(bernoulli_mat, 1, sum) b_data <- tibble(x = c(binom_samp, bern_samp), type = c(rep("binomial", 10000), rep("Bernoulli_sum", 10000))) b_plot <- ggplot(data = b_data, aes(x = x, fill = type)) + geom_bar(position = "dodge") plot(b_plot) Exercise 4.5 (Geometric random variable) A variable with PMF \\[\\begin{equation} P(k) = p(1-p)^k \\end{equation}\\] is a geometric random variable with support in non-negative integers. It has one parameter \\(p \\in (0,1]\\). We denote \\[\\begin{equation} X | p \\sim \\text{geometric}(p) \\end{equation}\\] Derive the CDF of a geometric random variable. R: Draw 1000 samples from the geometric distribution with \\(p = 0.3\\) and compare their frequencies to theoretical values. Solution. \\[\\begin{align} P(X \\leq k) &= \\sum_{i = 0}^k p(1-p)^i \\\\ &= p \\sum_{i = 0}^k (1-p)^i \\\\ &= p \\frac{1 - (1-p)^{k+1}}{1 - (1 - p)} \\\\ &= 1 - (1-p)^{k + 1} \\end{align}\\] set.seed(1) geo_samp <- rgeom(n = 1000, prob = 0.3) geo_samp <- data.frame(x = geo_samp) %>% count(x) %>% mutate(n = n / 1000, type = "empirical_frequencies") %>% bind_rows(data.frame(x = 0:20, n = dgeom(0:20, prob = 0.3), type = "theoretical_measure")) geo_plot <- ggplot(data = geo_samp, aes(x = x, y = n, fill = type)) + geom_bar(stat="identity", position = "dodge") plot(geo_plot) Exercise 4.6 (Poisson random variable) A variable with PMF \\[\\begin{equation} P(k) = \\frac{\\lambda^k e^{-\\lambda}}{k!} \\end{equation}\\] is a Poisson random variable with support in non-negative integers. It has one positive parameter \\(\\lambda\\), which also represents its mean value and variance (a measure of the deviation of the values from the mean – more on mean and variance in the next chapter). We denote \\[\\begin{equation} X | \\lambda \\sim \\text{Poisson}(\\lambda). \\end{equation}\\] This distribution is usually the default choice for modeling counts. We have already encountered a Poisson random variable in exercise ??, where we also sampled from this distribution. The CDF of a Poisson random variable is \\(P(X <= x) = e^{-\\lambda} \\sum_{i=0}^x \\frac{\\lambda^{i}}{i!}\\). R: Draw 1000 samples from the Poisson distribution with \\(\\lambda = 5\\) and compare their empirical cumulative distribution function with the theoretical CDF. set.seed(1) pois_samp <- rpois(n = 1000, lambda = 5) pois_samp <- data.frame(x = pois_samp) pois_plot <- ggplot(data = pois_samp, aes(x = x, colour = "ECDF")) + stat_ecdf(geom = "step") + geom_step(data = tibble(x = 0:17, y = ppois(x, 5)), aes(x = x, y = y, colour = "CDF")) + scale_colour_manual("Lgend title", values = c("black", "red")) plot(pois_plot) Exercise 4.7 (Negative binomial random variable) A variable with PMF \\[\\begin{equation} p(k) = \\binom{k + r - 1}{k}(1-p)^r p^k \\end{equation}\\] is a negative binomial random variable with support in non-negative integers. It has two parameters \\(r > 0\\) and \\(p \\in (0,1)\\). We denote \\[\\begin{equation} X | r,p \\sim \\text{NB}(r,p). \\end{equation}\\] Let us reparameterize the negative binomial distribution with \\(q = 1 - p\\). Find the PMF of \\(X \\sim \\text{NB}(1, q)\\). Do you recognize this distribution? Show that the sum of two negative binomial random variables with the same \\(p\\) is also a negative binomial random variable. Hint: Use the fact that the number of ways to place \\(n\\) indistinct balls into \\(k\\) boxes is \\(\\binom{n + k - 1}{n}\\). R: Draw samples from \\(X \\sim \\text{NB}(5, 0.4)\\) and \\(Y \\sim \\text{NB}(3, 0.4)\\). Draw samples from \\(Z = X + Y\\), where you use the parameters calculated in b). Plot both distributions, their sum, and \\(Z\\) using facet_wrap. Be careful, as R uses a different parameterization size=\\(r\\) and prob=\\(1 - p\\). Solution. \\[\\begin{align} P(X = k) &= \\binom{k + 1 - 1}{k}q^1 (1-q)^k \\\\ &= q(1-q)^k. \\end{align}\\] This is the geometric distribution. Let \\(X \\sim \\text{NB}(r_1, p)\\) and \\(Y \\sim \\text{NB}(r_2, p)\\). Let \\(Z = X + Y\\). \\[\\begin{align} P(Z = z) &= \\sum_{k = 0}^{\\infty} P(X = z - k)P(Y = k), \\text{ if k < 0, then the probabilities are 0} \\\\ &= \\sum_{k = 0}^{z} P(X = z - k)P(Y = k), \\text{ if k > z, then the probabilities are 0} \\\\ &= \\sum_{k = 0}^{z} \\binom{z - k + r_1 - 1}{z - k}(1 - p)^{r_1} p^{z - k} \\binom{k + r_2 - 1}{k}(1 - p)^{r_2} p^{k} & \\\\ &= \\sum_{k = 0}^{z} \\binom{z - k + r_1 - 1}{z - k} \\binom{k + r_2 - 1}{k}(1 - p)^{r_1 + r_2} p^{z} & \\\\ &= (1 - p)^{r_1 + r_2} p^{z} \\sum_{k = 0}^{z} \\binom{z - k + r_1 - 1}{z - k} \\binom{k + r_2 - 1}{k}& \\end{align}\\] The part before the sum reminds us of the negative binomial distribution with parameters \\(r_1 + r_2\\) and \\(p\\). To complete this term to the negative binomial PMF we need \\(\\binom{z + r_1 + r_2 -1}{z}\\). So the only thing we need to prove is that the sum equals this term. Both terms in the sum can be interpreted as numbers of ways to place a number of balls into boxes. For the left term it is \\(z-k\\) balls into \\(r_1\\) boxes, and for the right \\(k\\) balls into \\(r_2\\) boxes. For each \\(k\\) we are distributing \\(z\\) balls in total. By summing over all \\(k\\), we actually get all the possible placements of \\(z\\) balls into \\(r_1 + r_2\\) boxes. Therefore \\[\\begin{align} P(Z = z) &= (1 - p)^{r_1 + r_2} p^{z} \\sum_{k = 0}^{z} \\binom{z - k + r_1 - 1}{z - k} \\binom{k + r_2 - 1}{k}& \\\\ &= \\binom{z + r_1 + r_2 -1}{z} (1 - p)^{r_1 + r_2} p^{z}. \\end{align}\\] From this it also follows that the sum of geometric distributions with the same parameter is a negative binomial distribution. \\(Z \\sim \\text{NB}(8, 0.4)\\). set.seed(1) nsamps <- 10000 x <- rnbinom(nsamps, size = 5, prob = 0.6) y <- rnbinom(nsamps, size = 3, prob = 0.6) xpy <- x + y z <- rnbinom(nsamps, size = 8, prob = 0.6) samps <- tibble(x, y, xpy, z) samps <- melt(samps) ggplot(data = samps, aes(x = value)) + geom_bar() + facet_wrap(~ variable) 4.3 Continuous random variables Exercise 4.8 (Exponential random variable) A variable \\(X\\) with PDF \\(\\lambda e^{-\\lambda x}\\) is an exponential random variable with support in non-negative real numbers. It has one positive parameter \\(\\lambda\\). We denote \\[\\begin{equation} X | \\lambda \\sim \\text{Exp}(\\lambda). \\end{equation}\\] Find the CDF of an exponential random variable. Find the quantile function of an exponential random variable. Calculate the probability \\(P(1 \\leq X \\leq 3)\\), where \\(X \\sim \\text{Exp(1.5)}\\). R: Check your answer to c) with a simulation (rexp). Plot the probability in a meaningful way. R: Implement PDF, CDF, and the quantile function and compare their values with corresponding R functions visually. Hint: use the size parameter to make one of the curves wider. Solution. \\[\\begin{align} F(x) &= \\int_{0}^{x} \\lambda e^{-\\lambda t} dt \\\\ &= \\lambda \\int_{0}^{x} e^{-\\lambda t} dt \\\\ &= \\lambda (\\frac{1}{-\\lambda}e^{-\\lambda t} |_{0}^{x}) \\\\ &= \\lambda(\\frac{1}{\\lambda} - \\frac{1}{\\lambda} e^{-\\lambda x}) \\\\ &= 1 - e^{-\\lambda x}. \\end{align}\\] \\[\\begin{align} F(F^{-1}(x)) &= x \\\\ 1 - e^{-\\lambda F^{-1}(x)} &= x \\\\ e^{-\\lambda F^{-1}(x)} &= 1 - x \\\\ -\\lambda F^{-1}(x) &= \\ln(1 - x) \\\\ F^{-1}(x) &= - \\frac{ln(1 - x)}{\\lambda}. \\end{align}\\] \\[\\begin{align} P(1 \\leq X \\leq 3) &= P(X \\leq 3) - P(X \\leq 1) \\\\ &= P(X \\leq 3) - P(X \\leq 1) \\\\ &= 1 - e^{-1.5 \\times 3} - 1 + e^{-1.5 \\times 1} \\\\ &\\approx 0.212. \\end{align}\\] set.seed(1) nsamps <- 1000 samps <- rexp(nsamps, rate = 1.5) sum(samps >= 1 & samps <= 3) / nsamps ## [1] 0.212 exp_plot <- ggplot(data.frame(x = seq(0, 5, by = 0.01)), aes(x = x)) + stat_function(fun = dexp, args = list(rate = 1.5)) + stat_function(fun = dexp, args = list(rate = 1.5), xlim = c(1,3), geom = "area", fill = "red") plot(exp_plot) exp_pdf <- function(x, lambda) { return (lambda * exp(-lambda * x)) } exp_cdf <- function(x, lambda) { return (1 - exp(-lambda * x)) } exp_quant <- function(q, lambda) { return (-(log(1 - q) / lambda)) } ggplot(data = data.frame(x = seq(0, 5, by = 0.01)), aes(x = x)) + stat_function(fun = dexp, args = list(rate = 1.5), aes(color = "R"), size = 2.5) + stat_function(fun = exp_pdf, args = list(lambda = 1.5), aes(color = "Mine"), size = 1.2) + scale_color_manual(values = c("red", "black")) ggplot(data = data.frame(x = seq(0, 5, by = 0.01)), aes(x = x)) + stat_function(fun = pexp, args = list(rate = 1.5), aes(color = "R"), size = 2.5) + stat_function(fun = exp_cdf, args = list(lambda = 1.5), aes(color = "Mine"), size = 1.2) + scale_color_manual(values = c("red", "black")) ggplot(data = data.frame(x = seq(0, 1, by = 0.01)), aes(x = x)) + stat_function(fun = qexp, args = list(rate = 1.5), aes(color = "R"), size = 2.5) + stat_function(fun = exp_quant, args = list(lambda = 1.5), aes(color = "Mine"), size = 1.2) + scale_color_manual(values = c("red", "black")) Exercise 4.9 (Uniform random variable) Continuous uniform random variable with parameters \\(a\\) and \\(b\\) has the PDF \\[\\begin{equation} p(x) = \\begin{cases} \\frac{1}{b - a} & x \\in [a,b] \\\\ 0 & \\text{otherwise}. \\end{cases} \\end{equation}\\] Find the CDF of the uniform random variable. Find the quantile function of the uniform random variable. Let \\(X \\sim \\text{Uniform}(a,b)\\). Find the CDF of the variable \\(Y = \\frac{X - a}{b - a}\\). This is the standard uniform random variable. Let \\(X \\sim \\text{Uniform}(-1, 3)\\). Find such \\(z\\) that \\(P(X < z + \\mu_x) = \\frac{1}{5}\\). R: Check your result from d) using simulation. Solution. \\[\\begin{align} F(x) &= \\int_{a}^x \\frac{1}{b - a} dt \\\\ &= \\frac{1}{b - a} \\int_{a}^x dt \\\\ &= \\frac{x - a}{b - a}. \\end{align}\\] \\[\\begin{align} F(F^{-1}(p)) &= p \\\\ \\frac{F^{-1}(p) - a}{b - a} &= p \\\\ F^{-1}(p) &= p(b - a) + a. \\end{align}\\] \\[\\begin{align} F_Y(y) &= P(Y < y) \\\\ &= P(\\frac{X - a}{b - a} < y) \\\\ &= P(X < y(b - a) + a) \\\\ &= F_X(y(b - a) + a) \\\\ &= \\frac{(y(b - a) + a) - a}{b - a} \\\\ &= y. \\end{align}\\] \\[\\begin{align} P(X < z + 1) &= \\frac{1}{5} \\\\ F(z + 1) &= \\frac{1}{5} \\\\ z + 1 &= F^{-1}(\\frac{1}{5}) \\\\ z &= \\frac{1}{5}4 - 1 - 1 \\\\ z &= -1.2. \\end{align}\\] set.seed(1) a <- -1 b <- 3 nsamps <- 10000 unif_samp <- runif(nsamps, a, b) mu_x <- mean(unif_samp) new_samp <- unif_samp - mu_x quantile(new_samp, probs = 1/5) ## 20% ## -1.203192 punif(-0.2, -1, 3) ## [1] 0.2 Exercise 4.10 (Beta random variable) A variable \\(X\\) with PDF \\[\\begin{equation} p(x) = \\frac{x^{\\alpha - 1} (1 - x)^{\\beta - 1}}{\\text{B}(\\alpha, \\beta)}, \\end{equation}\\] where \\(\\text{B}(\\alpha, \\beta) = \\frac{\\Gamma(\\alpha) \\Gamma(\\beta)}{\\Gamma(\\alpha + \\beta)}\\) and \\(\\Gamma(x) = \\int_0^{\\infty} x^{z - 1} e^{-x} dx\\) is a Beta random variable with support on \\([0,1]\\). It has two positive parameters \\(\\alpha\\) and \\(\\beta\\). Notation: \\[\\begin{equation} X | \\alpha, \\beta \\sim \\text{Beta}(\\alpha, \\beta) \\end{equation}\\] It is often used in modeling rates. Calculate the PDF for \\(\\alpha = 1\\) and \\(\\beta = 1\\). What do you notice? R: Plot densities of the beta distribution for parameter pairs (2, 2), (4, 1), (1, 4), (2, 5), and (0.1, 0.1). R: Sample from \\(X \\sim \\text{Beta}(2, 5)\\) and compare the histogram with Beta PDF. Solution. \\[\\begin{equation} p(x) = \\frac{x^{1 - 1} (1 - x)^{1 - 1}}{\\text{B}(1, 1)} = 1. \\end{equation}\\] This is the standard uniform distribution. set.seed(1) ggplot(data = data.frame(x = seq(0, 1, by = 0.01)), aes(x = x)) + stat_function(fun = dbeta, args = list(shape1 = 2, shape2 = 2), aes(color = "alpha = 0.5")) + stat_function(fun = dbeta, args = list(shape1 = 4, shape2 = 1), aes(color = "alpha = 4")) + stat_function(fun = dbeta, args = list(shape1 = 1, shape2 = 4), aes(color = "alpha = 1")) + stat_function(fun = dbeta, args = list(shape1 = 2, shape2 = 5), aes(color = "alpha = 25")) + stat_function(fun = dbeta, args = list(shape1 = 0.1, shape2 = 0.1), aes(color = "alpha = 0.1")) set.seed(1) nsamps <- 1000 samps <- rbeta(nsamps, 2, 5) ggplot(data = data.frame(x = samps), aes(x = x)) + geom_histogram(aes(y = ..density..), color = "black") + stat_function(data = data.frame(x = seq(0, 1, by = 0.01)), aes(x = x), fun = dbeta, args = list(shape1 = 2, shape2 = 5), color = "red", size = 1.2) Exercise 4.11 (Gamma random variable) A random variable with PDF \\[\\begin{equation} p(x) = \\frac{\\beta^\\alpha}{\\Gamma(\\alpha)} x^{\\alpha - 1}e^{-\\beta x} \\end{equation}\\] is a Gamma random variable with support on the positive numbers and parameters shape \\(\\alpha > 0\\) and rate \\(\\beta > 0\\). We write \\[\\begin{equation} X | \\alpha, \\beta \\sim \\text{Gamma}(\\alpha, \\beta) \\end{equation}\\] and it’s CDF is \\[\\begin{equation} \\frac{\\gamma(\\alpha, \\beta x)}{\\Gamma(\\alpha)}, \\end{equation}\\] where \\(\\gamma(s, x) = \\int_0^x t^{s-1} e^{-t} dt\\). It is usually used in modeling positive phenomena (for example insurance claims and rainfalls). Let \\(X \\sim \\text{Gamma}(1, \\beta)\\). Find the PDF of \\(X\\). Do you recognize this PDF? Let \\(k = \\alpha\\) and \\(\\theta = \\frac{1}{\\beta}\\). Find the PDF of \\(X | k, \\theta \\sim \\text{Gamma}(k, \\theta)\\). Random variables can be reparameterized, and sometimes a reparameterized distribution is more suitable for certain calculations. The first parameterization is for example usually used in Bayesian statistics, while this parameterization is more common in econometrics and some other applied fields. Note that you also need to pay attention to the parameters in statistical software, so diligently read the help files when using functions like rgamma to see how the function is parameterized. R: Plot gamma CDF for random variables with shape and rate parameters (1,1), (10,1), (1,10). Solution. \\[\\begin{align} p(x) &= \\frac{\\beta^1}{\\Gamma(1)} x^{1 - 1}e^{-\\beta x} \\\\ &= \\beta e^{-\\beta x} \\end{align}\\] This is the PDF of the exponential distribution with parameter \\(\\beta\\). \\[\\begin{align} p(x) &= \\frac{1}{\\Gamma(k)\\beta^k} x^{k - 1}e^{-\\frac{x}{\\theta}}. \\end{align}\\] set.seed(1) ggplot(data = data.frame(x = seq(0, 25, by = 0.01)), aes(x = x)) + stat_function(fun = pgamma, args = list(shape = 1, rate = 1), aes(color = "Gamma(1,1)")) + stat_function(fun = pgamma, args = list(shape = 10, rate = 1), aes(color = "Gamma(10,1)")) + stat_function(fun = pgamma, args = list(shape = 1, rate = 10), aes(color = "Gamma(1,10)")) Exercise 4.12 (Normal random variable) A random variable with PDF \\[\\begin{equation} p(x) = \\frac{1}{\\sqrt{2 \\pi \\sigma^2}} e^{-\\frac{(x - \\mu)^2}{2 \\sigma^2}} \\end{equation}\\] is a normal random variable with support on the real axis and parameters \\(\\mu\\) in reals and \\(\\sigma^2 > 0\\). The first is the mean parameter and the second is the variance parameter. Many statistical methods assume a normal distribution. We denote \\[\\begin{equation} X | \\mu, \\sigma \\sim \\text{N}(\\mu, \\sigma^2), \\end{equation}\\] and it’s CDF is \\[\\begin{equation} F(x) = \\int_{-\\infty}^x \\frac{1}{\\sqrt{2 \\pi \\sigma^2}} e^{-\\frac{(t - \\mu)^2}{2 \\sigma^2}} dt, \\end{equation}\\] which is intractable and is usually approximated. Due to its flexibility it is also one of the most researched distributions. For that reason statisticians often use transformations of variables or approximate distributions with the normal distribution. Show that a variable \\(\\frac{X - \\mu}{\\sigma} \\sim \\text{N}(0,1)\\). This transformation is called standardization, and \\(\\text{N}(0,1)\\) is a standard normal distribution. R: Plot the normal distribution with \\(\\mu = 0\\) and different values for the \\(\\sigma\\) parameter. R: The normal distribution provides a good approximation for the Poisson distribution with a large \\(\\lambda\\). Let \\(X \\sim \\text{Poisson}(50)\\). Approximate \\(X\\) with the normal distribution and compare its density with the Poisson histogram. What are the values of \\(\\mu\\) and \\(\\sigma^2\\) that should provide the best approximation? Note that R function rnorm takes standard deviation (\\(\\sigma\\)) as a parameter and not variance. Solution. \\[\\begin{align} P(\\frac{X - \\mu}{\\sigma} < x) &= P(X < \\sigma x + \\mu) \\\\ &= F(\\sigma x + \\mu) \\\\ &= \\int_{-\\infty}^{\\sigma x + \\mu} \\frac{1}{\\sqrt{2 \\pi \\sigma^2}} e^{-\\frac{(t - \\mu)^2}{2\\sigma^2}} dt \\end{align}\\] Now let \\(s = f(t) = \\frac{t - \\mu}{\\sigma}\\), then \\(ds = \\frac{dt}{\\sigma}\\) and \\(f(\\sigma x + \\mu) = x\\), so \\[\\begin{align} P(\\frac{X - \\mu}{\\sigma} < x) &= \\int_{-\\infty}^{x} \\frac{1}{\\sqrt{2 \\pi}} e^{-\\frac{s^2}{2}} ds. \\end{align}\\] There is no need to evaluate this integral, as we recognize it as the CDF of a normal distribution with \\(\\mu = 0\\) and \\(\\sigma^2 = 1\\). set.seed(1) # b ggplot(data = data.frame(x = seq(-15, 15, by = 0.01)), aes(x = x)) + stat_function(fun = dnorm, args = list(mean = 0, sd = 1), aes(color = "sd = 1")) + stat_function(fun = dnorm, args = list(mean = 0, sd = 0.4), aes(color = "sd = 0.1")) + stat_function(fun = dnorm, args = list(mean = 0, sd = 2), aes(color = "sd = 2")) + stat_function(fun = dnorm, args = list(mean = 0, sd = 5), aes(color = "sd = 5")) # c mean_par <- 50 nsamps <- 100000 pois_samps <- rpois(nsamps, lambda = mean_par) norm_samps <- rnorm(nsamps, mean = mean_par, sd = sqrt(mean_par)) my_plot <- ggplot() + geom_bar(data = tibble(x = pois_samps), aes(x = x, y = (..count..)/sum(..count..))) + geom_density(data = tibble(x = norm_samps), aes(x = x), color = "red") plot(my_plot) Exercise 4.13 (Logistic random variable) A logistic random variable has CDF \\[\\begin{equation} F(x) = \\frac{1}{1 + e^{-\\frac{x - \\mu}{s}}}, \\end{equation}\\] where \\(\\mu\\) is real and \\(s > 0\\). The support is on the real axis. We denote \\[\\begin{equation} X | \\mu, s \\sim \\text{Logistic}(\\mu, s). \\end{equation}\\] The distribution of the logistic random variable resembles a normal random variable, however it has heavier tails. Find the PDF of a logistic random variable. R: Implement logistic PDF and CDF and visually compare both for \\(X \\sim \\text{N}(0, 1)\\) and \\(Y \\sim \\text{logit}(0, \\sqrt{\\frac{3}{\\pi^2}})\\). These distributions have the same mean and variance. Additionally, plot the same plot on the interval [5,10], to better see the difference in the tails. R: For the distributions in b) find the probability \\(P(|X| > 4)\\) and interpret the result. Solution. \\[\\begin{align} p(x) &= \\frac{d}{dx} \\frac{1}{1 + e^{-\\frac{x - \\mu}{s}}} \\\\ &= \\frac{- \\frac{d}{dx} (1 + e^{-\\frac{x - \\mu}{s}})}{(1 + e{-\\frac{x - \\mu}{s}})^2} \\\\ &= \\frac{e^{-\\frac{x - \\mu}{s}}}{s(1 + e{-\\frac{x - \\mu}{s}})^2}. \\end{align}\\] # b set.seed(1) logit_pdf <- function (x, mu, s) { return ((exp(-(x - mu)/(s))) / (s * (1 + exp(-(x - mu)/(s)))^2)) } nl_plot <- ggplot(data = data.frame(x = seq(-12, 12, by = 0.01)), aes(x = x)) + stat_function(fun = dnorm, args = list(mean = 0, sd = 2), aes(color = "normal")) + stat_function(fun = logit_pdf, args = list(mu = 0, s = sqrt(12/pi^2)), aes(color = "logit")) plot(nl_plot) nl_plot <- ggplot(data = data.frame(x = seq(5, 10, by = 0.01)), aes(x = x)) + stat_function(fun = dnorm, args = list(mean = 0, sd = 2), aes(color = "normal")) + stat_function(fun = logit_pdf, args = list(mu = 0, s = sqrt(12/pi^2)), aes(color = "logit")) plot(nl_plot) # c logit_cdf <- function (x, mu, s) { return (1 / (1 + exp(-(x - mu) / s))) } p_logistic <- 1 - logit_cdf(4, 0, sqrt(12/pi^2)) + logit_cdf(-4, 0, sqrt(12/pi^2)) p_norm <- 1 - pnorm(4, 0, 2) + pnorm(-4, 0, 2) p_logistic ## [1] 0.05178347 p_norm ## [1] 0.04550026 # Logistic distribution has wider tails, therefore the probability of larger # absolute values is higher. 4.4 Singular random variables Exercise 4.14 (Cantor distribution) The Cantor set is a subset of \\([0,1]\\), which we create by iteratively deleting the middle third of the interval. For example, in the first iteration, we get the sets \\([0,\\frac{1}{3}]\\) and \\([\\frac{2}{3},1]\\). In the second iteration, we get \\([0,\\frac{1}{9}]\\), \\([\\frac{2}{9},\\frac{1}{3}]\\), \\([\\frac{2}{3}, \\frac{7}{9}]\\), and \\([\\frac{8}{9}, 1]\\). On the \\(n\\)-th iteration, we have \\[\\begin{equation} C_n = \\frac{C_{n-1}}{3} \\cup \\bigg(\\frac{2}{3} + \\frac{C_{n-1}}{3} \\bigg), \\end{equation}\\] where \\(C_0 = [0,1]\\). The Cantor set is then defined as the intersection of these sets \\[\\begin{equation} C = \\cap_{n=1}^{\\infty} C_n. \\end{equation}\\] It has the same cardinality as \\([0,1]\\). Another way to define the Cantor set is the set of all numbers on \\([0,1]\\), that do not have a 1 in the ternary representation \\(x = \\sum_{n=1}^\\infty \\frac{x_i}{3^i}, x_i \\in \\{0,1,2\\}\\). A random variable follows the Cantor distribution, if its CDF is the Cantor function (below). You can find the implementations of random number generator, CDF, and quantile functions for the Cantor distributions at https://github.com/Henrygb/CantorDist.R. Show that the Lebesgue measure of the Cantor set is 0. (Jagannathan) Let us look at an infinite sequence of independent fair-coin tosses. If the outcome is heads, let \\(x_i = 2\\) and \\(x_i = 0\\), when tails. Then use these to create \\(x = \\sum_{n=1}^\\infty \\frac{x_i}{3^i}\\). This is a random variable with the Cantor distribution. Show that \\(X\\) has a singular distribution. Solution. \\[\\begin{align} \\lambda(C) &= 1 - \\lambda(C^c) \\\\ &= 1 - \\frac{1}{3}\\sum_{k = 0}^\\infty (\\frac{2}{3})^k \\\\ &= 1 - \\frac{\\frac{1}{3}}{1 - \\frac{2}{3}} \\\\ &= 0. \\end{align}\\] First, for every \\(x\\), the probability of observing it is \\(\\lim_{n \\rightarrow \\infty} \\frac{1}{2^n} = 0\\). Second, the probability that we observe one of all the possible sequences is 1. Therefore \\(P(C) = 1\\). So this is a singular variable. The CDF only increments on the elements of the Cantor set. 4.5 Transformations Exercise 4.15 Let \\(X\\) be a random variable that is uniformly distributed on \\(\\{-2, -1, 0, 1, 2\\}\\). Find the PMF of \\(Y = X^2\\). Solution. \\[\\begin{align} P_Y(y) = \\sum_{x \\in \\sqrt(y)} P_X(x) = \\begin{cases} 0 & y \\notin \\{0,1,4\\} \\\\ \\frac{1}{5} & y = 0 \\\\ \\frac{2}{5} & y \\in \\{1,4\\} \\end{cases} \\end{align}\\] Exercise 4.16 (Continuous uniform random variable) Let \\(X \\sim U(-1, 1)\\) be uniformly distributed on \\([-1, 1]\\) and let \\(Y = X^2\\). Find the CDF of \\(Y\\), the PDF of \\(Y\\), and ensure that the obtained PDF integrates to 1. Find the CDF of \\(Y\\). Find the PDF of \\(Y\\) and verify that it integrates to 1. R: Plot histograms of samples from \\(X\\) and \\(Y\\) and overlay the true densities. Solution. We first observe that \\(Y \\in [0, 1]\\). If a sample from \\(Y\\) is less than \\(\\alpha\\), the corresponding sample from \\(X\\) must be in \\(\\left[-\\sqrt{\\alpha}, \\sqrt{\\alpha}\\right]\\). \\[\\begin{align} F_Y(\\alpha) = P_X(X \\in \\left[-\\sqrt{\\alpha}, \\sqrt{\\alpha}\\right]) &= F_X(\\sqrt{\\alpha}) - F_X(-\\sqrt{\\alpha}) \\\\ &= \\frac{\\sqrt{\\alpha} + 1}{2} - \\frac{-\\sqrt{\\alpha} + 1}{2} = \\sqrt{\\alpha}. \\end{align}\\] \\[\\begin{align} f_Y(\\alpha) = \\frac{\\partial}{\\partial \\alpha} F_Y(\\alpha) = \\frac{\\partial}{\\partial \\alpha} \\sqrt{\\alpha} = \\frac{1}{2\\sqrt{\\alpha}}. \\end{align}\\] \\[\\begin{align} \\int_{0}^1f_Y(\\alpha)d\\alpha = 0.5\\int_{0}^1 \\frac{1}{\\alpha}d\\alpha = 0.5\\left[\\frac{\\alpha^{0.5}}{0.5}\\right]_0^1 = 1 - 0 = 1. \\end{align}\\] # Draw samples to plot set.seed(0) n <- 10000 x <- runif(n, min=-1, max=1) y <- runif(n, min=-1, max=1)^2 # Dataframe of samples df <- data.frame(value=c(x, y), RV=c( rep("X", n), rep("Y", n) )) # Dataframe with analytically computed density of Y df_y_analytic <- data.frame(y=seq(1e-2, 1, length.out=1000)) df_y_analytic$density <- 1 / (2 * sqrt(df_y_analytic$y)) library(ggplot2) ggplot(df) + geom_histogram(aes(x=value, after_stat(density), fill=RV), alpha=0.5, bins=50, position = "identity") + stat_function(fun = function(u) 1/2, color="red") + geom_path(data=df_y_analytic, aes(x=y, y=density), color="blue") Exercise 4.17 (Lognormal random variable) A lognormal random variable is a variable whose logarithm is normally distributed. In practice, we often encounter skewed data. Usually using a log transformation on such data makes it more symmetric and therefore more suitable for modeling with the normal distribution (more on why we wish to model data with the normal distribution in the following chapters). Let \\(X \\sim \\text{N}(\\mu,\\sigma)\\). Find the PDF of \\(Y: \\log(Y) = X\\). R: Sample from the lognormal distribution with parameters \\(\\mu = 5\\) and \\(\\sigma = 2\\). Plot a histogram of the samples. Then log-transform the samples and plot a histogram along with the theoretical normal PDF. Solution. \\[\\begin{align} p_Y(y) &= p_X(\\log(y)) \\frac{d}{dy} \\log(y) \\\\ &= \\frac{1}{\\sqrt{2 \\pi \\sigma^2}} e^{\\frac{(\\log(y) - \\mu)^2}{2 \\sigma^2}} \\frac{1}{y} \\\\ &= \\frac{1}{y \\sqrt{2 \\pi \\sigma^2}} e^{\\frac{(\\log(y) - \\mu)^2}{2 \\sigma^2}}. \\end{align}\\] set.seed(1) nsamps <- 10000 mu <- 0.5 sigma <- 0.4 ln_samps <- rlnorm(nsamps, mu, sigma) ln_plot <- ggplot(data = data.frame(x = ln_samps), aes(x = x)) + geom_histogram(color = "black") plot(ln_plot) norm_samps <- log(ln_samps) n_plot <- ggplot(data = data.frame(x = norm_samps), aes(x = x)) + geom_histogram(aes(y = ..density..), color = "black") + stat_function(fun = dnorm, args = list(mean = mu, sd = sigma), color = "red") plot(n_plot) Exercise 4.18 (Probability integral transform) This exercise is borrowed from Wasserman. Let \\(X\\) have a continuous, strictly increasing CDF \\(F\\). Let \\(Y = F(X)\\). Find the density of \\(Y\\). This is called the probability integral transform. Let \\(U \\sim \\text{Uniform}(0,1)\\) and let \\(X = F^{-1}(U)\\). Show that \\(X \\sim F\\). R: Implement a program that takes Uniform(0,1) random variables and generates random variables from an exponential(\\(\\beta\\)) distribution. Compare your implemented function with function rexp in R. Solution. \\[\\begin{align} F_Y(y) &= P(Y < y) \\\\ &= P(F(X) < y) \\\\ &= P(X < F_X^{-1}(y)) \\\\ &= F_X(F_X^{-1}(y)) \\\\ &= y. \\end{align}\\] From the above it follows that \\(p(y) = 1\\). Note that we need to know the inverse CDF to be able to apply this procedure. \\[\\begin{align} P(X < x) &= P(F^{-1}(U) < x) \\\\ &= P(U < F(x)) \\\\ &= F_U(F(x)) \\\\ &= F(x). \\end{align}\\] set.seed(1) nsamps <- 10000 beta <- 4 generate_exp <- function (n, beta) { tmp <- runif(n) X <- qexp(tmp, beta) return (X) } df <- tibble("R" = rexp(nsamps, beta), "myGenerator" = generate_exp(nsamps, beta)) %>% gather() ggplot(data = df, aes(x = value, fill = key)) + geom_histogram(position = "dodge") "],["404.html", "Page not found", " Page not found The page you requested cannot be found (perhaps it was moved or renamed). You may want to try searching to find the page's new location, or use the table of contents to find the page you are looking for. "]] +[["boot.html", "Chapter 14 Bootstrap", " Chapter 14 Bootstrap This chapter deals with bootstrap. The students are expected to acquire the following knowledge: How to use bootstrap to generate coverage intervals. .fold-btn { float: right; margin: 5px 5px 0 0; } .fold { border: 1px solid black; min-height: 40px; } Exercise 14.1 Ideally, a \\(1-\\alpha\\) CI would have \\(1-\\alpha\\) coverage. That is, say a 95% CI should, in the long run, contain the true value of the parameter 95% of the time. In practice, it is impossible to assess the coverage of our CI method, because we rarely know the true parameter. In simulation, however, we can. Let’s assess the coverage of bootstrap percentile intervals. Pick a univariate distribution with readily available mean and one that you can easily sample from. Draw \\(n = 30\\) random samples from the chosen distribution and use the bootstrap (with large enough m) and percentile CI method to construct 95% CI. Repeat the process many times and count how many times the CI contains the true mean. That is, compute the actual coverage probability (don’t forget to include the standard error of the coverage probability!). What can you observe? Try one or two different distributions. What can you observe? Repeat (b) and (c) using BCa intervals (R package boot). How does the coverage compare to percentile intervals? As (d) but using intervals based on asymptotic normality (+/- 1.96 SE). How do results from (b), (d), and (e) change if we increase the sample size to n = 200? What about n = 5? library(boot) set.seed(0) nit <- 1000 # Repeat the process "many times" alpha <- 0.05 # CI parameter nboot <- 100 # m parameter for bootstrap ("large enough m") # f: change this to 200 or 5. nsample <- 30 # n = 30 random samples from the chosen distribution. Comment out BCa code if it breaks. covers <- matrix(nrow = nit, ncol = 3) covers_BCa <- matrix(nrow = nit, ncol = 3) covers_asymp_norm <- matrix(nrow = nit, ncol = 3) isin <- function (x, lower, upper) { (x > lower) & (x < upper) } for (j in 1:nit) { # Repeating many times # a: pick a univariate distribution - standard normal x1 <- rnorm(nsample) # c: one or two different distributions - beta and poisson x2 <- rbeta(nsample, 1, 2) x3 <- rpois(nsample, 5) X1 <- matrix(data = NA, nrow = nsample, ncol = nboot) X2 <- matrix(data = NA, nrow = nsample, ncol = nboot) X3 <- matrix(data = NA, nrow = nsample, ncol = nboot) for (i in 1:nboot) { X1[ ,i] <- sample(x1, nsample, replace = T) X2[ ,i] <- sample(x2, nsample, T) X3[ ,i] <- sample(x3, nsample, T) } X1_func <- apply(X1, 2, mean) X2_func <- apply(X2, 2, mean) X3_func <- apply(X3, 2, mean) X1_quant <- quantile(X1_func, probs = c(alpha / 2, 1 - alpha / 2)) X2_quant <- quantile(X2_func, probs = c(alpha / 2, 1 - alpha / 2)) X3_quant <- quantile(X3_func, probs = c(alpha / 2, 1 - alpha / 2)) covers[j,1] <- (0 > X1_quant[1]) & (0 < X1_quant[2]) covers[j,2] <- ((1 / 3) > X2_quant[1]) & ((1 / 3) < X2_quant[2]) covers[j,3] <- (5 > X3_quant[1]) & (5 < X3_quant[2]) mf <- function (x, i) return(mean(x[i])) bootX1 <- boot(x1, statistic = mf, R = nboot) bootX2 <- boot(x2, statistic = mf, R = nboot) bootX3 <- boot(x3, statistic = mf, R = nboot) X1_quant_BCa <- boot.ci(bootX1, type = "bca")$bca X2_quant_BCa <- boot.ci(bootX2, type = "bca")$bca X3_quant_BCa <- boot.ci(bootX3, type = "bca")$bca covers_BCa[j,1] <- (0 > X1_quant_BCa[4]) & (0 < X1_quant_BCa[5]) covers_BCa[j,2] <- ((1 / 3) > X2_quant_BCa[4]) & ((1 / 3) < X2_quant_BCa[5]) covers_BCa[j,3] <- (5 > X3_quant_BCa[4]) & (5 < X3_quant_BCa[5]) # e: estimate mean and standard error # sample mean: x1_bar <- mean(x1) x2_bar <- mean(x2) x3_bar <- mean(x3) # standard error (of the sample mean) estimate: sample standard deviation / sqrt(n) x1_bar_SE <- sd(x1) / sqrt(nsample) x2_bar_SE <- sd(x2) / sqrt(nsample) x3_bar_SE <- sd(x3) / sqrt(nsample) covers_asymp_norm[j,1] <- isin(0, x1_bar - 1.96 * x1_bar_SE, x1_bar + 1.96 * x1_bar_SE) covers_asymp_norm[j,2] <- isin(1/3, x2_bar - 1.96 * x2_bar_SE, x2_bar + 1.96 * x2_bar_SE) covers_asymp_norm[j,3] <- isin(5, x3_bar - 1.96 * x3_bar_SE, x3_bar + 1.96 * x3_bar_SE) } apply(covers, 2, mean) ## [1] 0.918 0.925 0.905 apply(covers, 2, sd) / sqrt(nit) ## [1] 0.008680516 0.008333333 0.009276910 apply(covers_BCa, 2, mean) ## [1] 0.927 0.944 0.927 apply(covers_BCa, 2, sd) / sqrt(nit) ## [1] 0.008230355 0.007274401 0.008230355 apply(covers_asymp_norm, 2, mean) ## [1] 0.939 0.937 0.930 apply(covers_asymp_norm, 2, sd) / sqrt(nit) ## [1] 0.007572076 0.007687008 0.008072494 Exercise 14.2 You are given a sample of independent observations from a process of interest: Index 1 2 3 4 5 6 7 8 X 7 2 4 6 4 5 9 10 Compute the plug-in estimate of mean and 95% symmetric CI based on asymptotic normality. Use the plug-in estimate of SE. Same as (a), but use the unbiased estimate of SE. Apply nonparametric bootstrap with 1000 bootstrap replications and estimate the 95% CI for the mean with percentile-based CI. # a x <- c(7, 2, 4, 6, 4, 5, 9, 10) n <- length(x) mu <- mean(x) SE <- sqrt(mean((x - mu)^2)) / sqrt(n) SE ## [1] 0.8915839 z <- qnorm(1 - 0.05 / 2) c(mu - z * SE, mu + z * SE) ## [1] 4.127528 7.622472 # b SE <- sd(x) / sqrt(n) SE ## [1] 0.9531433 c(mu - z * SE, mu + z * SE) ## [1] 4.006873 7.743127 # c set.seed(0) m <- 1000 T_mean <- function(x) {mean(x)} est_boot <- array(NA, m) for (i in 1:m) { x_boot <- x[sample(1:n, n, rep = T)] est_boot[i] <- T_mean(x_boot) } quantile(est_boot, p = c(0.025, 0.975)) ## 2.5% 97.5% ## 4.250 7.625 Exercise 14.3 We are given a sample of 10 independent paired (bivariate) observations: Index 1 2 3 4 5 6 7 8 9 10 X 1.26 -0.33 1.33 1.27 0.41 -1.54 -0.93 -0.29 -0.01 2.40 Y 2.64 0.33 0.48 0.06 -0.88 -2.14 -2.21 0.95 0.83 1.45 Compute Pearson correlation between X and Y. Use the cor.test() from R to estimate a 95% CI for the estimate from (a). Apply nonparametric bootstrap with 1000 bootstrap replications and estimate the 95% CI for the Pearson correlation with percentile-based CI. Compare CI from (b) and (c). Are they similar? How would the bootstrap estimation of CI change if we were interested in Spearman or Kendall correlation instead? x <- c(1.26, -0.33, 1.33, 1.27, 0.41, -1.54, -0.93, -0.29, -0.01, 2.40) y <- c(2.64, 0.33, 0.48, 0.06, -0.88, -2.14, -2.21, 0.95, 0.83, 1.45) # a cor(x, y) ## [1] 0.6991247 # b res <- cor.test(x, y) res$conf.int[1:2] ## [1] 0.1241458 0.9226238 # c set.seed(0) m <- 1000 n <- length(x) T_cor <- function(x, y) {cor(x, y)} est_boot <- array(NA, m) for (i in 1:m) { idx <- sample(1:n, n, rep = T) # !!! important to use same indices to keep dependency between x and y est_boot[i] <- T_cor(x[idx], y[idx]) } quantile(est_boot, p = c(0.025, 0.975)) ## 2.5% 97.5% ## 0.2565537 0.9057664 # d # Yes, but the bootstrap CI is more narrow. # e # We just use the functions for Kendall/Spearman coefficients instead: T_kendall <- function(x, y) {cor(x, y, method = "kendall")} T_spearman <- function(x, y) {cor(x, y, method = "spearman")} # Put this in a function that returns the CI bootstrap_95_ci <- function(x, y, t, m = 1000) { n <- length(x) est_boot <- array(NA, m) for (i in 1:m) { idx <- sample(1:n, n, rep = T) # !!! important to use same indices to keep dependency between x and y est_boot[i] <- t(x[idx], y[idx]) } quantile(est_boot, p = c(0.025, 0.975)) } bootstrap_95_ci(x, y, T_kendall) ## 2.5% 97.5% ## -0.08108108 0.78378378 bootstrap_95_ci(x, y, T_spearman) ## 2.5% 97.5% ## -0.1701115 0.8867925 Exercise 14.4 In this problem we will illustrate the use of the nonparametric bootstrap for estimating CIs of regression model coefficients. Load the longley dataset from base R with data(longley). Use lm() to apply linear regression using “Employed” as the target (dependent) variable and all other variables as the predictors (independent). Using lm() results, print the estimated regression coefficients and standard errors. Estimate 95% CI for the coefficients using +/- 1.96 * SE. Use nonparametric bootstrap with 100 replications to estimate the SE of the coefficients from (b). Compare the SE from (c) with those from (b). # a data(longley) # b res <- lm(Employed ~ . , longley) tmp <- data.frame(summary(res)$coefficients[,1:2]) tmp$LB <- tmp[,1] - 1.96 * tmp[,2] tmp$UB <- tmp[,1] + 1.96 * tmp[,2] tmp ## Estimate Std..Error LB UB ## (Intercept) -3.482259e+03 8.904204e+02 -5.227483e+03 -1.737035e+03 ## GNP.deflator 1.506187e-02 8.491493e-02 -1.513714e-01 1.814951e-01 ## GNP -3.581918e-02 3.349101e-02 -1.014616e-01 2.982320e-02 ## Unemployed -2.020230e-02 4.883997e-03 -2.977493e-02 -1.062966e-02 ## Armed.Forces -1.033227e-02 2.142742e-03 -1.453204e-02 -6.132495e-03 ## Population -5.110411e-02 2.260732e-01 -4.942076e-01 3.919994e-01 ## Year 1.829151e+00 4.554785e-01 9.364136e-01 2.721889e+00 # c set.seed(0) m <- 100 n <- nrow(longley) T_coef <- function(x) { lm(Employed ~ . , x)$coefficients } est_boot <- array(NA, c(m, ncol(longley))) for (i in 1:m) { idx <- sample(1:n, n, rep = T) est_boot[i,] <- T_coef(longley[idx,]) } SE <- apply(est_boot, 2, sd) SE ## [1] 1.826011e+03 1.605981e-01 5.693746e-02 8.204892e-03 3.802225e-03 ## [6] 3.907527e-01 9.414436e-01 # Show the standard errors around coefficients library(ggplot2) library(reshape2) df <- data.frame(index = 1:7, bootstrap_SE = SE, lm_SE = tmp$Std..Error) melted_df <- melt(df[2:nrow(df), ], id.vars = "index") # Ignore bias which has a really large magnitude ggplot(melted_df, aes(x = index, y = value, fill = variable)) + geom_bar(stat="identity", position="dodge") + xlab("Coefficient") + ylab("Standard error") # + scale_y_continuous(trans = "log") # If you want to also plot bias Exercise 14.5 This exercise shows a shortcoming of the bootstrap method when using the plug in estimator for the maximum. Compute the 95% bootstrap CI for the maximum of a standard normal distribution. Compute the 95% bootstrap CI for the maximum of a binomial distribution with n = 15 and p = 0.2. Repeat (b) using p = 0.9. Why is the result different? # bootstrap CI for maximum alpha <- 0.05 T_max <- function(x) {max(x)} # Equal to T_max = max bootstrap <- function(x, t, m = 1000) { n <- length(x) values <- rep(0, m) for (i in 1:m) { values[i] <- t(sample(x, n, replace = T)) } quantile(values, probs = c(alpha / 2, 1 - alpha / 2)) } # a # Meaningless, as the normal distribution can yield arbitrarily large values. x <- rnorm(100) bootstrap(x, T_max) ## 2.5% 97.5% ## 1.819425 2.961743 # b x <- rbinom(100, size = 15, prob = 0.2) # min = 0, max = 15 bootstrap(x, T_max) ## 2.5% 97.5% ## 6 7 # c x <- rbinom(100, size = 15, prob = 0.9) # min = 0, max = 15 bootstrap(x, T_max) ## 2.5% 97.5% ## 15 15 # Observation: to estimate the maximum, we need sufficient probability mass near the maximum value the distribution can yield. # Using bootstrap is pointless when there is too little mass near the true maximum. # In general, bootstrap will fail when estimating the CI for the maximum. Exercise 14.6 (Practical - and fictional - coverage interval comparison) In this exercise, we investigate how different kinds of CI’s behave as we vary the number of measurements. The story behind the data: it’s 2025 and we’ve discovered that Slovenia has rich deposits of a rare mineral called Moustachium, which can be used to accelerate moustache growth. This mineral is highly sought, so the government has decided to contract two different companies to provide information on where to begin mining. Both companies investigated mining sites in each statistical region and gave their best estimate of the average Moustachium concentration in tonnes per square kilometer. The Data Science team has been called to estimate the uncertainty in these estimates and help avoid mining in the wrong region. Generate synthetic data with the script below: set.seed(0) library(comprehenr) regions <- c("pomurska", "podravska", "koroska", "savinjska", "zasavska", "posavska", "JV Slovenija", "primorsko-notranjska", "osrednjeslovenska", "gorenjska", "goriska", "obalno-kraska") region_rates <- seq(1.3, 2.3, length.out=length(regions)) region_rates <- region_rates[sample.int(length(regions), length(regions))] make_dataset <- function(n_contractors) { measurements <- matrix(nrow=length(regions), ncol=n_contractors) for (i in 1:length(regions)) { measurements[i,] <- rgamma(n_contractors, 5.0, region_rates[i]) } df <- data.frame(measurements) row.names(df) <- regions names(df) <- to_vec(for(i in 1:n_contractors) paste("Contractor", i)) return(df) } set.seed(0) df_2025 <- make_dataset(2) set.seed(0) df_2027 <- make_dataset(10) set.seed(0) df_2028 <- make_dataset(100) set.seed(0) df_2029 <- make_dataset(1000) saveRDS(df_2025, file="moustachium_2025.Rda") saveRDS(df_2027, file="moustachium_2027.Rda") saveRDS(df_2028, file="moustachium_2028.Rda") saveRDS(df_2029, file="moustachium_2029.Rda") Estimate the average concentration for different regions. Estimate the average concentration uncertainty using 95% CI’s (asymptotic normality with biased and unbiased standard error, standard bootstrap CI, bootstrap percentile CI). Visualize uncertainties with a histogram and discuss the best location to start mining. The year is 2027 and the government has decided to contract 10 companies. Rerun the code with new measurements and discuss how CI’s change. Technological advancements in robotics have enabled site surveys on a massive scale. Repeat the last point for 100 surveyor robots in 2028 and 1000 surveyor robots in 2029. library(ggplot2) library(dplyr) library(data.table) set.seed(0) input_dataset_path = "moustachium_2025.Rda" # Change this for points d and e output_plot_path = "moustachium_2025.pdf" # Change this for points d and e df <- readRDS(input_dataset_path) # Data comes from here n_contractors <- ncol(df) results_df <- data.frame(region=row.names(df)) # Store CI bounds here # 1. average concentration for different mining sites results_df$average_concetration <- rowMeans(df) # CI for the mean based on asymptotic normality (biased SE estimate) biased_SE <- sqrt(apply(df, 1, function(vec) {sum((vec - mean(vec))^2) / length(vec)}) / n_contractors) results_df$ci95.an.biased_var.low <- results_df$average_concetration - 1.96 * biased_SE results_df$ci95.an.biased_var.high <- results_df$average_concetration + 1.96 * biased_SE # CI for the mean based on asymptotic normality (unbiased SE estimate) unbiased_SE <- sqrt(apply(df, 1, var) / n_contractors) results_df$ci95.an.unbiased_var.low <- results_df$average_concetration - 1.96 * unbiased_SE results_df$ci95.an.unbiased_var.high <- results_df$average_concetration + 1.96 * unbiased_SE # Standard bootstrap CI with 1000 samples bootstrap_variance <- function(data, n_samples) { # n_samples is m in pseudocode output <- numeric(n_samples) for (i in 1:n_samples) { index <- sample(1:length(data), length(data), rep = TRUE) resampled_data <- data[index] output[i] <- mean(resampled_data) } return(var(output)) } bootstrap_1000_sd <- sqrt(apply(df, 1, function(vec){bootstrap_variance(vec, 1000)})) results_df$ci95.bootstrap.standard.1000.low <- results_df$average_concetration - 1.96 * bootstrap_1000_sd results_df$ci95.bootstrap.standard.1000.high <- results_df$average_concetration + 1.96 * bootstrap_1000_sd # Bootstrap percentile CI with 1000 samples bootstrap_quantile <- function(data, functional, n_samples, probs) { # n_samples is m in pseudocode output <- numeric(n_samples) for (i in 1:n_samples) { index <- sample(1:length(data), length(data), rep = TRUE) resampled_data <- data[index] output[i] <- functional(resampled_data) } return(quantile(output, probs=probs)) } results_df$ci95.bootstrap.percentile.1000.low <- apply(df, 1, function(vec){bootstrap_quantile(vec, mean, 1000, 0.025)}) results_df$ci95.bootstrap.percentile.1000.high <- apply(df, 1, function(vec){bootstrap_quantile(vec, mean, 1000, 0.975)}) results_df ## region average_concetration ci95.an.biased_var.low ## 1 pomurska 2.814731 1.5351811 ## 2 podravska 2.646518 1.5358919 ## 3 koroska 2.010216 0.5956186 ## 4 savinjska 4.618001 4.4057369 ## 5 zasavska 2.458873 2.0050840 ## 6 posavska 2.153802 1.9001244 ## 7 JV Slovenija 2.433503 1.6860397 ## 8 primorsko-notranjska 3.165394 2.9640430 ## 9 osrednjeslovenska 3.696875 3.5592419 ## 10 gorenjska 1.341931 0.2784547 ## 11 goriska 2.767328 2.3255569 ## 12 obalno-kraska 1.580711 1.4533751 ## ci95.an.biased_var.high ci95.an.unbiased_var.low ci95.an.unbiased_var.high ## 1 4.094281 1.005174095 4.624288 ## 2 3.757144 1.075855548 4.217180 ## 3 3.424813 0.009673183 4.010759 ## 4 4.830264 4.317814385 4.918187 ## 5 2.912662 1.817118318 3.100628 ## 6 2.407479 1.795047746 2.512556 ## 7 3.180965 1.376430415 3.490575 ## 8 3.366746 2.880640556 3.450148 ## 9 3.834508 3.502232367 3.891518 ## 10 2.405407 -0.162051549 2.845913 ## 11 3.209099 2.142569481 3.392086 ## 12 1.708047 1.400630772 1.760792 ## ci95.bootstrap.standard.1000.low ci95.bootstrap.standard.1000.high ## 1 1.5397542 4.089708 ## 2 1.5388631 3.754173 ## 3 0.5492603 3.471171 ## 4 4.4062860 4.829715 ## 5 1.9938049 2.923942 ## 6 1.9010514 2.406552 ## 7 1.6932573 3.173748 ## 8 2.9670216 3.363767 ## 9 3.5602064 3.833544 ## 10 0.2845999 2.399262 ## 11 2.3293359 3.205320 ## 12 1.4543352 1.707087 ## ci95.bootstrap.percentile.1000.low ci95.bootstrap.percentile.1000.high ## 1 1.8914878 3.737975 ## 2 1.8451596 3.447876 ## 3 0.9895308 3.030901 ## 4 4.4648444 4.771157 ## 5 2.1314473 2.786299 ## 6 1.9707640 2.336840 ## 7 1.8941800 2.972825 ## 8 3.0201118 3.310677 ## 9 3.5975676 3.796183 ## 10 0.5745928 2.109269 ## 11 2.4485735 3.086082 ## 12 1.4888334 1.672589 # Visualization: we use a bar chart with uncertainty bands plot_moustachium_per_region <- function(region_names, average, ci_low, ci_high) { df_visualization <- data.frame(region=region_names, average=average, low=ci_low, high=ci_high) ggplot(df_visualization, aes(x=region, y=average)) + geom_bar(stat="identity") } mask <- endsWith(colnames(results_df), "low") mask[c(1, 2)] <- T results_df_low <- results_df[, mask] colnames(results_df_low) <- gsub('.low','', colnames(results_df_low)) mask <- endsWith(colnames(results_df), "high") mask[c(1, 2)] <- T results_df_high <- results_df[, mask] colnames(results_df_high) <- gsub('.high','', colnames(results_df_high)) long_results_df_low <- melt(setDT(results_df_low), id.vars=c("region", "average_concetration")) names(long_results_df_low) <- c("region", "average_concentration", "variable", "low") long_results_df_high <- melt(setDT(results_df_high), id.vars=c("region", "average_concetration")) names(long_results_df_high) <- c("region", "average_concentration", "variable", "high") long_results_df <- merge(long_results_df_low, long_results_df_high, by=c("region", "variable", "average_concentration"), all=T) moustachium_plot <- ggplot(long_results_df, aes(x=region, y=average_concentration)) + geom_bar(stat="identity", position="dodge", alpha=0.2) + geom_errorbar(aes(ymin=low, ymax=high, color=variable), width=0.2, position=position_dodge(0.9)) + scale_x_discrete(guide = guide_axis(angle = 45)) + ylim(-1, 8) # ggsave(plot=moustachium_plot, width=12, height=8, dpi=300, filename=output_plot_path) moustachium_plot # Visualization: we can also use a map. Circle size denotes concentration in region, low transparency denotes high uncertainty. library(maps) map_data_slo <- map_data('world')[map_data('world')$region == "Slovenia",] map_df <- long_results_df[long_results_df$variable == "ci95.an.biased_var", ] # VERY approximate longitudes and latitudes for different regions. map_df$long <- rep(0, nrow(map_df)) map_df$lat <- rep(0, nrow(map_df)) map_df[map_df$region == "gorenjska"]$long <- 14.2 map_df[map_df$region == "gorenjska"]$lat <- 46.3 map_df[map_df$region == "goriska"]$long <- 13.85 map_df[map_df$region == "goriska"]$lat <- 46.0 map_df[map_df$region == "obalno-kraska"]$long <- 13.9 map_df[map_df$region == "obalno-kraska"]$lat <- 45.65 map_df[map_df$region == "osrednjeslovenska"]$long <- 14.5 map_df[map_df$region == "osrednjeslovenska"]$lat <- 46. map_df[map_df$region == "primorsko-notranjska"]$long <- 14.3 map_df[map_df$region == "primorsko-notranjska"]$lat <- 45.7 map_df[map_df$region == "zasavska"]$long <- 15 map_df[map_df$region == "zasavska"]$lat <- 46.1 map_df[map_df$region == "savinjska"]$long <- 15.2 map_df[map_df$region == "savinjska"]$lat <- 46.25 map_df[map_df$region == "posavska"]$long <- 15.4 map_df[map_df$region == "posavska"]$lat <- 46 map_df[map_df$region == "koroska"]$long <- 15.1 map_df[map_df$region == "koroska"]$lat <- 46.5 map_df[map_df$region == "podravska"]$long <- 15.7 map_df[map_df$region == "podravska"]$lat <- 46.45 map_df[map_df$region == "pomurska"]$long <- 16.2 map_df[map_df$region == "pomurska"]$lat <- 46.65 map_df[map_df$region == "JV Slovenija"]$long <- 15. map_df[map_df$region == "JV Slovenija"]$lat <- 45.7 map_df$ci_size <- (map_df$high - map_df$low) map_df$ci_y <- map_df$lat - 0.05 map_df$ci_label <- sprintf("(%.2f, %.2f)", map_df$low, map_df$high) map_df$avg_label <- sprintf("%.2f", map_df$average_concentration) country_plot <- ggplot() + # First layer: worldwide map geom_polygon(data = map_data("world"), aes(x=long, y=lat, group = group), color = '#9c9c9c', fill = '#f3f3f3') + # Second layer: Country map geom_polygon( data = map_data_slo, aes(x=long, y=lat, group = group), color='darkgreen', fill='green', alpha=0.2 ) + geom_point(data=map_df, aes(x=long, y=lat, fill=region, size=average_concentration, alpha=ci_size), color="black", pch=21) + geom_text(data=map_df, aes(x=long, y=ci_y, label=ci_label), size=3) + geom_text(data=map_df, aes(x=long, y=lat, label=avg_label), size=3) + scale_size_continuous(range = c(3, 12), trans = "exp") + scale_alpha_continuous(range = c(0.15, 0.75), trans = "reverse") + ggtitle("Estimated average Moustachium concentration with 95% CI") + coord_cartesian(xlim=c(13.2, 16.7), ylim=c(45.4, 47.)) # ggsave(plot=country_plot, width=18, height=12, dpi=300, filename="country.pdf") country_plot "],["404.html", "Page not found", " Page not found The page you requested cannot be found (perhaps it was moved or renamed). You may want to try searching to find the page's new location, or use the table of contents to find the page you are looking for. "]]