From 2fda7d59942f396c420fbd7d99d70a77f52220e1 Mon Sep 17 00:00:00 2001 From: LPREM Date: Mon, 30 Sep 2024 11:08:20 +0000 Subject: [PATCH] upload week04 assignments --- materials/tutorial_04/tests_tutorial_04.R | 316 +++ materials/tutorial_04/tutorial_04.ipynb | 1527 ++++++++++++++ materials/worksheet_04/tests_worksheet_04.R | 328 +++ materials/worksheet_04/worksheet_04.ipynb | 2085 +++++++++++++++++++ 4 files changed, 4256 insertions(+) create mode 100644 materials/tutorial_04/tests_tutorial_04.R create mode 100644 materials/tutorial_04/tutorial_04.ipynb create mode 100644 materials/worksheet_04/tests_worksheet_04.R create mode 100644 materials/worksheet_04/worksheet_04.ipynb diff --git a/materials/tutorial_04/tests_tutorial_04.R b/materials/tutorial_04/tests_tutorial_04.R new file mode 100644 index 0000000..5ccde2e --- /dev/null +++ b/materials/tutorial_04/tests_tutorial_04.R @@ -0,0 +1,316 @@ +library(digest) +library(testthat) + +test_1.1 <- function() { + test_that('Did not assign answer to an object called "crabs_vs_width_scatterplot"', { + expect_true(exists("crabs_vs_width_scatterplot")) + }) + + test_that("Solution should be a ggplot object", { + expect_true(is.ggplot(crabs_vs_width_scatterplot)) + }) + + properties <- c(crabs_vs_width_scatterplot$layers[[1]]$mapping, crabs_vs_width_scatterplot$mapping) + + test_that("Plot should have width on the x-axis", { + expect_true("width" == rlang::get_expr(properties$x)) + }) + + test_that("Plot does not have the correct layers", { + expect_true("GeomPoint" %in% class(crabs_vs_width_scatterplot$layers[[1]]$geom)) + + # Remove if not needed: + # expect_true("GeomVline" %in% class(crabs_vs_width_scatterplot$layers[[2]]$geom)) + }) + + test_that("Plot does not use the correct data", { + expect_equal(digest(nrow(crabs_vs_width_scatterplot$data)), "8c2afe893b01f3a8c63e1aef3b5aad9e") + expect_equal(digest(round(sum(crabs_vs_width_scatterplot$data$width))), "17a701ffb51e7429bcd57678dd80b402") + + # If width is not known: + # expect_equal(digest(round(sum(pull(crabs_vs_width_scatterplot$data, rlang::get_expr(properties$x))))), "HASH_HERE") + }) + + test_that("x-axis label should be descriptive and human readable", { + expect_false(crabs_vs_width_scatterplot$labels$x == toString(rlang::get_expr(properties$x))) + }) + + test_that("Plot should have a title", { + expect_true("title" %in% names(crabs_vs_width_scatterplot$labels)) + }) + + print("Success!") +} + +test_1.2 <- function() { + test_that('Did not assign answer to an object called "crabs_group_avg_width"', { + expect_true(exists("crabs_group_avg_width")) + }) + + test_that("Solution should be a data frame", { + expect_true("data.frame" %in% class(crabs_group_avg_width)) + }) + + expected_colnames <- c("width_intervals", "mean_n_males") + given_colnames <- colnames(crabs_group_avg_width) + test_that("Data frame does not have the correct columns", { + expect_equal(length(setdiff( + union(expected_colnames, given_colnames), + intersect(expected_colnames, given_colnames) + )), 0) + }) + + test_that("Data frame does not contain the correct number of rows", { + expect_equal(digest(as.integer(nrow(crabs_group_avg_width))), "71db8a6cad03244e6e50f0ad8bc95a65") + }) + + test_that("Data frame does not contain the correct data", { + expect_equal(digest(as.integer(sum(crabs_group_avg_width$mean_n_males) * 10e4)), "12273d78c0faff1ea67c8f8e9c42edd0") + }) + + print("Success!") +} + +test_1.3 <- function() { + test_that('Did not assign answer to an object called "crabs_avg_width_scatterplot"', { + expect_true(exists("crabs_avg_width_scatterplot")) + }) + + test_that("Solution should be a ggplot object", { + expect_true(is.ggplot(crabs_avg_width_scatterplot)) + }) + + properties <- c(crabs_avg_width_scatterplot$layers[[1]]$mapping, crabs_avg_width_scatterplot$mapping) + + test_that("Plot should have width_intervals on the x-axis", { + expect_true("width_intervals" == rlang::get_expr(properties$x)) + }) + + test_that("Plot does not have the correct layers", { + expect_true("GeomPoint" %in% class(crabs_avg_width_scatterplot$layers[[1]]$geom)) + + # Remove if not needed: + # expect_true("GeomVline" %in% class(crabs_avg_width_scatterplot$layers[[2]]$geom)) + }) + + test_that("Plot does not use the correct data", { + expect_equal(digest(nrow(crabs_avg_width_scatterplot$data)), "71db8a6cad03244e6e50f0ad8bc95a65") + expect_equal(digest(round(sum(crabs_avg_width_scatterplot$data$mean_n_males))), "e1d9279a9999b2d3bb972ab3267b49c7") + + # If width_intervals is not known: + # expect_equal(digest(round(sum(pull(crabs_avg_width_scatterplot$data, rlang::get_expr(properties$x))))), "HASH_HERE") + }) + + test_that("x-axis label should be descriptive and human readable", { + expect_false(crabs_avg_width_scatterplot$labels$x == toString(rlang::get_expr(properties$x))) + }) + + test_that("Plot should have a title", { + expect_true("title" %in% names(crabs_avg_width_scatterplot$labels)) + }) + + print("Success!") +} + +test_1.4 <- function() { + test_that('Did not assign answer to an object called "answer1.4"', { + expect_true(exists("answer1.4")) + }) + + test_that('Solution should be a single character ("A", "B", or "C")', { + expect_match(answer1.4, "a|b|c", ignore.case = TRUE) + }) + + answer_hash <- digest(tolower(answer1.4)) + + test_that("Solution is incorrect", { + expect_equal(answer_hash, "127a2ec00989b9f7faf671ed470be7f8") + }) + + print("Success!") +} + +test_1.5 <- function() { + test_that('Did not assign answer to an object called "crabs_vs_width_scatterplot"', { + expect_true(exists("crabs_vs_width_scatterplot")) + }) + + test_that("Solution should be a ggplot object", { + expect_true(is.ggplot(crabs_vs_width_scatterplot)) + }) + + properties <- c(crabs_vs_width_scatterplot$layers[[1]]$mapping, crabs_vs_width_scatterplot$mapping) + + test_that("Plot should have width on the x-axis", { + expect_true("width" == rlang::get_expr(properties$x)) + }) + + test_that("Plot does not have the correct layers", { + expect_true("GeomPoint" %in% class(crabs_vs_width_scatterplot$layers[[1]]$geom)) + }) + + + test_that("Plot does not use the correct data", { + expect_equal(digest(nrow(crabs_vs_width_scatterplot$data)), "8c2afe893b01f3a8c63e1aef3b5aad9e") + expect_equal(digest(round(sum(crabs_vs_width_scatterplot$data$width))), "17a701ffb51e7429bcd57678dd80b402") + + # If width is not known: + # expect_equal(digest(round(sum(pull(crabs_vs_width_scatterplot$data, rlang::get_expr(properties$x))))), "HASH_HERE") + }) + + test_that("x-axis label should be descriptive and human readable", { + expect_false(crabs_vs_width_scatterplot$labels$x == toString(rlang::get_expr(properties$x))) + }) + + test_that("Plot should have a title", { + expect_true("title" %in% names(crabs_vs_width_scatterplot$labels)) + }) + + print("Success!") +} + + +test_1.6 <- function() { + test_that('Did not assign answer to an object called "crabs_poisson_model"', { + expect_true(exists("crabs_poisson_model")) + }) + + test_that("Solution should be a glm object", { + expect_true("glm" %in% class(crabs_poisson_model)) + }) + + test_that("Data frame does not contain the correct data", { + expect_equal(digest(as.integer(sum(crabs_poisson_model$residuals) * 10e4)), "5ab35ebc157c4f75476569c445d5a1cc") + }) + + print("Success!") +} + +test_1.7 <- function() { + test_that('Did not assign answer to an object called "crabs_poisson_model_results"', { + expect_true(exists("crabs_poisson_model_results")) + }) + + test_that("Solution should be a data frame", { + expect_true("data.frame" %in% class(crabs_poisson_model_results)) + }) + + expected_colnames <- c('term','estimate','std.error','statistic','p.value','conf.low','conf.high') + + + given_colnames <- colnames(crabs_poisson_model_results) + test_that("Data frame does not have the correct columns", { + expect_equal(length(setdiff( + union(expected_colnames, given_colnames), + intersect(expected_colnames, given_colnames) + )), 0) + }) + + test_that("Data frame does not contain the correct number of rows", { + expect_equal(digest(as.integer(nrow(crabs_poisson_model_results))), "dd4ad37ee474732a009111e3456e7ed7") + }) + + test_that("Data frame does not contain the correct data", { + expect_equal(digest(as.integer(sum(crabs_poisson_model_results$conf.low) * 10e6)), "79cef645627327fe2632a29728949fa8") + expect_equal(digest(as.integer(sum(crabs_poisson_model_results$conf.high) * 10e6)), "b2f64ff96ebb309861a640e306397224") + expect_equal(digest(as.integer(sum(crabs_poisson_model_results$statistic) * 10e6)), "09ad422316ee480535d0735aedfb2543") + }) + + print("Success!") +} + +test_1.8 <- function() { + test_that('Did not assign answer to an object called "crabs_poisson_model_results"', { + expect_true(exists("crabs_poisson_model_results")) + }) + + test_that("Solution should be a data frame", { + expect_true("data.frame" %in% class(crabs_poisson_model_results)) + }) + + expected_colnames <- c('term','estimate','std.error','statistic','p.value','conf.low','conf.high', 'exp.estimate', 'exp.conf.low', 'exp.conf.high') + + given_colnames <- colnames(crabs_poisson_model_results) + test_that("Data frame does not have the correct columns", { + expect_equal(length(setdiff( + union(expected_colnames, given_colnames), + intersect(expected_colnames, given_colnames) + )), 0) + }) + + test_that("Data frame does not contain the correct number of rows", { + expect_equal(digest(as.integer(nrow(crabs_poisson_model_results))), "dd4ad37ee474732a009111e3456e7ed7") + }) + + test_that("Data frame does not contain the correct data", { + expect_equal(digest(as.integer(sum(crabs_poisson_model_results$exp.estimate) * 10e6)), "dd50342cffd6d0cde245787cbdd97021") + expect_equal(digest(as.integer(sum(crabs_poisson_model_results$exp.conf.low) * 10e6)), "5aa9e6879f68968bdb958bdc1a4cdf32") + expect_equal(digest(as.integer(sum(crabs_poisson_model_results$exp.conf.high) * 10e6)), "b6866c669fd240e038b16d7e35b90592") + }) + + print("Success!") +} + +test_1.9 <- function() { + test_that('Did not assign answer to an object called "answer1.9"', { + expect_true(exists("answer1.9")) + }) + + answer_hash <- digest(tolower(answer1.9)) + test_that("Solution is incorrect", { + expect_equal(answer_hash, "fd4d64bc84d8d1ac10b94c23bda1a016") + }) + + print("Success!") +} + +test_1.10 <- function() { + test_that('Did not assign answer to an object called "answer1.10"', { + expect_true(exists("answer1.10")) + }) + + test_that('Solution should be a single character ("A", "B", "C", or "D")', { + expect_match(answer1.10, "a|b|c|d", ignore.case = TRUE) + }) + + answer_hash <- digest(tolower(answer1.10)) + test_that("Solution is incorrect", { + expect_equal(answer_hash, "6e7a8c1c098e8817e3df3fd1b21149d1") + }) + + print("Success!") +} + +test_1.11 <- function() { + test_that('Did not assign answer to an object called "answer1.11"', { + expect_true(exists("answer1.11")) + }) + + test_that('Solution should be a single character ("A", "B", "C", or "D")', { + expect_match(answer1.11, "a|b|c|d", ignore.case = TRUE) + }) + + answer_hash <- digest(tolower(answer1.11)) + test_that("Solution is incorrect", { + expect_equal(answer_hash, "ddf100612805359cd81fdc5ce3b9fbba") + }) + + print("Success!") +} + +test_1.12 <- function() { + test_that('Did not assign answer to an object called "answer1.12"', { + expect_true(exists("answer1.12")) + }) + + answer_as_numeric <- as.numeric(answer1.12) + test_that("Solution should be a number", { + expect_false(is.na(answer_as_numeric)) + }) + + test_that("Solution is incorrect", { + expect_equal(digest(as.integer(answer_as_numeric * 10e6)), "07660fdc17d69e9b645c10b4a1f810be") + }) + + print("Success!") +} diff --git a/materials/tutorial_04/tutorial_04.ipynb b/materials/tutorial_04/tutorial_04.ipynb new file mode 100644 index 0000000..6d85eef --- /dev/null +++ b/materials/tutorial_04/tutorial_04.ipynb @@ -0,0 +1,1527 @@ +{ + "cells": [ + { + "cell_type": "markdown", + "metadata": { + "deletable": false, + "editable": false, + "nbgrader": { + "cell_type": "markdown", + "checksum": "f24261fa7714655f9a524ad3407a23d9", + "grade": false, + "grade_id": "cell-f1e1d845873036f4", + "locked": true, + "schema_version": 3, + "solution": false, + "task": false + } + }, + "source": [ + "# Tutorial 04: Binary or Discrete Counts Responses" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "deletable": false, + "editable": false, + "nbgrader": { + "cell_type": "markdown", + "checksum": "098120a10a47865f4be13447efc6b47f", + "grade": false, + "grade_id": "cell-82d9926086d47a80", + "locked": true, + "schema_version": 3, + "solution": false, + "task": false + } + }, + "source": [ + "#### Lecture and Tutorial Learning Goals:\n", + "After completing this week's lecture and tutorial work, you will be able to:\n", + "\n", + "1. Describe the logistic regression estimation procedure (categorical data as the response variable and explanatory variables) and the Poisson regression estimation procedure (discrete counts as the response variable and explanatory variables).\n", + "2. Discuss the relationship between linear regression and logistic and Poisson regression. Discuss the consequences of modelling data more suitable for logistic and Poisson regression models as a linear regression model.\n", + "3. Interpret the coefficients and $p$-values in the logistic and Poisson regression settings.\n", + "4. Discuss useful logistic and Poisson regression diagnostics and explain why they should be performed.\n", + "5. Write a computer script to perform logistic and Poisson regression and perform model diagnostics. Interpret and communicate the results from that computer script." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "deletable": false, + "editable": false, + "nbgrader": { + "cell_type": "code", + "checksum": "1f326fc9153c6f2dd6d3520435304a2c", + "grade": false, + "grade_id": "cell-a2a153352bc44a68", + "locked": true, + "schema_version": 3, + "solution": false, + "task": false + }, + "tags": [] + }, + "outputs": [], + "source": [ + "# Run this cell before continuing.\n", + "library(repr)\n", + "library(infer)\n", + "library(gridExtra)\n", + "library(mlbench)\n", + "library(AER)\n", + "library(ISLR)\n", + "library(broom)\n", + "library(qqplotr)\n", + "library(performance)\n", + "library(see)\n", + "library(MASS)\n", + "library(glmbb)\n", + "library(cowplot)\n", + "library(tidyverse)\n", + "library(digest)\n", + "source(\"tests_tutorial_04.R\")\n", + "\n" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "deletable": false, + "editable": false, + "nbgrader": { + "cell_type": "markdown", + "checksum": "0b50907239904975577e0c33ba818d9a", + "grade": false, + "grade_id": "cell-801b28434bf00853", + "locked": true, + "schema_version": 3, + "solution": false, + "task": false + } + }, + "source": [ + "## 1. Poisson Regression\n", + "\n", + "Let us proceed with Poisson regression. This class of GLM is intended for cases where the response is a count (i.e., a non-negative integer). Let's start by loading the dataset to be used in this section. The data frame `crabs` ([Brockmann, 1996](https://ubc.summon.serialssolutions.com/2.0.0/link/0/eLvHCXMwrV3JasMwEBUlpZBLl7Sl6YY_oE4sy5sgFEpICKU9NadcjFYSmtghCyRf0t-tJC_EPhRaepMHS8jSSPM0M3oGALkdx67tCYRyKV0GacCp9CBX60RCySMcYiigy2qpOq_F1ZiMLqL0v-mFYrZvvd4JXXcPEnOUZbU1z6W-fxdoh6eO-XQUvDyGCnXrbK_hxCkDDCHKgs9KH20FelDOR_pzWxXbdYhljTEanoF10e8iC6V2SbDK9Pg_H3gOTnPsar1kynYBjkTSAs1yC923wMkkNaVL8NVbkNXn8wcxjJ8b0euaZysTvyuzVJUYH9i6Kpsl1ijVOSbTtPZ2f6V6nYuectnbbLGdb2tNLNP5fjkVi1J-BcbDwbg_svOfQNhMIRdsu4KHEnIsIiQYJoGQvsKQmAuOsE8lojRnrPEIlZQHHIUEEs8PBVHQjaJr0EjSRNwAS8ebmMNk4DHkRYgQSt2ARoxRnzuhJG2AitmNlxnVR3x4REI41sMf6-GP8-GPd20QmVn7RZV4MB7p0u3fq96BZpY-rn1B96CxWW3FgyGOeDR6_g1rIQSN)) is a dataset detailing the **counts** of satellite male crabs residing around a female crab nest: `n_males`. \n", + "\n", + "> The data frame `crabs` contains 173 observations on horseshoe crabs (*Limulus polyphemus*). The response is the count of male crabs (`n_males`) around a female breeding nest. It is subject to four input variables: a factor for the `color` of the prosoma with four levels, a factor for the condition of the posterior `spine` with three levels, the continuous variables for carapace `width` (cm), and `weight` (g).\n", + "\n", + "Run the cell below before proceeding." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "deletable": false, + "editable": false, + "nbgrader": { + "cell_type": "code", + "checksum": "279a3293df69fc4d81f0dde58d7b1056", + "grade": false, + "grade_id": "cell-c42393c3147ab6d0", + "locked": true, + "schema_version": 3, + "solution": false, + "task": false + }, + "tags": [] + }, + "outputs": [], + "source": [ + "# Load the data\n", + "data(crabs)\n", + "\n", + "crabs <- \n", + " crabs %>%\n", + " rename(n_males = satell) %>%\n", + " select(-y)\n", + "\n", + "str(crabs)\n", + "head(crabs)" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "deletable": false, + "editable": false, + "nbgrader": { + "cell_type": "markdown", + "checksum": "375e97009d5be3b1fd98d26a65226100", + "grade": false, + "grade_id": "cell-3245a397a3c0a56f", + "locked": true, + "schema_version": 3, + "solution": false, + "task": false + } + }, + "source": [ + "**Question 1.1**\n", + "
{points: 1}\n", + "\n", + "Create a scatterplot of `n_males` versus carapace `width` (via `geom_point()`), even though `n_males` is not continuous. The `ggplot()` object's name will be `crabs_vs_width_scatterplot`. Recall that the response must be placed on the $y$-axis, whereas the continuous input must be on the $x$-axis. Include proper axis labels and title.\n", + "\n", + "*Fill out those parts indicated with `...`, uncomment the corresponding code in the cell below, and run it.*" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "deletable": false, + "nbgrader": { + "cell_type": "code", + "checksum": "3721a30c4a1ba8444bde457f8bb78ee2", + "grade": false, + "grade_id": "cell-267744b8584d386b", + "locked": false, + "schema_version": 3, + "solution": true, + "task": false + }, + "tags": [] + }, + "outputs": [], + "source": [ + "# Adjust these numbers so the plot looks good in your desktop.\n", + "options(repr.plot.width = 7, repr.plot.height = 5) \n", + "\n", + "# crabs_vs_width_scatterplot <- \n", + "# ... %>%\n", + "# ggplot() +\n", + "# ...(aes(..., ...)) +\n", + "# labs(y = ..., x = ...) +\n", + "# ggtitle(...) +\n", + "# theme(text = element_text(size = 14)) + \n", + "# scale_x_continuous(breaks = seq(20, 34, 2))\n", + "\n", + "# your code here\n", + "fail() # No Answer - remove if you provide an answer\n", + "\n", + "crabs_vs_width_scatterplot" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "deletable": false, + "editable": false, + "nbgrader": { + "cell_type": "code", + "checksum": "0ab5a2b196ff6c3c1ed3a2b30946f711", + "grade": true, + "grade_id": "cell-4ea95eb36fa39c91", + "locked": true, + "points": 1, + "schema_version": 3, + "solution": false, + "task": false + }, + "tags": [] + }, + "outputs": [], + "source": [ + "test_1.1()" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "deletable": false, + "editable": false, + "nbgrader": { + "cell_type": "markdown", + "checksum": "2bfc539dbc008a472952707429620bc4", + "grade": false, + "grade_id": "cell-c3c5c48f76ee8a52", + "locked": true, + "schema_version": 3, + "solution": false, + "task": false + } + }, + "source": [ + "Note the characteristic horizontal pattern in the points of `crabs_vs_width_scatterplot`, since the $y$-axis has repeated counts associated with different `width` values. Graphically speaking, is the carapace `width` variable associated with `n_males`?\n", + "\n", + "From the `crabs_vs_width_scatterplot` above, it is hard to graphically conclude anything about the relationship between `n_males` and caparace `width`. Hence, let us plot the average `n_males` by non-overlapped carapace `width` groups. " + ] + }, + { + "cell_type": "markdown", + "metadata": { + "deletable": false, + "editable": false, + "nbgrader": { + "cell_type": "markdown", + "checksum": "161df85309f5e424a3259e1dccfdd3db", + "grade": false, + "grade_id": "cell-d8dc6bd50a1e51f1", + "locked": true, + "schema_version": 3, + "solution": false, + "task": false + } + }, + "source": [ + "**Question 1.2**\n", + "
{points: 1}\n", + "\n", + "Create a data frame called `crabs_group_avg_width`, which is created from `crabs` and has two columns:\n", + "\n", + "- `width_intervals`: a column created with column `width` via function `cut()` with `breaks = 10` (i.e., bins).\n", + "- `mean_n_males`: the average `n_males` by each bin.\n", + "\n", + "*Fill out those parts indicated with `...`, uncomment the corresponding code in the cell below, and run it.*" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "deletable": false, + "nbgrader": { + "cell_type": "code", + "checksum": "fd0a0503432435b9719b221f448c9bbe", + "grade": false, + "grade_id": "cell-e136e294390e75db", + "locked": false, + "schema_version": 3, + "solution": true, + "task": false + }, + "tags": [] + }, + "outputs": [], + "source": [ + "# crabs_group_avg_width <- \n", + "# ... %>%\n", + "# ...(width_intervals = ...(..., ...)) %>%\n", + "# group_by(...) %>% \n", + "# summarise(... = ...(...)) \n", + "\n", + "# your code here\n", + "fail() # No Answer - remove if you provide an answer\n", + "\n", + "crabs_group_avg_width" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "deletable": false, + "editable": false, + "nbgrader": { + "cell_type": "code", + "checksum": "429c9f614730ce806230cd149ce6cd13", + "grade": true, + "grade_id": "cell-24bf5f18b3ee2717", + "locked": true, + "points": 1, + "schema_version": 3, + "solution": false, + "task": false + }, + "tags": [] + }, + "outputs": [], + "source": [ + "test_1.2()" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "deletable": false, + "editable": false, + "nbgrader": { + "cell_type": "markdown", + "checksum": "99fa6435e8297d3e3b7726dd7c3bf377", + "grade": false, + "grade_id": "cell-fcf23138e5f9e32f", + "locked": true, + "schema_version": 3, + "solution": false, + "task": false + } + }, + "source": [ + "**Question 1.3**\n", + "
{points: 1}\n", + "\n", + "Create another scatterplot of `mean_n_males` on the $y$-axis versus the carapace `width_intervals` on the $x$-axis using `crabs_group_avg_width` with `geom_point()`. The `ggplot()` object's name will be `crabs_avg_width_scatterplot`. Include proper axis labels and title.\n", + "\n", + "*Fill out those parts indicated with `...`, uncomment the corresponding code in the cell below, and run it.*" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "deletable": false, + "nbgrader": { + "cell_type": "code", + "checksum": "6b094f63a750e461e09928bfb6064dbe", + "grade": false, + "grade_id": "cell-72dcede930e14421", + "locked": false, + "schema_version": 3, + "solution": true, + "task": false + }, + "tags": [] + }, + "outputs": [], + "source": [ + "# Adjust these numbers so the plot looks good in your desktop.\n", + "options(repr.plot.width = 8, repr.plot.height = 10)\n", + "\n", + "# Crabs_avg_width_scatterplot <- \n", + "# ... %>%\n", + "# ggplot() +\n", + "# ...(aes(..., ...), colour = \"red\", size = 4) +\n", + "# labs(y = ..., x = ...) +\n", + "# ggtitle(...) +\n", + "# theme(text = element_text(size = 14), \n", + "# axis.text.x = element_text(angle = 45, hjust = 1))\n", + "\n", + "# your code here\n", + "fail() # No Answer - remove if you provide an answer\n", + "\n", + "plot_grid(crabs_avg_width_scatterplot, crabs_vs_width_scatterplot, ncol = 1)" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "deletable": false, + "editable": false, + "nbgrader": { + "cell_type": "code", + "checksum": "06d23e2fc7522963d9ad92d2bc5ceedc", + "grade": true, + "grade_id": "cell-c470e7f855f8f4c8", + "locked": true, + "points": 1, + "schema_version": 3, + "solution": false, + "task": false + }, + "tags": [] + }, + "outputs": [], + "source": [ + "test_1.3()" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "deletable": false, + "editable": false, + "nbgrader": { + "cell_type": "markdown", + "checksum": "a227e6065d0254d0e966f2b955bdde66", + "grade": false, + "grade_id": "cell-58f301f5cab01d1a", + "locked": true, + "schema_version": 3, + "solution": false, + "task": false + } + }, + "source": [ + "**Question 1.4**\n", + "
{points: 1}\n", + "\n", + "By looking at `Crabs_avg_width_scatterplot`, graphically speaking, what is the relationship between `n_males` and carapace `width`?\n", + "\n", + "**A.** Positive.\n", + "\n", + "**B.** Negative.\n", + "\n", + "**C.** No relationship.\n", + "\n", + "*Assign your answer to the object `answer1.4` (character type surrounded by quotes).*" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "deletable": false, + "nbgrader": { + "cell_type": "code", + "checksum": "d417bb44aa9ac1ad7ea23c927da2496f", + "grade": false, + "grade_id": "cell-5f4531deea0754da", + "locked": false, + "schema_version": 3, + "solution": true, + "task": false + }, + "tags": [] + }, + "outputs": [], + "source": [ + "# answer1.4 <- ...\n", + "\n", + "# your code here\n", + "fail() # No Answer - remove if you provide an answer\n", + "\n", + "answer1.4" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "deletable": false, + "editable": false, + "nbgrader": { + "cell_type": "code", + "checksum": "3fe969a9a11de52d0812d9d263da9ee1", + "grade": true, + "grade_id": "cell-32224ab4e0d00a59", + "locked": true, + "points": 1, + "schema_version": 3, + "solution": false, + "task": false + }, + "tags": [] + }, + "outputs": [], + "source": [ + "test_1.4()" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "deletable": false, + "editable": false, + "nbgrader": { + "cell_type": "markdown", + "checksum": "f8fc0931470e4cfa7253a3dfed528cb7", + "grade": false, + "grade_id": "cell-98e957ee7f6dfc9c", + "locked": true, + "schema_version": 3, + "solution": false, + "task": false + } + }, + "source": [ + "Let's compare `crabs_vs_width_scatterplot` and `crabs_avg_width_scatterplot`. We can see that working with the averages of `n_males` by carapace `width` intervals gives us a clearer perspective of the relationship between these two variables. Nonetheless, we need to find a suitable model to confirm this statistically.\n", + "\n", + "Recall that the residual component in an ordinary linear regression model, namely $\\varepsilon_i$, is assumed to be Normally distributed, making the response $Y_i$ Normally distributed. In this case, our response variable is the \"Number of male crabs\" (a count). Count distributions can be asymmetric, and they are non-negative. Thus, the Normal distribution might not be adequate. Nor the logistic regression since we are not estimating proportions. \n", + "\n", + "A very useful distribution to model counts is the Poisson distribution (it's not the only one). " + ] + }, + { + "cell_type": "markdown", + "metadata": { + "deletable": false, + "editable": false, + "nbgrader": { + "cell_type": "markdown", + "checksum": "f0fb4d253c9feb7f00dd8fa6bec74e85", + "grade": false, + "grade_id": "cell-d07f008bc4129b8f", + "locked": true, + "schema_version": 3, + "solution": false, + "task": false + } + }, + "source": [ + "**The Poisson Regression**\n", + "\n", + "A Poisson random variable takes discrete non-negative integer values (i.e., 0, 1, 2,...) that count something in a given timeframe or even in a space such as a geographic unit. \n", + "\n", + "The Poisson regression model is given by:\n", + "\n", + "$$Y_i|\\mathbf{X}_i \\sim \\text{Poisson}(\\lambda_i),$$\n", + "\n", + "$$\\log(\\lambda_i) = \\beta_0 + \\beta_1X_{1,i} + \\ldots + \\beta_pX_{1,p}$$\n", + "\n", + "or equivalently,\n", + "\n", + "$$\\lambda_i = e^{\\beta_0 + \\beta_1X_{1,i} + \\ldots + \\beta_pX_{1,p}}$$\n", + "\n", + "where each variable has its own mean, $\\lambda_i$, and variance, also $\\lambda_i$. The parameter $\\lambda_i$ is interpreted as the risk of an event occurring in a given timeframe or even a space. Note that $\\lambda_i$ cannot be negative.\n", + "\n", + "A particularity of the Poisson distribution is that its mean equals its variance. Thus, any factor that affects the mean will also affect the variance. This fact could be a potential drawback for using a Poisson regression model." + ] + }, + { + "cell_type": "markdown", + "metadata": { + "deletable": false, + "editable": false, + "nbgrader": { + "cell_type": "markdown", + "checksum": "d35add8e63b91a046d7fa9f86237a9ae", + "grade": false, + "grade_id": "cell-27605b7706aedb71", + "locked": true, + "schema_version": 3, + "solution": false, + "task": false + } + }, + "source": [ + "**Question 1.5**\n", + "
{points: 1}\n", + "\n", + "For our `crabs` dataset, the events are the number of male crabs, `n_males,` around a space: the female breeding nest. Suppose we want to make an inference on whether the carapace `width` is related to the response `n_males.` Thus, we could use Poisson regression. Let $\\texttt{width}_i$ be the $i$th value for the input `width` in our dataset `crabs`. The model's regression equation will be:\n", + "\n", + "$$\\log(\\lambda_i) = \\beta_0 + \\beta_1\\texttt{width}_i$$\n", + "\n", + "Let us plot the predictions of this model on top of `crabs_vs_width_scatterplot`. Use `geom_smooth()` with `method = \"glm\"` and `method.args = list(family = poisson)`.\n", + "\n", + "*Fill out those parts indicated with `...`, uncomment the corresponding code in the cell below, and run it.*" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "deletable": false, + "nbgrader": { + "cell_type": "code", + "checksum": "6c463caf91e5d9f1b4a701c1e92f64b9", + "grade": false, + "grade_id": "cell-3a1e6d7e495e79d0", + "locked": false, + "schema_version": 3, + "solution": true, + "task": false + }, + "tags": [] + }, + "outputs": [], + "source": [ + "# Adjust these numbers so the plot looks good in your desktop.\n", + "options(repr.plot.width = 9, repr.plot.height = 5)\n", + "\n", + "# crabs_vs_width_scatterplot <- \n", + "# crabs_vs_width_scatterplot +\n", + "# ...(aes(..., ...), \n", + "# ...,\n", + "# se = FALSE,\n", + "# ...)\n", + "\n", + "# your code here\n", + "fail() # No Answer - remove if you provide an answer\n", + "\n", + "crabs_vs_width_scatterplot" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "deletable": false, + "editable": false, + "nbgrader": { + "cell_type": "code", + "checksum": "419c75caf1de4a00fff5719224bc8cb6", + "grade": true, + "grade_id": "cell-e6f45c72012f6548", + "locked": true, + "points": 1, + "schema_version": 3, + "solution": false, + "task": false + }, + "tags": [] + }, + "outputs": [], + "source": [ + "test_1.5()" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "deletable": false, + "editable": false, + "nbgrader": { + "cell_type": "markdown", + "checksum": "d6e75e61e3d522f406840740c0695ff5", + "grade": false, + "grade_id": "cell-04bbdae892ba5783", + "locked": true, + "schema_version": 3, + "solution": false, + "task": false + } + }, + "source": [ + "As seen in other models, the parameters $\\beta_0, \\beta_1, \\dots, \\beta_{p}$ are unknown population coefficients that we want to estimate using data. \n", + "\n", + "In order to fit a Poisson regression model, we can also use the function `glm()` and its argument `family = poisson` (required to specify the Poisson nature of the response), which obtains the estimates $\\hat{\\beta}_0, \\hat{\\beta}_1, \\dots \\hat{\\beta}_{p}$. The estimates are obtained through maximum likelihood." + ] + }, + { + "cell_type": "markdown", + "metadata": { + "deletable": false, + "editable": false, + "nbgrader": { + "cell_type": "markdown", + "checksum": "78a6d0c6681eea49f326f46bc83543fa", + "grade": false, + "grade_id": "cell-1cd6b2d8388adfad", + "locked": true, + "schema_version": 3, + "solution": false, + "task": false + } + }, + "source": [ + "**Question 1.6**\n", + "
{points: 1}\n", + "\n", + "Using `glm()`, estimate a Poisson regression model with `n_males` as a response and two input variables: `width` \n", + "($\\texttt{width}_i$) and `color` ($\\text{colordarker}_i$, $\\texttt{colorlight}_i$, and $\\texttt{colormedium}_i$) for the $i$th observation:\n", + "\n", + "$$\n", + "log(\\lambda_i) = \\beta_0 + \\beta_1 \\texttt{width}_i + \\beta_2 \\texttt{colordarker}_i + \\beta_3 \\texttt{colorlight}_i + \\beta_4 \\texttt{colormedium}_i.\n", + "$$\n", + "\n", + "Note that the ordinal input `color` has four levels (where `dark` is the baseline): " + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "deletable": false, + "editable": false, + "nbgrader": { + "cell_type": "code", + "checksum": "ef6561481dd7f355a99408c62d1dcd9a", + "grade": false, + "grade_id": "cell-a5c51d7457cd83a8", + "locked": true, + "schema_version": 3, + "solution": false, + "task": false + } + }, + "outputs": [], + "source": [ + "levels(crabs$color)" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "deletable": false, + "editable": false, + "nbgrader": { + "cell_type": "markdown", + "checksum": "4a8d65de0162e423e9065edb029862bb", + "grade": false, + "grade_id": "cell-3474c3e0f1563e34", + "locked": true, + "schema_version": 3, + "solution": false, + "task": false + } + }, + "source": [ + "Therefore, we have three dummy variables: $\\texttt{colordarker}_i$, $\\texttt{colorlight}_i$, and $\\texttt{colormedium}_i$. Depending on the `color`, these dummy variables take on the following values:\n", + "\n", + "- When `color` is `darker`, then $\\texttt{colordarker}_i = 1$ while the other two dummy variables $\\texttt{colorlight}_i = \\texttt{colormedium}_i = 0$.\n", + "- When `color` is `light`, then $\\texttt{light}_i = 1$ while the other two dummy variables $\\texttt{colordarker}_i = \\texttt{colormedium}_i = 0$.\n", + "- When `color` is `medium`, then $\\texttt{medium}_i = 1$ while the other two dummy variables $\\texttt{colordarker}_i = \\texttt{colorlight}_i = 0$.\n", + "\n", + "Call the model `crabs_poisson_model`.\n", + " \n", + "*Fill out those parts indicated with `...`, uncomment the corresponding code in the cell below, and run it.* " + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "deletable": false, + "nbgrader": { + "cell_type": "code", + "checksum": "151d34a5c399f3312e1c423f492e01bc", + "grade": false, + "grade_id": "cell-cce6669279499835", + "locked": false, + "schema_version": 3, + "solution": true, + "task": false + }, + "tags": [] + }, + "outputs": [], + "source": [ + "# crabs_poisson_model <- ...(...,\n", + "# ...,\n", + "# ...)\n", + "\n", + "# your code here\n", + "fail() # No Answer - remove if you provide an answer\n", + "\n", + "summary(crabs_poisson_model)" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "deletable": false, + "editable": false, + "nbgrader": { + "cell_type": "code", + "checksum": "01ca38e188374d6711e79151a5fec892", + "grade": true, + "grade_id": "cell-97cc9b41beb464bf", + "locked": true, + "points": 1, + "schema_version": 3, + "solution": false, + "task": false + }, + "tags": [] + }, + "outputs": [], + "source": [ + "test_1.6()" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "deletable": false, + "editable": false, + "nbgrader": { + "cell_type": "markdown", + "checksum": "95cbd593551bb504c9e9d593818fb4b7", + "grade": false, + "grade_id": "cell-b7f592bf64da6cf8", + "locked": true, + "schema_version": 3, + "solution": false, + "task": false + } + }, + "source": [ + "**Question 1.7**\n", + "
{points: 1}\n", + "\n", + "Report the estimated coefficients, their standard errors, and corresponding $p$-values using `tidy()` with `crabs_poisson_model`. Include the corresponding asymptotic 95% confidence intervals. Store the results in the variable `crabs_poisson_model_results`.\n", + "\n", + "*Fill out those parts indicated with `...`, uncomment the corresponding code in the cell below, and run it.*" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "deletable": false, + "nbgrader": { + "cell_type": "code", + "checksum": "a38de77492b5554b2119df7df243eb5f", + "grade": false, + "grade_id": "cell-4deafdcc73d1ddc4", + "locked": false, + "schema_version": 3, + "solution": true, + "task": false + }, + "tags": [] + }, + "outputs": [], + "source": [ + "# crabs_Poisson_model_results <- \n", + "# ...\n", + "# ...(... = TRUE) %>%\n", + "# mutate_if(is.numeric, round, 4)\n", + "\n", + "# your code here\n", + "fail() # No Answer - remove if you provide an answer\n", + "\n", + "crabs_poisson_model_results" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "deletable": false, + "editable": false, + "nbgrader": { + "cell_type": "code", + "checksum": "338e86ff29326cb30c941ede1a46255c", + "grade": true, + "grade_id": "cell-441e381e29ca858b", + "locked": true, + "points": 1, + "schema_version": 3, + "solution": false, + "task": false + }, + "tags": [] + }, + "outputs": [], + "source": [ + "test_1.7()" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "deletable": false, + "editable": false, + "nbgrader": { + "cell_type": "markdown", + "checksum": "32289516563e1f9e7c19dac29f5f1a32", + "grade": false, + "grade_id": "cell-f95b8a861b29b15b", + "locked": true, + "schema_version": 3, + "solution": false, + "task": false + } + }, + "source": [ + "**Question 1.8**\n", + "
{points: 1}\n", + "\n", + "We can also interpret the exponentiated coefficients since we are using the logarithmic link function. Add to `crabs_poisson_model_results` the estimate and 95% confidence interval of $e^{\\beta_j}, j=0,...,p$. \n", + "\n", + "*Fill out those parts indicated with `...`, uncomment the corresponding code in the cell below, and run it.*" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "deletable": false, + "nbgrader": { + "cell_type": "code", + "checksum": "712679efc27fcf52ef832a7860a2ba41", + "grade": false, + "grade_id": "cell-faa353ab9211c114", + "locked": false, + "schema_version": 3, + "solution": true, + "task": false + }, + "tags": [] + }, + "outputs": [], + "source": [ + "# crabs_poisson_model_results <- \n", + "# crabs_poisson_model_results %>%\n", + "# mutate(exp.estimate = ...,\n", + "# exp.conf.low = ...,\n", + "# exp.conf.high = ...) %>%\n", + "# mutate_if(is.numeric, round, 4)\n", + "\n", + "# your code here\n", + "fail() # No Answer - remove if you provide an answer\n", + "\n", + "crabs_poisson_model_results\n" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "deletable": false, + "editable": false, + "nbgrader": { + "cell_type": "code", + "checksum": "a615caefcc12fee9864b794e383c0a3d", + "grade": true, + "grade_id": "cell-1822a6990bcbbd0f", + "locked": true, + "points": 1, + "schema_version": 3, + "solution": false, + "task": false + }, + "tags": [] + }, + "outputs": [], + "source": [ + "test_1.8()" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "deletable": false, + "editable": false, + "nbgrader": { + "cell_type": "markdown", + "checksum": "7a2c5ac3641b01bf800bd44b047d7fdc", + "grade": false, + "grade_id": "cell-d029850491a79223", + "locked": true, + "schema_version": 3, + "solution": false, + "task": false + } + }, + "source": [ + "Note that you can also get the exponentiated estimated coefficients using `tidy` when the link function is `log` or `logit`. Note that `std.error` and `statistic` are not adjusted and it does not report the raw coefficients." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "deletable": false, + "editable": false, + "nbgrader": { + "cell_type": "code", + "checksum": "9c9fb3abcdd9bcdeef76b298f28b8635", + "grade": false, + "grade_id": "cell-d41aa0a8719a312b", + "locked": true, + "schema_version": 3, + "solution": false, + "task": false + }, + "tags": [] + }, + "outputs": [], + "source": [ + "tidy(crabs_poisson_model, exponentiate = FALSE,conf.int = TRUE)" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "deletable": false, + "editable": false, + "nbgrader": { + "cell_type": "code", + "checksum": "2f0a3d44c3c01d060bfda78b4bb657b1", + "grade": false, + "grade_id": "cell-87f78f36becf5ba3", + "locked": true, + "schema_version": 3, + "solution": false, + "task": false + }, + "tags": [] + }, + "outputs": [], + "source": [ + "tidy(crabs_poisson_model, exponentiate = TRUE,conf.int = TRUE)" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "deletable": false, + "editable": false, + "nbgrader": { + "cell_type": "markdown", + "checksum": "6d061bd70509b2bfd45a531104c717f0", + "grade": false, + "grade_id": "cell-7c2691fb066e9a1e", + "locked": true, + "schema_version": 3, + "solution": false, + "task": false + } + }, + "source": [ + "**Question 1.9**\n", + "
{points: 1}\n", + "\n", + "Using a significance level $\\alpha = 0.05$, and the output in `crabs_Poisson_model_results`, which of the following statements is TRUE?\n", + "\n", + "**A.** There's enough evidence to reject the null hypothesis that the coefficient of carapace `width` is zero.\n", + "\n", + "**B.** There's enough evidence to reject the null hypothesis that, for any width, the mean numbers of male crabs with `dark` and `darker` colours of the prosoma are equal.\n", + "\n", + "**C.** There's enough evidence to reject the null hypothesis that, for any width, the mean numbers of male crabs with `dark` and `light` colours of the prosoma are equal. \n", + "\n", + "**D.** There's enough evidence to reject the null hypothesis that, for any width, the mean numbers of male crabs with `dark` and `medium` colours of the prosoma are equal. \n", + "\n", + "*Assign your answers to the object `answer1.9`. Your answers must be included in a single string indicating the correct options in alphabetical order and surrounded by quotes (e.g., `\"ABCD\"` indicates you are selecting the four options).*" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "deletable": false, + "nbgrader": { + "cell_type": "code", + "checksum": "d65d30b90231faecc44c6d528236c653", + "grade": false, + "grade_id": "cell-d1466876e62cbd8d", + "locked": false, + "schema_version": 3, + "solution": true, + "task": false + }, + "tags": [] + }, + "outputs": [], + "source": [ + "# answer1.9 <- \n", + "\n", + "# your code here\n", + "fail() # No Answer - remove if you provide an answer\n", + "\n", + "answer1.9" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "deletable": false, + "editable": false, + "nbgrader": { + "cell_type": "code", + "checksum": "9a60387f4dcfb40d6906afa52ea7a93e", + "grade": true, + "grade_id": "cell-c4b2bdbb07660385", + "locked": true, + "points": 1, + "schema_version": 3, + "solution": false, + "task": false + }, + "tags": [] + }, + "outputs": [], + "source": [ + "test_1.9()" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "deletable": false, + "editable": false, + "nbgrader": { + "cell_type": "markdown", + "checksum": "edeb676291aa7628423840ae4daff67b", + "grade": false, + "grade_id": "cell-3cce4716202a3ac2", + "locked": true, + "schema_version": 3, + "solution": false, + "task": false + } + }, + "source": [ + "**Interpretation of estimated coefficients for continuous covariates** \n", + "\n", + "Firstly, let us focus on the coefficient interpretation corresponding to carapace `width`, *while keeping `color` constant*. Consider an observation with a given value ${\\texttt{width}} = \\texttt{w}$ cm, and another observation with a given ${\\texttt{width + 1}} = \\texttt{w} + 1$ cm (i.e., an increase of $1$ cm). Then we have their corresponding regression equations:\n", + "\n", + "$$\n", + "\\log \\lambda_{\\texttt{width}} = \\beta_0 + \\beta_1 \\overbrace{\\texttt{w}}^{{\\texttt{width}}} + \\overbrace{\\beta_2 {\\texttt{colordarker}} + \\beta_3 {\\texttt{colorlight}} + \\beta_4 {\\texttt{colormedium}}}^{\\text{Constant}}\n", + "$$\n", + "$$\n", + "\\log \\lambda_{\\texttt{width + 1}} = \\beta_0 + \\beta_1 \\underbrace{(\\texttt{w} + 1)}_{{\\texttt{width + 1}}} + \\underbrace{\\beta_2 {\\texttt{colordarker}} + \\beta_3 {\\texttt{colorlight}} + \\beta_4 {\\texttt{colormedium}}.}_{\\text{Constant}}\n", + "$$\n", + "\n", + "We take the difference between both equations as:\n", + "\n", + "\\begin{align*}\n", + "\\log \\lambda_{\\texttt{width + 1}} - \\log \\lambda_{\\texttt{width}} &= \\beta_1 (\\texttt{w} + 1) - \\beta_1 \\texttt{w} \\\\\n", + "&= \\beta_1\n", + "\\end{align*}\n", + "\n", + "Then, we apply the logarithm property for a ratio:\n", + "\n", + "\\begin{align*}\n", + "\\log \\frac{\\lambda_{\\texttt{width + 1}} }{\\lambda_{\\texttt{width}}} &= \\log \\lambda_{\\texttt{width + 1}} - \\log \\lambda_{\\texttt{width}} \\\\\n", + "&= \\beta_1\n", + "\\end{align*}\n", + "\n", + "Finally, we have to exponentiate the previous equation:\n", + "\n", + "$$\n", + "\\frac{\\lambda_{\\texttt{width + 1}} }{\\lambda_{\\texttt{width}}} = e^{\\beta_1}\n", + "$$\n", + "\n", + "This expression indicates that the mean count varies in a multiplicative way when a continuous covariate increases by 1 unit, i.e., $\\lambda_{\\texttt{width + 1}}= e^{\\beta_1}\\lambda_{\\texttt{width}}$." + ] + }, + { + "cell_type": "markdown", + "metadata": { + "deletable": false, + "editable": false, + "nbgrader": { + "cell_type": "markdown", + "checksum": "d18dc4097eb487b67385f4107bf2cafc", + "grade": false, + "grade_id": "cell-30680ad0b12b005a", + "locked": true, + "schema_version": 3, + "solution": false, + "task": false + } + }, + "source": [ + "**Question 1.10**\n", + "
{points: 1}\n", + "\n", + "**Using the column `exp.estimate` from `crabs_Poisson_model_results`**, what is the correct interpretation of the regression equation's estimated slope for `width`?\n", + "\n", + "**A.** The mean count of male crabs (`n_males`) around a female breeding nest decreases by $161\\%$ when increasing the carapace `width` by $1$ cm, *while keeping `color` constant*.\n", + "\n", + "**B.** The mean count of male crabs (`n_males`) around a female breeding nest increases by $161\\%$ when increasing the carapace `width` by $1$ cm, *while keeping `color` constant*.\n", + "\n", + "**C.** The mean count of male crabs (`n_males`) around a female breeding nest increases by $16.1\\%$ when increasing the carapace `width` by $1$ cm, *while keeping `color` constant*.\n", + "\n", + "**D.** The mean count of male crabs (`n_males`) around a female breeding nest decreases by $16.1\\%$ when increasing the carapace `width` by $1$ cm, *while keeping `color` constant*.\n", + "\n", + "**E.** The mean count of *dark* male crabs (`n_males`) around a female breeding nest increases by $161\\%$ when increasing the carapace `width` by $1$ cm.\n", + "\n", + "**F.** The mean count of *dark* male crabs (`n_males`) around a female breeding nest increases by $16.1\\%$ when increasing the carapace `width` by $1$ cm.\n", + "\n", + "*Assign your answer to the object `answer1.10`. Your answer should be one of `\"A\"`, `\"B\"`, `\"C\"`, `\"D\"`, `\"E\"`, or `\"F\"` surrounded by quotes.*" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "deletable": false, + "nbgrader": { + "cell_type": "code", + "checksum": "52bf049475f2209a74c35c60713d28d2", + "grade": false, + "grade_id": "cell-813f996317ff61dd", + "locked": false, + "schema_version": 3, + "solution": true, + "task": false + }, + "tags": [] + }, + "outputs": [], + "source": [ + "# answer1.10 <- ...\n", + "\n", + "# your code here\n", + "fail() # No Answer - remove if you provide an answer\n", + "\n", + "answer1.10" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "deletable": false, + "editable": false, + "nbgrader": { + "cell_type": "code", + "checksum": "f7233916807a2fc8edae2069861a8ade", + "grade": true, + "grade_id": "cell-01b57f216ff8f262", + "locked": true, + "points": 1, + "schema_version": 3, + "solution": false, + "task": false + }, + "tags": [] + }, + "outputs": [], + "source": [ + "test_1.10()" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "deletable": false, + "editable": false, + "nbgrader": { + "cell_type": "markdown", + "checksum": "d62e690cc37cbc4559a066dc9388f493", + "grade": false, + "grade_id": "cell-01590258a807acee", + "locked": true, + "schema_version": 3, + "solution": false, + "task": false + } + }, + "source": [ + "**Interpretation of estimated coefficients of dummy variables** \n", + "\n", + "*Keeping `width` constant*, at any value (recall assumption of additive models!) consider an observation from the `darker` group: \n", + "\n", + "Then we have their corresponding regression equations:\n", + "\n", + "$$\n", + "\\log \\lambda_{\\texttt{width}} = \\beta_0 + \\beta_1 \\overbrace{\\texttt{w}}^{\\text{Constant}} + \\beta_2 {\\texttt{colordarker}} + \\beta_3 {\\texttt{colorlight}} + \\beta_4 {\\texttt{colormedium}}\n", + "$$\n", + "\n", + "$$\n", + "\\log \\lambda_{\\texttt{width,dark}} = \\beta_0 + \\beta_1 \\texttt{w} + \\beta_2 \\times 0 + \\beta_3 \\times 0 + \\beta_4 \\times 0 \n", + "$$\n", + "\n", + "$$\n", + "\\log \\lambda_{\\texttt{width,darker}} = \\beta_0 + \\beta_1 \\texttt{w} + \\beta_2 \\times 1 + \\beta_3 \\times 0 + \\beta_4 \\times 0.\n", + "$$\n", + "\n", + "We take the difference between both equations as:\n", + "\n", + "\\begin{align*}\n", + "\\log \\lambda_{\\texttt{width,darker}} - \\log \\lambda_{\\texttt{width,dark}} = \\beta_2.\n", + "\\end{align*}\n", + "\n", + "Then, we apply the logarithm property for a ratio:\n", + "\n", + "\\begin{align*}\n", + "\\log \\frac{\\lambda_{\\texttt{width,darker}} }{\\lambda_{\\texttt{width,dark}}} &= \\log \\lambda_{\\texttt{width,darker}} - \\log \\lambda_{\\texttt{width,dark}} \\\\\n", + "&= \\beta_2.\n", + "\\end{align*}\n", + "\n", + "Finally, we have to exponentiate the previous equation:\n", + "\n", + "$$\n", + "\\frac{\\lambda_{\\texttt{width,darker}} }{\\lambda_{\\texttt{width,dark}}} = e^{\\beta_2}.\n", + "$$\n", + "\n", + "The expression $\\frac{\\lambda_{\\texttt{width,darker}} }{\\lambda_{\\texttt{width,dark}}} = e^{\\beta_2}$ indicates that the mean count changes in a multiplicative way between the two groups." + ] + }, + { + "cell_type": "markdown", + "metadata": { + "deletable": false, + "editable": false, + "nbgrader": { + "cell_type": "markdown", + "checksum": "2730f7c988816ff03732f5225d70d2eb", + "grade": false, + "grade_id": "cell-f8af88cea66668b0", + "locked": true, + "schema_version": 3, + "solution": false, + "task": false + } + }, + "source": [ + "**Question 1.11**\n", + "
{points: 1}\n", + "\n", + "Let us move on to the interpretation of the coefficient corresponding to `light` from `color` (with reference level `dark`).\n", + "\n", + "Using the `crabs_Poisson_model_results` tibble, what is the correct interpretation of the regression estimated coefficient for the dummy variable `light`?\n", + "\n", + "**A.** The mean count of male crabs (`n_males`) around a female breeding nest is $54.7\\%$ lower in the `light` prosoma group compared to `dark` group, *while keeping the carapace `width` constant (for any width value).*\n", + "\n", + "**B.** The mean count of male crabs (`n_males`) around a female breeding nest $54.7\\%$ higher in the `light` prosoma group compared to `dark` group, *while keeping the carapace `width` constant (for any width value).*\n", + "\n", + "**C.** The mean count of male crabs (`n_males`) around a female breeding nest $154.7\\%$ higher in the `light` prosoma group compared to `dark` group, *while keeping the carapace `width` constant (for any width value).*\n", + "\n", + "**D.** The mean count of male crabs (`n_males`) around a female breeding nest $154.7\\%$ lower in the `light` prosoma group compared to `dark` group, *while keeping the carapace `width` constant (for any width value).*\n", + "\n", + "**E.** The mean count of male crabs (`n_males`) around a female breeding nest is $54.7\\%$ lower in the `dark` prosoma group compared to `light` group, *while keeping the carapace `width` constant (for any width value).*\n", + "\n", + "**F.** The mean count of male crabs (`n_males`) around a female breeding nest $54.7\\%$ higher in the `dark` prosoma group compared to `light` group, *while keeping the carapace `width` constant (for any width value).*\n", + "\n", + "*Assign your answer to the object `answer1.11`. Your answer should be one of `\"A\"`, `\"B\"`, `\"C\"`, `\"D\"`, `\"E\"`, or `\"F\"` surrounded by quotes.*" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "deletable": false, + "nbgrader": { + "cell_type": "code", + "checksum": "ff760b89269d4912c27982a3fa0e2af6", + "grade": false, + "grade_id": "cell-9dd1384a6af3a36a", + "locked": false, + "schema_version": 3, + "solution": true, + "task": false + }, + "tags": [] + }, + "outputs": [], + "source": [ + "# answer1.11 <- ...\n", + "\n", + "# your code here\n", + "fail() # No Answer - remove if you provide an answer\n", + "\n", + "answer1.11" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "deletable": false, + "editable": false, + "nbgrader": { + "cell_type": "code", + "checksum": "d9edace5350e6c726a32663d2322356c", + "grade": true, + "grade_id": "cell-27d30eed0b7b8bae", + "locked": true, + "points": 1, + "schema_version": 3, + "solution": false, + "task": false + }, + "tags": [] + }, + "outputs": [], + "source": [ + "test_1.11()" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "deletable": false, + "editable": false, + "nbgrader": { + "cell_type": "markdown", + "checksum": "441ed357f144184437bffc19a3893267", + "grade": false, + "grade_id": "cell-96565a64571f2a45", + "locked": true, + "schema_version": 3, + "solution": false, + "task": false + } + }, + "source": [ + "**Question 1.12**\n", + "
{points: 1}\n", + "\n", + "Suppose we want to predict the mean count of male crabs (`n_males`) around a female breeding nest with a carapace `width` of $27.5$ cm and a `light` `color` of the prosoma. Then, the corresponding prediction is obtained using the function `predict()` with the object `crabs_poisson_model`.\n", + "\n", + "> **Hint:** Check the argument `type` when coding this prediction.\n", + "\n", + "*Assign your answer to the object `answer1.12`. Fill out those parts indicated with `...`, uncomment the corresponding code in the cell below, and run it.*" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "deletable": false, + "nbgrader": { + "cell_type": "code", + "checksum": "e495e0ababf6655472414e9b0822ac7a", + "grade": false, + "grade_id": "cell-5f4ccaec6bf73492", + "locked": false, + "schema_version": 3, + "solution": true, + "task": false + }, + "tags": [] + }, + "outputs": [], + "source": [ + "# answer1.12 <- \n", + "# ...(...,\n", + "# tibble(..., ..., ...),\n", + "# type = ...\n", + "# )\n", + "\n", + "# your code here\n", + "fail() # No Answer - remove if you provide an answer\n", + "\n", + "answer1.12" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "deletable": false, + "editable": false, + "nbgrader": { + "cell_type": "code", + "checksum": "81c218f35362c5b32755ab5d436812c3", + "grade": true, + "grade_id": "cell-0df8c510b063bb57", + "locked": true, + "points": 1, + "schema_version": 3, + "solution": false, + "task": false + }, + "tags": [] + }, + "outputs": [], + "source": [ + "test_1.12()" + ] + } + ], + "metadata": { + "jupytext": { + "formats": "ipynb,Rmd" + }, + "kernelspec": { + "display_name": "R", + "language": "R", + "name": "ir" + }, + "language_info": { + "codemirror_mode": "r", + "file_extension": ".r", + "mimetype": "text/x-r-source", + "name": "R", + "pygments_lexer": "r", + "version": "4.3.3" + }, + "latex_envs": { + "LaTeX_envs_menu_present": true, + "autoclose": false, + "autocomplete": true, + "bibliofile": "biblio.bib", + "cite_by": "apalike", + "current_citInitial": 1, + "eqLabelWithNumbers": true, + "eqNumInitial": 1, + "hotkeys": { + "equation": "Ctrl-E", + "itemize": "Ctrl-I" + }, + "labels_anchors": false, + "latex_user_defs": false, + "report_style_numbering": false, + "user_envs_cfg": false + } + }, + "nbformat": 4, + "nbformat_minor": 4 +} diff --git a/materials/worksheet_04/tests_worksheet_04.R b/materials/worksheet_04/tests_worksheet_04.R new file mode 100644 index 0000000..77103fc --- /dev/null +++ b/materials/worksheet_04/tests_worksheet_04.R @@ -0,0 +1,328 @@ +library(digest) +library(testthat) + +test_1.1 <- function() { + test_that('Did not assign answer to an object called "default_SLR_plot"', { + expect_true(exists("default_SLR_plot")) + }) + + test_that("Solution should be a ggplot object", { + expect_true(is.ggplot(default_SLR_plot)) + }) + + properties <- c(default_SLR_plot$layers[[1]]$mapping, default_SLR_plot$mapping) + + test_that("Plot should have balance on the x-axis", { + expect_true("balance" == rlang::get_expr(properties$x)) + }) + + test_that("Plot does not have the correct layers", { + expect_true("GeomPoint" %in% class(default_SLR_plot$layers[[1]]$geom)) + }) + + + test_that("Plot does not use the correct data", { + expect_equal(digest(nrow(default_SLR_plot$data)), "f1a30baa3072c1aad822c059f35c6841") + expect_equal(digest(round(sum(default_SLR_plot$data$balance))), "3267136924eba9966e7fa0114b916d95") + }) + + test_that("x-axis label should be descriptive and human readable", { + expect_false(default_SLR_plot$labels$x == toString(rlang::get_expr(properties$x))) + }) + + test_that("Plot should have a title", { + expect_true("title" %in% names(default_SLR_plot$labels)) + }) + + print("Success!") +} + +test_1.2 <- function() { + test_that('Did not assign answer to an object called "logistic_curve"', { + expect_true(exists("logistic_curve")) + }) + + test_that("Solution should be a ggplot object", { + expect_true(is.ggplot(logistic_curve)) + }) + + properties <- c(logistic_curve$layers[[1]]$mapping, logistic_curve$mapping) + + test_that("Plot should have z on the x-axis", { + expect_true("z" == rlang::get_expr(properties$x)) + }) + + test_that("Plot does not have the correct layers", { + expect_true("GeomLine" %in% class(logistic_curve$layers[[1]]$geom)) + + # Remove if not needed: + # expect_true("GeomVline" %in% class(logistic_curve$layers[[2]]$geom)) + }) + + test_that("Plot does not use the correct data", { + expect_equal(digest(nrow(logistic_curve$data)), "cada6c6b62b103dca67f23bd7d60ac3c") + expect_equal(digest(round(sum(logistic_curve$data$z))), "908d1fd10b357ed0ceaaec823abf81bc") + }) + + test_that("Plot should have a title", { + expect_true("title" %in% names(logistic_curve$labels)) + }) + + print("Success!") +} + +test_1.3 <- function() { + test_that('Did not assign answer to an object called "answer1.3"', { + expect_true(exists("answer1.3")) + }) + + answer_as_numeric <- as.numeric(answer1.3) + test_that("Solution should be a number", { + expect_false(is.na(answer_as_numeric)) + }) + + test_that("Solution is incorrect", { + expect_equal(digest(as.integer(answer_as_numeric * 10e6)), "9ce8f1f78c56eccda919dc6404297331") + }) + + print("Success!") +} + +test_1.4 <- function() { + test_that('Did not assign answer to an object called "default_SLR_plot"', { + expect_true(exists("default_SLR_plot")) + }) + + test_that("Solution should be a ggplot object", { + expect_true(is.ggplot(default_SLR_plot)) + }) + + properties <- c(default_SLR_plot$layers[[1]]$mapping, default_SLR_plot$mapping) + + test_that("Plot should have balance on the x-axis", { + expect_true("balance" == rlang::get_expr(properties$x)) + }) + + test_that("Plot does not have the correct layers", { + expect_true("GeomPoint" %in% class(default_SLR_plot$layers[[1]]$geom)) + }) + + + test_that("Plot does not use the correct data", { + expect_equal(digest(nrow(default_SLR_plot$data)), "f1a30baa3072c1aad822c059f35c6841") + expect_equal(digest(round(sum(default_SLR_plot$data$balance))), "3267136924eba9966e7fa0114b916d95") + }) + + test_that("x-axis label should be descriptive and human readable", { + expect_false(default_SLR_plot$labels$x == toString(rlang::get_expr(properties$x))) + }) + + test_that("Plot should have a title", { + expect_true("title" %in% names(default_SLR_plot$labels)) + }) + + print("Success!") +} + +#----- +test_1.5 <- function() { + test_that('Did not assign answer to an object called "default_binary_log_student"', { + expect_true(exists("default_binary_log_student")) + }) + + test_that("Solution should be a glm object", { + expect_true("glm" %in% class(default_binary_log_student)) + }) + + test_that("Data frame does not contain the correct data", { + expect_equal(digest(as.integer(sum(default_binary_log_student$residuals) * 10e4)), + "1473d70e5646a26de3c52aa1abd85b1f") + }) + + print("Success!") +} + +test_1.6 <- function() { + test_that('Did not assign answer to an object called "answer1.6"', { + expect_true(exists("answer1.6")) + }) + + test_that('Solution should be a single character ("A", "B", "C", or "D")', { + expect_match(answer1.6, "a|b|c|d", ignore.case = TRUE) + }) + + answer_hash <- digest(tolower(answer1.6)) + + test_that("Solution is incorrect", { + expect_equal(answer_hash, "ddf100612805359cd81fdc5ce3b9fbba") + }) + + print("Success!") +} + + +test_1.7 <- function() { + test_that('Did not assign answer to an object called "default_binary_log_model"', { + expect_true(exists("default_binary_log_model")) + }) + + test_that("Solution should be a glm object", { + expect_true("glm" %in% class(default_binary_log_model)) + }) + + test_that("Data frame does not contain the correct data", { + expect_equal(digest(as.integer(sum(default_binary_log_model$residuals) * 10e4)), "4cd0d70c6d524273008afa4703cd9df0") + }) + + print("Success!") +} + + +test_1.8 <- function() { + test_that('Did not assign answer to an object called "answer1.8"', { + expect_true(exists("answer1.8")) + }) + + test_that('Solution should be a single character ("A", "B", "C", or "D")', { + expect_match(answer1.8, "a|b|c|d", ignore.case = TRUE) + }) + + answer_hash <- digest(tolower(answer1.8)) + + test_that("Solution is incorrect", { + expect_equal(answer_hash, "6e7a8c1c098e8817e3df3fd1b21149d1") + }) + + print("Success!") +} + +test_1.9 <- function() { + test_that('Did not assign answer to an object called "answer1.9"', { + expect_true(exists("answer1.9")) + }) + + test_that('Solution should be a single character ("A", "B", "C", or "D")', { + expect_match(answer1.9, "a|b|c|d", ignore.case = TRUE) + }) + + answer_hash <- digest(tolower(answer1.9)) + + test_that("Solution is incorrect", { + expect_equal(answer_hash, "127a2ec00989b9f7faf671ed470be7f8") + }) + + print("Success!") +} + +test_1.10 <- function() { + test_that('Did not assign answer to an object called "default_binary_log_model_results"', { + expect_true(exists("default_binary_log_model_results")) + }) + + test_that("Solution should be a data frame", { + expect_true("data.frame" %in% class(default_binary_log_model_results)) + }) + + expected_colnames <- c('term','estimate','std.error','statistic','p.value','conf.low','conf.high') + + + given_colnames <- colnames(default_binary_log_model_results) + test_that("Data frame does not have the correct columns", { + expect_equal(length(setdiff( + union(expected_colnames, given_colnames), + intersect(expected_colnames, given_colnames) + )), 0) + }) + + test_that("Data frame does not contain the correct number of rows", { + expect_equal(digest(as.integer(nrow(default_binary_log_model_results))), "234a2a5581872457b9fe1187d1616b13") + }) + + test_that("Data frame does not contain the correct data", { + expect_equal(digest(as.integer(sum(default_binary_log_model_results$conf.low) * 10e6)), "ba24456096c2af163742607b3e494ef7") + expect_equal(digest(as.integer(sum(default_binary_log_model_results$conf.high) * 10e6)), "f2c22cf099c18a563f722da22c975d80") + expect_equal(digest(as.integer(sum(default_binary_log_model_results$statistic) * 10e6)), "273c6d0ba411836667be62df7641a7b4") + }) + + print("Success!") +} + +test_1.11 <- function() { + test_that('Did not assign answer to an object called "default_binary_odds_model_results"', { + expect_true(exists("default_binary_odds_model_results")) + }) + + test_that("Solution should be a data frame", { + expect_true("data.frame" %in% class(default_binary_odds_model_results)) + }) + + expected_colnames <- c('term','estimate','std.error','statistic','p.value','conf.low','conf.high') + + given_colnames <- colnames(default_binary_odds_model_results) + test_that("Data frame does not have the correct columns", { + expect_equal(length(setdiff( + union(expected_colnames, given_colnames), + intersect(expected_colnames, given_colnames) + )), 0) + }) + + test_that("Data frame does not contain the correct number of rows", { + expect_equal(digest(as.integer(nrow(default_binary_odds_model_results))), "234a2a5581872457b9fe1187d1616b13") + }) + + test_that("Data frame does not contain the correct data", { + expect_equal(digest(as.integer(sum(default_binary_odds_model_results$estimate) * 10e6)), "7a4bb9d19684e603bffc6f1961a7eda1") + expect_equal(digest(as.integer(sum(default_binary_odds_model_results$conf.low) * 10e6)), "55679ab470e07149e5a38ce2213a15df") + expect_equal(digest(as.integer(sum(default_binary_odds_model_results$conf.high) * 10e6)), "bdccf6329a8314620c8305cc31634021") + }) + + print("Success!") +} + +test_1.12 <- function() { + test_that('Did not assign answer to an object called "answer1.12"', { + expect_true(exists("answer1.12")) + }) + + answer_hash <- digest(tolower(answer1.12)) + test_that("Solution is incorrect", { + expect_equal(answer_hash, "8b63e49106226d2a45958a7e24c97c37") + }) + + print("Success!") +} + + +test_1.13 <- function() { + test_that('Did not assign answer to an object called "answer1.13"', { + expect_true(exists("answer1.13")) + }) + + answer_as_numeric <- as.numeric(answer1.13) + test_that("Solution should be a number", { + expect_false(is.na(answer_as_numeric)) + }) + + test_that("Solution is incorrect", { + expect_equal(digest(as.integer(answer_as_numeric * 10e6)), "7ea6059985f925469f09bb3af79c7dc1") + }) + + print("Success!") +} + +test_1.14 <- function() { + test_that('Did not assign answer to an object called "answer1.14"', { + expect_true(exists("answer1.14")) + }) + + answer_as_numeric <- as.numeric(answer1.14) + test_that("Solution should be a number", { + expect_false(is.na(answer_as_numeric)) + }) + + test_that("Solution is incorrect", { + expect_equal(digest(as.integer(answer_as_numeric * 10e6)), "26b7f981cf15e1b5f55fa6a141d8a3a0") + }) + + print("Success!") +} diff --git a/materials/worksheet_04/worksheet_04.ipynb b/materials/worksheet_04/worksheet_04.ipynb new file mode 100644 index 0000000..ccef425 --- /dev/null +++ b/materials/worksheet_04/worksheet_04.ipynb @@ -0,0 +1,2085 @@ +{ + "cells": [ + { + "cell_type": "markdown", + "metadata": { + "deletable": false, + "editable": false, + "nbgrader": { + "cell_type": "markdown", + "checksum": "9289673bc88fef625580fcc14d337033", + "grade": false, + "grade_id": "cell-f1e1d845873036f4", + "locked": true, + "schema_version": 3, + "solution": false, + "task": false + } + }, + "source": [ + "# Worksheet 04: Binary or Discrete Counts Responses" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "deletable": false, + "editable": false, + "nbgrader": { + "cell_type": "markdown", + "checksum": "03e626e6244044341d56ccfd007c0e05", + "grade": false, + "grade_id": "cell-82d9926086d47a80", + "locked": true, + "schema_version": 3, + "solution": false, + "task": false + } + }, + "source": [ + "## Lecture and Tutorial Learning Goals:\n", + "After completing this week's lecture and tutorial work, you will be able to:\n", + "\n", + "1. Describe the logistic regression estimation procedure (binary response variable) and Poisson regression estimation procedure (discrete counts as the response variable).\n", + "2. Discuss the relationship between linear regression and logistic and Poisson regression. Discuss the consequences of modelling data more suitable for logistic and Poisson regression models as a linear regression model.\n", + "3. Interpret the coefficients and $p$-values in the logistic and Poisson regression settings.\n", + "4. Write a computer script to perform logistic and Poisson regression and perform model diagnostics. Interpret and communicate the results from that computer script." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "deletable": false, + "editable": false, + "nbgrader": { + "cell_type": "code", + "checksum": "f42454b22db7b193c8ee787d760af4c5", + "grade": false, + "grade_id": "cell-a2a153352bc44a68", + "locked": true, + "schema_version": 3, + "solution": false, + "task": false + }, + "tags": [] + }, + "outputs": [], + "source": [ + "# Run this cell before continuing.\n", + "library(MASS)\n", + "library(tidyverse)\n", + "library(repr)\n", + "library(digest)\n", + "library(infer)\n", + "library(gridExtra)\n", + "library(mlbench)\n", + "library(AER)\n", + "library(ISLR)\n", + "library(broom)\n", + "library(qqplotr)\n", + "library(performance)\n", + "library(see)\n", + "library(glmbb)\n", + "library(cowplot)\n", + "source(\"tests_worksheet_04.R\")" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "deletable": false, + "editable": false, + "nbgrader": { + "cell_type": "markdown", + "checksum": "fa1c9c1b59e593fe70dc770887794dd8", + "grade": false, + "grade_id": "cell-82e529f0a9216400", + "locked": true, + "schema_version": 3, + "solution": false, + "task": false + } + }, + "source": [ + "## 0. Intro\n", + "\n", + "So far, you have explored the Multiple Linear Regression (MLR) as a way to model the mean of a numeric response variable, $Y$, given a set of covariate $\\mathbf{X}$:\n", + "\n", + "$$\n", + "E\\left[Y\\left|\\mathbf{X}=\\left(X_1,...,X_p\\right)\\right.\\right] = \\beta_0 + \\beta_1X_1 + \\ldots + \\beta_pX_p\n", + "$$\n", + "\n", + "However, in some situations, the MLR is not suitable. This week, we are going to study two of those situations that commonly arise in practice:\n", + "\n", + "- **in this worksheet**: the case of dichotomous response variables (e.g., yes/no, success/failure, win/lose, sick/not sick) \n", + "\n", + "- **in the tutorial**: the case of response variables representing counts (e.g., number of cases of a rare disease in Vancouver in one year; the number of accidents on the Canada Highway in one month)." + ] + }, + { + "cell_type": "markdown", + "metadata": { + "deletable": false, + "editable": false, + "nbgrader": { + "cell_type": "markdown", + "checksum": "8d3d95ab3add8b9d5b28701adf33b335", + "grade": false, + "grade_id": "cell-9be71f65643c5906", + "locked": true, + "schema_version": 3, + "solution": false, + "task": false + } + }, + "source": [ + "## 1. Logistic Regression\n", + "\n", + "Logistic Regressions are commonly used to model the probability of an event based on a set of observed covariates. As with Linear Regression, Logistic Regression can also be used to:\n", + "\n", + "- estimate and test the true relation between different types of variables and a *binary response*.\n", + "\n", + "\n", + "- predict the probability of a *binary response* (aka, classifier)\n", + "\n", + "
\n", + "\n", + "For example, we can use a logistic regression to\n", + "\n", + "1. compare the presence of bacteria between groups taking a new drug and a placebo, respectively.\n", + " - Response: *present* or *not present*\n", + "\n", + "
\n", + "\n", + "2. predict whether or not a customer will default on a loan given their income and demographic variables.\n", + " - Response: *default* or *not default*\n", + "\n", + "
\n", + "\n", + "3. know how GPA, ACT score, and number of AP classes taken are associated with the probability of getting accepted into a particular university.\n", + " - Response: *accepted* or *not accepted*\n", + " \n", + "\n", + "
\n" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "deletable": false, + "editable": false, + "nbgrader": { + "cell_type": "markdown", + "checksum": "fcfb8d63afba7acaba42f32cd34c60ac", + "grade": false, + "grade_id": "cell-f1fa818e59b35d73", + "locked": true, + "schema_version": 3, + "solution": false, + "task": false + } + }, + "source": [ + "### The response variable:\n", + "\n", + "The function in R to fit a logistic regression requires either a numerical response (0 and 1) or a `factor`, with two levels (note that R stores factors as integers). \n", + "\n", + "Mathematically, we have to construct a binary response $Y_i$ that flags the successes (S) for a given event of interest: \n", + "\n", + "$$\n", + "Y_i =\n", + "\\begin{cases}\n", + "1 \\; \\; \\; \\; \\mbox{if the $i$th observation is S},\\\\\n", + "0 \\; \\; \\; \\; \t\\mbox{otherwise.}\n", + "\\end{cases}\n", + "$$\n", + "\n", + "In the examples above, we would set:\n", + "\n", + "1. $Y_i = 1$ if the bacteria is present in the blood sample of the $i$th patient \n", + "\n", + "\n", + "2. $Y_i = 1$ if the $i$th customer defaulted on their loan\n", + "\n", + "\n", + "3. $Y_i = 1$ if the $i$th student was accepted to a particular university\n", + "\n", + "\n", + "In statistics, we refer to each $Y_i$ as a Bernoulli trial with a $p_i$ probability of success, i.e., \n", + "\n", + "$$Y_i \\sim \\text{Bernoulli}(p_i)$$\n", + "\n", + "where \n", + "\n", + "$$\n", + "E(Y_i) = p_i\n", + "$$" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "deletable": false, + "editable": false, + "nbgrader": { + "cell_type": "markdown", + "checksum": "6ff9798f84754777daec6b68c675351a", + "grade": false, + "grade_id": "cell-7aad649d27ed77eb", + "locked": true, + "schema_version": 3, + "solution": false, + "task": false + } + }, + "source": [ + "This worksheet will focus on the dataset `Default` from [*An Introduction to Statistical Learning*](https://www.statlearning.com/) (James et al., 2013). This is a dataset of $n = 10,000$ observations with the following variables:\n", + "\n", + "- `default`: a binary response indicating whether the customer defaulted on their debt (`Yes` or `No`).\n", + "- `student`: a binary input variable indicating whether the customer is a student (`Yes` or `No`).\n", + "- `balance`: a continuous input variable indicating the remaining average balance of the customer's credit card (after the monthly payment).\n", + "- `income`: a continuous input variable indicating the customer's income." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "deletable": false, + "editable": false, + "nbgrader": { + "cell_type": "code", + "checksum": "b96b0bf2d2dc5fe1c06048f774f68692", + "grade": false, + "grade_id": "cell-be65b9244ff2e164", + "locked": true, + "schema_version": 3, + "solution": false, + "task": false + }, + "tags": [] + }, + "outputs": [], + "source": [ + "head(Default)" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "deletable": false, + "editable": false, + "nbgrader": { + "cell_type": "markdown", + "checksum": "8c3ac8a25cf24d8b986e1b8d1202b847", + "grade": false, + "grade_id": "cell-52cf22804e4f9008", + "locked": true, + "schema_version": 3, + "solution": false, + "task": false + } + }, + "source": [ + "### Simple Linear Regression for Binary Data\n", + "\n", + "In this first exercise, we use a simple linear regression (SLR) to estimate the relation between `balance` and the response `default`.\n", + "\n", + "A SLR will model: \n", + "\n", + "$$\n", + "E\\left[\\ \\texttt{default}_i\\ |\\ \\texttt{balance}_i\\ \\right] = \\beta_0 + \\beta_1\\times \\texttt{balance}_i\n", + "$$\n", + "\n", + "However, since `default` is a binary variable, we also know that: \n", + "\n", + "$$\n", + "E\\left[\\ \\texttt{default}_i\\ |\\ \\texttt{balance}_i\\ \\right] = p_{\\texttt{default} | \\texttt{balance}_i}\n", + "$$\n", + "\n", + "this means that the conditional expectation $E\\left[\\ \\texttt{default}_i\\ |\\ \\texttt{balance}_i\\ \\right]$ is equal to the **probability** that a customer with $\\texttt{balance}_i$ will default (denoted above as $p_{\\texttt{default} | \\texttt{balance}_i}$).\n", + "\n", + "Let's examine if there is a problem using a linear regression to model this conditional expectation. " + ] + }, + { + "cell_type": "markdown", + "metadata": { + "deletable": false, + "editable": false, + "nbgrader": { + "cell_type": "markdown", + "checksum": "85f5261a542c656acd71d49ed3d59418", + "grade": false, + "grade_id": "cell-ec0e34aef0968060", + "locked": true, + "schema_version": 3, + "solution": false, + "task": false + } + }, + "source": [ + "**Question 1.1**\n", + "
{points: 1}\n", + "\n", + "Create a plot of the data (using `geom_point()`) along with the estimated regression line (using `geom_smooth()` with `method = \"lm\"`). Include proper axis labels. The `ggplot()` object's name will be `default_SLR_plot`.\n", + "\n", + "The first clue that fitting a simple linear regression (SLR) to this data may be problematic is that R will complain about fitting an SLR with a categorical variable. Therefore, we need to convert the 'default' variable to numeric in order to fit the SLR.\n", + "\n", + "*Fill out those parts indicated with `...`, uncomment the corresponding code in the cell below, and run it.*" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "deletable": false, + "nbgrader": { + "cell_type": "code", + "checksum": "3f988d2d418a69b617f3514c58d78e67", + "grade": false, + "grade_id": "cell-beeb841f20cbe0b7", + "locked": false, + "schema_version": 3, + "solution": true, + "task": false + }, + "tags": [] + }, + "outputs": [], + "source": [ + "# Adjust these numbers so the plot looks good in your desktop.\n", + "options(repr.plot.width = 7, repr.plot.height = 5) \n", + "\n", + "# default_SLR_plot <- \n", + "# Default %>%\n", + "# mutate(default = if_else(default == \"Yes\", ..., ...)) %>%\n", + "# ...\n", + "# ...(aes(..., ...)) +\n", + "# ...(aes(..., ...), method = ..., se = FALSE) +\n", + "# labs(y = ..., x = ...) +\n", + "# ggtitle(...) +\n", + "# ylim(-0.5, 1.5) +\n", + "# theme(text = element_text(size = 16.5)) +\n", + "# scale_x_continuous(breaks = seq(0, 2500, 500))\n", + "\n", + "\n", + "# your code here\n", + "fail() # No Answer - remove if you provide an answer\n", + "\n", + "default_SLR_plot" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "deletable": false, + "editable": false, + "nbgrader": { + "cell_type": "code", + "checksum": "517a6cdb14fdc0d88ff2ae2218bd6f20", + "grade": true, + "grade_id": "cell-5f01635dd4cf6b6b", + "locked": true, + "points": 1, + "schema_version": 3, + "solution": false, + "task": false + }, + "tags": [] + }, + "outputs": [], + "source": [ + "test_1.1()" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "deletable": false, + "editable": false, + "nbgrader": { + "cell_type": "markdown", + "checksum": "49b1467bfe839b6a4ffa78aaf209e340", + "grade": false, + "grade_id": "cell-a0c60c4720d6d478", + "locked": true, + "schema_version": 3, + "solution": false, + "task": false + } + }, + "source": [ + "#### Discussion\n", + "\n", + "Do you see any problems with our model? Discuss it with a colleague." + ] + }, + { + "cell_type": "markdown", + "metadata": { + "deletable": false, + "editable": false, + "nbgrader": { + "cell_type": "markdown", + "checksum": "f6b5645403ad15f34d738a47d80ec61d", + "grade": false, + "grade_id": "cell-e2bdf22d86042c11", + "locked": true, + "schema_version": 3, + "solution": false, + "task": false + }, + "tags": [] + }, + "source": [ + "### Logistic Regression: an alternative to LR\n", + "\n", + "The problem stems from using the *linear* model to estimate a probability. \n", + "\n", + "Mathematically, the linear component $\\beta_0 + \\beta_1X_{i1} + ... +\\beta_pX_{ip}$ can take any value, while probabilities is **always** between 0 and 1.\n", + "\n", + "A natural way to solve this problem is to use a curve, instead of a line, with a range between $[0,1]$. One such curve is the logistic curve:\n", + "\n", + "$$E(Y_i|X_i) = p_i = \\frac{e^{\\beta_0 + \\beta_1X_i}}{1+e^{\\beta_0 + \\beta_1X_i}}$$ \n", + "\n", + "Note that we are still using a linear component but not to model the conditional expectation directly. \n", + "\n", + "With some algebra, we can show that:\n", + "\n", + "\\begin{equation*}\n", + "\\log\\left(\\frac{p_i}{1 - p_i}\\right) = \\beta_0 + \\beta_1 X_i\n", + "\\end{equation*}\n", + "\n", + "**Definition**: $p_i$ to $1 - p_i$ are also known as the **odds** and can be estimated by *number of sucesses* to *number of failures*\n", + "\n", + "For example, among 7056 non-student customers, 206 defaulted on their debt. Thus, the odds that non-student customers default is 206 to 6850. \n", + "\n", + "The function $\\log\\left(\\frac{p_i}{1 - p_i}\\right)$ is called **logit**, and it is *logarithm of the odds*. \n", + "\n", + "Let's explore this new function a bit." + ] + }, + { + "cell_type": "markdown", + "metadata": { + "deletable": false, + "editable": false, + "nbgrader": { + "cell_type": "markdown", + "checksum": "fc08d9d4d3dddb3cd4882cd89a017330", + "grade": false, + "grade_id": "cell-33c529c35aa6d29d", + "locked": true, + "schema_version": 3, + "solution": false, + "task": false + } + }, + "source": [ + "**Question 1.2**\n", + "
{points: 1}\n", + "\n", + "Let's see what the logistic curve looks like. In this exercise, you will plot the logistic curve to see how it behaves.\n", + "\n", + "_Save the plot in an object named `logistic_curve`._" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "deletable": false, + "nbgrader": { + "cell_type": "code", + "checksum": "a9065d3e33d8b03430b7e82093470e67", + "grade": false, + "grade_id": "cell-ebd9f096f835b1b0", + "locked": false, + "schema_version": 3, + "solution": true, + "task": false + }, + "tags": [] + }, + "outputs": [], + "source": [ + "# logistic_curve <-\n", + "# tibble(z = seq(-10,10,0.01),\n", + "# logistic_z = ...) %>% \n", + "# ggplot(aes(z, ...)) + \n", + "# geom_line() +\n", + "# geom_hline(yintercept = 1, lty=2) + \n", + "# geom_hline(yintercept = 0, lty=2) +\n", + "# theme(text = element_text(size = 20)) + \n", + "# ggtitle(\"Logistic curve\")\n", + "\n", + "# your code here\n", + "fail() # No Answer - remove if you provide an answer\n", + "\n", + "logistic_curve" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "deletable": false, + "editable": false, + "nbgrader": { + "cell_type": "code", + "checksum": "ecf65b1c668b9efa77e328b0933067be", + "grade": true, + "grade_id": "cell-7728e71dc711badb", + "locked": true, + "points": 1, + "schema_version": 3, + "solution": false, + "task": false + }, + "tags": [] + }, + "outputs": [], + "source": [ + "test_1.2()" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "deletable": false, + "editable": false, + "nbgrader": { + "cell_type": "markdown", + "checksum": "78e4c162edcc6ccf20c36e44cad9613f", + "grade": false, + "grade_id": "cell-1f6a141622c13896", + "locked": true, + "schema_version": 3, + "solution": false, + "task": false + } + }, + "source": [ + "**Question 1.3: Understanding the odds**\n", + "
{points: 1}\n", + "\n", + "Vancouver Canucks is playing against Calgary Flames in the Final of the NHL. The match will be at Rogers' arena, Canucks home. It is expected that out of 18,910 seats in the arena, 13,700 seats will be occupied by Canucks fans. During the match, prizes are randomly distributed among the seats. What are the odds that a Canucks fan wins a given prize? \n", + "\n", + "Assign your answer to an object named `answer1.3`." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "deletable": false, + "nbgrader": { + "cell_type": "code", + "checksum": "836ce27b9d3fa4140dea20c3ceb8bb8f", + "grade": false, + "grade_id": "cell-ee8b9ad3125f8346", + "locked": false, + "schema_version": 3, + "solution": true, + "task": false + }, + "tags": [] + }, + "outputs": [], + "source": [ + "#answer1.3 <- ...\n", + "\n", + "# your code here\n", + "fail() # No Answer - remove if you provide an answer\n", + "\n", + "answer1.3" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "deletable": false, + "editable": false, + "nbgrader": { + "cell_type": "code", + "checksum": "15f77319d1592683114eb33627f52723", + "grade": true, + "grade_id": "cell-85f66b5726146ab1", + "locked": true, + "points": 1, + "schema_version": 3, + "solution": false, + "task": false + }, + "tags": [] + }, + "outputs": [], + "source": [ + "test_1.3()" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "deletable": false, + "editable": false, + "nbgrader": { + "cell_type": "markdown", + "checksum": "8be5d7606a3bf1c040bbfc760dfaa24b", + "grade": false, + "grade_id": "cell-8a7c7401c453d69c", + "locked": true, + "schema_version": 3, + "solution": false, + "task": false + } + }, + "source": [ + "**Question 1.4:**\n", + "
{points: 1}\n", + "\n", + "Let us plot the predictions of the binary logistic regression model on top of `default_SLR_plot`. Use `geom_smooth()` with `method = \"glm\"` and `method.args = c(family = binomial)`.\n", + "\n", + "*Fill out those parts indicated with `...`, uncomment the corresponding code in the cell below, and run it.*" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "deletable": false, + "nbgrader": { + "cell_type": "code", + "checksum": "6bc21a65be21dbb77b90f7294bc13166", + "grade": false, + "grade_id": "cell-47acd27ded854e48", + "locked": false, + "schema_version": 3, + "solution": true, + "task": false + }, + "tags": [] + }, + "outputs": [], + "source": [ + "# default_SLR_plot <- \n", + "# default_SLR_plot +\n", + "# ...(aes(..., ...),\n", + "# method = ...,\n", + "# method.args = ..., \n", + "# se = FALSE, color = \"red\") +\n", + "# ggtitle(\"Simple Linear Regression and Binary Logistic Regression\")\n", + "\n", + "# your code here\n", + "fail() # No Answer - remove if you provide an answer\n", + "\n", + "default_SLR_plot" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "deletable": false, + "editable": false, + "nbgrader": { + "cell_type": "code", + "checksum": "e2abe04eb35bee83f8eb82ef34ad35bb", + "grade": true, + "grade_id": "cell-4989d973853e512d", + "locked": true, + "points": 1, + "schema_version": 3, + "solution": false, + "task": false + }, + "tags": [] + }, + "outputs": [], + "source": [ + "test_1.4()" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "deletable": false, + "editable": false, + "nbgrader": { + "cell_type": "markdown", + "checksum": "55d066dcc3c785ec147c6536046c0507", + "grade": false, + "grade_id": "cell-a628cff07e7aa9ce", + "locked": true, + "schema_version": 3, + "solution": false, + "task": false + } + }, + "source": [ + "Much better, isn't it? " + ] + }, + { + "cell_type": "markdown", + "metadata": { + "deletable": false, + "editable": false, + "nbgrader": { + "cell_type": "markdown", + "checksum": "d22fd78f493931ed5c86a64ecee143e2", + "grade": false, + "grade_id": "cell-ed0a86b15f54a046", + "locked": true, + "schema_version": 3, + "solution": false, + "task": false + } + }, + "source": [ + "#### The model\n", + "\n", + "Let's review our model!\n", + "\n", + "The response: \n", + "\n", + "$$\n", + "Y_i =\n", + "\\begin{cases}\n", + "1 \\; \\; \\; \\; \\mbox{if the $i$th observation is a success},\\\\\n", + "0 \\; \\; \\; \\; \t\\mbox{otherwise}\n", + "\\end{cases}\n", + "$$\n", + "\n", + "can only take the values $0$ or $1$. \n", + "\n", + "The conditional expected value of this variable is the probability that $Y_i$ takes on the value of $1$ (or probability of success), given the values of $X_i$, denoted as $p_i$. Hence:\n", + "\n", + "$$\\left.Y_i\\right|\\mathbf{X}_{i} \\sim \\text{Bernoulli}(p_i).$$\n", + "\n", + "The **logistic regression** models the conditional probability $p_i$ given the information of a set of covariates but *not* directly as a linear function of them. Instead, it re-expresses $p_i$ on an unrestricted scale, called the log-odds:\n", + "\n", + "$$\n", + "\\mbox{logit}(p_i) = \\log \\bigg( \\frac{p_i}{1 - p_i}\\bigg) = \\beta_0 + \\beta_1 X_{i1} + \\beta_2 X_{i2} + \\ldots + \\beta_{q} X_{iq},\n", + "$$\n", + "\n", + "Again, the logarithm of the odds is the logarithm of the ratio of the probability of a sucess to the probability of a failure of an event\n", + "\n", + "Or equivalently\n", + "\n", + "$$\n", + "p_i = \\frac{\\exp\\big[\\beta_0 + \\beta_1 X_{i1} + \\beta_2 X_{i2} + \\ldots + \\beta_{q} X_{iq}\\big]}{1 + \\exp\\big[\\beta_0 + \\beta_1 X_{i1} + \\beta_2 X_{i2} + \\ldots + \\beta_{q} X_{iq}\\big]}.\n", + "$$\n", + "\n", + "In our example,\n", + "\n", + "$$\n", + "\\mbox{logit}(p_{i,\\texttt{default}}) = \\log \\bigg( \\frac{p_{i,\\texttt{default}}}{1 - p_{i,\\texttt{default}}}\\bigg) = \\beta_0 + \\beta_1\\texttt{balance}_i\n", + "$$\n", + "\n", + "or equivalently\n", + "\n", + "$$\n", + "p_{i,\\texttt{default}} = \\frac{e^{\\beta_0 + \\beta_1 \\texttt{balance}_i}}{1 + e^{\\beta_0 + \\beta_1 \\texttt{balance}_i}}.\n", + "$$\n", + "\n", + "In this example, the *odds* are interpreted as how likely the $i$th customer is to be in default compared to how unlikely it is at a fixed value of their balance. " + ] + }, + { + "cell_type": "markdown", + "metadata": { + "deletable": false, + "editable": false, + "nbgrader": { + "cell_type": "markdown", + "checksum": "9ae7f462308f928cb7d9be38f5c8544f", + "grade": false, + "grade_id": "cell-f7510c5644481477", + "locked": true, + "schema_version": 3, + "solution": false, + "task": false + } + }, + "source": [ + "### 1.1 Estimation\n", + "\n", + "The question now is, how do we estimate the coefficients $\\beta_j$'s? \n", + "\n", + "So far, in the case of linear regression, we have been using the Least Square Estimators. However, due to the type of response of the logistic model, this objective function is no longer appropriate.\n", + "\n", + "A common method for estimating a logistic regression (and many other models) is *Maximum Likelihood Estimation (MLE)*. Details of MLE are outside the scope of this course, but we will still implement it using R! " + ] + }, + { + "cell_type": "markdown", + "metadata": { + "deletable": false, + "editable": false, + "nbgrader": { + "cell_type": "markdown", + "checksum": "54572438085e0f4594ef67e567ab7a8d", + "grade": false, + "grade_id": "cell-9e11a13acd8b9ad1", + "locked": true, + "schema_version": 3, + "solution": false, + "task": false + } + }, + "source": [ + "\n", + "#### Interpretation\n", + "\n", + "As before, the interpretation of the coefficients will depend on the type of input variable. \n", + "\n", + "Let's start by looking at a model with only 1 categorical covariate, such as being or not being a student. We know that `R` will create a dummy variable $X$ to include in the model in this case. But how do we interpret its coefficient?\n", + "\n", + "Let's refresh our memory for **Linear Regression**:\n", + "\n", + "- *Intercept:* was the mean of the response of the reference group (and the *estimate* was the *sample* mean)\n", + "\n", + "- *Slope:* was the *difference* between the mean of the response of the treatment group vs that of the reference group (and the *estimate* was the difference of *sample* means)\n", + "\n", + "However, for LR, we modelled the expected value of the response directly. We now need to adjust the interpretations to changes in log-odds.\n", + "\n", + "- Intercept: $\\hat{\\beta}_0$ represents the log odds of the reference group (e.g., non-students)\n", + "\n", + "- Slope: $\\hat{\\beta}_1$ represents the difference in log odds between the treatment and the reference group (e.g., students vs. non-students)\n", + "\n", + "Since log-odds are difficult to interpret, it is common to also interpret the exponentiated version of the coefficients:\n", + "\n", + "- Intercept: $e^{\\hat{\\beta}_0}$ represents the odds of the reference group, i.e., the proportion of success relative to the proportion of failures in the sample \n", + "\n", + "- Slope: $e^{\\hat{\\beta}_1}$ represents the *odds ratio*, i.e., the ratio between the odds of the treatment vs the odds of the reference group\n", + "\n", + "*Run the cell below to compute these quantities. Read and follow calculations*" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "tags": [] + }, + "outputs": [], + "source": [ + "Default %>% \n", + " select(default, student) %>% \n", + " group_by(default, student) %>%\n", + " summarise(n=n(), .groups = 'drop')" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "deletable": false, + "editable": false, + "nbgrader": { + "cell_type": "code", + "checksum": "442a5bb6c66d4fdb142bd4fc55b21962", + "grade": false, + "grade_id": "cell-30b101cea83cb90f", + "locked": true, + "schema_version": 3, + "solution": false, + "task": false + }, + "tags": [] + }, + "outputs": [], + "source": [ + "# b0 = \\hat{beta}_0\n", + "\n", + "tot_studentNo <- 6850 + 206\n", + "\n", + "# probability of default if non-student\n", + "# note that the same is true for #default1_studentNo/#default0_studentNo\n", + "p_studentNo <- 206/tot_studentNo\n", + "\n", + "b0 <- log(p_studentNo/(1-p_studentNo))\n", + "\n", + "# b1 = \\hat{beta}_1\n", + "tot_studentYes <- 2817 + 127\n", + "\n", + "# probability of default if student\n", + "p_studentYes <- 127/tot_studentYes\n", + "\n", + "# log odds of students vs. log odds of non-students\n", + "b1 <- log(p_studentYes/(1-p_studentYes)) - b0\n", + "\n", + "print(\"log-odds for non-students (b0) and for students\")\n", + "c(b0, log(p_studentYes/(1-p_studentYes))) %>%\n", + " round(3)\n", + "\n", + "print(\"odds for non-students (exp(b0)) and for students\")\n", + "c(exp(b0), (p_studentYes/(1-p_studentYes))) %>%\n", + " round(3)\n", + "\n", + "print(\"odds ratio (exp(b1))\")\n", + "c(exp(b1), (p_studentYes/(1-p_studentYes))/(p_studentNo/(1-p_studentNo))) %>% \n", + " round(3)" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "deletable": false, + "editable": false, + "nbgrader": { + "cell_type": "markdown", + "checksum": "2f0ea6be60de86e4f4fd190e8f3fb3c8", + "grade": false, + "grade_id": "cell-a3e8e0c543a5d4ea", + "locked": true, + "schema_version": 3, + "solution": false, + "task": false + } + }, + "source": [ + "**Question 1.5:**\n", + "
{points: 1}\n", + "\n", + "To fit the model, we can use the function `glm()` and its argument `family = binomial` (required to specify the binary nature of the response). \n", + "\n", + "Now, let's put this into practice. We'll estimate a binary logistic regression utilizing the function `glm()` with `default` as the response and `student` as the input variable. The dataset is `Default`.\n", + " \n", + "Store the model in an object named `default_binary_log_student`. The `glm()` parameters are analogous to `lm()` (`formula` and `data`) with the addition of `family = binomial` for this specific model. " + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "deletable": false, + "nbgrader": { + "cell_type": "code", + "checksum": "c909654f04f5d2c449e53b1686710d3a", + "grade": false, + "grade_id": "cell-8ac0e64c9daf4ffc", + "locked": false, + "schema_version": 3, + "solution": true, + "task": false + }, + "tags": [] + }, + "outputs": [], + "source": [ + "# default_binary_log_student <- \n", + "# ...(formula = ...,\n", + "# data = ...,\n", + "# family = ...)\n", + "\n", + "\n", + "# your code here\n", + "fail() # No Answer - remove if you provide an answer\n", + "\n", + "summary(default_binary_log_student)" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "deletable": false, + "editable": false, + "nbgrader": { + "cell_type": "code", + "checksum": "9f286a3c5b2ad6c51b25dc02ee65f1ac", + "grade": true, + "grade_id": "cell-debd081e1606b49a", + "locked": true, + "points": 1, + "schema_version": 3, + "solution": false, + "task": false + }, + "tags": [] + }, + "outputs": [], + "source": [ + "test_1.5()" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "deletable": false, + "editable": false, + "nbgrader": { + "cell_type": "markdown", + "checksum": "04b74bd9ec3596fb96c3024af41b5e06", + "grade": false, + "grade_id": "cell-de489b368280eba8", + "locked": true, + "schema_version": 3, + "solution": false, + "task": false + } + }, + "source": [ + "Note that you can also use the function `tidy()` to obtain a summary table of results." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "deletable": false, + "editable": false, + "nbgrader": { + "cell_type": "code", + "checksum": "0e2913d1d2f988dfc00b2acac6a636b1", + "grade": false, + "grade_id": "cell-68c04287f7d21da3", + "locked": true, + "schema_version": 3, + "solution": false, + "task": false + }, + "tags": [] + }, + "outputs": [], + "source": [ + "#Run this cell to get a tidy summary table\n", + "\n", + "default_binary_log_student_results <-\n", + " default_binary_log_student %>%\n", + " tidy() %>% \n", + " mutate(exp.estimate = exp(estimate)) \n", + "\n", + "default_binary_log_student_results " + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "You can also get the exponentiated coefficients with the argument `exponentiate = TRUE` in `tidy()`." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "tags": [] + }, + "outputs": [], + "source": [ + "default_binary_log_student %>% \n", + " tidy(exponentiate = TRUE) " + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "**Note**: that the `std.error` column still refers to the non-exponentiated estimators, e.g., it refers to $\\hat{\\beta}_0$ not $e^{\\hat{\\beta}_0}$" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "deletable": false, + "editable": false, + "nbgrader": { + "cell_type": "markdown", + "checksum": "62fc3fe37294849aa1d43d3bacb220b3", + "grade": false, + "grade_id": "cell-6665a0ff5a730bdf", + "locked": true, + "schema_version": 3, + "solution": false, + "task": false + } + }, + "source": [ + "**Question 1.6**\n", + "
{points: 1}\n", + "\n", + "Considering the `default_binary_log_student_results` tibble, what is the correct interpretation of the $\\hat{\\beta}_\\textit{student}$?\n", + "\n", + "**A.** The odds of default are $49.9\\%$ higher for non-student customers than student customers.\n", + "\n", + "**B.** The odds of default are $49.9\\%$ higher for student customers than non-student customers.\n", + "\n", + "**C.** The odds of default are $40.5\\%$ higher for non-student customers than student customers.\n", + "\n", + "**D.** The odds of default are $40.5\\%$ higher for student customers than non-student customers.\n", + "\n", + "*Assign your answer to the object `answer1.6` (character type surrounded by quotes).*" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "deletable": false, + "nbgrader": { + "cell_type": "code", + "checksum": "5061c90ecf4e6a196e542aea2f516667", + "grade": false, + "grade_id": "cell-9d862eda2ad912b4", + "locked": false, + "schema_version": 3, + "solution": true, + "task": false + }, + "tags": [] + }, + "outputs": [], + "source": [ + "# answer1.6 <- ...\n", + "\n", + "# your code here\n", + "fail() # No Answer - remove if you provide an answer" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "deletable": false, + "editable": false, + "nbgrader": { + "cell_type": "code", + "checksum": "48a900d29872eb46d146064466276e40", + "grade": true, + "grade_id": "cell-454d0bf3ba416081", + "locked": true, + "points": 1, + "schema_version": 3, + "solution": false, + "task": false + }, + "tags": [] + }, + "outputs": [], + "source": [ + "test_1.6()" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "deletable": false, + "editable": false, + "nbgrader": { + "cell_type": "markdown", + "checksum": "e94e9e68ec304d2d1c4e6a2225841d46", + "grade": false, + "grade_id": "cell-4bf8d13d2489425d", + "locked": true, + "schema_version": 3, + "solution": false, + "task": false + } + }, + "source": [ + "#### Multiple covariates\n", + "\n", + "The interpretation extends to the case of more covariates of different types.\n", + "\n", + "- $\\hat{\\beta}_1$ gives the changes in log odds per one-unit increase in X, \n", + " - or equivalently it multiplies the odds by $e^{\\hat{\\beta}_1}$\n", + " \n", + "Because the relationship between $p_i$ and $X_i$ is not a straight line, $\\hat{\\beta}_1$ does not correspond to the change in $p_i$ associated with a one-unit change in $X_i$. But regardless,\n", + "if $\\hat{\\beta}_1$ is positive then increasing $X_i$ will be associated with increasing $p_i$ and visceversa.\n", + "\n", + "> Note that when the odds decrease, i.e., negative change in log-odds, it is easier to interpret $1/e^{\\hat{\\beta}_1}$.\n", + ">\n", + "> That is, to redefine\n", + " $\\text{odds} = \\frac{\\text{failures}}{\\text{successes}}$" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "deletable": false, + "editable": false, + "nbgrader": { + "cell_type": "markdown", + "checksum": "69f7988a4e800709127db086ff5b4c47", + "grade": false, + "grade_id": "cell-148b1dc1b2667581", + "locked": true, + "schema_version": 3, + "solution": false, + "task": false + } + }, + "source": [ + "**Question 1.7:**\n", + "
{points: 1}\n", + "\n", + "In order to fit the model, we can use the function `glm()` and its argument `family = binomial` (required to specify the binary nature of the response). \n", + "\n", + "Let us use the function `glm()` to estimate a binary logistic regression. Using the `Default` dataset, we will fit a binary logistic model with `default` as the response and `student`, `balance`, and `income` as input variables.\n", + " \n", + "Store the model in an object named `default_binary_log_model`. The `glm()` parameters are analogous to `lm()` (`formula` and `data`) with the addition of `family = binomial` for this specific model. " + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "deletable": false, + "nbgrader": { + "cell_type": "code", + "checksum": "3523cd610678bfd1ea994ecdf69327c4", + "grade": false, + "grade_id": "cell-7097573be6a38bf2", + "locked": false, + "schema_version": 3, + "solution": true, + "task": false + }, + "tags": [] + }, + "outputs": [], + "source": [ + "# default_binary_log_model <- \n", + "# ...(...,\n", + "# ...,\n", + "# ...)\n", + "\n", + "# your code here\n", + "fail() # No Answer - remove if you provide an answer\n", + "\n", + "default_binary_log_model %>%\n", + " tidy() %>% \n", + " mutate_if(is.numeric, round, 3)\n", + "\n", + "default_binary_log_model %>%\n", + " tidy(exponentiate = TRUE) %>% \n", + " mutate_if(is.numeric, round, 3)" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "deletable": false, + "editable": false, + "nbgrader": { + "cell_type": "code", + "checksum": "b186bf9f116ab25d76f66c78304a40df", + "grade": true, + "grade_id": "cell-a0e923b260f01656", + "locked": true, + "points": 1, + "schema_version": 3, + "solution": false, + "task": false + }, + "tags": [] + }, + "outputs": [], + "source": [ + "test_1.7()" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "deletable": false, + "editable": false, + "nbgrader": { + "cell_type": "markdown", + "checksum": "ef737cd98f224e50beb0e389dd5b1766", + "grade": false, + "grade_id": "cell-5fe352592ed23811", + "locked": true, + "schema_version": 3, + "solution": false, + "task": false + } + }, + "source": [ + "**Question 1.8**\n", + "
{points: 1}\n", + "\n", + "Considering the `default_binary_log_model_results` tibble, what is the correct interpretation of the $\\hat{\\beta}_\\textit{student}$?\n", + "\n", + "**A.** Since $1 / 0.524 = 1.908$, we estimate that the odds of non-default are $90.8\\%$ higher for non-student customers than student customers while keeping the rest of the input variables constant.\n", + "\n", + "**B.** Since $1 / 0.524 = 1.908$, we estimate that the odds of default are $90.8\\%$ higher for non-student customers than student customers while keeping the rest of the input variables constant.\n", + "\n", + "**C.** Since $1 / 0.524 = 1.908$, we estimate that the odds of non-default are $90.8\\%$ higher for student customers than non-student customers while keeping the rest of the input variables constant.\n", + "\n", + "**D.** Since $1 / 0.524 = 1.908$, we estimate that the odds of default are $90.8\\%$ higher for student custormers than non-student customers, while keeping the rest of the input variables constant.\n", + "\n", + "*Assign your answer to the object `answer1.8` (character type surrounded by quotes).*" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "deletable": false, + "nbgrader": { + "cell_type": "code", + "checksum": "a52e8dcb232f6c3226e33b6217364aed", + "grade": false, + "grade_id": "cell-11677807d101b133", + "locked": false, + "schema_version": 3, + "solution": true, + "task": false + }, + "tags": [] + }, + "outputs": [], + "source": [ + "# answer1.8 <- ...\n", + "\n", + "# your code here\n", + "fail() # No Answer - remove if you provide an answer" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "deletable": false, + "editable": false, + "nbgrader": { + "cell_type": "code", + "checksum": "f8b8b2dbc78416121455ee492281fa03", + "grade": true, + "grade_id": "cell-30f034478ea08799", + "locked": true, + "points": 1, + "schema_version": 3, + "solution": false, + "task": false + } + }, + "outputs": [], + "source": [ + "test_1.8()" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "deletable": false, + "editable": false, + "nbgrader": { + "cell_type": "markdown", + "checksum": "db7ca2eb2ff9bab301a1d53317dfafdd", + "grade": false, + "grade_id": "cell-7960f6f613499491", + "locked": true, + "schema_version": 3, + "solution": false, + "task": false + } + }, + "source": [ + "**Question 1.9**\n", + "
{points: 1}\n", + "\n", + "What is the correct interpretation of the regression equation's estimated slope for `balance`?\n", + "\n", + "**A.** A $\\$1$ increase in `balance` is associated with a $0.6\\%$ increase in the odds of a customer defaulting while keeping the rest of the input variables constant. \n", + "\n", + "**B.** A $\\$1$ increase in `balance` is associated with a $0.6\\%$ increase in the odds of a customer not being in default while keeping the rest of the input variables constant. \n", + "\n", + "**C.** A $\\$1$ increase in `balance` is associated with a $0.6\\%$ increase in the odds of a student customer defaulting while keeping the rest of the input variables constant. \n", + "\n", + "**D.** A $\\$1$ increase in `balance` is associated with a $0.6\\%$ increase in the odds of a non-student customer defaulting while keeping the rest of the input variables constant.\n", + "\n", + "*Assign your answer to the object `answer1.9` (character type surrounded by quotes).*" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "deletable": false, + "nbgrader": { + "cell_type": "code", + "checksum": "19623ffd8cd12291e421b7f8a0cc12cb", + "grade": false, + "grade_id": "cell-7441e6aa1307023f", + "locked": false, + "schema_version": 3, + "solution": true, + "task": false + }, + "tags": [] + }, + "outputs": [], + "source": [ + "# answer1.9 <- ...\n", + "\n", + "# your code here\n", + "fail() # No Answer - remove if you provide an answer\n" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "deletable": false, + "editable": false, + "nbgrader": { + "cell_type": "code", + "checksum": "4809d604ee9b9b23b53ce01c29a0e108", + "grade": true, + "grade_id": "cell-347ad929fad50479", + "locked": true, + "points": 1, + "schema_version": 3, + "solution": false, + "task": false + }, + "tags": [] + }, + "outputs": [], + "source": [ + "test_1.9()" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "deletable": false, + "editable": false, + "nbgrader": { + "cell_type": "markdown", + "checksum": "edd81efc8c977b63e330f8059e68fe84", + "grade": false, + "grade_id": "cell-8357d4c87f849b10", + "locked": true, + "schema_version": 3, + "solution": false, + "task": false + } + }, + "source": [ + "### 1.2 Inference\n", + "\n", + "We can use this estimated model to make inferences about the population parameters, i.e., we can determine whether an input variable is statistically associated with the logarithm of the odds through hypothesis testing for the parameters $\\beta_j$. \n", + "\n", + "To do that, we need additional information about the estimators of the regression coefficients, $\\hat{\\beta}_j$. In particular, we need their sampling distribution and corresponding standard errors, $\\mbox{SE}\\left(\\hat{\\beta}_j\\right)$. \n", + "\n", + "**Theoretical test**:\n", + "\n", + "To test the hypotheses\n", + "\\begin{gather*}\n", + "H_0: \\beta_j = 0\\\\\n", + "H_a: \\beta_j \\neq 0.\n", + "\\end{gather*}\n", + "\n", + "You can use the Wald statistic $z_j$\n", + "\n", + "$$z_j = \\frac{\\hat{\\beta}_j}{\\mbox{SE}\\left(\\hat{\\beta}_j\\right)}$$\n", + "\n", + "which under $H_0$ has an approximately standard normal distribution provided the sample size $n$ is large enough. This statistic is analogous to the $t$-value used in LR. \n", + "\n", + "Furthermore, given a specified level of confidence, we can construct approximate $(1 - \\alpha) \\times 100\\%$ confidence intervals for the corresponding true value of $\\beta_j$:\n", + "\n", + "$$\\hat{\\beta}_j \\pm z_{\\alpha/2}\\mbox{SE}\\left(\\hat{\\beta}_j\\right),$$\n", + "\n", + "where $z_{\\alpha/2}$ is the upper $\\alpha/2$ quantile of the standard normal distribution." + ] + }, + { + "cell_type": "markdown", + "metadata": { + "deletable": false, + "editable": false, + "nbgrader": { + "cell_type": "markdown", + "checksum": "d2bcf7253b1378ab67cead3e877bb314", + "grade": false, + "grade_id": "cell-c18026a5fd84fca7", + "locked": true, + "schema_version": 3, + "solution": false, + "task": false + } + }, + "source": [ + "**Question 1.10: Inference**\n", + "
{points: 1}\n", + "\n", + "Report the estimated coefficients, their standard errors, and corresponding $p$-values by calling `tidy()` on `Default_binary_log_model`. Include the corresponding asymptotic 95% confidence intervals. \n", + "\n", + "_Store the results in the variable `Default_binary_log_model_results`._" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "deletable": false, + "nbgrader": { + "cell_type": "code", + "checksum": "be61c0828e85b141f7ee55cccd636b52", + "grade": false, + "grade_id": "cell-ad37611aa165ca7a", + "locked": false, + "schema_version": 3, + "solution": true, + "task": false + }, + "tags": [] + }, + "outputs": [], + "source": [ + "# default_binary_log_model_results <- \n", + "# ... %>%\n", + "# ...(conf.int = TRUE) \n", + "\n", + "# your code here\n", + "fail() # No Answer - remove if you provide an answer\n", + "\n", + "default_binary_log_model_results" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "deletable": false, + "editable": false, + "nbgrader": { + "cell_type": "code", + "checksum": "92a3208529cd7c7a0d2ccc4fdaf2e45f", + "grade": true, + "grade_id": "cell-38f7c2eea95906c9", + "locked": true, + "points": 1, + "schema_version": 3, + "solution": false, + "task": false + }, + "tags": [] + }, + "outputs": [], + "source": [ + "test_1.10()" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "deletable": false, + "editable": false, + "nbgrader": { + "cell_type": "markdown", + "checksum": "1e4787dd502ce65c32c66f1f31c6342b", + "grade": false, + "grade_id": "cell-82749136a082e937", + "locked": true, + "schema_version": 3, + "solution": false, + "task": false + } + }, + "source": [ + "**Question 1.11: Inference**\n", + "
{points: 1}\n", + "\n", + "Use `tidy()` to the estimated effect each of the variables has on the *odds* to the `default_binary_log_model_results` tibble. Make sure to also include the confidence interval for these effects." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "deletable": false, + "nbgrader": { + "cell_type": "code", + "checksum": "2433e0c3214fcf1f1bf9f295971cdc37", + "grade": false, + "grade_id": "cell-366329ae6e91a284", + "locked": false, + "schema_version": 3, + "solution": true, + "task": false + }, + "tags": [] + }, + "outputs": [], + "source": [ + "# default_binary_odds_model_results <- \n", + "# default_binary_log_model %>%\n", + "# ... %>%\n", + "# mutate_if(is.numeric, round, 6)\n", + "\n", + "# your code here\n", + "fail() # No Answer - remove if you provide an answer\n", + "\n", + "default_binary_odds_model_results" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "deletable": false, + "editable": false, + "nbgrader": { + "cell_type": "code", + "checksum": "505b346f8721786c9ec10aa064641509", + "grade": true, + "grade_id": "cell-b905bb95b0e06997", + "locked": true, + "points": 1, + "schema_version": 3, + "solution": false, + "task": false + }, + "tags": [] + }, + "outputs": [], + "source": [ + "test_1.11()" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "deletable": false, + "editable": false, + "nbgrader": { + "cell_type": "markdown", + "checksum": "ec0ac9a05f3646b0a35d2e9da17edd97", + "grade": false, + "grade_id": "cell-d9141f6447689b01", + "locked": true, + "schema_version": 3, + "solution": false, + "task": false + } + }, + "source": [ + "**Question 1.12**\n", + "
{points: 1}\n", + "\n", + "Using a **significance level $\\alpha = 0.05$**, which inputs are statistically associated to the probability of default in `Default_binary_log_model_results`?\n", + "\n", + "**A.** The categorical input `student`.\n", + "\n", + "**B.** The continuous input `balance`.\n", + "\n", + "**C.** The continuous input `income`.\n", + "\n", + "*Assign your answers to the object `answer1.12`. Your answers must be included in a single string indicating the correct options in alphabetical order and surrounded by quotes (e.g., `\"ABC\"` indicates you are selecting the three options).*" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "deletable": false, + "nbgrader": { + "cell_type": "code", + "checksum": "1921a0f4b26d34ce61fe714041abb812", + "grade": false, + "grade_id": "cell-822cb96c53313b4e", + "locked": false, + "schema_version": 3, + "solution": true, + "task": false + }, + "tags": [] + }, + "outputs": [], + "source": [ + "# answer1.12 <- \n", + "\n", + "# your code here\n", + "fail() # No Answer - remove if you provide an answer" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "deletable": false, + "editable": false, + "nbgrader": { + "cell_type": "code", + "checksum": "535f53f696cf6606d87d1581eff0df56", + "grade": true, + "grade_id": "cell-a04e784c42d742dc", + "locked": true, + "points": 1, + "schema_version": 3, + "solution": false, + "task": false + }, + "tags": [] + }, + "outputs": [], + "source": [ + "test_1.12()" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "deletable": false, + "editable": false, + "nbgrader": { + "cell_type": "markdown", + "checksum": "9bbf6b488ba2755eb18e28b751f6eb0c", + "grade": false, + "grade_id": "cell-a1792c454ca79341", + "locked": true, + "schema_version": 3, + "solution": false, + "task": false + } + }, + "source": [ + "### 1.3 Prediction\n", + "\n", + "Besides inference, we can use an estimated logistic regression model to predict the probability of success. \n", + "\n", + "\\begin{gather*} \n", + "\\log \\bigg( \\frac{\\hat{p}_i}{1 - \\hat{p}_i}\\bigg) = \\hat{\\beta}_0 +\\hat{\\beta}_1 x_{i1} + \\ldots + \\hat{\\beta}_p x_{iq} \\\\\n", + "\\end{gather*}\n", + "\n", + "where lower letters $x$ denote a particular observed value for the $i$th experimental unit.\n", + "\n", + "For example, suppose we want to predict the odds of a student who has a credit card balance of \\\\$2200 and an income of \\\\$35000 to be in default relative to not being in default.\n", + "\n", + "Mathematically, our predicted log odds will be \n", + "\n", + "\\begin{gather*} \n", + "\\log \\bigg( \\frac{\\hat{p}_\\texttt{default}}{1 - \\hat{p}_\\texttt{default}}\\bigg) = \\underbrace{-10.869045}_{\\hat{\\beta}_0} - \\underbrace{0.646776}_{\\hat{\\beta}_1} \\times 1 + \\underbrace{0.005737}_{\\hat{\\beta}_2} \\times 2200 + \\underbrace{0.000003}_{\\hat{\\beta}_2} \\times 35000= 1.21 \\\\\n", + "\\end{gather*}\n", + "\n", + "Next, by taking the exponential on both sides of the equation, we obtain our predicted *odds*: \n", + "\n", + "$$\n", + "\\frac{\\hat{p}_\\texttt{default}}{1 - \\hat{p}_\\texttt{default}} = e^{1.21} = 3.36.\n", + "$$\n", + "\n", + "Finally, solving the above for $\\hat{p}_\\texttt{default}$, we obtain our predicted probability of default\n", + "\n", + "$$\n", + "\\hat{p}_\\texttt{default} = 3.36/4.36 = 0.7706\n", + "$$" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "deletable": false, + "editable": false, + "nbgrader": { + "cell_type": "markdown", + "checksum": "c8bf378e5da617b077312f6211397a6c", + "grade": false, + "grade_id": "cell-cfbfd19ea63f8152", + "locked": true, + "schema_version": 3, + "solution": false, + "task": false + } + }, + "source": [ + "**Question 1.13**\n", + "
{points: 1}\n", + "\n", + "Using `predict` and `default_binary_log_model`, obtain the odds prediction above.\n", + "\n", + "*Hint: Check the argument `type` when coding this prediction.*\n", + "\n", + "*Assign your answer to the object `answer1.13`. Fill out those parts indicated with `...`, uncomment the corresponding code in the cell below, and run it.*" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "deletable": false, + "nbgrader": { + "cell_type": "code", + "checksum": "14a4a41a0bb7ceaeea980f40bec11227", + "grade": false, + "grade_id": "cell-23f0d706c79c9f1c", + "locked": false, + "schema_version": 3, + "solution": true, + "task": false + }, + "tags": [] + }, + "outputs": [], + "source": [ + "# answer1.13 <- \n", + "# ...(...,\n", + "# tibble(student = ..., balance = ..., income = ...),\n", + "# type = ...) %>%\n", + "# ... # how did you obtain the predicted odds above? \n", + "\n", + "# your code here\n", + "fail() # No Answer - remove if you provide an answer\n", + "\n", + "answer1.13" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "deletable": false, + "editable": false, + "nbgrader": { + "cell_type": "code", + "checksum": "dfa62c02c59648c0455073c973242fe7", + "grade": true, + "grade_id": "cell-e8a22ef2e4ac8fe6", + "locked": true, + "points": 1, + "schema_version": 3, + "solution": false, + "task": false + }, + "tags": [] + }, + "outputs": [], + "source": [ + "test_1.13()" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "deletable": false, + "editable": false, + "nbgrader": { + "cell_type": "markdown", + "checksum": "9db5228591aa82a6986d938803c6a7a3", + "grade": false, + "grade_id": "cell-f84da54cd4da6b01", + "locked": true, + "schema_version": 3, + "solution": false, + "task": false + } + }, + "source": [ + "**Question 1.14**\n", + "
{points: 1}\n", + "\n", + "We can also predict probabilities for classification purposes, i.e., whether the customer will default. Using the function `predict()` with the object `default_binary_log_model`, obtain the estimated probability that a given customer will default. This customer is a `student` who has a credit card `balance `2200` and income of `35000`.\n", + "\n", + "\n", + "*Hint: Check the argument `type` when coding this prediction.*\n", + "\n", + "*Assign your answer to the object `answer1.14`. Fill out those parts indicated with `...`, uncomment the corresponding code in the cell below, and run it.*" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "deletable": false, + "nbgrader": { + "cell_type": "code", + "checksum": "0b6a9848b2206991d9d3002b6617a6c4", + "grade": false, + "grade_id": "cell-ee0c66cf674510cc", + "locked": false, + "schema_version": 3, + "solution": true, + "task": false + }, + "tags": [] + }, + "outputs": [], + "source": [ + "# answer1.14 <- \n", + "# ...(...,\n", + "# tibble(...),\n", + "# type = ...)\n", + "\n", + "# your code here\n", + "fail() # No Answer - remove if you provide an answer\n", + "\n", + "answer1.14" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "deletable": false, + "editable": false, + "nbgrader": { + "cell_type": "code", + "checksum": "6c96e6a22f12b6ca559b6781d603b09f", + "grade": true, + "grade_id": "cell-03c8e23b29226318", + "locked": true, + "points": 1, + "schema_version": 3, + "solution": false, + "task": false + }, + "tags": [] + }, + "outputs": [], + "source": [ + "test_1.14()" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "deletable": false, + "editable": false, + "nbgrader": { + "cell_type": "markdown", + "checksum": "1631325951b990154c70c0585215fe8d", + "grade": false, + "grade_id": "cell-5d4b54303734b4c1", + "locked": true, + "schema_version": 3, + "solution": false, + "task": false + } + }, + "source": [ + "**Overdispersion**\n", + "\n", + "The variance of a binary response variable is a function of the mean: $p(1-p)$. This means that the estimate of the mean also provides an estimate of the variance of the response. \n", + "\n", + "Since the logistic regression is built assuming that the response is *Bernoulli*, the estimated $\\hat{p}$ conditions the estimated variance of the response to be $\\hat{p}(1-\\hat{p})$. \n", + "\n", + "Unfortunately, in real applications, even in situations where the model seems to be estimating the mean well, the data's variability is not quite compatible with the model's assumed variance. This misspecification in the variance affects the SE of the coefficients, not their estimates.\n", + "\n", + "A way around this problem is to estimate a dispersion parameter, usually called $\\phi$, to correct the standard error of our estimators. An easy implementation is to change the `family` argument to a `quasibinomial`. Let's see an example. " + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "deletable": false, + "editable": false, + "nbgrader": { + "cell_type": "code", + "checksum": "8add326ccea85b3907d01445cdf32c31", + "grade": false, + "grade_id": "cell-5d6361d8c9695383", + "locked": true, + "schema_version": 3, + "solution": false, + "task": false + }, + "tags": [] + }, + "outputs": [], + "source": [ + "summary(glm(\n", + " formula = default ~ student + balance + income,\n", + " data = Default,\n", + " family = quasibinomial))" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "deletable": false, + "editable": false, + "nbgrader": { + "cell_type": "markdown", + "checksum": "6289539de2f5b485ee0c293a9e41fa9f", + "grade": false, + "grade_id": "cell-69697cc0a73122ef", + "locked": true, + "schema_version": 3, + "solution": false, + "task": false + }, + "tags": [] + }, + "source": [ + "Note that the estimates haven't changed, but the SE was adjusted to account for some overdispersion in the data.\n", + "\n", + "### 1.4 Conclusions\n", + "\n", + "- The (conditional) expectation of a binary response is the probability of success.\n", + "\n", + "- A LR can not be used to model the conditional expectation of a binary response since its range extends beyond the interval $[0,1]$\n", + "\n", + "- Instead, one can model a function of the conditional probability. A common choice in logistic regression is to use the *logit* function (logarithm of odds)\n", + "\n", + "- The interpretation of the coefficients depends on the type of variables and the form of the model:\n", + "\n", + "The raw coefficients are interpreted as:\n", + "\n", + "- log odds of a reference group\n", + "- difference of log odds of a treatment vs a control group\n", + "- changes in log odds per unit change in the input\n", + " \n", + "The exponentiated coefficients are interpreted as:\n", + "- odds of a reference group\n", + "- odds ratio of a treatment vs a control group\n", + "- multiplicative changes in odds per unit change in the input\n", + " \n", + "- The estimated logistic model can be used to make inference using the Wald's test\n", + "\n", + "- The estimated logistic model can be used to make predictions\n", + " - the probability of success\n", + " - the odds of success relative to failure" + ] + } + ], + "metadata": { + "jupytext": { + "formats": "ipynb,Rmd" + }, + "kernelspec": { + "display_name": "R", + "language": "R", + "name": "ir" + }, + "language_info": { + "codemirror_mode": "r", + "file_extension": ".r", + "mimetype": "text/x-r-source", + "name": "R", + "pygments_lexer": "r", + "version": "4.3.3" + }, + "latex_envs": { + "LaTeX_envs_menu_present": true, + "autoclose": false, + "autocomplete": true, + "bibliofile": "biblio.bib", + "cite_by": "apalike", + "current_citInitial": 1, + "eqLabelWithNumbers": true, + "eqNumInitial": 1, + "hotkeys": { + "equation": "Ctrl-E", + "itemize": "Ctrl-I" + }, + "labels_anchors": false, + "latex_user_defs": false, + "report_style_numbering": false, + "user_envs_cfg": false + } + }, + "nbformat": 4, + "nbformat_minor": 4 +}