diff --git a/materials/tutorial_02/tests_tutorial_02.R b/materials/tutorial_02/tests_tutorial_02.R new file mode 100644 index 0000000..7d0d6ab --- /dev/null +++ b/materials/tutorial_02/tests_tutorial_02.R @@ -0,0 +1,240 @@ +library(digest) +library(testthat) + +# Question 1.0 + +test_1.0 <- function() { + test_that('Did not assign answer to an object called "answer1.0"', { + expect_true(exists("answer1.0")) + }) + + test_that('Solution should be "true" or "false"', { + expect_match(answer1.0, "true|false", ignore.case = TRUE) + }) + + answer_hash <- digest(tolower(answer1.0)) + #if (answer_hash == "") { + # print("HINT_HERE") + #} + + test_that("Solution is incorrect", { + expect_equal(answer_hash, "d2a90307aac5ae8d0ef58e2fe730d38b") + }) + + print("Success! Good start!!") +} + +# Question 1.1 + +test_1.1 <- function() { + test_that('Did not assign answer to an object called "answer1.1"', { + expect_true(exists("answer1.1")) + }) + + test_that('Solution should be "true" or "false"', { + expect_match(answer1.1, "true|false", ignore.case = TRUE) + }) + + answer_hash <- digest(tolower(answer1.1)) + #if (answer_hash == "") { + # print("HINT_HERE") + #} + + test_that("Solution is incorrect", { + expect_equal(answer_hash, "d2a90307aac5ae8d0ef58e2fe730d38b") + }) + + print("Success!") +} + +# Question 2.0 +test_2.0 <- function() { + test_that('Did not assign answer to an object called "crit_pocock_20"', { + expect_true(exists("crit_pocock_20")) + }) + + test_that("Solution should be a vector of length 20", { + expect_equal(digest(length(crit_pocock_20)), "be3c152f6f6bcd5f85f9e4cba49b1e48") + }) + + test_that("Solution is incorrect", { + expect_equal(digest(sum(crit_pocock_20)), "7bb25ec05af390eb51ebe476fa9ef96d") + }) + + print("Success!") +} + +# Question 2.1 +test_2.1 <- function() { + test_that('Did not assign answer to an object called "crit_pocock_10"', { + expect_true(exists("crit_pocock_10")) + }) + + test_that("Solution should be a vector of length 10", { + expect_equal(digest(length(crit_pocock_10)), "71db8a6cad03244e6e50f0ad8bc95a65") + }) + + test_that("Solution is incorrect", { + expect_equal(digest(sum(crit_pocock_10)), "87970dc4131c4aa6927d4ec68b3565c7") + }) + + print("Success!") +} + +# Question 2.3 + +test_2.3 <- function() { + test_that('Did not assign answer to an object called "answer2.3"', { + expect_true(exists("answer2.3")) + }) + + test_that('Solution should be a single character ("A", "B", "C", or "D")', { + expect_match(answer2.3, "a|b|c|d", ignore.case = TRUE) + }) + + answer_hash <- digest(tolower(answer2.3)) + + test_that("Solution is incorrect", { + expect_equal(answer_hash, "127a2ec00989b9f7faf671ed470be7f8") + }) + + print("Success!") +} + +# Question 3.0 + +test_3.0 <- function() { + test_that('Did not assign answer to an object called "answer3.0"', { + expect_true(exists("answer3.0")) + }) + + test_that("answer3.0 should be a data frame", { + expect_true("data.frame" %in% class(answer3.0)) + }) + + test_that("answer3.0 does not contain the correct number of columns", { + expect_equal(digest(ncol(answer3.0)), "11946e7a3ed5e1776e81c0f0ecd383d0") + }) + + test_that("answer3.0 does not contain the correct data", { + expect_equal(digest(as.integer(sum(answer3.0$p_value) * 1e5)), "2d6c2c2e53d7ed419407077a10907ca7") + + }) + + print("Success!") +} + +# Question 3.1 +test_3.1 <- function() { + test_that('Did not assign answer to an object called "crit_of_10"', { + expect_true(exists("crit_of_10")) + }) + + test_that("Solution should be a vector of length 10", { + expect_equal(digest(length(crit_of_10)), "71db8a6cad03244e6e50f0ad8bc95a65") + }) + + test_that("Solution is incorrect", { + expect_equal(digest(sum(crit_of_10)), "a99cf80f84dadaf35494cbe5b806032d") + }) + print("Success!") +} +# Question 3.2 + + test_3.2 <- function() { + + test_that('Did not assign answer to an object called "sequential_stat"', { + expect_true(exists("sequential_stat")) + }) + + test_that("Solution should be a ggplot object", { + expect_true(is.ggplot(sequential_stat)) + }) + + properties <- c(sequential_stat$layers[[1]]$mapping, sequential_stat$mapping) + + test_that("Plot should have inc_sample_size on the x-axis", { + expect_true("inc_sample_size" == rlang::get_expr(properties$x)) + }) + + test_that("Plot should have Statistic on the y-axis", { + expect_true("statistic" == rlang::get_expr(properties$y)) + }) + + test_that("Plot does not have the correct layers", { + expect_true("GeomLine" %in% class(sequential_stat$layers[[1]]$geom)) + }) + + test_that("x-axis label should be descriptive and human readable", { + expect_false(sequential_stat$labels$x == toString(rlang::get_expr(properties$x))) + }) + + test_that("y-axis label should be descriptive and human readable", { + expect_false(sequential_stat$labels$y == toString(rlang::get_expr(properties$y))) + }) + + test_that("Plot should have a title", { + expect_true("title" %in% names(sequential_stat$labels)) + }) + + print("Success!") + } + + +# Question 3.2 + +test_3.3 <- function() { + test_that('Did not assign answer to an object called "answer3.3"', { + expect_true(exists("answer3.3")) + }) + + test_that('Solution should be "true" or "false"', { + expect_match(answer3.3, "true|false", ignore.case = TRUE) + }) + + answer_hash <- digest(tolower(answer3.3)) + #if (answer_hash == "") { + # print("HINT_HERE") + #} + + test_that("Solution is incorrect", { + expect_equal(answer_hash, "05ca18b596514af73f6880309a21b5dd") + }) + + print("Success! Finish strong!!") +} + +# Question 3.4 + +test_3.4 <- function() { + test_that('Did not assign answer to an object called "answer3.4"', { + expect_true(exists("answer3.4")) + }) + + test_that("Solution should be a data frame", { + expect_true("data.frame" %in% class(answer3.4)) + }) + + expected_colnames <- c("n_rejections_OF", "n_rejections_unadj", "expected_n_rejections") + given_colnames <- colnames(answer3.4) + test_that("Data frame does not have the correct number of columns or column names", { + expect_equal(length(setdiff( + union(expected_colnames, given_colnames), + intersect(expected_colnames, given_colnames) + )), 0) + }) + + test_that("Data frame does not contain the correct number of rows", { + expect_equal(digest(as.integer(nrow(answer3.4))), "4b5630ee914e848e8d07221556b0a2fb") + }) + + test_that("Data frame does not contain the correct data", { + expect_equal(digest(as.integer(sum(answer3.4$n_rejections_OF) * 1e3)), "189e2f1b2fbb3743811990e9708c226a") + }) + + test_that("Data frame does not contain the correct data", { + expect_equal(digest(as.integer(sum(answer3.4$n_rejections_unadj) * 1e3)), "7ea55401005f54e88bdc2ce0c9a9ceb1") + }) + + print("Success! One more and you are done!") +} diff --git a/materials/tutorial_02/tutorial_02.ipynb b/materials/tutorial_02/tutorial_02.ipynb new file mode 100644 index 0000000..9999c97 --- /dev/null +++ b/materials/tutorial_02/tutorial_02.ipynb @@ -0,0 +1,1259 @@ +{ + "cells": [ + { + "cell_type": "markdown", + "metadata": { + "deletable": false, + "editable": false, + "nbgrader": { + "cell_type": "markdown", + "checksum": "a7d11e98092cd6ffd0b55bcc09235725", + "grade": false, + "grade_id": "cell-f1e1d845873036f4", + "locked": true, + "schema_version": 3, + "solution": false, + "task": false + } + }, + "source": [ + "# Tutorial 2: A/B Testing and principled peeking" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "deletable": false, + "editable": false, + "nbgrader": { + "cell_type": "markdown", + "checksum": "d3087353c3938a93a70166f52bdaeece", + "grade": false, + "grade_id": "cell-82d9926086d47a80", + "locked": true, + "schema_version": 3, + "solution": false, + "task": false + } + }, + "source": [ + "## Learning Objectives\n", + "\n", + "After completing this week's worksheet and tutorial work, you will be able to:\n", + "\n", + "1. Discuss why the methods learned in past courses are not sufficient to answer the more complex research problems being posed in this course (in particular stopping an A/B test early).\n", + "2. Explain sequential testing and principled peeking and how it can be used for early stopping of an experiment (e.g., A/B testing).\n", + "3. Write a computer script to perform A/B testing optimization with and without using principled peeking.\n", + "4. Discuss the tradeoff between stopping earlier and certainty of significance, and thereal world implications (e.g., what does the FDA require for early stopping of clinical trials versus Facebook ads optimization?).\n", + "5. List other questions related to A/B testing optimization that may be relevant in a real data application (e.g., what features cause a Facebook ad to perform best?)" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "deletable": false, + "editable": false, + "nbgrader": { + "cell_type": "code", + "checksum": "c7b523de4676b23894dba6b6a4710557", + "grade": false, + "grade_id": "cell-a2a153352bc44a68", + "locked": true, + "schema_version": 3, + "solution": false, + "task": false + }, + "tags": [] + }, + "outputs": [], + "source": [ + "# Run this cell before continuing.\n", + "library(tidyverse)\n", + "library(gsDesign)\n", + "\n", + "source(\"tests_tutorial_02.R\")" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "deletable": false, + "editable": false, + "nbgrader": { + "cell_type": "markdown", + "checksum": "47147eb690a367cac56f019736b247a0", + "grade": false, + "grade_id": "cell-0647a289e8c93c6e", + "locked": true, + "schema_version": 3, + "solution": false, + "task": false + } + }, + "source": [ + "## 1. Warm up questions" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "deletable": false, + "editable": false, + "nbgrader": { + "cell_type": "markdown", + "checksum": "e91859697791998cedfd8d871142490f", + "grade": false, + "grade_id": "cell-016e99af9ac357da", + "locked": true, + "schema_version": 3, + "solution": false, + "task": false + } + }, + "source": [ + "**Question 1.0**\n", + "
{points: 1}\n", + "\n", + "Sequential A/B testing is used to analyze only continuous variables. **True or False??**\n", + "\n", + "*Assign your answer to an object called answer1.0. Your answer should be either \"true\" or \"false\", surrounded by quotes.*" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "deletable": false, + "nbgrader": { + "cell_type": "code", + "checksum": "9ff78776eec3c949085483c7225bec0a", + "grade": false, + "grade_id": "cell-4f765b97eddc4200", + "locked": false, + "schema_version": 3, + "solution": true, + "task": false + }, + "tags": [] + }, + "outputs": [], + "source": [ + "#answer1.0 <- \n", + "\n", + "# your code here\n", + "fail() # No Answer - remove if you provide an answer" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "deletable": false, + "editable": false, + "nbgrader": { + "cell_type": "code", + "checksum": "52fd489a854726c1b416ba6f9b73eb37", + "grade": true, + "grade_id": "cell-444b4babc585c9ff", + "locked": true, + "points": 1, + "schema_version": 3, + "solution": false, + "task": false + }, + "tags": [] + }, + "outputs": [], + "source": [ + "test_1.0()" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "deletable": false, + "editable": false, + "nbgrader": { + "cell_type": "markdown", + "checksum": "f3eba5dcff43cfca99bf4ca0ba64e349", + "grade": false, + "grade_id": "cell-ade4b5a0e3e3478d", + "locked": true, + "schema_version": 3, + "solution": false, + "task": false + } + }, + "source": [ + "**Question 1.1**\n", + "
{points: 1}\n", + "\n", + "When performing sequential A/B testing, a power analysis is not required since the analyst will check the data before collecting all data. **True or False??**\n", + "\n", + "*Assign your answer to an object called answer1.1. Your answer should be either \"true\" or \"false\", surrounded by quotes.*" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "deletable": false, + "nbgrader": { + "cell_type": "code", + "checksum": "f82b6b57c12aee4d084e0ae48c6d14f9", + "grade": false, + "grade_id": "cell-734fae2ecfda4719", + "locked": false, + "schema_version": 3, + "solution": true, + "task": false + }, + "tags": [] + }, + "outputs": [], + "source": [ + "#answer1.1 <- \n", + "\n", + "# your code here\n", + "fail() # No Answer - remove if you provide an answer" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "deletable": false, + "editable": false, + "nbgrader": { + "cell_type": "code", + "checksum": "299f29f594738c9608d92c55aa7a7462", + "grade": true, + "grade_id": "cell-3a3f763f43ee517a", + "locked": true, + "points": 1, + "schema_version": 3, + "solution": false, + "task": false + }, + "tags": [] + }, + "outputs": [], + "source": [ + "test_1.1()" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "deletable": false, + "editable": false, + "nbgrader": { + "cell_type": "markdown", + "checksum": "5433f06ef0245e398a9866e1ddbe2216", + "grade": false, + "grade_id": "cell-ca338da0504ad817", + "locked": true, + "schema_version": 3, + "solution": false, + "task": false + } + }, + "source": [ + "**Question 1.2**\n", + "
{points: 1}\n", + "\n", + "In **full sequential designs**, the analyst performs an analysis after every new observation, sequentially. Would you recommend using a Bonferroni correction for **full sequential designs** of large experiments so that it can be stopped as soon as possible??\n", + "\n", + "*Briefly, justify your recommendation. Think of pros and cons of the correction for the experiment described.*" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "deletable": false, + "nbgrader": { + "cell_type": "markdown", + "checksum": "dbb0388184f02b3e6209ef88ac6e76b0", + "grade": true, + "grade_id": "cell-d52f0c41674cca3b", + "locked": false, + "points": 1, + "schema_version": 3, + "solution": true, + "task": false + } + }, + "source": [ + "DOUBLE CLICK TO EDIT **THIS CELL** AND REPLACE THIS TEXT WITH YOUR ANSWER." + ] + }, + { + "cell_type": "markdown", + "metadata": { + "deletable": false, + "editable": false, + "nbgrader": { + "cell_type": "markdown", + "checksum": "f5e05eb95872332dc976640c78821b24", + "grade": false, + "grade_id": "cell-ad493deab495ffd1", + "locked": true, + "schema_version": 3, + "solution": false, + "task": false + } + }, + "source": [ + "## 2. Early peeking in A/B testing\n", + "\n", + "In worksheet_02, we studied by means of an A/A testing (i.e., the case where we know that there's no difference in the distributions from the groups) how peeking can inflate the probability of Type I Error. \n", + "\n", + "In addition, we used different methods to implement principled peeking and early stopping rules in sequential A/B testing.\n", + "\n", + "- **Bonferroni's method** provides an adjustment to classical $p$-values (or equivalently the significance level or critical values) to control the type I error rate. \n", + "\n", + "- **Pocock's method**, available in `gsDesign` offers a less conservative way of controlling the type I error rate in sequential testing with early stops.\n", + "\n", + "- These 2 methods used uniform boundaries (common critical values) for all interim tests.\n", + "\n", + "In this section, you will examine how the number of interim tests may affect the results and the boundary used. " + ] + }, + { + "cell_type": "markdown", + "metadata": { + "deletable": false, + "editable": false, + "nbgrader": { + "cell_type": "markdown", + "checksum": "c5fe6a38bdaef0a54c1391366995c5b9", + "grade": false, + "grade_id": "cell-6fbbd797464475f3", + "locked": true, + "schema_version": 3, + "solution": false, + "task": false + } + }, + "source": [ + "**Question 2.0**\n", + "
{points: 1}\n", + "\n", + "Use the package `gsDesign` to get critical values for the **Pocock's method** for the following experimental design:\n", + "\n", + "- one-sided test to compare two population means (recall that the package will implement $z$-tests which are similar to $t$-tests when sample size is large)\n", + "\n", + "- 20 sequential (interim) tests\n", + "\n", + "- A/A testing design, i.e., effect size = 0 \n", + "\n", + "- a significance level of $5\\%$ \n", + "\n", + "- a power of $80\\%$\n", + "\n", + "Use examples in worksheet_02 to write the appropriate code and get the resulting critical values for the experiment described. \n", + "\n", + "*Assign your final answer to an object called `crit_pocock_20`. Your solution should be a vector with 20 equal values.*" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "deletable": false, + "nbgrader": { + "cell_type": "code", + "checksum": "51cad3d3c22ae5df02f30760dc8a27b7", + "grade": false, + "grade_id": "cell-291025e2bc8a15fe", + "locked": false, + "schema_version": 3, + "solution": true, + "task": false + }, + "tags": [] + }, + "outputs": [], + "source": [ + "# Your code goes here. No skeleton code provided.\n", + "\n", + "# your code here\n", + "fail() # No Answer - remove if you provide an answer\n", + "\n", + "crit_pocock_20" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "deletable": false, + "editable": false, + "nbgrader": { + "cell_type": "code", + "checksum": "4eb42c8db1b71fe4b5fa51bb70d84290", + "grade": true, + "grade_id": "cell-fadc6b4fa788f892", + "locked": true, + "points": 1, + "schema_version": 3, + "solution": false, + "task": false + }, + "tags": [] + }, + "outputs": [], + "source": [ + "test_2.0()" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "deletable": false, + "editable": false, + "nbgrader": { + "cell_type": "markdown", + "checksum": "8fd62f3b7cb192116ece3f4fbd1469cd", + "grade": false, + "grade_id": "cell-a52d0000faa9a48e", + "locked": true, + "schema_version": 3, + "solution": false, + "task": false + } + }, + "source": [ + "**Question 2.1**\n", + "
{points: 1}\n", + "\n", + "Repeat **Question 2.0** but this time get Pocock's critical values for an experimental design that will peek at the data 10 times (i.e., 10 sequential (interim) tests).\n", + "\n", + "*Write the appropriate code and assign your answer to an object called `crit_pocock_10`. Your solution should be a vector with 10 equal values.*" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "deletable": false, + "nbgrader": { + "cell_type": "code", + "checksum": "174e1bc568489a7ba9c0d4968677a879", + "grade": false, + "grade_id": "cell-092b0878bb03c233", + "locked": false, + "schema_version": 3, + "solution": true, + "task": false + }, + "tags": [] + }, + "outputs": [], + "source": [ + "# Your code goes here. No skeleton code provided.\n", + "\n", + "# your code here\n", + "fail() # No Answer - remove if you provide an answer\n", + "\n", + "crit_pocock_10" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "deletable": false, + "editable": false, + "nbgrader": { + "cell_type": "code", + "checksum": "122e8fc9681252987e5c0bbc5c4686a4", + "grade": true, + "grade_id": "cell-8212cb6ebfa0c18c", + "locked": true, + "points": 1, + "schema_version": 3, + "solution": false, + "task": false + }, + "tags": [] + }, + "outputs": [], + "source": [ + "test_2.1()" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "deletable": false, + "editable": false, + "nbgrader": { + "cell_type": "markdown", + "checksum": "f4125a8eb510bcbf5112893574ad9ea3", + "grade": false, + "grade_id": "cell-e44e65eabc8fb77e", + "locked": true, + "schema_version": 3, + "solution": false, + "task": false + } + }, + "source": [ + "**Question 2.2**\n", + "
{points: 1}\n", + "\n", + "The Pocock's critical values obtained in **Question 2.0** and **Question 2.1** show that as the number of peeks (interim tests) increases, the critical values also increase. \n", + "\n", + "Briefly, explain why." + ] + }, + { + "cell_type": "markdown", + "metadata": { + "deletable": false, + "nbgrader": { + "cell_type": "markdown", + "checksum": "3e191ec1b2e77f6c9c8d871ea7b0f168", + "grade": true, + "grade_id": "cell-43cb68384dc3a6a4", + "locked": false, + "points": 1, + "schema_version": 3, + "solution": true, + "task": false + } + }, + "source": [ + "DOUBLE CLICK TO EDIT **THIS CELL** AND REPLACE THIS TEXT WITH YOUR ANSWER." + ] + }, + { + "cell_type": "markdown", + "metadata": { + "deletable": false, + "editable": false, + "nbgrader": { + "cell_type": "markdown", + "checksum": "6ed5030c36828ba7e4cceeb9dfdae211", + "grade": false, + "grade_id": "cell-d3c3fad9bdace6b7", + "locked": true, + "schema_version": 3, + "solution": false, + "task": false + } + }, + "source": [ + "**Question 2.3**\n", + "
{points: 1}\n", + "\n", + "Suppose that another company decides to use the same test, the same significance level and also to peek 10 times. If the new company wants to increase the probability of finding a significance result when $H_0$ is false (i.e., have more power) and still control the type I error rate, which of the following strategies would you recommend:\n", + "\n", + "**A** Use the same Pocock's critical value as in **Question 2.1** to control the type I error and plan for a larger experiment (i.e., larger sample size)\n", + "\n", + "**B** Don't use the Pocock's method since it's too conservative. Just use raw $p$-values from the CLT sampling distribution.\n", + "\n", + "**C** Use a Bonferroni's correction\n", + "\n", + "**D** There is no way to increase the power of the test\n", + "\n", + "*Hint*: you can use code to design this new experiment\n", + "\n", + "*Assign your answer to an object called answer2.3. Your answer should be one of `\"A\"`, `\"B\"`, `\"C\"`, or `\"D\"` surrounded by quotes.*" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "deletable": false, + "nbgrader": { + "cell_type": "code", + "checksum": "677a843ff55b9b0479db413cbfcfae33", + "grade": false, + "grade_id": "cell-523621d2299f5a90", + "locked": false, + "schema_version": 3, + "solution": true, + "task": false + }, + "tags": [] + }, + "outputs": [], + "source": [ + "# answer2.3 <- \n", + "\n", + "# your code here\n", + "fail() # No Answer - remove if you provide an answer" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "deletable": false, + "editable": false, + "nbgrader": { + "cell_type": "code", + "checksum": "b0ecd99816a0d18161fede6ffe170358", + "grade": true, + "grade_id": "cell-b2a0e2c82435b7b0", + "locked": true, + "points": 1, + "schema_version": 3, + "solution": false, + "task": false + }, + "tags": [] + }, + "outputs": [], + "source": [ + "test_2.3()" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "deletable": false, + "editable": false, + "nbgrader": { + "cell_type": "markdown", + "checksum": "c2b747fdd6db95d6cd8fb8e0fa315e8e", + "grade": false, + "grade_id": "cell-793479bb1437c485", + "locked": true, + "schema_version": 3, + "solution": false, + "task": false + } + }, + "source": [ + "## 3. Principled peeking: O’Brien-Fleming method\n", + "\n", + "In this section, we will implement and examine another method, also available in `gsDesign`: the **O’Brien-Fleming method**\n", + "\n", + "Unlike previous methods covered in worksheet_02, the O’Brien-Fleming method uses *non-uniform* boundaries, which has conservative critical values for earlier interim analyses and less conservative values (closer to the unadjusted critical values) as more data are collected. \n", + "\n", + "As in **Question 3.2.1 of worksheet_02** we will use the function `incremental_t_test` to generate and sequentially analyze data.\n", + "\n", + "In the next few exercises, we will plot the statistics of the sequential tests and add 4 type of boundaries:\n", + "\n", + "- the unadjusted critical values (black line)\n", + "\n", + "- the Bonferroni's adjusted critical values (blue line)\n", + "\n", + "- the Pocock critical values (red line)\n", + "\n", + "- the O'Brien-Fleming critical values (green line)\n", + "\n", + "*Run the following cell to get the function to simulate and analyze data of sequential A/B testing*" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "deletable": false, + "editable": false, + "nbgrader": { + "cell_type": "code", + "checksum": "a7c929affeed65d04438d7bde654f39c", + "grade": false, + "grade_id": "cell-10f6fb3783a7c482", + "locked": true, + "schema_version": 3, + "solution": false, + "task": false + }, + "tags": [] + }, + "outputs": [], + "source": [ + "# Two-sample t-test with tracking sequential statistic and p-values by incremental sample sizes until getting to n.\n", + "\n", + "# @param n (numeric): Initially planned sample size for each group (for simplicity, n needs to be a multiple of sample_increase_step).\n", + "# @param d_0 (numeric): effect size.\n", + "# @param mean_current (numeric): Population mean for control variation.\n", + "# @param sd_current (numeric): Population standard deviation for current variation.\n", + "# @param sd_new (numeric): Population standard deviation for new variation.\n", + "# @param sample_increase_step (numeric): Sample size increment.\n", + "\n", + "# @return p.value.df: A tibble that has 3 columns:\n", + "# inc_sample_size, statistic, and p_value \n", + "\n", + "incremental_t_test <- function(n, d_0, mean_current, sd_current, sd_new, sample_increase_step) {\n", + " sample_current <- rnorm(n, mean = mean_current, sd = sd_current)\n", + " sample_new <- rnorm(n, mean = mean_current + d_0, sd = sd_new)\n", + "\n", + " p.value.df <- tibble(\n", + " inc_sample_size = rep(0, n / sample_increase_step),\n", + " statistic = rep(0, n / sample_increase_step),\n", + " p_value = rep(0, n / sample_increase_step)\n", + " )\n", + "\n", + " current_sample_size <- sample_increase_step\n", + " \n", + " for (i in 1:nrow(p.value.df))\n", + " {\n", + " t_test_results <- t.test(sample_new[1:current_sample_size], sample_current[1:current_sample_size],\n", + " var.equal = TRUE,\n", + " alternative = \"greater\" \n", + " )\n", + " p.value.df[i, \"statistic\"] <- as_tibble(t_test_results$statistic)\n", + " p.value.df[i, \"p_value\"] <- as_tibble(t_test_results$p.value)\n", + " p.value.df[i, \"inc_sample_size\"] <- current_sample_size\n", + " current_sample_size <- current_sample_size + sample_increase_step\n", + " }\n", + "\n", + " return(p.value.df)\n", + "}" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "deletable": false, + "editable": false, + "nbgrader": { + "cell_type": "markdown", + "checksum": "8de5d221c0db1034486a1379ba66ad86", + "grade": false, + "grade_id": "cell-948ee1a9d21f3c52", + "locked": true, + "schema_version": 3, + "solution": false, + "task": false + } + }, + "source": [ + "**Question 3.0**\n", + "{points: 1}\n", + "\n", + "**A/A testing (cont.)**: as in worksheet_02, let's simulate data that reflects no difference in the population means (i.e., $H_0$ is true).\n", + "\n", + "In this question, analyze the data in batches of 100 experimental units per group until a total of $n = 1000$ per group is collected (i.e., plan for 10 sequential tests).\n", + " \n", + "We will assume again that data of both groups is generated from a Normal distribution with a mean equal to \\\\$200 and a standard deviation equal to \\\\$50.\n", + "\n", + "Use the `incremental_t_test` function to conduct this experiment.\n", + "\n", + "*Save the result in an object called answer3.0. Your answer should be a tibble with three columns: `inc_sample_size`, `statistic`, and `p_value`." + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "deletable": false, + "nbgrader": { + "cell_type": "code", + "checksum": "d228b9862d5854ad77fff34f871eb4da", + "grade": false, + "grade_id": "cell-efd291d0304d92e1", + "locked": false, + "schema_version": 3, + "solution": true, + "task": false + }, + "tags": [] + }, + "outputs": [], + "source": [ + "set.seed(25) # do not change this.\n", + "\n", + "#answer3.0 <- \n", + "# incremental_t_test(n = ..., d_0 = ..., sample_increase_step = ..., mean_current = 200, sd_current = 50, sd_new = 50)\n", + "\n", + "# your code here\n", + "fail() # No Answer - remove if you provide an answer\n", + "\n", + "answer3.0" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "deletable": false, + "editable": false, + "nbgrader": { + "cell_type": "code", + "checksum": "ecd1815ef2ec38c26519835b766321ca", + "grade": true, + "grade_id": "cell-0c7efb1f62225434", + "locked": true, + "points": 1, + "schema_version": 3, + "solution": false, + "task": false + }, + "tags": [] + }, + "outputs": [], + "source": [ + "test_3.0()" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "deletable": false, + "editable": false, + "nbgrader": { + "cell_type": "markdown", + "checksum": "eef09b5a45f1189846857f09595ea9a0", + "grade": false, + "grade_id": "cell-de29a136e111ae85", + "locked": true, + "schema_version": 3, + "solution": false, + "task": false + } + }, + "source": [ + "**Question 3.1**\n", + "
{points: 1}\n", + "\n", + "Repeat **Question 2.0** but this time get O'Brien-Fleming's critical values for an experimental design that will peek at the data 10 times (i.e., 10 sequential (interim) tests).\n", + "\n", + "*Write the appropriate code and assign your answer to an object called `crit_of_10`. Your solution should be a vector with 10 numeric values.*" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "deletable": false, + "nbgrader": { + "cell_type": "code", + "checksum": "50f566a60d58ebc06f30775fdb55b3ef", + "grade": false, + "grade_id": "cell-da06b64c53195124", + "locked": false, + "schema_version": 3, + "solution": true, + "task": false + }, + "tags": [] + }, + "outputs": [], + "source": [ + "# Get critical values of the O'Brien-Fleming design!\n", + "\n", + "# Your code goes here. No skeleton code provided.\n", + "\n", + "# your code here\n", + "fail() # No Answer - remove if you provide an answer\n", + "\n", + "crit_of_10" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "deletable": false, + "editable": false, + "nbgrader": { + "cell_type": "code", + "checksum": "d0dac98f0627f8eb4ce9b73186cca231", + "grade": true, + "grade_id": "cell-7577b94a1406d9b2", + "locked": true, + "points": 1, + "schema_version": 3, + "solution": false, + "task": false + } + }, + "outputs": [], + "source": [ + "test_3.1()" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "deletable": false, + "editable": false, + "nbgrader": { + "cell_type": "markdown", + "checksum": "ba9f5bd38670e527581cb16364c2696f", + "grade": false, + "grade_id": "cell-2f896d20a801d8e7", + "locked": true, + "schema_version": 3, + "solution": false, + "task": false + } + }, + "source": [ + "**Question 3.2**\n", + "
{points: 1}\n", + "\n", + "Using the data stored in `answer3.0`, plot the sequence of observed statistics for each interim analysis as a **line** with the incremental sample size on the $x$-axis and the value of the observed statistic on the $y$-axis. \n", + "\n", + "Add 4 dashed lines that indicate the following 4 boundaries (critical values): \n", + "\n", + "- a green line for the OF's critical values\n", + "\n", + "- a red line for the Pocock's critical values\n", + "\n", + "- a blue line for the Bonferroni's critical values\n", + "\n", + "- a black line for the unadjusted critical values\n", + "\n", + "The `ggplot()` object's name will be `sequential_stat`.\n", + "\n", + "*Fill out those parts indicated with `...`, uncomment the corresponding code in the cell below, and run it.*" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "deletable": false, + "lines_to_next_cell": 0, + "nbgrader": { + "cell_type": "code", + "checksum": "e102fcbf9704f5b5c73c4e03758de5fa", + "grade": false, + "grade_id": "cell-92328169e3d31d90", + "locked": false, + "schema_version": 3, + "solution": true, + "task": false + }, + "tags": [] + }, + "outputs": [], + "source": [ + "options(repr.plot.width = 15, repr.plot.height = 9) # Adjust these numbers so the plot looks good in your desktop.\n", + "\n", + "#crit_unadj <- qt(1 - ..., ...)\n", + "#crit_bonferroni <- ...(1 - ..., ...)\n", + "\n", + "#sequential_stat <- \n", + "# answer3.0 %>%\n", + "# ggplot() +\n", + "# geom_line(aes(x = inc_sample_size, y = statistic)) +\n", + "# geom_point(aes(x = ..., y = ...)) +\n", + "# geom_line(aes(x = inc_sample_size, y = crit_of_10),colour = 3, linetype = \"twodash\")+\n", + "# geom_point(aes(x = inc_sample_size, y = ...), colour = 3) +\n", + "# geom_text(x=150, y=crit_of_10[1] + 0.15, size=6, label=\"O'Brien-Fleming\",colour = 3) +\n", + "# geom_hline(yintercept = ..., colour = \"red\", linetype = \"twodash\") +\n", + "# geom_point(aes(x = inc_sample_size, y = ...), colour = \"red\") +\n", + "# geom_text(x=150, y=crit_pocock_10 + 0.15, size=6, label=\"Pocock\",colour = \"red\") +\n", + "# geom_hline(yintercept = ..., colour = \"blue\", linetype = \"twodash\") +\n", + "# geom_point(aes(x = inc_sample_size, y = rep(crit_Bonferroni, ...)), colour = \"blue\") +\n", + "# geom_text(x=150, y=crit_bonferroni + 0.15, size=6, label=\"Bonferroni\",colour = \"blue\") +\n", + "# geom_hline(yintercept = ..., linetype = \"twodash\") +\n", + "# geom_point(aes(x = inc_sample_size, y = rep(..., ...))) +\n", + "# geom_text(x=150, y=crit_unadj + 0.15, size=6, label=\"Unadjusted\") +\n", + "# theme(\n", + "# text = element_text(size = 18),\n", + "# plot.title = element_text(face = \"bold\"),\n", + "# axis.title = element_text(face = \"bold\")\n", + "# ) +\n", + "# ggtitle(\"Critical values in Sequential Designs\") +\n", + "# ylab(\"Statistic\") +\n", + "# xlab(\"Sample Size\") +\n", + "# coord_cartesian(ylim = c(-1, 6)) +\n", + "# scale_y_continuous(breaks = seq(-1, 6, by = 0.5))\n", + "\n", + "# your code here\n", + "fail() # No Answer - remove if you provide an answer\n", + "\n", + "sequential_stat" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "deletable": false, + "editable": false, + "nbgrader": { + "cell_type": "code", + "checksum": "43f96d155dfba9089e2ced87c7442d4e", + "grade": true, + "grade_id": "cell-c4ec82e4b947c36a", + "locked": true, + "points": 1, + "schema_version": 3, + "solution": false, + "task": false + } + }, + "outputs": [], + "source": [ + "test_3.2()" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "deletable": false, + "editable": false, + "nbgrader": { + "cell_type": "markdown", + "checksum": "b07cf76d55a6fc79ae366eec5777ae08", + "grade": false, + "grade_id": "cell-d1f712563b8b6a85", + "locked": true, + "schema_version": 3, + "solution": false, + "task": false + } + }, + "source": [ + "**Question 3.3**\n", + "
{points: 1}\n", + "\n", + "Suppose that the generated data correspond to the political campaign experiment and that the organizers have decided to monitor the data every 100 visitors per website and stop the experiment earlier if there's evidence of a difference between the group means. \n", + "\n", + "According to the data plotted in **Question 3.2**, is the following statement **True or False*?? \n", + "\n", + "> The campaign organizers will not erroneously stop the experiment if they compare the observed statistics with any of the boundaries that control the type I error rate\n", + "\n", + "*Assign your answer to an object called `answer3.3`. Your answer should be either `\"true\"` or `\"false\"`, surrounded by quotes.*" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "deletable": false, + "nbgrader": { + "cell_type": "code", + "checksum": "1773fcca9f2574e4683a56f43ed30091", + "grade": false, + "grade_id": "cell-cbdb916890270418", + "locked": false, + "schema_version": 3, + "solution": true, + "task": false + }, + "tags": [] + }, + "outputs": [], + "source": [ + "# answer3.3 <- \n", + "\n", + "# your code here\n", + "fail() # No Answer - remove if you provide an answer" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "deletable": false, + "editable": false, + "nbgrader": { + "cell_type": "code", + "checksum": "f4db2be48400f9ff193671ffc3a5399a", + "grade": true, + "grade_id": "cell-c68fa6f64391c4e2", + "locked": true, + "points": 1, + "schema_version": 3, + "solution": false, + "task": false + }, + "tags": [] + }, + "outputs": [], + "source": [ + "test_3.3()" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "deletable": false, + "editable": false, + "nbgrader": { + "cell_type": "markdown", + "checksum": "ffec72c72273779859b20f6b90dfe76d", + "grade": false, + "grade_id": "cell-df816ff9bb116e3f", + "locked": true, + "schema_version": 3, + "solution": false, + "task": false + } + }, + "source": [ + "**Question 3.4**\n", + "\n", + "To examine how the O'Brien-Fleming (OF) controls the type I error rate, the campaign organizers decided to: \n", + "\n", + "- perform the **A/A testing** experiment 100 times \n", + "\n", + "- count how many times they would wrongly reject $H_0$ with their strategy, and\n", + "\n", + "- compare it with the expected number of rejections given the significance level $\\alpha = 0.05$\n", + "\n", + "Use the code below to run 100 experiments and then estimate the type I error rate for the OF method.\n", + "\n", + "Your answer will be a tibble with three columns: `n_rejections_OF`, `n_rejections_unadj`, and `expected_n_rejections`.\n", + "\n", + "These columns should contain: the number of wrong rejections among the 100 experiments for the OF and the classical methods (unadjusted), respectively, compared to the expected number of wrong rejections given the design.\n", + "\n", + "*Fill out those parts indicated with `...`, uncomment the corresponding code in the cell below, and run it.*" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "deletable": false, + "editable": false, + "nbgrader": { + "cell_type": "code", + "checksum": "c2b2accd22b5ef863b62cd5e4bda74bc", + "grade": false, + "grade_id": "cell-80ce1a82c6388796", + "locked": true, + "schema_version": 3, + "solution": false, + "task": false + }, + "tags": [] + }, + "outputs": [], + "source": [ + "set.seed(120)\n", + "\n", + "### Run this before continuing\n", + "multiple_times_sequential_tests <- \n", + " tibble(experiment = 1:100) %>% \n", + " mutate(seq_test = map(.x = experiment, \n", + " .f = function(x) incremental_t_test(n = 1000, d_0 = 0, sample_increase_step = 100, \n", + " mean_current = 200, sd_current = 50, sd_new = 50)))" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "deletable": false, + "nbgrader": { + "cell_type": "code", + "checksum": "a074b2f1c8d97f27cf479995b395d19c", + "grade": false, + "grade_id": "cell-7e6502a2334285d4", + "locked": false, + "schema_version": 3, + "solution": true, + "task": false + }, + "tags": [] + }, + "outputs": [], + "source": [ + "#answer3.4 <- multiple_times_sequential_tests %>% \n", + "# mutate(reject_of = map_dbl(.x = seq_test, .f = function(x) sum(... ... ...) > 0),\n", + "# reject_unadj = map_dbl(.x = seq_test, .f = function(x) sum(... ... ...) >0)) %>% \n", + "# summarise(n_rejections_OF = ...(reject_of),\n", + "# n_rejections_unadj = ...(reject_unadj),\n", + "# expected_n_rejections = ...)\n", + "\n", + "# your code here\n", + "fail() # No Answer - remove if you provide an answer\n", + " \n", + "answer3.4" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": { + "deletable": false, + "editable": false, + "nbgrader": { + "cell_type": "code", + "checksum": "9775ac1cfd45aa58410f6384414a1a47", + "grade": true, + "grade_id": "cell-065f7cddd3d93051", + "locked": true, + "points": 1, + "schema_version": 3, + "solution": false, + "task": false + }, + "tags": [] + }, + "outputs": [], + "source": [ + "test_3.4()" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "deletable": false, + "editable": false, + "nbgrader": { + "cell_type": "markdown", + "checksum": "6bd9e0ba95e40f6dcde8946857abc5ec", + "grade": false, + "grade_id": "cell-027ecb0d6c9f11ac", + "locked": true, + "schema_version": 3, + "solution": false, + "task": false + } + }, + "source": [ + "**Question 3.5**\n", + "
{points: 1}\n", + "\n", + "Explain briefly the results obtained in **Question 3.4**.\n" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "deletable": false, + "nbgrader": { + "cell_type": "markdown", + "checksum": "d20eacec1cd0c4a33592742545622c35", + "grade": true, + "grade_id": "cell-b9d8c96418c97fb1", + "locked": false, + "points": 1, + "schema_version": 3, + "solution": true, + "task": false + } + }, + "source": [ + "DOUBLE CLICK TO EDIT **THIS CELL** AND REPLACE THIS TEXT WITH YOUR ANSWER." + ] + } + ], + "metadata": { + "jupytext": { + "formats": "ipynb,Rmd" + }, + "kernelspec": { + "display_name": "R", + "language": "R", + "name": "ir" + }, + "language_info": { + "codemirror_mode": "r", + "file_extension": ".r", + "mimetype": "text/x-r-source", + "name": "R", + "pygments_lexer": "r", + "version": "4.2.3" + } + }, + "nbformat": 4, + "nbformat_minor": 4 +}