diff --git a/.gitattributes b/.gitattributes new file mode 100644 index 0000000..3173dbf --- /dev/null +++ b/.gitattributes @@ -0,0 +1 @@ +*.rds filter=lfs diff=lfs merge=lfs -text diff --git a/.github/workflows/lint-project.yaml b/.github/workflows/lint-project.yaml new file mode 100644 index 0000000..0236b03 --- /dev/null +++ b/.github/workflows/lint-project.yaml @@ -0,0 +1,28 @@ +name: Lint project + +on: + push: + +jobs: + lint-project: + runs-on: ubuntu-latest + steps: + - name: 🛎 Check out repo + uses: actions/checkout@v3 + with: + submodules: recursive + + - name: 🏗️ Setup R + uses: r-lib/actions/setup-r@v2 + with: + use-public-rspm: true + + - name: 🛠️ Install lintr + run: install.packages("lintr") + shell: Rscript {0} + + - name: ✨ Lint root directory + run: lintr::lint_dir() + shell: Rscript {0} + env: + LINTR_ERROR_ON_LINT: true diff --git a/.github/workflows/test-build.yml b/.github/workflows/test-build.yml index 0f7d07a..1da1a68 100644 --- a/.github/workflows/test-build.yml +++ b/.github/workflows/test-build.yml @@ -2,38 +2,30 @@ name: Test build on: push: - branches: - - update-theme jobs: - build: - + test-build: runs-on: ubuntu-latest - env: HUGO_VERSION: 0.116.1 - steps: - name: 🛎 Check out repo uses: actions/checkout@v3 with: - ref: update-theme - fetch-depth: 1 submodules: recursive + - name: Use Node.js uses: actions/setup-node@master with: node-version: 10.x + - run: npm i -g postcss postcss-cli autoprefixer + - name: 🤵 Install Hugo run: | wget -O ${{ runner.temp }}/hugo.deb https://github.com/gohugoio/hugo/releases/download/v${HUGO_VERSION}/hugo_extended_${HUGO_VERSION}_linux-amd64.deb \ && sudo dpkg -i ${{ runner.temp }}/hugo.deb - - name: 🧹 Clean site - run: | - if [ -d "public" ]; then - rm -rf public/* - fi + - name: 🍳 Build site run: | hugo --gc --minify diff --git a/.lintr b/.lintr new file mode 100644 index 0000000..cde0b3b --- /dev/null +++ b/.lintr @@ -0,0 +1,10 @@ +linters: linters_with_defaults( + line_length_linter(80), + commented_code_linter = NULL + ) +exclusions: list( + "content/blog/2021-easy-map-norway/save_splmaps_data.R", + "content/blog/2022-dutch-performance-olympic-speed-skating/index.qmd", + "content/blog/2023-sunrise-sunset-differences/index.qmd", + "renv/" + ) diff --git a/config.toml b/config.toml index 4554e82..b0a9f49 100644 --- a/config.toml +++ b/config.toml @@ -16,6 +16,8 @@ ignoreFiles = ["\\.qmd$", "\\.ipynb$", "\\.Rmd$", "\\.Rmarkdown$", "_cache$", "i favicon = "avatar.png" + highlightstyle = "foundation" + iconsource = "simple-icons" description = "Daniel Roelfs' personal website" @@ -23,7 +25,7 @@ ignoreFiles = ["\\.qmd$", "\\.ipynb$", "\\.Rmd$", "\\.Rmarkdown$", "_cache$", "i customcss = ["custom_style.css"] - umami = true + umami = false umamilink = "https://analytics-danielroelfs.netlify.app/script.js" umamiid = "da48a88a-2e87-4024-8c99-639222aab54d" diff --git a/content/blog/2019-analyzing-bach/data/places.rds b/content/blog/2019-analyzing-bach/data/places.rds new file mode 100644 index 0000000..e40f78c --- /dev/null +++ b/content/blog/2019-analyzing-bach/data/places.rds @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:7d7b7287171aca46c42fea4efeb4f2daafa8e4f780e8a98f7b0841135e87f473 +size 799 diff --git a/content/blog/2019-analyzing-bach/data/scraped_data.rds b/content/blog/2019-analyzing-bach/data/scraped_data.rds new file mode 100644 index 0000000..87174d1 --- /dev/null +++ b/content/blog/2019-analyzing-bach/data/scraped_data.rds @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:e5e2e8fcb6bcb6b0328671bc7f4a11d45b4cf80bf2d3e8336787bf28f42465ff +size 324641 diff --git a/content/blog/2019-analyzing-bach/index.markdown_strict_files/figure-markdown_strict/colplot-key-cat1-1.png b/content/blog/2019-analyzing-bach/index.markdown_strict_files/figure-markdown_strict/colplot-key-cat1-1.png index c221510..afa7c38 100644 Binary files a/content/blog/2019-analyzing-bach/index.markdown_strict_files/figure-markdown_strict/colplot-key-cat1-1.png and b/content/blog/2019-analyzing-bach/index.markdown_strict_files/figure-markdown_strict/colplot-key-cat1-1.png differ diff --git a/content/blog/2019-analyzing-bach/index.markdown_strict_files/figure-markdown_strict/lollipop-cat1-1.png b/content/blog/2019-analyzing-bach/index.markdown_strict_files/figure-markdown_strict/lollipop-cat1-1.png index 9f195c9..23a7975 100644 Binary files a/content/blog/2019-analyzing-bach/index.markdown_strict_files/figure-markdown_strict/lollipop-cat1-1.png and b/content/blog/2019-analyzing-bach/index.markdown_strict_files/figure-markdown_strict/lollipop-cat1-1.png differ diff --git a/content/blog/2019-analyzing-bach/index.markdown_strict_files/figure-markdown_strict/tileplot-key-cat2-1.png b/content/blog/2019-analyzing-bach/index.markdown_strict_files/figure-markdown_strict/tileplot-key-cat2-1.png index b0e3482..bc84f91 100644 Binary files a/content/blog/2019-analyzing-bach/index.markdown_strict_files/figure-markdown_strict/tileplot-key-cat2-1.png and b/content/blog/2019-analyzing-bach/index.markdown_strict_files/figure-markdown_strict/tileplot-key-cat2-1.png differ diff --git a/content/blog/2019-analyzing-bach/index.md b/content/blog/2019-analyzing-bach/index.md index d7eb155..68cba11 100644 --- a/content/blog/2019-analyzing-bach/index.md +++ b/content/blog/2019-analyzing-bach/index.md @@ -14,6 +14,8 @@ execute: fig.show: hold results: hold out.width: 80% +editor_options: + chunk_output_type: console --- - ## Introduction A little while ago I was watching a documentary series on Dutch television about one of the most important composers in the Netherlands: Johann Sebastian Bach. The documentary discussed parts of Bach's life and the music he wrote during it. This documentary inspired me to learn more about the pieces Bach wrote, and since this is my life now, I might as well do it in R. - - ## Collecting the data In my search of a dataset, I found this old website, looking like it last got a major update in 2003, made by what appeared to be a Bach enthousiast called Bryen Travis. I tried to contact him to see if he was interested in collaborating, but I couldn't get a hold of him. This website contained a list of all works by Bach, with some information about each of them. The website listed a few options, an online browser, a pdf containing slimmed down entries, and two files that I couldn't get to work, presumably because they were in a format that have presumably become depracated since they were uploaded back in 1996, when the website was created. I could have used the pdf, but since it contained only the BWV number, the title of the piece, and the setting, I had to scrape the data I wanted from the website directly. @@ -60,7 +59,7 @@ Let's collect the BWVs: ``` r index_url <- "http://www.bachcentral.com/BWV/index.html" -BWVs <- paste(readLines(index_url), collapse = "\n") |> +bwvs <- paste(readLines(index_url), collapse = "\n") |> str_match_all(" unlist() |> unique() |> @@ -69,7 +68,7 @@ BWVs <- paste(readLines(index_url), collapse = "\n") |> unique() |> str_extract_all("[0-9]+") |> as.character() -str(BWVs) +str(bwvs) ``` chr [1:1075] "1" "2" "3" "4" "5" "6" "7" "8" "9" "10" "11" "12" "13" "14" ... @@ -77,13 +76,15 @@ str(BWVs) I now have a list of 1075 numbers, corresponding to BWV numbers of which this website has an entry. The next thing I needed to do now was loop over these numbers, navigate to each of the webpages corresponding to that link, and scrape all 1075 webpages individually. This takes a little bit of time, but luckily the author of the pages was very consistent in the setup, so I didn't need to build in fail-safes or condititions to account for irregularities. Before we'll do that, I first initialized a data frame with the correct size for speed. I added the column names for convenience too. ``` r -col_names <- c("Title", "Subtitle_and_notes", - "BWV", "BWV_epifix", "CLC_BWV_w_epifix", "belongs after", - "voices_instruments", - "category1", "category2", "category3", - "cantate_cat1", "cantate_cat2") +col_names <- c( + "Title", "Subtitle_and_notes", + "BWV", "BWV_epifix", "CLC_BWV_w_epifix", "belongs after", + "voices_instruments", + "category1", "category2", "category3", + "cantate_cat1", "cantate_cat2" +) -scraped_data <- data.frame(matrix(ncol = 12, nrow = length(BWVs))) +scraped_data <- data.frame(matrix(ncol = 12, nrow = length(bwvs))) colnames(scraped_data) <- col_names ``` @@ -92,46 +93,48 @@ colnames(scraped_data) <- col_names We now have a variable of the same dimensions and column names as I'll scrape from the website. Now it's time to loop through the webpages and collect the data. Each webpage contains what looks like a table, but within the table it's bulleted lists (denoted as `ul`). This might just be how html builds up tables, but I thought it was a bit strange. Nonetheless, it provided me with an easy hook to grab the values. I remove all the white spaces (`\t`), and each new line was denoted by a `\n`, which I used to split the strings into separate values. The advantage of this approach is that when a field is empty, it will still occupy an element in the character array. Then all I needed to do to obtain the values is take every second element and add it as a row to the data frame I created earlier. Now I have a dataset containing the values from all of the 1075 webpages, with which I was quite pleased. ``` r -for (i in 1:length(BWVs)) { - - print(sprintf("Scraping data for BWV %s", BWVs[i])) - - url <- sprintf("http://www.bachcentral.com/BWV/%s.html", BWVs[i]) +for (i in seq_along(length(bwvs))) { + print(sprintf("Scraping data for BWV %s", bwvs[i])) + + url <- sprintf("http://www.bachcentral.com/BWV/%s.html", bwvs[i]) webpage <- read_html(url) - + text <- webpage |> html_nodes("ul") |> html_text() |> - gsub('[\t]', '', .) |> + gsub("[\t]", "", .) |> strsplit(., "\\n") |> unlist() - - values <- text[seq(2,length(text),2)] - + + values <- text[seq(2, length(text), 2)] + scraped_data[i, ] <- values - } + +scraped_data <- scraped_data |> + janitor::clean_names() ``` With this, I achieved the first goal I had for this little project, which was to find or create a dataset on Bach. Let's see what it looks like: ``` r -str(scraped_data) +glimpse(scraped_data) ``` - 'data.frame': 1075 obs. of 12 variables: - $ Title : chr "Wie schön leuchtet der Morgenstern" "Ach Gott, von Himmel sieh darein" "Ach Gott, wie manches Herzeleid" "Christ lag in Todes Banden" ... - $ Subtitle_and_notes: chr "Kantate am Fest Mariae Verkündigung (Festo annuntiationis Mariae)" "Kantate am zweiten Sonntag nach Trinitatis (Dominica 2 post Trinitatis)" "Kantate am zweiten Sonntag nach Epiphanias (Dominica 2 post Epiphanias)" "Kantate am Osterfest (Feria Paschatos)" ... - $ BWV : chr "1" "2" "3" "4" ... - $ BWV_epifix : chr "" "" "" "" ... - $ CLC_BWV_w_epifix : chr "1" "2" "3" "4" ... - $ belongs after : chr "" "" "" "" ... - $ voices_instruments: chr "Soli: S, T, B. Chor: S, A, T, B. Instr.: Corno I, II; Ob. da caccia I, II; Viol. conc. I, II; Viol. rip. I, II; Vla.; Cont." "Soli: A, T, B. Chor: S, A, T, B. Instr.: Tromb. I - IV; Ob. I, II; Viol. I, II; Vla.; Cont." "Soli: S, A, T, B. Chor: S, A, T, B. Instr.: Corno; Tromb.; Ob. d'amore I, II; Viol. I, II; Vla.; Cont." "Soli: S, A, T, B. Chor: S, A, T, B. Instr.: Cornetto; Tromb. I, II, III; Viol. I, II; Vla. I, II; Cont." ... - $ category1 : chr "Vokalwerke" "Vokalwerke" "Vokalwerke" "Vokalwerke" ... - $ category2 : chr "Kantaten" "Kantaten" "Kantaten" "Kantaten" ... - $ category3 : chr "" "" "" "" ... - $ cantate_cat1 : chr "A. Geistliche Kantaten an der Sonn- und Festtagen des Kirchenjahres" "A. Geistliche Kantaten an der Sonn- und Festtagen des Kirchenjahres" "A. Geistliche Kantaten an der Sonn- und Festtagen des Kirchenjahres" "A. Geistliche Kantaten an der Sonn- und Festtagen des Kirchenjahres" ... - $ cantate_cat2 : chr "Mariae Verkundigung" "2. Sonntag nach Trinitatis" "2. Sonntag nach Epiphanias" "1. Osterfesttag" ... + Rows: 1,075 + Columns: 12 + $ title "Wie schön leuchtet der Morgenstern", "Ach Gott, vo… + $ subtitle_and_notes "Kantate am Fest Mariae Verkündigung (Festo annunti… + $ bwv "1", "2", "3", "4", "5", "6", "7", "8", "9", "10", … + $ bwv_epifix "", "", "", "", "", "", "", "", "", "", "", "", "",… + $ clc_bwv_w_epifix "1", "2", "3", "4", "5", "6", "7", "8", "9", "10", … + $ belongs_after "", "", "", "", "", "", "", "", "", "", "249", "", … + $ voices_instruments "Soli: S, T, B. Chor: S, A, T, B. Instr.: Corno I, … + $ category1 "Vokalwerke", "Vokalwerke", "Vokalwerke", "Vokalwer… + $ category2 "Kantaten", "Kantaten", "Kantaten", "Kantaten", "Ka… + $ category3 "", "", "", "", "", "", "", "", "", "", "", "", "",… + $ cantate_cat1 "A. Geistliche Kantaten an der Sonn- und Festtagen … + $ cantate_cat2 "Mariae Verkundigung", "2. Sonntag nach Trinitatis"… All columns are currently character arrays, and this is appropriate for most of them. Although I think the BWV number alone could be a numeric array. Also, some columns are still a bit awkward. This is why I moved on to do another important and satisfying part, cleaning the data. @@ -141,30 +144,33 @@ The character arrays are appropriate, but for further analyes I'd prefer to make ``` r data <- scraped_data |> - mutate(BWV = as.numeric(BWV), - category1 = factor(category1), - category2 = factor(category2), - category3 = factor(category3), - cantate_cat1 = substring(cantate_cat1,4), - CLC_BWV_w_epifix = str_replace(CLC_BWV_w_epifix, " ", "")) |> - rename(BWV_w_epifix = CLC_BWV_w_epifix) |> - select(BWV, BWV_epifix, BWV_w_epifix, everything()) -str(data) + mutate( + bwv = as.numeric(bwv), + category1 = factor(category1), + category2 = factor(category2), + category3 = factor(category3), + cantate_cat1 = substring(cantate_cat1, 4), + clc_bwv_w_epifix = str_squish(clc_bwv_w_epifix) + ) |> + rename(bwv_w_epifix = clc_bwv_w_epifix) |> + select(bwv, bwv_epifix, bwv_w_epifix, everything()) |> + glimpse() ``` - 'data.frame': 1075 obs. of 12 variables: - $ BWV : num 1 2 3 4 5 6 7 8 9 10 ... - $ BWV_epifix : chr "" "" "" "" ... - $ BWV_w_epifix : chr "1" "2" "3" "4" ... - $ Title : chr "Wie schön leuchtet der Morgenstern" "Ach Gott, von Himmel sieh darein" "Ach Gott, wie manches Herzeleid" "Christ lag in Todes Banden" ... - $ Subtitle_and_notes: chr "Kantate am Fest Mariae Verkündigung (Festo annuntiationis Mariae)" "Kantate am zweiten Sonntag nach Trinitatis (Dominica 2 post Trinitatis)" "Kantate am zweiten Sonntag nach Epiphanias (Dominica 2 post Epiphanias)" "Kantate am Osterfest (Feria Paschatos)" ... - $ belongs after : chr "" "" "" "" ... - $ voices_instruments: chr "Soli: S, T, B. Chor: S, A, T, B. Instr.: Corno I, II; Ob. da caccia I, II; Viol. conc. I, II; Viol. rip. I, II; Vla.; Cont." "Soli: A, T, B. Chor: S, A, T, B. Instr.: Tromb. I - IV; Ob. I, II; Viol. I, II; Vla.; Cont." "Soli: S, A, T, B. Chor: S, A, T, B. Instr.: Corno; Tromb.; Ob. d'amore I, II; Viol. I, II; Vla.; Cont." "Soli: S, A, T, B. Chor: S, A, T, B. Instr.: Cornetto; Tromb. I, II, III; Viol. I, II; Vla. I, II; Cont." ... - $ category1 : Factor w/ 2 levels "Instrumentalwerke",..: 2 2 2 2 2 2 2 2 2 2 ... - $ category2 : Factor w/ 15 levels "Kammermusik",..: 3 3 3 3 3 3 3 3 3 3 ... - $ category3 : Factor w/ 26 levels "","Choralbearbeitungen",..: 1 1 1 1 1 1 1 1 1 1 ... - $ cantate_cat1 : chr "Geistliche Kantaten an der Sonn- und Festtagen des Kirchenjahres" "Geistliche Kantaten an der Sonn- und Festtagen des Kirchenjahres" "Geistliche Kantaten an der Sonn- und Festtagen des Kirchenjahres" "Geistliche Kantaten an der Sonn- und Festtagen des Kirchenjahres" ... - $ cantate_cat2 : chr "Mariae Verkundigung" "2. Sonntag nach Trinitatis" "2. Sonntag nach Epiphanias" "1. Osterfesttag" ... + Rows: 1,075 + Columns: 12 + $ bwv 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 16, … + $ bwv_epifix "", "", "", "", "", "", "", "", "", "", "", "", "",… + $ bwv_w_epifix "1", "2", "3", "4", "5", "6", "7", "8", "9", "10", … + $ title "Wie schön leuchtet der Morgenstern", "Ach Gott, vo… + $ subtitle_and_notes "Kantate am Fest Mariae Verkündigung (Festo annunti… + $ belongs_after "", "", "", "", "", "", "", "", "", "", "249", "", … + $ voices_instruments "Soli: S, T, B. Chor: S, A, T, B. Instr.: Corno I, … + $ category1 Vokalwerke, Vokalwerke, Vokalwerke, Vokalwerke, Vok… + $ category2 "Kantaten", "Kantaten", "Kantaten", "Kantaten", "Ka… + $ category3 "", "", "", "", "", "", "", "", "", "", "", "", "",… + $ cantate_cat1 "Geistliche Kantaten an der Sonn- und Festtagen des… + $ cantate_cat2 "Mariae Verkundigung", "2. Sonntag nach Trinitatis"… Now we have this data, we can do some descriptive visualizations of the data. Over time I hope I can dive into the setting (`voices_instruments`) and disect that, but for now I'll keep it simple and just do some descriptives. @@ -177,19 +183,27 @@ data |> group_by(category1) |> summarise(n = n()) |> ggplot(aes(x = category1, y = n, color = category1)) + - geom_segment(aes(xend = category1, yend = 0), size = 12, - color = "grey40", alpha = 0.9) + - geom_point(size = 20, shape = 16) + - geom_text(aes(label = n, y = n + 5), color = "white", size = 5, family = "Alegreya") + - labs(x = NULL, - y = "Number of compositions") + + geom_segment(aes(xend = category1, yend = 0), + linewidth = 12, + color = "grey40", alpha = 0.9 + ) + + geom_point(size = 20, shape = 16) + + geom_text(aes(label = n, y = n + 5), + color = "white", size = 5, family = "Alegreya" + ) + + geom_hline(yintercept = 0, size = 1) + + labs( + x = NULL, + y = "Number of compositions" + ) + scale_color_daniel(palette = "staalmeesters") + - scale_y_continuous(limits = c(0,650)) + + scale_y_continuous(limits = c(0, 650)) + theme_bach(base_family = "Alegreya", grid = "y") + theme( legend.position = "none", - aspect.ratio = 3/2, - axis.text.x = element_text(angle = 45, hjust = 1.1, vjust = 1.15)) + aspect.ratio = 3 / 2, + axis.text.x = element_text(angle = 45, hjust = 1.1, vjust = 1.15) + ) ``` Warning: Using `size` aesthetic for lines was deprecated in ggplot2 3.4.0. @@ -203,20 +217,26 @@ It seems Bach didn't have a strong preference for either instrumental or choral data |> group_by(category2) |> summarise(n = n()) |> - ggplot(aes(x = reorder(category2,-n), y = n, color = category2)) + - geom_segment(aes(xend = category2, yend = 0), size = 6, - color = "grey40", alpha = 0.9) + - geom_point(size = 10, shape = 16) + - geom_text(aes(label = n, y = n + 3), color = "white", size = 4, family = "Alegreya") + - labs(x = NULL, - y = "Number of compositions") + + ggplot(aes(x = reorder(category2, -n), y = n, color = category2)) + + geom_segment(aes(xend = category2, yend = 0), + size = 6, + color = "grey40", alpha = 0.9 + ) + + geom_point(size = 10, shape = 16) + + geom_text(aes(label = n, y = n + 3), + color = "white", size = 4, family = "Alegreya" + ) + + labs( + x = NULL, + y = "Number of compositions" + ) + scale_color_daniel(palette = "staalmeesters") + - scale_y_continuous(limits = c(-5,300)) + + scale_y_continuous(limits = c(-5, 300)) + theme_bach(base_family = "Alegreya", grid = "y") + theme( legend.position = "none", axis.text.x = element_text(angle = 45, hjust = 1.05, vjust = 1.025) - ) + ) ``` @@ -229,7 +249,10 @@ data |> summarise(n = n()) |> mutate(category2 = sprintf("%s (%s)", category2, n)) |> ggplot(aes(fill = category2, values = n)) + - geom_waffle(n_rows = 17, size = 0.2, colour = "#FFFFFC", flip = FALSE, na.rm = TRUE) + + geom_waffle( + n_rows = 17, size = 0.2, colour = "#FFFFFC", + flip = FALSE, na.rm = TRUE + ) + scale_fill_daniel(palette = "staalmeesters", name = NULL) + coord_equal() + theme_bach(base_family = "Alegreya", base_size = 14) + @@ -246,40 +269,48 @@ data |> The dataset I just created was comprehensive and clean, but it didn't contain any information about the musical properties other than the setting. I want to dive into the setting later, but it's going to be a regular expression hell. I might come back to that later. In the meantime, Wikipedia has a page listing the compositions by Bach too (because of course Wikipedia does). This page contains approximate dates on each composition, as well as the key it was written in. I'm going to scrape this webpage too. The setup of this page was somewhat simpler, so scraping it was slightly simpler. ``` r -url <- "https://en.wikipedia.org/wiki/List_of_compositions_by_Johann_Sebastian_Bach" +website <- "https://en.wikipedia.org" +article <- "wiki/List_of_compositions_by_Johann_Sebastian_Bach" +url <- str_glue("{website}/{article}") webpage <- read_html(url) wikitext <- webpage |> - html_nodes(xpath='//*[@id="TOP"]') |> + html_nodes(xpath = '//*[@id="TOP"]') |> html_table(fill = TRUE) |> - as.data.frame() + as.data.frame() |> + janitor::clean_names() ``` Then I cleaned the data somewhat and extracted the number from the BWV columns. ``` r wikidata <- wikitext |> - rename(BWV_full = BWV) |> - mutate(BWV = sub('.*(\\d{3}).*', '\\1', BWV_full), - BWV = parse_integer(BWV)) |> - filter(!rev(duplicated(rev(BWV)))) + rename(bwv_full = bwv) |> + mutate( + bwv = sub(".*(\\d{3}).*", "\\1", bwv_full), + bwv = parse_integer(bwv) + ) |> + filter(!rev(duplicated(rev(bwv)))) ``` Then I merged the data. I lost a number of compositions in the process, but I was okay with it, mostly because it took me too much time and effort to try and fix it. Hurray for laziness. I extracted the year from the slightly messy `Date` column. Some strings in this column contain two dates, one for the first compilation and one for the date of completion. I extracted the last number, the year of completion. ``` r -merged_data <- merge(data, wikidata, by = "BWV") |> - mutate(year = sub(".*(\\d{4}).*", "\\1", Date), - year = as.numeric(year), - age = year - 1685) +merged_data <- data |> + inner_join(wikidata, by = "bwv") |> + mutate( + year = sub(".*(\\d{4}).*", "\\1", date), + year = as.numeric(year), + age = year - 1685 + ) ``` I noticed that some entries in the `year` column exceeded the year of Bach's death. Bach died in 1750. I assumed that these years indicated the year of first performance or publication. In this scenario, it would be possible that certain pieces were lost and rediscovered at a later date and only then published. I thought to make a barplot of the number of compositions Bach over the course of his life, trying to see if there was any period where he was particularly productive. I also added some annotations to give the plot some context. ``` r -BWVperyear <- merged_data |> +bwvperyear <- merged_data |> filter(!is.na(year)) |> - group_by(year,age) |> + group_by(year, age) |> summarise(n = n()) palette <- daniel_pal("staalmeesters")(6) @@ -300,34 +331,66 @@ annotation2 <- data.frame( text = "italic(Schemellis~Gesangbuch)~comes~out" ) -ggplot(BWVperyear, aes(x = year, y = n, fill = n)) + +ggplot(bwvperyear, aes(x = year, y = n, fill = n)) + geom_bar(stat = "identity") + - geom_vline(xintercept = c(1750.5,1685), linetype = "dashed", color = "grey30") + - geom_text(data = data.frame(), mapping = aes(x = 1685.5, y = 75, label = "Bach was born\nMarch 31st 1685\nin Eisenach"), - inherit.aes = FALSE, family = "Alegreya", hjust = 0) + - geom_text(data = data.frame(), mapping = aes(x = 1750, y = 75, label = "Bach died\nJuly 28th 1750\nin Leipzich"), - inherit.aes = FALSE, family = "Alegreya", hjust = 1) + - geom_curve(data = annotation1, mapping = aes(x = x, y = y, xend = xend, yend = yend), - arrow = arrow(angle = 25, length = unit(6,"pt")), - curvature = 0.1, inherit.aes = FALSE) + - geom_text(data = annotation1, mapping = aes(x = x, y = y + 4, label = text), - family = "Alegreya", inherit.aes = FALSE, parse = FALSE) + - geom_curve(data = annotation2, mapping = aes(x = x, y = y, xend = xend, yend = yend), - arrow = arrow(angle = 25, length = unit(6,"pt")), - curvature = 0.2, inherit.aes = FALSE) + - geom_text(data = annotation2, mapping = aes(x = x, y = y + 3, label = text), - family = "Alegreya", inherit.aes = FALSE, parse = TRUE) + - labs(x = "Year", - y = "Number of compositions") + - scale_x_continuous(limits = c(1685,1751), breaks = seq(1690,1750,10)) + + geom_vline( + xintercept = c(1750.5, 1685), + linetype = "dashed", color = "grey30" + ) + + geom_text( + data = data.frame(), + mapping = aes( + x = 1685.5, y = 75, + label = "Bach was born\nMarch 31st 1685\nin Eisenach" + ), + inherit.aes = FALSE, family = "Alegreya", hjust = 0 + ) + + geom_text( + data = data.frame(), + mapping = aes( + x = 1750, y = 75, + label = "Bach died\nJuly 28th 1750\nin Leipzich" + ), + inherit.aes = FALSE, family = "Alegreya", hjust = 1 + ) + + geom_curve( + data = annotation1, + mapping = aes(x = x, y = y, xend = xend, yend = yend), + arrow = arrow(angle = 25, length = unit(6, "pt")), + curvature = 0.1, inherit.aes = FALSE + ) + + geom_text( + data = annotation1, + mapping = aes(x = x, y = y + 4, label = text), + family = "Alegreya", inherit.aes = FALSE, parse = FALSE + ) + + geom_curve( + data = annotation2, + mapping = aes(x = x, y = y, xend = xend, yend = yend), + arrow = arrow(angle = 25, length = unit(6, "pt")), + curvature = 0.2, inherit.aes = FALSE + ) + + geom_text( + data = annotation2, + mapping = aes(x = x, y = y + 3, label = text), + family = "Alegreya", inherit.aes = FALSE, parse = TRUE + ) + + labs( + x = "Year", + y = "Number of compositions" + ) + + scale_x_continuous( + limits = c(1685, 1751), + breaks = seq(1690, 1750, 10) + ) + scale_fill_gradient(low = palette[6], high = palette[2]) + theme_bach(base_family = "Alegreya", grid = "y") + theme( - legend.position = 'none' - ) + legend.position = "none" + ) ``` - + It seems there were two particularly productive years. But since the year column likely indicates year of publication, it's perhaps more likely that Bach published a collection of pieces in a batch. This is certainly true for the year 1736, when the *Schemellis Gesangbuch* came out, a song book with sacred music, written together with a student of the school he was cantor at: the *Thomasschule* in Leipzich. I'll come back to this plot later. @@ -335,28 +398,24 @@ The Wikipedia page also contained information on the key of most of the composit ``` r summ_key_cat1 <- merged_data |> - group_by(category1, Key) |> + group_by(category1, key) |> summarise(n = n()) |> - filter(nchar(Key) > 1) |> - mutate(Key = sub("\u266D", "b", Key), - Key = sub("\U266F", "#", Key), - Key = ifelse(nchar(Key) > 10, substring(Key,1,nchar(Key)/2), Key) + filter(nchar(key) > 1) |> + mutate( + Key = sub("\u266D", "b", key), + Key = sub("\U266F", "#", key), + Key = ifelse(nchar(key) > 10, substring(key, 1, nchar(key) / 2), key) ) |> - group_by(category1, Key) |> + group_by(category1, key) |> summarise(n = sum(n)) -``` - - `summarise()` has grouped output by 'category1'. You can override using the - `.groups` argument. - `summarise()` has grouped output by 'category1'. You can override using the - `.groups` argument. -``` r -ggplot(summ_key_cat1, aes(x = reorder(Key,-n), y = n, fill = category1)) + +ggplot(summ_key_cat1, aes(x = reorder(key, -n), y = n, fill = category1)) + geom_col(position = position_dodge2(preserve = "single", padding = 0)) + - labs(x = NULL, - y = "Number of compositions", - fill = NULL) + + labs( + x = NULL, + y = "Number of compositions", + fill = NULL + ) + scale_fill_daniel(palette = "staalmeesters") + theme_bach(base_family = "Alegreya", grid = "y") + theme( @@ -371,31 +430,42 @@ I noticed that there were no double keys, as in B flat is the same as A sharp. T ``` r summ_key_cat2 <- merged_data |> - group_by(category2, Key) |> + group_by(category2, key) |> summarise(n = n()) |> - filter(nchar(Key) > 1) |> - mutate(Key = sub("\u266D", "b", Key), - Key = sub("\U266F", "#", Key), - Key = ifelse(nchar(Key) > 10, substring(Key,1,nchar(Key)/2), Key) + filter(nchar(key) > 1) |> + mutate( + key = sub("\u266D", "b", key), + key = sub("\U266F", "#", key), + key = ifelse(nchar(key) > 10, substring(key, 1, nchar(key) / 2), key) ) |> - group_by(category2, Key) |> + group_by(category2, key) |> summarise(n = sum(n)) -plotdat <- rbind(summ_key_cat1 |> rename(category2 = category1), - summ_key_cat2) |> - cbind(gr = c(rep(1,nrow(summ_key_cat1)), - rep(2,nrow(summ_key_cat2))) - ) +plotdat <- rbind( + summ_key_cat1 |> rename(category2 = category1), + summ_key_cat2 +) |> + cbind(gr = c( + rep(1, nrow(summ_key_cat1)), + rep(2, nrow(summ_key_cat2)) + )) -ggplot(plotdat, aes(x = Key, y = reorder(category2,n), fill = n)) + +ggplot(plotdat, aes(x = key, y = reorder(category2, n), fill = n)) + geom_tile() + geom_text(aes(label = n), family = "Alegreya", color = "white", size = 4) + - geom_hline(yintercept = 8.5, color = "#FFFFF0", size = 1) + - labs(x = NULL, - y = NULL, - fill = NULL) + + geom_hline(yintercept = 8.5, color = "#FFFFF0", size = 1) + + labs( + x = NULL, + y = NULL, + fill = NULL + ) + scale_x_discrete(position = "top") + - scale_fill_gradient(low = palette[6], high = palette[2], breaks = c(1,seq(10,40,10)),guide = guide_colorbar(barheight = 8, barwidth = 0.75)) + + scale_fill_gradient( + low = palette[6], + high = palette[2], + breaks = c(1, seq(10, 40, 10)), + guide = guide_colorbar(barheight = 8, barwidth = 0.75) + ) + coord_fixed() + theme_bach(base_family = "Alegreya", grid = FALSE) + theme( @@ -415,9 +485,12 @@ In addition to this, I also wanted to do some classic journalism. Previously, I ``` r register_google(google_code) places <- data_frame( - city = c("Eisenach","Ohrdruf","Lüneburg","Weimar","Arnstadt","Mühlhausen","Weimar","Köthen","Leipzig"), - year_from = c(1685,1695,1700,1702,1703,1707,1708,1717,1723), - year_to = c(1695,1700,1702,1703,1707,1708,1717,1723,1750) + city = c( + "Eisenach", "Ohrdruf", "Lüneburg", "Weimar", + "Arnstadt", "Mühlhausen", "Weimar", "Köthen", "Leipzig" + ), + year_from = c(1685, 1695, 1700, 1702, 1703, 1707, 1708, 1717, 1723), + year_to = c(1695, 1700, 1702, 1703, 1707, 1708, 1717, 1723, 1750) ) |> mutate_geocode(city) |> mutate(duration = year_to - year_from) @@ -439,12 +512,14 @@ Lastly, just for fun, I created a map. places_unique <- places |> distinct(city, .keep_all = TRUE) -ggplot(places_unique, aes(x = lon, y = lat)) + +ggplot(places_unique, aes(x = lon, y = lat)) + borders("world", colour = palette[3], fill = palette[4], alpha = 1) + geom_point(color = palette[1], size = 4) + - ggrepel::geom_text_repel(aes(label = city), force = 1, size = 4, - seed = 2345, point.padding = 0.2, - family = "Alegreya") + + ggrepel::geom_text_repel(aes(label = city), + force = 1, size = 4, + seed = 2345, point.padding = 0.2, + family = "Alegreya" + ) + coord_quickmap(xlim = c(4, 18), ylim = c(48, 55)) + theme_void() ``` diff --git a/content/blog/2019-analyzing-bach/index.qmd b/content/blog/2019-analyzing-bach/index.qmd index 57fb017..dfe3835 100644 --- a/content/blog/2019-analyzing-bach/index.qmd +++ b/content/blog/2019-analyzing-bach/index.qmd @@ -14,6 +14,8 @@ execute: fig.show: hold results: hold out.width: 80% +editor_options: + chunk_output_type: console --- ```{css} @@ -34,12 +36,10 @@ h2, h3, h4, h5, h6 { } ``` - ## Introduction A little while ago I was watching a documentary series on Dutch television about one of the most important composers in the Netherlands: Johann Sebastian Bach. The documentary discussed parts of Bach's life and the music he wrote during it. This documentary inspired me to learn more about the pieces Bach wrote, and since this is my life now, I might as well do it in R. - ## Collecting the data In my search of a dataset, I found this old website, looking like it last got a major update in 2003, made by what appeared to be a Bach enthousiast called Bryen Travis. I tried to contact him to see if he was interested in collaborating, but I couldn't get a hold of him. This website contained a list of all works by Bach, with some information about each of them. The website listed a few options, an online browser, a pdf containing slimmed down entries, and two files that I couldn't get to work, presumably because they were in a format that have presumably become depracated since they were uploaded back in 1996, when the website was created. I could have used the pdf, but since it contained only the BWV number, the title of the piece, and the setting, I had to scrape the data I wanted from the website directly. @@ -66,9 +66,6 @@ library(danielR) #| echo: false theme_bach <- theme_daniel - -#palette <- c('#555555','#db735c', '#EFA86E', '#9A8A76', '#F3C57B', -# '#7A6752', '#2A91A2', '#87F28A', '#6EDCEF') ``` Let's collect the BWVs: @@ -77,7 +74,7 @@ Let's collect the BWVs: #| label: scrape-index index_url <- "http://www.bachcentral.com/BWV/index.html" -BWVs <- paste(readLines(index_url), collapse = "\n") |> +bwvs <- paste(readLines(index_url), collapse = "\n") |> str_match_all(" unlist() |> unique() |> @@ -86,7 +83,7 @@ BWVs <- paste(readLines(index_url), collapse = "\n") |> unique() |> str_extract_all("[0-9]+") |> as.character() -str(BWVs) +str(bwvs) ``` I now have a list of 1075 numbers, corresponding to BWV numbers of which this website has an entry. The next thing I needed to do now was loop over these numbers, navigate to each of the webpages corresponding to that link, and scrape all 1075 webpages individually. This takes a little bit of time, but luckily the author of the pages was very consistent in the setup, so I didn't need to build in fail-safes or condititions to account for irregularities. Before we'll do that, I first initialized a data frame with the correct size for speed. I added the column names for convenience too. @@ -94,42 +91,55 @@ I now have a list of 1075 numbers, corresponding to BWV numbers of which this we ```{r} #| label: collect-scrape-index -col_names <- c("Title", "Subtitle_and_notes", - "BWV", "BWV_epifix", "CLC_BWV_w_epifix", "belongs after", - "voices_instruments", - "category1", "category2", "category3", - "cantate_cat1", "cantate_cat2") +col_names <- c( + "Title", "Subtitle_and_notes", + "BWV", "BWV_epifix", "CLC_BWV_w_epifix", "belongs after", + "voices_instruments", + "category1", "category2", "category3", + "cantate_cat1", "cantate_cat2" +) -scraped_data <- data.frame(matrix(ncol = 12, nrow = length(BWVs))) +scraped_data <- data.frame(matrix(ncol = 12, nrow = length(bwvs))) colnames(scraped_data) <- col_names ``` ## Scraping the data + We now have a variable of the same dimensions and column names as I'll scrape from the website. Now it's time to loop through the webpages and collect the data. Each webpage contains what looks like a table, but within the table it's bulleted lists (denoted as `ul`). This might just be how html builds up tables, but I thought it was a bit strange. Nonetheless, it provided me with an easy hook to grab the values. I remove all the white spaces (`\t`), and each new line was denoted by a `\n`, which I used to split the strings into separate values. The advantage of this approach is that when a field is empty, it will still occupy an element in the character array. Then all I needed to do to obtain the values is take every second element and add it as a row to the data frame I created earlier. Now I have a dataset containing the values from all of the 1075 webpages, with which I was quite pleased. ```{r} #| label: scrape-data #| eval: false -for (i in 1:length(BWVs)) { - - print(sprintf("Scraping data for BWV %s", BWVs[i])) - - url <- sprintf("http://www.bachcentral.com/BWV/%s.html", BWVs[i]) +for (i in seq_along(length(bwvs))) { + print(sprintf("Scraping data for BWV %s", bwvs[i])) + + url <- sprintf("http://www.bachcentral.com/BWV/%s.html", bwvs[i]) webpage <- read_html(url) - + text <- webpage |> html_nodes("ul") |> html_text() |> - gsub('[\t]', '', .) |> + gsub("[\t]", "", .) |> strsplit(., "\\n") |> unlist() - - values <- text[seq(2,length(text),2)] - + + values <- text[seq(2, length(text), 2)] + scraped_data[i, ] <- values - } + +scraped_data <- scraped_data |> + janitor::clean_names() +``` + +```{r} +#| label: save-scraped +#| eval: false +#| echo: false + +scraped_data |> + write_rds("./data/scraped_data.rds") ``` With this, I achieved the first goal I had for this little project, which was to find or create a dataset on Bach. Let's see what it looks like: @@ -138,13 +148,14 @@ With this, I achieved the first goal I had for this little project, which was to #| label: load-scraped #| echo: false -load("scraped_data.Rdata") +scraped_data <- read_rds("./data/scraped_data.rds") |> + janitor::clean_names() ``` ```{r} #| label: str-scraped -str(scraped_data) +glimpse(scraped_data) ``` All columns are currently character arrays, and this is appropriate for most of them. Although I think the BWV number alone could be a numeric array. Also, some columns are still a bit awkward. This is why I moved on to do another important and satisfying part, cleaning the data. @@ -157,15 +168,17 @@ The character arrays are appropriate, but for further analyes I'd prefer to make #| label: clean-scraped data <- scraped_data |> - mutate(BWV = as.numeric(BWV), - category1 = factor(category1), - category2 = factor(category2), - category3 = factor(category3), - cantate_cat1 = substring(cantate_cat1,4), - CLC_BWV_w_epifix = str_replace(CLC_BWV_w_epifix, " ", "")) |> - rename(BWV_w_epifix = CLC_BWV_w_epifix) |> - select(BWV, BWV_epifix, BWV_w_epifix, everything()) -str(data) + mutate( + bwv = as.numeric(bwv), + category1 = factor(category1), + category2 = factor(category2), + category3 = factor(category3), + cantate_cat1 = substring(cantate_cat1, 4), + clc_bwv_w_epifix = str_squish(clc_bwv_w_epifix) + ) |> + rename(bwv_w_epifix = clc_bwv_w_epifix) |> + select(bwv, bwv_epifix, bwv_w_epifix, everything()) |> + glimpse() ``` Now we have this data, we can do some descriptive visualizations of the data. Over time I hope I can dive into the setting (`voices_instruments`) and disect that, but for now I'll keep it simple and just do some descriptives. @@ -181,19 +194,27 @@ data |> group_by(category1) |> summarise(n = n()) |> ggplot(aes(x = category1, y = n, color = category1)) + - geom_segment(aes(xend = category1, yend = 0), size = 12, - color = "grey40", alpha = 0.9) + - geom_point(size = 20, shape = 16) + - geom_text(aes(label = n, y = n + 5), color = "white", size = 5, family = "Alegreya") + - labs(x = NULL, - y = "Number of compositions") + + geom_segment(aes(xend = category1, yend = 0), + linewidth = 12, + color = "grey40", alpha = 0.9 + ) + + geom_point(size = 20, shape = 16) + + geom_text(aes(label = n, y = n + 5), + color = "white", size = 5, family = "Alegreya" + ) + + geom_hline(yintercept = 0, size = 1) + + labs( + x = NULL, + y = "Number of compositions" + ) + scale_color_daniel(palette = "staalmeesters") + - scale_y_continuous(limits = c(0,650)) + + scale_y_continuous(limits = c(0, 650)) + theme_bach(base_family = "Alegreya", grid = "y") + theme( legend.position = "none", - aspect.ratio = 3/2, - axis.text.x = element_text(angle = 45, hjust = 1.1, vjust = 1.15)) + aspect.ratio = 3 / 2, + axis.text.x = element_text(angle = 45, hjust = 1.1, vjust = 1.15) + ) ``` It seems Bach didn't have a strong preference for either instrumental or choral music. I suppose he didn't have infinite freedom with what to compose, since his employer might also request him to compose certain types of music. I wanted to see the same for the secondary category, which differentias between the type of composition (e.g. cantate, passions, organ pieces, symphonies, and so on). @@ -204,20 +225,26 @@ It seems Bach didn't have a strong preference for either instrumental or choral data |> group_by(category2) |> summarise(n = n()) |> - ggplot(aes(x = reorder(category2,-n), y = n, color = category2)) + - geom_segment(aes(xend = category2, yend = 0), size = 6, - color = "grey40", alpha = 0.9) + - geom_point(size = 10, shape = 16) + - geom_text(aes(label = n, y = n + 3), color = "white", size = 4, family = "Alegreya") + - labs(x = NULL, - y = "Number of compositions") + + ggplot(aes(x = reorder(category2, -n), y = n, color = category2)) + + geom_segment(aes(xend = category2, yend = 0), + size = 6, + color = "grey40", alpha = 0.9 + ) + + geom_point(size = 10, shape = 16) + + geom_text(aes(label = n, y = n + 3), + color = "white", size = 4, family = "Alegreya" + ) + + labs( + x = NULL, + y = "Number of compositions" + ) + scale_color_daniel(palette = "staalmeesters") + - scale_y_continuous(limits = c(-5,300)) + + scale_y_continuous(limits = c(-5, 300)) + theme_bach(base_family = "Alegreya", grid = "y") + theme( legend.position = "none", axis.text.x = element_text(angle = 45, hjust = 1.05, vjust = 1.025) - ) + ) ``` From this it seems that most of the intrumental pieces are made up by just solo pieces for organ and the harpsichord and that the choral pieces are made up mostly by cantates and chorales for four voices. While I appreciate the information dissemination qualities of a barplot (or lollipop plot) like this, in that it's good in communicating absolute quantities (while I admit that this could also be displayed in a table). One thing it is less good at, is communicating the subjective volume of the works. There's more than a thousand pieces in the BWV, and I feel that the plots above don't do a good enough job at communicating just how many pieces this is. Therefore, I created a tile plot (using `{waffle}`), where every tile represents one piece. I colored again based on the secondary category. In order to still maintain the quantities of each category, I added the number of compositions to the legend. @@ -231,7 +258,10 @@ data |> summarise(n = n()) |> mutate(category2 = sprintf("%s (%s)", category2, n)) |> ggplot(aes(fill = category2, values = n)) + - geom_waffle(n_rows = 17, size = 0.2, colour = "#FFFFFC", flip = FALSE, na.rm = TRUE) + + geom_waffle( + n_rows = 17, size = 0.2, colour = "#FFFFFC", + flip = FALSE, na.rm = TRUE + ) + scale_fill_daniel(palette = "staalmeesters", name = NULL) + coord_equal() + theme_bach(base_family = "Alegreya", base_size = 14) + @@ -246,11 +276,10 @@ data |> #| echo: false #| eval: false -library(tidytext) -title_words <- data |> - select(Title) |> - unnest_tokens(output = word, input = Title) |> - anti_join(get_stopwords(language = "de")) |> +title_words <- data |> + select(title) |> + tidytext::unnest_tokens(output = word, input = title) |> + anti_join(tidytext::get_stopwords(language = "de")) |> count(word, sort = TRUE) |> mutate(nchar = nchar(word)) |> filter(nchar >= 3) @@ -263,13 +292,16 @@ The dataset I just created was comprehensive and clean, but it didn't contain an ```{r} #| label: scrape-wiki -url <- "https://en.wikipedia.org/wiki/List_of_compositions_by_Johann_Sebastian_Bach" +website <- "https://en.wikipedia.org" +article <- "wiki/List_of_compositions_by_Johann_Sebastian_Bach" +url <- str_glue("{website}/{article}") webpage <- read_html(url) wikitext <- webpage |> - html_nodes(xpath='//*[@id="TOP"]') |> + html_nodes(xpath = '//*[@id="TOP"]') |> html_table(fill = TRUE) |> - as.data.frame() + as.data.frame() |> + janitor::clean_names() ``` Then I cleaned the data somewhat and extracted the number from the BWV columns. @@ -279,10 +311,12 @@ Then I cleaned the data somewhat and extracted the number from the BWV columns. #| warning: false wikidata <- wikitext |> - rename(BWV_full = BWV) |> - mutate(BWV = sub('.*(\\d{3}).*', '\\1', BWV_full), - BWV = parse_integer(BWV)) |> - filter(!rev(duplicated(rev(BWV)))) + rename(bwv_full = bwv) |> + mutate( + bwv = sub(".*(\\d{3}).*", "\\1", bwv_full), + bwv = parse_integer(bwv) + ) |> + filter(!rev(duplicated(rev(bwv)))) ``` Then I merged the data. I lost a number of compositions in the process, but I was okay with it, mostly because it took me too much time and effort to try and fix it. Hurray for laziness. I extracted the year from the slightly messy `Date` column. Some strings in this column contain two dates, one for the first compilation and one for the date of completion. I extracted the last number, the year of completion. @@ -291,23 +325,26 @@ Then I merged the data. I lost a number of compositions in the process, but I wa #| label: merge-scraped #| warning: false -merged_data <- merge(data, wikidata, by = "BWV") |> - mutate(year = sub(".*(\\d{4}).*", "\\1", Date), - year = as.numeric(year), - age = year - 1685) +merged_data <- data |> + inner_join(wikidata, by = "bwv") |> + mutate( + year = sub(".*(\\d{4}).*", "\\1", date), + year = as.numeric(year), + age = year - 1685 + ) ``` I noticed that some entries in the `year` column exceeded the year of Bach's death. Bach died in 1750. I assumed that these years indicated the year of first performance or publication. In this scenario, it would be possible that certain pieces were lost and rediscovered at a later date and only then published. I thought to make a barplot of the number of compositions Bach over the course of his life, trying to see if there was any period where he was particularly productive. I also added some annotations to give the plot some context. ```{r} -#| label: BWVperyear-plot +#| label: bwvperyear-plot #| warning: false #| message: false #| fig-width: 12 -BWVperyear <- merged_data |> +bwvperyear <- merged_data |> filter(!is.na(year)) |> - group_by(year,age) |> + group_by(year, age) |> summarise(n = n()) palette <- daniel_pal("staalmeesters")(6) @@ -328,31 +365,63 @@ annotation2 <- data.frame( text = "italic(Schemellis~Gesangbuch)~comes~out" ) -ggplot(BWVperyear, aes(x = year, y = n, fill = n)) + +ggplot(bwvperyear, aes(x = year, y = n, fill = n)) + geom_bar(stat = "identity") + - geom_vline(xintercept = c(1750.5,1685), linetype = "dashed", color = "grey30") + - geom_text(data = data.frame(), mapping = aes(x = 1685.5, y = 75, label = "Bach was born\nMarch 31st 1685\nin Eisenach"), - inherit.aes = FALSE, family = "Alegreya", hjust = 0) + - geom_text(data = data.frame(), mapping = aes(x = 1750, y = 75, label = "Bach died\nJuly 28th 1750\nin Leipzich"), - inherit.aes = FALSE, family = "Alegreya", hjust = 1) + - geom_curve(data = annotation1, mapping = aes(x = x, y = y, xend = xend, yend = yend), - arrow = arrow(angle = 25, length = unit(6,"pt")), - curvature = 0.1, inherit.aes = FALSE) + - geom_text(data = annotation1, mapping = aes(x = x, y = y + 4, label = text), - family = "Alegreya", inherit.aes = FALSE, parse = FALSE) + - geom_curve(data = annotation2, mapping = aes(x = x, y = y, xend = xend, yend = yend), - arrow = arrow(angle = 25, length = unit(6,"pt")), - curvature = 0.2, inherit.aes = FALSE) + - geom_text(data = annotation2, mapping = aes(x = x, y = y + 3, label = text), - family = "Alegreya", inherit.aes = FALSE, parse = TRUE) + - labs(x = "Year", - y = "Number of compositions") + - scale_x_continuous(limits = c(1685,1751), breaks = seq(1690,1750,10)) + + geom_vline( + xintercept = c(1750.5, 1685), + linetype = "dashed", color = "grey30" + ) + + geom_text( + data = data.frame(), + mapping = aes( + x = 1685.5, y = 75, + label = "Bach was born\nMarch 31st 1685\nin Eisenach" + ), + inherit.aes = FALSE, family = "Alegreya", hjust = 0 + ) + + geom_text( + data = data.frame(), + mapping = aes( + x = 1750, y = 75, + label = "Bach died\nJuly 28th 1750\nin Leipzich" + ), + inherit.aes = FALSE, family = "Alegreya", hjust = 1 + ) + + geom_curve( + data = annotation1, + mapping = aes(x = x, y = y, xend = xend, yend = yend), + arrow = arrow(angle = 25, length = unit(6, "pt")), + curvature = 0.1, inherit.aes = FALSE + ) + + geom_text( + data = annotation1, + mapping = aes(x = x, y = y + 4, label = text), + family = "Alegreya", inherit.aes = FALSE, parse = FALSE + ) + + geom_curve( + data = annotation2, + mapping = aes(x = x, y = y, xend = xend, yend = yend), + arrow = arrow(angle = 25, length = unit(6, "pt")), + curvature = 0.2, inherit.aes = FALSE + ) + + geom_text( + data = annotation2, + mapping = aes(x = x, y = y + 3, label = text), + family = "Alegreya", inherit.aes = FALSE, parse = TRUE + ) + + labs( + x = "Year", + y = "Number of compositions" + ) + + scale_x_continuous( + limits = c(1685, 1751), + breaks = seq(1690, 1750, 10) + ) + scale_fill_gradient(low = palette[6], high = palette[2]) + theme_bach(base_family = "Alegreya", grid = "y") + theme( - legend.position = 'none' - ) + legend.position = "none" + ) ``` It seems there were two particularly productive years. But since the year column likely indicates year of publication, it's perhaps more likely that Bach published a collection of pieces in a batch. This is certainly true for the year 1736, when the _Schemellis Gesangbuch_ came out, a song book with sacred music, written together with a student of the school he was cantor at: the _Thomasschule_ in Leipzich. I'll come back to this plot later. @@ -361,23 +430,27 @@ The Wikipedia page also contained information on the key of most of the composit ```{r} #| label: colplot-key-cat1 +#| message: false summ_key_cat1 <- merged_data |> - group_by(category1, Key) |> + group_by(category1, key) |> summarise(n = n()) |> - filter(nchar(Key) > 1) |> - mutate(Key = sub("\u266D", "b", Key), - Key = sub("\U266F", "#", Key), - Key = ifelse(nchar(Key) > 10, substring(Key,1,nchar(Key)/2), Key) + filter(nchar(key) > 1) |> + mutate( + Key = sub("\u266D", "b", key), + Key = sub("\U266F", "#", key), + Key = ifelse(nchar(key) > 10, substring(key, 1, nchar(key) / 2), key) ) |> - group_by(category1, Key) |> + group_by(category1, key) |> summarise(n = sum(n)) -ggplot(summ_key_cat1, aes(x = reorder(Key,-n), y = n, fill = category1)) + +ggplot(summ_key_cat1, aes(x = reorder(key, -n), y = n, fill = category1)) + geom_col(position = position_dodge2(preserve = "single", padding = 0)) + - labs(x = NULL, - y = "Number of compositions", - fill = NULL) + + labs( + x = NULL, + y = "Number of compositions", + fill = NULL + ) + scale_fill_daniel(palette = "staalmeesters") + theme_bach(base_family = "Alegreya", grid = "y") + theme( @@ -394,31 +467,42 @@ I noticed that there were no double keys, as in B flat is the same as A sharp. T #| fig-width: 12 summ_key_cat2 <- merged_data |> - group_by(category2, Key) |> + group_by(category2, key) |> summarise(n = n()) |> - filter(nchar(Key) > 1) |> - mutate(Key = sub("\u266D", "b", Key), - Key = sub("\U266F", "#", Key), - Key = ifelse(nchar(Key) > 10, substring(Key,1,nchar(Key)/2), Key) + filter(nchar(key) > 1) |> + mutate( + key = sub("\u266D", "b", key), + key = sub("\U266F", "#", key), + key = ifelse(nchar(key) > 10, substring(key, 1, nchar(key) / 2), key) ) |> - group_by(category2, Key) |> + group_by(category2, key) |> summarise(n = sum(n)) -plotdat <- rbind(summ_key_cat1 |> rename(category2 = category1), - summ_key_cat2) |> - cbind(gr = c(rep(1,nrow(summ_key_cat1)), - rep(2,nrow(summ_key_cat2))) - ) +plotdat <- rbind( + summ_key_cat1 |> rename(category2 = category1), + summ_key_cat2 +) |> + cbind(gr = c( + rep(1, nrow(summ_key_cat1)), + rep(2, nrow(summ_key_cat2)) + )) -ggplot(plotdat, aes(x = Key, y = reorder(category2,n), fill = n)) + +ggplot(plotdat, aes(x = key, y = reorder(category2, n), fill = n)) + geom_tile() + geom_text(aes(label = n), family = "Alegreya", color = "white", size = 4) + - geom_hline(yintercept = 8.5, color = "#FFFFF0", size = 1) + - labs(x = NULL, - y = NULL, - fill = NULL) + + geom_hline(yintercept = 8.5, color = "#FFFFF0", size = 1) + + labs( + x = NULL, + y = NULL, + fill = NULL + ) + scale_x_discrete(position = "top") + - scale_fill_gradient(low = palette[6], high = palette[2], breaks = c(1,seq(10,40,10)),guide = guide_colorbar(barheight = 8, barwidth = 0.75)) + + scale_fill_gradient( + low = palette[6], + high = palette[2], + breaks = c(1, seq(10, 40, 10)), + guide = guide_colorbar(barheight = 8, barwidth = 0.75) + ) + coord_fixed() + theme_bach(base_family = "Alegreya", grid = FALSE) + theme( @@ -439,9 +523,12 @@ In addition to this, I also wanted to do some classic journalism. Previously, I register_google(google_code) places <- data_frame( - city = c("Eisenach","Ohrdruf","Lüneburg","Weimar","Arnstadt","Mühlhausen","Weimar","Köthen","Leipzig"), - year_from = c(1685,1695,1700,1702,1703,1707,1708,1717,1723), - year_to = c(1695,1700,1702,1703,1707,1708,1717,1723,1750) + city = c( + "Eisenach", "Ohrdruf", "Lüneburg", "Weimar", + "Arnstadt", "Mühlhausen", "Weimar", "Köthen", "Leipzig" + ), + year_from = c(1685, 1695, 1700, 1702, 1703, 1707, 1708, 1717, 1723), + year_to = c(1695, 1700, 1702, 1703, 1707, 1708, 1717, 1723, 1750) ) |> mutate_geocode(city) |> mutate(duration = year_to - year_from) @@ -452,14 +539,14 @@ places <- data_frame( #| echo: false #| eval: false -save(places, file = "places.Rdata") +places |> write_rds("./data/places.rds") ``` ```{r} #| label: load-places #| echo: false -load("places.Rdata") +places <- read_rds("./data/places.rds") ``` Since this time I wanted to visualize whether Bach was more productive in certain places, I recreated the plot from earlier, shaded the background with the city Bach inhabited at the time, and transformed the bars into a density plot, which smoothed over the data and removed the high outliers. @@ -470,21 +557,42 @@ Since this time I wanted to visualize whether Bach was more productive in certai #| fig-width: 12 places_plot <- places |> - mutate(meanyear = year_from + ((year_to - year_from)/2)) + mutate(meanyear = year_from + ((year_to - year_from) / 2)) ggplot(places_plot) + - geom_rect(aes(xmin = year_from + 0.5, xmax = year_to + 0.5, ymin = 0, ymax = 60, fill = reorder(city,year_from)), - alpha = 0.75) + - stat_smooth(data = BWVperyear |> filter(year < 1751), mapping = aes(x = year + 0.5, y = n, group = 1), fill = "black", geom = 'area', method = 'loess', span = 0.225, alpha = 0.75) + - geom_text(aes(x = meanyear, label = city, y = 62.5), - family = "Alegreya", angle = 60, hjust = 0) + - labs(x = NULL, - y = "Number of compositions", - fill = "City") + + geom_rect( + aes( + xmin = year_from + 0.5, xmax = year_to + 0.5, + ymin = 0, ymax = 60, + fill = reorder(city, year_from) + ), + alpha = 0.75 + ) + + stat_smooth( + data = bwvperyear |> filter(year < 1751), + mapping = aes(x = year + 0.5, y = n, group = 1), + fill = "black", geom = "area", + method = "loess", span = 0.225, alpha = 0.75 + ) + + geom_text(aes(x = meanyear, label = city, y = 62.5), + family = "Alegreya", angle = 60, hjust = 0 + ) + + labs( + x = NULL, + y = "Number of compositions", + fill = "City" + ) + scale_fill_daniel(palette = "staalmeesters") + - scale_x_continuous(breaks = seq(1690,1750,10), expand = c(0,0)) + - scale_y_continuous(limits = c(0,85), breaks = seq(0,60,20), expand = c(0,0)) + - theme_bach(base_family = "Alegreya", grid = FALSE, ticks = FALSE, axis_title_just = "c") + + scale_x_continuous(breaks = seq(1690, 1750, 10), expand = c(0, 0)) + + scale_y_continuous( + limits = c(0, 85), + breaks = seq(0, 60, 20), + expand = c(0, 0) + ) + + theme_bach( + base_family = "Alegreya", grid = FALSE, + ticks = FALSE, axis_title_just = "c" + ) + theme( legend.position = "none" ) @@ -500,12 +608,14 @@ Lastly, just for fun, I created a map. places_unique <- places |> distinct(city, .keep_all = TRUE) -ggplot(places_unique, aes(x = lon, y = lat)) + +ggplot(places_unique, aes(x = lon, y = lat)) + borders("world", colour = palette[3], fill = palette[4], alpha = 1) + geom_point(color = palette[1], size = 4) + - ggrepel::geom_text_repel(aes(label = city), force = 1, size = 4, - seed = 2345, point.padding = 0.2, - family = "Alegreya") + + ggrepel::geom_text_repel(aes(label = city), + force = 1, size = 4, + seed = 2345, point.padding = 0.2, + family = "Alegreya" + ) + coord_quickmap(xlim = c(4, 18), ylim = c(48, 55)) + theme_void() ``` @@ -514,4 +624,4 @@ This was a fun exercise in scraping data and trying out new things with visualiz ## Future plans -In the future, I hope to do some more in-depth statistical analysis on any of this. Perhaps I'll finally dive into the setting, there are some cool more complex visualization opportunities there. With this I could also make a Shiny app where one can select the voices you are interested in, and it will show you the pieces that are written for that combination of voices, or select the time of the year, and it will return the pieces written for that part of the Christian calendar (i.e. Advent, Epiphany, or Easter). Most importantly, I might actually do some statistics, instead of just visualizing data. Creating nice figures is great fun, but the real interest lies in perfoming adequate statistical tests, though I still need to think about what statistics would be appropriate for this context. Any tips or requests? \ No newline at end of file +In the future, I hope to do some more in-depth statistical analysis on any of this. Perhaps I'll finally dive into the setting, there are some cool more complex visualization opportunities there. With this I could also make a Shiny app where one can select the voices you are interested in, and it will show you the pieces that are written for that combination of voices, or select the time of the year, and it will return the pieces written for that part of the Christian calendar (i.e. Advent, Epiphany, or Easter). Most importantly, I might actually do some statistics, instead of just visualizing data. Creating nice figures is great fun, but the real interest lies in perfoming adequate statistical tests, though I still need to think about what statistics would be appropriate for this context. Any tips or requests? diff --git a/content/blog/2019-analyzing-bach/places.Rdata b/content/blog/2019-analyzing-bach/places.Rdata deleted file mode 100644 index 721144c..0000000 Binary files a/content/blog/2019-analyzing-bach/places.Rdata and /dev/null differ diff --git a/content/blog/2019-analyzing-bach/scraped_data.Rdata b/content/blog/2019-analyzing-bach/scraped_data.Rdata deleted file mode 100644 index aaeb321..0000000 Binary files a/content/blog/2019-analyzing-bach/scraped_data.Rdata and /dev/null differ diff --git a/content/blog/2019-how-i-create-manhattan-plots-using-ggplot/index.markdown_strict_files/figure-markdown_strict/print-plot-1.png b/content/blog/2019-how-i-create-manhattan-plots-using-ggplot/index.markdown_strict_files/figure-markdown_strict/print-plot-1.png index 0f1411d..5feaf77 100644 Binary files a/content/blog/2019-how-i-create-manhattan-plots-using-ggplot/index.markdown_strict_files/figure-markdown_strict/print-plot-1.png and b/content/blog/2019-how-i-create-manhattan-plots-using-ggplot/index.markdown_strict_files/figure-markdown_strict/print-plot-1.png differ diff --git a/content/blog/2019-how-i-create-manhattan-plots-using-ggplot/index.md b/content/blog/2019-how-i-create-manhattan-plots-using-ggplot/index.md index bd35e87..8290a33 100644 --- a/content/blog/2019-how-i-create-manhattan-plots-using-ggplot/index.md +++ b/content/blog/2019-how-i-create-manhattan-plots-using-ggplot/index.md @@ -14,13 +14,15 @@ execute: fig.show: hold results: hold out.width: 80% +editor_options: + chunk_output_type: console --- -### Introduction +## Introduction There are many ways to create a Manhattan plot. There's a number of online tools that create Manhattan plots for you, it's implemented in a number of toolboxes that are often used in genetics, and there's a couple of packages for R that can create these plots. However, these options often don't offer the customizability that some people (like me) would want. One of the most flexible ways to plot a Manhattan plot (I know of) is the `{manhattan}` package, but how nice would it be to have full control over the properties of the plot. Therefore, whenever I need to create a Manhattan plot, my preference is to go to the awesome `{ggplot2}` package. In my opinion, it gives me more control over the lay-out and properties of the Manhattan plot, so I thought I'd go through how I go about creating Manhattan plots in R using the `{ggplot2}` package. I've tried this code on GWAS summary statistics from several sources, and it works for a bunch of them. Because data can look somewhat different, I'll describe the concept behind some of the code, as to show what my reasoning is behind each step. One thing I should point out is that it's probably best to run this code on a computer that is a little bit powerful, since it will need to deal with an enormous amount of SNPs, that will create massive dataframes and plot objects. -### Import data into R +## Import data into R The first step, as always, is to load the packages we need. I personally prefer to use as little packages as possible and write most of the code myself, because that gives me total control over what happens to my data. However, there are some packages (in particular some of the packages developed by Hadley Wickham) that are very powerful. So, all code described below depends on one of two packages, the `{tidyverse}` package, which includes the `{ggplot2}` and `{dplyr}` package. Next, I load the file that contains the GWAS summary statistics. Different tools use different file formats. The file I use is the output from PLINK with the `--meta-analysis` flag. This file can simple be loaded as a table. Here I use a function from our own `{normentR}` package, called `simulateGWAS()`, which does as the name suggest, and simulate the output from a GWAS analysis. @@ -33,7 +35,7 @@ library(normentR) ``` r set.seed(2404) -gwas_data_load <- simulateGWAS(nSNPs = 1e6, nSigCols = 3) |> +gwas_data_load <- simulateGWAS(nSNPs = 1e6, nSigCols = 3) |> janitor::clean_names() ``` @@ -55,64 +57,84 @@ This will create a dataframe with as many rows as there are SNPs in the summary A vast majority of the datapoints will overlap in the non-significant region of the Manhattan plot, these data points are not particularly informative, and it's possible to select a random number of SNPs in this region to make it less computationally heavy. The data I used here does not have such a large volume, so I only needed to filter a small number of SNPs (10% in this case). ``` r -sig_data <- gwas_data_load |> +sig_data <- gwas_data_load |> subset(p < 0.05) -notsig_data <- gwas_data_load |> + +notsig_data <- gwas_data_load |> subset(p >= 0.05) |> - group_by(chr) |> + group_by(chr) |> sample_frac(0.1) + gwas_data <- bind_rows(sig_data, notsig_data) ``` -### Preparing the data +## Preparing the data Since the only columns we have indicating position are the chromosome number and the base pair position of the SNP on that chromosome, we want to combine those so that we have one column with position that we can use for the x-axis. So, what we want to do is to create a column with cumulative base pair position in a way that puts the SNPs on the first chromosome first, and the SNPs on chromosome 22 last. I create a data frame frame called `data_cum` (for cumulative), which selects the largest position for each chromosome, and then calculates the cumulative sum of those. Since we don't need to add anything to the first chromosome, we move everything one row down (using the `lag()` function). We then merge this data frame with the original dataframe and calculate a the cumulative basepair position for each SNP by adding the relative position and the adding factor together. This will create a column (here called `bp_cum`) in which the relative base pair position is the position as if it was stitched together. This code is shown below: ``` r -data_cum <- gwas_data |> - group_by(chr) |> - summarise(max_bp = max(bp)) |> - mutate(bp_add = lag(cumsum(max_bp), default = 0)) |> +data_cum <- gwas_data |> + group_by(chr) |> + summarise(max_bp = max(bp)) |> + mutate(bp_add = lag(cumsum(max_bp), default = 0)) |> select(chr, bp_add) -gwas_data <- gwas_data |> - inner_join(data_cum, by = "chr") |> +gwas_data <- gwas_data |> + inner_join(data_cum, by = "chr") |> mutate(bp_cum = bp + bp_add) ``` -When this is done, the next thing I want to do is to get a couple of parameters that I'll use for the plot later. First, I want the centre position of each chromosome. This position I'll use later to place the labels on the x-axis of the Manhattan plot neatly in the middle of each chromosome. In order to get this position, I'll pipe the `gwas_data` dataframe into this powerful `{dplyr}` function which I then ask to calculate the difference between the maximum and minimum cumulative base pair position for each chromosome and divide it by two to get the middle of each chromosome. I also want to set the limit of the y-axis, as not to cut off any highly significant SNPs. If you want to compare multiple GWAS statistics, then I highly recommend to hard code the limit of the y-axis, and then explore the data beforehand to make sure your chosen limit does not cut off any SNPs. Since the y-axis will be log transformed, we need an integer that is lower than the largest negative exponent. But since the y-axis will be linear and positive, I transform the largest exponent to positive and add 2, to give some extra space on the top edge of the plot. When plotting, I actually convert it back to a log scale, but it's a bit easier to add a constant to it by transforming it to a regular integer first. Then, we also want to indicate the significance threshold, I prefer to save this in a variable. Here, I choose to get a Bonferroni-corrected threshold, which is 0.05 divided by the number of SNPs in the summary statistics. I believe many scientists will use the "standard" threshold of 0.05 divided by 1e-6, which is 5e-8. However, in the data I had I believed it to be best to use the Bonferroni-corrected threshold since the sample encompassed different populations, and because it contained less than a million SNPs were used in the association testing, which would make a standard correction overly conservative. These three parameters were calculated as follows: +When this is done, the next thing I want to do is to get a couple of parameters that I'll use for the plot later. First, I want the centre position of each chromosome. This position I'll use later to place the labels on the x-axis of the Manhattan plot neatly in the middle of each chromosome. In order to get this position, I'll pipe the `gwas_data` dataframe into this powerful `{dplyr}` function which I then ask to calculate the difference between the maximum and minimum cumulative base pair position for each chromosome and divide it by two to get the middle of each chromosome. I also want to set the limit of the y-axis, as not to cut off any highly significant SNPs. If you want to compare multiple GWAS statistics, then I highly recommend to hard code the limit of the y-axis, and then explore the data beforehand to make sure your chosen limit does not cut off any SNPs. Since the y-axis will be log transformed, we need an integer that is lower than the largest negative exponent. But since the y-axis will be linear and positive, I transform the largest exponent to positive and add 2, to give some extra space on the top edge of the plot. When plotting, I actually convert it back to a log scale, but it's a bit easier to add a constant to it by transforming it to a regular integer first. + +{{< sidenote br="5em" >}} +Note that the 5e-8 threshold [has its own issues](https://doi.org/10.1038%2Fejhg.2015.269) +{{< /sidenote >}} + +Then, we also want to indicate the significance threshold, I prefer to save this in a variable. Here, I choose to get a Bonferroni-corrected threshold, which is 0.05 divided by the number of SNPs in the summary statistics. Usually, I would use the "default" threshold of 0.05 divided by 1e-6 which is 5e-8. However, in the data I had I believed it to be best to use the Bonferroni-corrected threshold to be a little bit more conservative given the dataset I had. These three parameters were calculated as follows: ``` r -axis_set <- gwas_data |> - group_by(chr) |> +axis_set <- gwas_data |> + group_by(chr) |> summarize(center = mean(bp_cum)) -ylim <- gwas_data |> - filter(p == min(p)) |> - mutate(ylim = abs(floor(log10(p))) + 2) |> +ylim <- gwas_data |> + filter(p == min(p)) |> + mutate(ylim = abs(floor(log10(p))) + 2) |> pull(ylim) -sig <- 5e-8 +sig <- 0.05 / nrow(gwas_data_load) ``` -### Plotting the data +## Plotting the data Finally, we're ready to plot the data. As promised, I use the `ggplot()` function for this. I build this up like I would any other `ggplot` object, each SNP will be one point on the plot. Each SNP will be colored based on the chromosome. I manually set the colors, add the labels based on the definitions I determined earlier, and set the y limits to the limit calculated before as well. I add a horizontal dashed line indicating the significance threshold. In Manhattan plots we really don't need a grid, all that would be nice are horizontal grid lines, so I keep those but remove vertical lines. Manhattan plots can be a bit intimidating, so I prefer them as minimal as possible. Shown below is the code I use to create the plot and to save the plot as an image. ``` r -manhplot <- ggplot(gwas_data, aes(x = bp_cum, y = -log10(p), - color = as_factor(chr), size = -log10(p))) + - geom_hline(yintercept = -log10(sig), color = "grey40", linetype = "dashed") + +manhplot <- ggplot(gwas_data, aes( + x = bp_cum, y = -log10(p), + color = as_factor(chr), size = -log10(p) +)) + + geom_hline( + yintercept = -log10(sig), color = "grey40", + linetype = "dashed" + ) + geom_point(alpha = 0.75) + - scale_x_continuous(label = axis_set$chr, breaks = axis_set$center) + - scale_y_continuous(expand = c(0,0), limits = c(0, ylim)) + - scale_color_manual(values = rep(c("#276FBF", "#183059"), - unique(length(axis_set$chr)))) + - scale_size_continuous(range = c(0.5,3)) + - labs(x = NULL, - y = "-log10(p)") + + scale_x_continuous( + label = axis_set$chr, + breaks = axis_set$center + ) + + scale_y_continuous(expand = c(0, 0), limits = c(0, ylim)) + + scale_color_manual(values = rep( + c("#276FBF", "#183059"), + unique(length(axis_set$chr)) + )) + + scale_size_continuous(range = c(0.5, 3)) + + labs( + x = NULL, + y = "-log10(p)" + ) + theme_minimal() + - theme( + theme( legend.position = "none", panel.grid.major.x = element_blank(), panel.grid.minor.x = element_blank(), diff --git a/content/blog/2019-how-i-create-manhattan-plots-using-ggplot/index.qmd b/content/blog/2019-how-i-create-manhattan-plots-using-ggplot/index.qmd index 3ad0694..2ebc8e0 100644 --- a/content/blog/2019-how-i-create-manhattan-plots-using-ggplot/index.qmd +++ b/content/blog/2019-how-i-create-manhattan-plots-using-ggplot/index.qmd @@ -14,12 +14,16 @@ execute: fig.show: hold results: hold out.width: 80% +editor_options: + chunk_output_type: console --- -### Introduction +## Introduction + There are many ways to create a Manhattan plot. There's a number of online tools that create Manhattan plots for you, it's implemented in a number of toolboxes that are often used in genetics, and there's a couple of packages for R that can create these plots. However, these options often don't offer the customizability that some people (like me) would want. One of the most flexible ways to plot a Manhattan plot (I know of) is the `{manhattan}` package, but how nice would it be to have full control over the properties of the plot. Therefore, whenever I need to create a Manhattan plot, my preference is to go to the awesome `{ggplot2}` package. In my opinion, it gives me more control over the lay-out and properties of the Manhattan plot, so I thought I'd go through how I go about creating Manhattan plots in R using the `{ggplot2}` package. I've tried this code on GWAS summary statistics from several sources, and it works for a bunch of them. Because data can look somewhat different, I'll describe the concept behind some of the code, as to show what my reasoning is behind each step. One thing I should point out is that it's probably best to run this code on a computer that is a little bit powerful, since it will need to deal with an enormous amount of SNPs, that will create massive dataframes and plot objects. -### Import data into R +## Import data into R + The first step, as always, is to load the packages we need. I personally prefer to use as little packages as possible and write most of the code myself, because that gives me total control over what happens to my data. However, there are some packages (in particular some of the packages developed by Hadley Wickham) that are very powerful. So, all code described below depends on one of two packages, the `{tidyverse}` package, which includes the `{ggplot2}` and `{dplyr}` package. Next, I load the file that contains the GWAS summary statistics. Different tools use different file formats. The file I use is the output from PLINK with the `--meta-analysis` flag. This file can simple be loaded as a table. Here I use a function from our own `{normentR}` package, called `simulateGWAS()`, which does as the name suggest, and simulate the output from a GWAS analysis. ```{r} @@ -36,7 +40,7 @@ library(normentR) set.seed(2404) -gwas_data_load <- simulateGWAS(nSNPs = 1e6, nSigCols = 3) |> +gwas_data_load <- simulateGWAS(nSNPs = 1e6, nSigCols = 3) |> janitor::clean_names() ``` @@ -47,68 +51,90 @@ A vast majority of the datapoints will overlap in the non-significant region of ```{r} #| label: reduce-size -sig_data <- gwas_data_load |> +sig_data <- gwas_data_load |> subset(p < 0.05) -notsig_data <- gwas_data_load |> + +notsig_data <- gwas_data_load |> subset(p >= 0.05) |> - group_by(chr) |> + group_by(chr) |> sample_frac(0.1) + gwas_data <- bind_rows(sig_data, notsig_data) ``` -### Preparing the data +## Preparing the data + Since the only columns we have indicating position are the chromosome number and the base pair position of the SNP on that chromosome, we want to combine those so that we have one column with position that we can use for the x-axis. So, what we want to do is to create a column with cumulative base pair position in a way that puts the SNPs on the first chromosome first, and the SNPs on chromosome 22 last. I create a data frame frame called `data_cum` (for cumulative), which selects the largest position for each chromosome, and then calculates the cumulative sum of those. Since we don't need to add anything to the first chromosome, we move everything one row down (using the `lag()` function). We then merge this data frame with the original dataframe and calculate a the cumulative basepair position for each SNP by adding the relative position and the adding factor together. This will create a column (here called `bp_cum`) in which the relative base pair position is the position as if it was stitched together. This code is shown below: ```{r} #| label: cumulative-bp -data_cum <- gwas_data |> - group_by(chr) |> - summarise(max_bp = max(bp)) |> - mutate(bp_add = lag(cumsum(max_bp), default = 0)) |> +data_cum <- gwas_data |> + group_by(chr) |> + summarise(max_bp = max(bp)) |> + mutate(bp_add = lag(cumsum(max_bp), default = 0)) |> select(chr, bp_add) -gwas_data <- gwas_data |> - inner_join(data_cum, by = "chr") |> +gwas_data <- gwas_data |> + inner_join(data_cum, by = "chr") |> mutate(bp_cum = bp + bp_add) ``` -When this is done, the next thing I want to do is to get a couple of parameters that I'll use for the plot later. First, I want the centre position of each chromosome. This position I'll use later to place the labels on the x-axis of the Manhattan plot neatly in the middle of each chromosome. In order to get this position, I'll pipe the `gwas_data` dataframe into this powerful `{dplyr}` function which I then ask to calculate the difference between the maximum and minimum cumulative base pair position for each chromosome and divide it by two to get the middle of each chromosome. I also want to set the limit of the y-axis, as not to cut off any highly significant SNPs. If you want to compare multiple GWAS statistics, then I highly recommend to hard code the limit of the y-axis, and then explore the data beforehand to make sure your chosen limit does not cut off any SNPs. Since the y-axis will be log transformed, we need an integer that is lower than the largest negative exponent. But since the y-axis will be linear and positive, I transform the largest exponent to positive and add 2, to give some extra space on the top edge of the plot. When plotting, I actually convert it back to a log scale, but it's a bit easier to add a constant to it by transforming it to a regular integer first. Then, we also want to indicate the significance threshold, I prefer to save this in a variable. Here, I choose to get a Bonferroni-corrected threshold, which is 0.05 divided by the number of SNPs in the summary statistics. I believe many scientists will use the "standard" threshold of 0.05 divided by 1e-6, which is 5e-8. However, in the data I had I believed it to be best to use the Bonferroni-corrected threshold since the sample encompassed different populations, and because it contained less than a million SNPs were used in the association testing, which would make a standard correction overly conservative. These three parameters were calculated as follows: +When this is done, the next thing I want to do is to get a couple of parameters that I'll use for the plot later. First, I want the centre position of each chromosome. This position I'll use later to place the labels on the x-axis of the Manhattan plot neatly in the middle of each chromosome. In order to get this position, I'll pipe the `gwas_data` dataframe into this powerful `{dplyr}` function which I then ask to calculate the difference between the maximum and minimum cumulative base pair position for each chromosome and divide it by two to get the middle of each chromosome. I also want to set the limit of the y-axis, as not to cut off any highly significant SNPs. If you want to compare multiple GWAS statistics, then I highly recommend to hard code the limit of the y-axis, and then explore the data beforehand to make sure your chosen limit does not cut off any SNPs. Since the y-axis will be log transformed, we need an integer that is lower than the largest negative exponent. But since the y-axis will be linear and positive, I transform the largest exponent to positive and add 2, to give some extra space on the top edge of the plot. When plotting, I actually convert it back to a log scale, but it's a bit easier to add a constant to it by transforming it to a regular integer first. + +{{{< sidenote br="5em" >}}} +Note that the 5e-8 threshold [has its own issues](https://doi.org/10.1038%2Fejhg.2015.269) +{{{< /sidenote >}}} + +Then, we also want to indicate the significance threshold, I prefer to save this in a variable. Here, I choose to get a Bonferroni-corrected threshold, which is 0.05 divided by the number of SNPs in the summary statistics. Usually, I would use the "default" threshold of 0.05 divided by 1e-6 which is 5e-8. However, in the data I had I believed it to be best to use the Bonferroni-corrected threshold to be a little bit more conservative given the dataset I had. These three parameters were calculated as follows: ```{r} #| label: set-axes -axis_set <- gwas_data |> - group_by(chr) |> +axis_set <- gwas_data |> + group_by(chr) |> summarize(center = mean(bp_cum)) -ylim <- gwas_data |> - filter(p == min(p)) |> - mutate(ylim = abs(floor(log10(p))) + 2) |> +ylim <- gwas_data |> + filter(p == min(p)) |> + mutate(ylim = abs(floor(log10(p))) + 2) |> pull(ylim) -sig <- 5e-8 +sig <- 0.05 / nrow(gwas_data_load) ``` -### Plotting the data +## Plotting the data + Finally, we're ready to plot the data. As promised, I use the `ggplot()` function for this. I build this up like I would any other `ggplot` object, each SNP will be one point on the plot. Each SNP will be colored based on the chromosome. I manually set the colors, add the labels based on the definitions I determined earlier, and set the y limits to the limit calculated before as well. I add a horizontal dashed line indicating the significance threshold. In Manhattan plots we really don't need a grid, all that would be nice are horizontal grid lines, so I keep those but remove vertical lines. Manhattan plots can be a bit intimidating, so I prefer them as minimal as possible. Shown below is the code I use to create the plot and to save the plot as an image. ```{r} #| label: create-plot -manhplot <- ggplot(gwas_data, aes(x = bp_cum, y = -log10(p), - color = as_factor(chr), size = -log10(p))) + - geom_hline(yintercept = -log10(sig), color = "grey40", linetype = "dashed") + +manhplot <- ggplot(gwas_data, aes( + x = bp_cum, y = -log10(p), + color = as_factor(chr), size = -log10(p) +)) + + geom_hline( + yintercept = -log10(sig), color = "grey40", + linetype = "dashed" + ) + geom_point(alpha = 0.75) + - scale_x_continuous(label = axis_set$chr, breaks = axis_set$center) + - scale_y_continuous(expand = c(0,0), limits = c(0, ylim)) + - scale_color_manual(values = rep(c("#276FBF", "#183059"), - unique(length(axis_set$chr)))) + - scale_size_continuous(range = c(0.5,3)) + - labs(x = NULL, - y = "-log10(p)") + + scale_x_continuous( + label = axis_set$chr, + breaks = axis_set$center + ) + + scale_y_continuous(expand = c(0, 0), limits = c(0, ylim)) + + scale_color_manual(values = rep( + c("#276FBF", "#183059"), + unique(length(axis_set$chr)) + )) + + scale_size_continuous(range = c(0.5, 3)) + + labs( + x = NULL, + y = "-log10(p)" + ) + theme_minimal() + - theme( + theme( legend.position = "none", panel.grid.major.x = element_blank(), panel.grid.minor.x = element_blank(), diff --git a/content/blog/2019-how-i-plot-erps-in-r/data.RData b/content/blog/2019-how-i-plot-erps-in-r/data.RData deleted file mode 100644 index 33d2dfd..0000000 Binary files a/content/blog/2019-how-i-plot-erps-in-r/data.RData and /dev/null differ diff --git a/content/blog/2019-how-i-plot-erps-in-r/times.txt b/content/blog/2019-how-i-plot-erps-in-r/data/times.txt similarity index 100% rename from content/blog/2019-how-i-plot-erps-in-r/times.txt rename to content/blog/2019-how-i-plot-erps-in-r/data/times.txt diff --git a/content/blog/2019-how-i-plot-erps-in-r/index.markdown_strict_files/figure-markdown_strict/erp-plot-1.png b/content/blog/2019-how-i-plot-erps-in-r/index.markdown_strict_files/figure-markdown_strict/erp-plot-1.png index ec7d191..432cbf2 100644 Binary files a/content/blog/2019-how-i-plot-erps-in-r/index.markdown_strict_files/figure-markdown_strict/erp-plot-1.png and b/content/blog/2019-how-i-plot-erps-in-r/index.markdown_strict_files/figure-markdown_strict/erp-plot-1.png differ diff --git a/content/blog/2019-how-i-plot-erps-in-r/index.md b/content/blog/2019-how-i-plot-erps-in-r/index.md index f29d7af..a10799f 100644 --- a/content/blog/2019-how-i-plot-erps-in-r/index.md +++ b/content/blog/2019-how-i-plot-erps-in-r/index.md @@ -15,6 +15,8 @@ execute: fig.show: hold results: hold out.width: 80% +editor_options: + chunk_output_type: console --- ## Introduction @@ -32,28 +34,32 @@ library(normentR) Since I cannot share participants' data, I used an open EEG dataset downloaded here, this data comes from a psychophysics group of 5 participants with 2 conditions. It came in the EEGLAB format, same as my own data. The first step is to take the data out of MATLAB and take it to a format that can be easily understood by R. The most convenient way I thought of was to take the `EEG.data` field, transform it into a two-dimensional metrix and store it in a .csv file. -If you have a large amount of participants, I can recommend to only extract data from the channels of interest or the conditions of interest. One can make one file per channel or participant, or one large file that contains everything. I usually choose the latter, and that's what I'll work with here. So my file looks as follows: one column with the channel, one column with the ID, one with the condition (or trigger). If I'm doing a between-groups analysis, I'll also have a column with the group. All the other columns are the amplitudes with across the timepoints. +If you have a large amount of participants, I can recommend to only extract data from the channels of interest or the conditions of interest. One can make one file per channel or participant, or one large file that contains everything. I usually choose the latter, and that's what I'll work with here. So my file looks as follows: one column with the channel, one column with the ID, one with the condition (or trigger). If I'm doing a between-groups analysis, I'll also have a column with the group. All the other columns are the amplitudes with across the timepoints. I also have a file with the timepoints for each value. i.e. with epoch of 500 milliseconds ranging from 100 milliseconds pre-stimulus to 400 milliseconds post-stimulus, all the timepoints according to the sampling rate are the values in this file. It is basically nothing more than a print of the values in the `EEG.times` field from the EEGLAB dataset. I'll load this list of time points here also. ``` r -data <- read_delim("AllChannels_ERP.txt", delim = "\t", col_names = FALSE) -``` +data <- read_delim("./data/all_channels_erp.txt", + delim = "\t", col_names = FALSE +) -I also have a file with the timepoints for each value. i.e. with epoch of 500 milliseconds ranging from 100 milliseconds pre-stimulus to 400 milliseconds post-stimulus, all the timepoints according to the sampling rate are the values in this file. It is basically nothing more than a print of the values in the `EEG.times` field from the EEGLAB dataset. +times <- read_table("./data/times.txt", col_names = FALSE) +``` -Since I didn't include any headers in my file, I rename them here. I give the the identifying columns their appropriate names, and for the amplitudes, I attach the values from the `times` variable as names to these columns, so -100 will be one column, -99 will be another, and so on. +{{< sidenote >}} +The `janitor` package helps cleaning up column names, in this instance it converts all default columns (e.g. `V1`) to lowercase (i.e. `v1`) +{{< /sidenote >}} ``` r -ERPdata <- data |> - rename(Channel = V1, - ID = V2, - Condition = V3) |> - mutate(ID = factor(ID), - Condition = factor(Condition)) - -oldnames <- sprintf("V%s", 1:ncol(times) + 3) - -ERPdata <- ERPdata |> - rename_at(vars(all_of(oldnames)), ~ as.character(times)) +erp_data <- data |> + janitor::clean_names() |> + rename( + channel = v1, + id = v2, + condition = v3 + ) |> + mutate( + id = as_factor(id), + condition = as_factor(condition) + ) ``` ## Preparing the data @@ -61,43 +67,57 @@ ERPdata <- ERPdata |> Then I specify a variable with the channels of interest. These will be the channels I'll average across later. ``` r -coi <- c("P1", "P2", "Pz", "P3", "P4", "PO3", "PO4", "POz"); +coi <- c("P1", "P2", "Pz", "P3", "P4", "PO3", "PO4", "POz") ``` -Then I calculate the means across channels and conditions. This goes in two steps. First I'll select only the channels of interest, then I'll group by ID, condition, and channel. And then calculate the average of every other column, in this case column 4 to the end of the file. Then I'll do the same, except now I'll group by ID and condition. So then we have one average ERP for every condition in all participants. +Then I calculate the means across channels and conditions. This goes in two steps. First I'll select only the channels of interest, then I'll group by ID, condition, and channel. And then calculate the average of every other column, in this case all columns starting with "v". Then I'll do the same, except now I'll group by ID and condition. So then we have one average ERP for every condition in all participants. + +{{< sidenote >}} +The `summarise()` function will throw a warning about the grouping variable, this can be silenced by setting `.groups = "drop"` +{{< /sidenote >}} ``` r -ERPdata_mChan <- ERPdata |> - filter(Channel %in% coi) |> - group_by(ID,Condition,Channel) %>% - summarise_at(vars(names(.)[4:ncol(.)]), list(~ mean(., na.rm = TRUE))) |> +erp_data_chan_mean <- erp_data |> + filter(channel %in% coi) |> + group_by(id, condition, channel) |> + summarise(across(starts_with("v"), ~ mean(.x, na.rm = TRUE))) |> ungroup() -ERPdata_mCond <- ERPdata_mChan |> - group_by(ID,Condition) %>% - summarise_at(vars(names(.)[4:ncol(.)]), list(~ mean(., na.rm = TRUE))) |> +erp_data_cond_mean <- erp_data_chan_mean |> + group_by(id, condition) |> + summarise(across(starts_with("v"), ~ mean(.x, na.rm = TRUE))) |> ungroup() -MeanERPs <- ERPdata_mCond +mean_erp <- erp_data_cond_mean ``` ## Calculate grand average and confidence interval -The next piece of code calculates the grand average. I will also calculate the confidence interval and then transform it from the interval relative to the mean to the absolute values representing the upper and lower boundaries of the confidence interval. Here I use a confidence interval of 95%. We first transform from wide to long format using the `pivot_longer()` function from the `{tidyr}` package. Then we convert the (now character) `Time` variable to numeric. Then we will calculate the average amplitude per time point. Then using the `CI()` function from the `{Rmisc}` package, we calculate the upper and lower bounds of the confidence interval. +The next piece of code calculates the grand average. I will also add the time for each timepoint in milliseconds from the "times.txt" file we loaded earlier. Afterwards, I will alculate the confidence interval and then transform it from the interval relative to the mean to the absolute values representing the upper and lower boundaries of the confidence interval. Here I use a confidence interval of 95%. We first transform from wide to long format using the `pivot_longer()` function from the `{tidyr}` package. I merge the data frame containing the times to the long data frame with the mean ERP so that we have a column with numerical time points. Then we calculate the average amplitude per time point. Then using the `CI()` function from the `{Rmisc}` package, we calculate the upper and lower bounds of the confidence interval and add each as a separate column in the data frame. ``` r -ERP_plotdata <- MeanERPs |> - pivot_longer(-c(ID,Condition), names_to = "Time", values_to = "Amplitude") |> - mutate(Time = as.numeric(Time)) |> - group_by(Condition,Time) |> - summarise(Mean_Amplitude = mean(Amplitude), - CIlower = Rmisc::CI(Amplitude, ci = 0.95)["lower"], - CIupper = Rmisc::CI(Amplitude, ci = 0.95)["upper"]) +times <- times |> + pivot_longer( + cols = everything(), + names_to = "index", values_to = "time" + ) |> + mutate(timepoint = str_glue("v{row_number() + 3}")) |> + select(-index) + +erp_plotdata <- mean_erp |> + pivot_longer( + cols = -c(id, condition), + names_to = "timepoint", values_to = "amplitude" + ) |> + inner_join(times, by = "timepoint") |> + group_by(condition, time) |> + summarise( + mean_amplitude = mean(amplitude), + ci_lower = Rmisc::CI(amplitude, ci = 0.95)["lower"], + ci_upper = Rmisc::CI(amplitude, ci = 0.95)["upper"] + ) ``` - `summarise()` has grouped output by 'Condition'. You can override using the - `.groups` argument. - ## Preparing to plot Before we can start plotting, I just wanted to share one piece of code I found on StackOverflow. The question there was to move the x-axis and y-axis line towards 0, rather than the edge of the plot. You can find the question (and answers) here. I combined two answers into a function that moves both the y- and the x-axis to 0. It is my personal preference to have a line indicating 0. This makes the plots easier to read and looks prettier too, but of course this is subjective. Alternatively we could also add a `geom_hline()` and `geom_vline()`, both with intercept at 0, and turn the axes off (as default in `theme_norment()`. This will still give you the lines at x=0 and y=0 but will keep the axis labels at 0. @@ -111,30 +131,31 @@ After running the chunk below, we have a beautiful ERP with a nice confidence in ``` r colors <- norment_pal(palette = "logo")(2) -ERPplot <- ggplot(ERP_plotdata, aes(x = Time, y = Mean_Amplitude, - colour = Condition, group = Condition)) + - geom_ribbon(aes(ymin = CIlower, ymax = CIupper, fill = Condition), - alpha = 0.1, linetype = 0) + - geom_line(size = 0.75) + - scale_color_manual(values = colors) + - scale_fill_manual(values = colors) + - scale_x_continuous(breaks = c(seq(-2000,2000,500))) + - scale_y_continuous(breaks = c(seq(-12,-1,2), seq(2,12,2))) + +erp_plot <- ggplot(erp_plotdata, aes( + x = time, y = mean_amplitude, + colour = condition, group = condition +)) + + geom_ribbon(aes(ymin = ci_lower, ymax = ci_upper, fill = condition), + alpha = 0.1, linetype = 0 + ) + + geom_line(linewidth = 0.75) + + scale_color_manual(values = colors) + + scale_fill_manual(values = colors) + + scale_x_continuous(breaks = c(seq(-2000, 2000, 500))) + + scale_y_continuous(breaks = c(seq(-12, -1, 2), seq(2, 12, 2))) + coord_cartesian() + - labs(x = "Time (ms)", - y = "Amplitude (µV)") + + labs( + x = "Time (ms)", + y = "Amplitude (µV)" + ) + theme_norment(ticks = TRUE, grid = FALSE) + theme( - legend.position = c(0.9,0.1), + legend.position = c(0.9, 0.1), axis.text.x = element_text(vjust = -0.1) ) -``` - - Warning: Using `size` aesthetic for lines was deprecated in ggplot2 3.4.0. - ℹ Please use `linewidth` instead. - -``` r -shift_axes(ERPplot, x = 0, y = 0) +shift_axes(erp_plot, x = 0, y = 0) ``` + +**EDIT (2023-10-30)**: In the process of making some general updates to this site, I also revisited some of the code here. I improved the standard somewhat from when I first wrote this post in 2019 and brought it up to the code standards of 2023. The old version is of course available on GitHub. diff --git a/content/blog/2019-how-i-plot-erps-in-r/index.qmd b/content/blog/2019-how-i-plot-erps-in-r/index.qmd index aebeb37..8cc074a 100644 --- a/content/blog/2019-how-i-plot-erps-in-r/index.qmd +++ b/content/blog/2019-how-i-plot-erps-in-r/index.qmd @@ -15,6 +15,8 @@ execute: fig.show: hold results: hold out.width: 80% +editor_options: + chunk_output_type: console --- ## Introduction @@ -35,57 +37,63 @@ library(normentR) Since I cannot share participants' data, I used an open EEG dataset downloaded [here](https://sccn.ucsd.edu/~arno/fam2data/publicly_available_EEG_data.html){target="_blank"}, this data comes from a psychophysics group of 5 participants with 2 conditions. It came in the EEGLAB format, same as my own data. The first step is to take the data out of MATLAB and take it to a format that can be easily understood by R. The most convenient way I thought of was to take the `EEG.data` field, transform it into a two-dimensional metrix and store it in a .csv file. -If you have a large amount of participants, I can recommend to only extract data from the channels of interest or the conditions of interest. One can make one file per channel or participant, or one large file that contains everything. I usually choose the latter, and that's what I'll work with here. So my file looks as follows: one column with the channel, one column with the ID, one with the condition (or trigger). If I'm doing a between-groups analysis, I'll also have a column with the group. All the other columns are the amplitudes with across the timepoints. +If you have a large amount of participants, I can recommend to only extract data from the channels of interest or the conditions of interest. One can make one file per channel or participant, or one large file that contains everything. I usually choose the latter, and that's what I'll work with here. So my file looks as follows: one column with the channel, one column with the ID, one with the condition (or trigger). If I'm doing a between-groups analysis, I'll also have a column with the group. All the other columns are the amplitudes with across the timepoints. I also have a file with the timepoints for each value. i.e. with epoch of 500 milliseconds ranging from 100 milliseconds pre-stimulus to 400 milliseconds post-stimulus, all the timepoints according to the sampling rate are the values in this file. It is basically nothing more than a print of the values in the `EEG.times` field from the EEGLAB dataset. I'll load this list of time points here also. ```{r} #| label: read-data #| eval: false -data <- read_delim("AllChannels_ERP.txt", delim = "\t", col_names = FALSE) -``` +data <- read_delim("./data/all_channels_erp.txt", + delim = "\t", col_names = FALSE +) -I also have a file with the timepoints for each value. i.e. with epoch of 500 milliseconds ranging from 100 milliseconds pre-stimulus to 400 milliseconds post-stimulus, all the timepoints according to the sampling rate are the values in this file. It is basically nothing more than a print of the values in the `EEG.times` field from the EEGLAB dataset. +times <- read_table("./data/times.txt", col_names = FALSE) +``` ```{r} #| label: save-data #| echo: false #| eval: false -save(data, file = "data.RData") +write_rds(data, file = "./data/data.rds") ``` ```{r} #| label: load-data #| echo: false +#| message: false -load("data.RData") -times <- read.table("times.txt") +data <- read_rds("./data/data.rds") + +times <- read_table("./data/times.txt", col_names = FALSE) ``` -Since I didn't include any headers in my file, I rename them here. I give the the identifying columns their appropriate names, and for the amplitudes, I attach the values from the `times` variable as names to these columns, so -100 will be one column, -99 will be another, and so on. +{{{< sidenote >}}} +The `janitor` package helps cleaning up column names, in this instance it converts all default columns (e.g. `V1`) to lowercase (i.e. `v1`) +{{{< /sidenote >}}} ```{r} #| label: prep-data -ERPdata <- data |> - rename(Channel = V1, - ID = V2, - Condition = V3) |> - mutate(ID = factor(ID), - Condition = factor(Condition)) - -oldnames <- sprintf("V%s", 1:ncol(times) + 3) - -ERPdata <- ERPdata |> - rename_at(vars(all_of(oldnames)), ~ as.character(times)) +erp_data <- data |> + janitor::clean_names() |> + rename( + channel = v1, + id = v2, + condition = v3 + ) |> + mutate( + id = as_factor(id), + condition = as_factor(condition) + ) ``` ```{r} #| label: remove-empty-col #| echo: false -ERPdata <- ERPdata |> - select(-V824) +erp_data <- erp_data |> + select_if(~ !all(is.na(.))) ``` ## Preparing the data @@ -95,50 +103,70 @@ Then I specify a variable with the channels of interest. These will be the chann ```{r} #| label: set-coi -coi <- c("P1", "P2", "Pz", "P3", "P4", "PO3", "PO4", "POz"); +coi <- c("P1", "P2", "Pz", "P3", "P4", "PO3", "PO4", "POz") ``` -Then I calculate the means across channels and conditions. This goes in two steps. First I'll select only the channels of interest, then I'll group by ID, condition, and channel. And then calculate the average of every other column, in this case column 4 to the end of the file. Then I'll do the same, except now I'll group by ID and condition. So then we have one average ERP for every condition in all participants. +Then I calculate the means across channels and conditions. This goes in two steps. First I'll select only the channels of interest, then I'll group by ID, condition, and channel. And then calculate the average of every other column, in this case all columns starting with "v". Then I'll do the same, except now I'll group by ID and condition. So then we have one average ERP for every condition in all participants. + +{{{< sidenote >}}} +The `summarise()` function will throw a warning about the grouping variable, this can be silenced by setting `.groups = "drop"` +{{{< /sidenote >}}} ```{r} #| label: calculate-grand-averages +#| warning: false -ERPdata_mChan <- ERPdata |> - filter(Channel %in% coi) |> - group_by(ID,Condition,Channel) %>% - summarise_at(vars(names(.)[4:ncol(.)]), list(~ mean(., na.rm = TRUE))) |> +erp_data_chan_mean <- erp_data |> + filter(channel %in% coi) |> + group_by(id, condition, channel) |> + summarise(across(starts_with("v"), ~ mean(.x, na.rm = TRUE))) |> ungroup() -ERPdata_mCond <- ERPdata_mChan |> - group_by(ID,Condition) %>% - summarise_at(vars(names(.)[4:ncol(.)]), list(~ mean(., na.rm = TRUE))) |> +erp_data_cond_mean <- erp_data_chan_mean |> + group_by(id, condition) |> + summarise(across(starts_with("v"), ~ mean(.x, na.rm = TRUE))) |> ungroup() -MeanERPs <- ERPdata_mCond +mean_erp <- erp_data_cond_mean ``` ## Calculate grand average and confidence interval -The next piece of code calculates the grand average. I will also calculate the confidence interval and then transform it from the interval relative to the mean to the absolute values representing the upper and lower boundaries of the confidence interval. Here I use a confidence interval of 95%. We first transform from wide to long format using the `pivot_longer()` function from the `{tidyr}` package. Then we convert the (now character) `Time` variable to numeric. Then we will calculate the average amplitude per time point. Then using the `CI()` function from the `{Rmisc}` package, we calculate the upper and lower bounds of the confidence interval. +The next piece of code calculates the grand average. I will also add the time for each timepoint in milliseconds from the "times.txt" file we loaded earlier. Afterwards, I will alculate the confidence interval and then transform it from the interval relative to the mean to the absolute values representing the upper and lower boundaries of the confidence interval. Here I use a confidence interval of 95%. We first transform from wide to long format using the `pivot_longer()` function from the `{tidyr}` package. I merge the data frame containing the times to the long data frame with the mean ERP so that we have a column with numerical time points. Then we calculate the average amplitude per time point. Then using the `CI()` function from the `{Rmisc}` package, we calculate the upper and lower bounds of the confidence interval and add each as a separate column in the data frame. ```{r} #| label: long-format - -ERP_plotdata <- MeanERPs |> - pivot_longer(-c(ID,Condition), names_to = "Time", values_to = "Amplitude") |> - mutate(Time = as.numeric(Time)) |> - group_by(Condition,Time) |> - summarise(Mean_Amplitude = mean(Amplitude), - CIlower = Rmisc::CI(Amplitude, ci = 0.95)["lower"], - CIupper = Rmisc::CI(Amplitude, ci = 0.95)["upper"]) +#| warning: false + +times <- times |> + pivot_longer( + cols = everything(), + names_to = "index", values_to = "time" + ) |> + mutate(timepoint = str_glue("v{row_number() + 3}")) |> + select(-index) + +erp_plotdata <- mean_erp |> + pivot_longer( + cols = -c(id, condition), + names_to = "timepoint", values_to = "amplitude" + ) |> + inner_join(times, by = "timepoint") |> + group_by(condition, time) |> + summarise( + mean_amplitude = mean(amplitude), + ci_lower = Rmisc::CI(amplitude, ci = 0.95)["lower"], + ci_upper = Rmisc::CI(amplitude, ci = 0.95)["upper"] + ) ``` ## Preparing to plot + Before we can start plotting, I just wanted to share one piece of code I found on StackOverflow. The question there was to move the x-axis and y-axis line towards 0, rather than the edge of the plot. You can find the question (and answers) [here](https://stackoverflow.com/questions/39071002/moving-x-or-y-axis-together-with-tick-labels-to-the-middle-of-a-single-ggplot-n){target="_blank"}. I combined two answers into a function that moves both the y- and the x-axis to 0. It is my personal preference to have a line indicating 0. This makes the plots easier to read and looks prettier too, but of course this is subjective. Alternatively we could also add a `geom_hline()` and `geom_vline()`, both with intercept at 0, and turn the axes off (as default in `theme_norment()`. This will still give you the lines at x=0 and y=0 but will keep the axis labels at 0. I personally prefer to have a plot that is as clean as possible, to really put focus on the ERP curve. So I’d typically turn off the grid lines. I also remove one of the 0-ticks to avoid any awkward overlay with the axes, especially since the 0 is also indicated by the axis line already. -I take the colors from our own `{normentR}` package. I ask for two colors from the "logo" palette. +I take the colors from our own `{normentR}` package. I ask for two colors from the "logo" palette. After running the chunk below, we have a beautiful ERP with a nice confidence interval. I hope this was helpful and informative. Good luck! @@ -149,22 +177,29 @@ After running the chunk below, we have a beautiful ERP with a nice confidence in colors <- norment_pal(palette = "logo")(2) -ERPplot <- ggplot(ERP_plotdata, aes(x = Time, y = Mean_Amplitude, - colour = Condition, group = Condition)) + - geom_ribbon(aes(ymin = CIlower, ymax = CIupper, fill = Condition), - alpha = 0.1, linetype = 0) + - geom_line(size = 0.75) + - scale_color_manual(values = colors) + - scale_fill_manual(values = colors) + - scale_x_continuous(breaks = c(seq(-2000,2000,500))) + - scale_y_continuous(breaks = c(seq(-12,-1,2), seq(2,12,2))) + +erp_plot <- ggplot(erp_plotdata, aes( + x = time, y = mean_amplitude, + colour = condition, group = condition +)) + + geom_ribbon(aes(ymin = ci_lower, ymax = ci_upper, fill = condition), + alpha = 0.1, linetype = 0 + ) + + geom_line(linewidth = 0.75) + + scale_color_manual(values = colors) + + scale_fill_manual(values = colors) + + scale_x_continuous(breaks = c(seq(-2000, 2000, 500))) + + scale_y_continuous(breaks = c(seq(-12, -1, 2), seq(2, 12, 2))) + coord_cartesian() + - labs(x = "Time (ms)", - y = "Amplitude (µV)") + + labs( + x = "Time (ms)", + y = "Amplitude (µV)" + ) + theme_norment(ticks = TRUE, grid = FALSE) + theme( - legend.position = c(0.9,0.1), + legend.position = c(0.9, 0.1), axis.text.x = element_text(vjust = -0.1) ) -shift_axes(ERPplot, x = 0, y = 0) +shift_axes(erp_plot, x = 0, y = 0) ``` + +**EDIT (2023-10-30)**: In the process of making some general updates to this site, I also revisited some of the code here. I improved the standard somewhat from when I first wrote this post in 2019 and brought it up to the code standards of 2023. The old version is of course available on GitHub. diff --git a/content/blog/2020-how-i-make-qq-plots-using-ggplot/index.markdown_strict_files/figure-markdown_strict/print-plot-1.png b/content/blog/2020-how-i-make-qq-plots-using-ggplot/index.markdown_strict_files/figure-markdown_strict/print-plot-1.png index 8a4531e..adefa09 100644 Binary files a/content/blog/2020-how-i-make-qq-plots-using-ggplot/index.markdown_strict_files/figure-markdown_strict/print-plot-1.png and b/content/blog/2020-how-i-make-qq-plots-using-ggplot/index.markdown_strict_files/figure-markdown_strict/print-plot-1.png differ diff --git a/content/blog/2020-how-i-make-qq-plots-using-ggplot/index.md b/content/blog/2020-how-i-make-qq-plots-using-ggplot/index.md index 5cf53be..b0f5c0c 100644 --- a/content/blog/2020-how-i-make-qq-plots-using-ggplot/index.md +++ b/content/blog/2020-how-i-make-qq-plots-using-ggplot/index.md @@ -14,25 +14,29 @@ execute: fig.show: hold results: hold out.width: 80% +editor_options: + chunk_output_type: console --- -### Introduction +## Introduction -Whenever I show my colleagues in the genetics group my results, the first things they say are "*can you show me the Manhattan plots?*" and "*can you show me the QQ plots?*". I covered how to make Manhattan plots in ggplot before (click [here](https://danielroelfs.com/blog/how-i-create-manhattan-plots-using-ggplot/) for a link. But now I want to go through how I make QQ plots. I'm aware that there's a number of packages available that offer this funcionality, but I feel they're for the most part a bit limiting compared to making the plot yourself using the `{ggplot2}` package. Another advantage I found of creating QQ plots myself, is that I got a better understanding of how GWAS summary statistics are projected on the QQ plot, and thus I got a better understanding of QQ plots. For this process, I'll use the `{tidyverse}` package (which includes `{ggplot2}`) for all operations, and the `{normentR}` package to simulate some summary statistics and get some of my preferred color palettes. The code to calculate the confidence interval is based on code from Kamil Slowikowski (click [here](https://gist.github.com/slowkow/9041570) to go to the Gist). +Whenever I show my colleagues in the genetics group my results, the first things they say are "*can you show me the Manhattan plots?*" and "*can you show me the QQ plots?*". I covered how to make Manhattan plots in ggplot before (click [here](https://danielroelfs.com/blog/how-i-create-manhattan-plots-using-ggplot/) for a link. But now I want to go through how I make QQ plots. I'm aware that there's a number of packages available that offer this funcionality, but I feel they're for the most part a bit limiting compared to making the plot yourself using the `{ggplot2}` package. Another advantage I found of creating QQ plots myself, is that I got a better understanding of how GWAS summary statistics are projected on the QQ plot, and thus I got a better understanding of QQ plots. For this process, I'll use the `{tidyverse}` package (which includes `{ggplot2}`) for all operations, the `{ggtext}` package for some fancy plot labels, and the `{normentR}` package to simulate some summary statistics and get some of my preferred color palettes. The code to calculate the confidence interval is based on code from Kamil Slowikowski (click [here](https://gist.github.com/slowkow/9041570) to go to the Gist). ``` r library(tidyverse) +library(ggtext) library(normentR) ``` -### Import data into R +## Import data into R First we'll simulate some summary statistics data. We can do this using the `simulateGWAS()` function in the `{normentR}` package for now. If you have summary statistics ready, you can load it in and use that instead. ``` r set.seed(1994) -sumstats.data <- simulateGWAS(nSNPs = 1e6, nSigCols = 1) +sumstats_data <- simulateGWAS(nSNPs = 1e6, nSigCols = 1) |> + janitor::clean_names() ``` GENERATING SIMULATED GWAS DATASET @@ -46,26 +50,34 @@ sumstats.data <- simulateGWAS(nSNPs = 1e6, nSigCols = 1) 8. Adding significant column in chromosome 8 DONE! -Now we have a data frame called `sumstats.data` that contains a simulated GWAS summary statistic dataset. Note that I only generated a small number of SNPs. The QQ plot can take quite a while to generate if there's a lot of SNPs in the file. QQ plots require only the p-values, but we'll keep the entire data frame because you might need this data frame at some point later on. +Now we have a data frame called `sumstats_data` that contains a simulated GWAS summary statistic dataset. Note that I only generated a small number of SNPs. The QQ plot can take quite a while to generate if there's a lot of SNPs in the file. QQ plots require only the p-values, but we'll keep the entire data frame because you might need this data frame at some point later on. -### Preparing the data +## Preparing the data -First I'll specify two variables, just to make the code later somehwat cleaner. I'll want to plot an area indicating the confidence interval, so we specify the confidence interval (e.g. 95%) and the number of SNPs (which is just the length of your `sumstats.data` data frame) +First I'll specify two variables, just to make the code later somehwat cleaner. I'll want to plot an area indicating the confidence interval, so we specify the confidence interval (e.g. 95%) and the number of SNPs (which is just the length of your `sumstats_data` data frame) ``` r ci <- 0.95 -nSNPs <- nrow(sumstats.data) +n_snps <- nrow(sumstats_data) ``` Next, we'll create a data frame that has all the data we need for the plot. We'll call that data frame `plotdata`. We initiate a data frame with four columns. The first column is the observed p-values, sorted in decreasing order, and then -log10 transformed. So then the SNPs with the lowest p-values will have the highest value. These SNPs will end up on the right side of the figure later. Based on this sorted vector, we can also generate the expected vector of log transformed p-values. We could do this manually, but I much prefer to use one of the (somewhat obscure but very useful) base functions in R called `ppoints()`. This generates the sequence of probabilities based on the input vector. In this case, we'll input just the number of SNPs. Since we also want to plot the confidence interval, we'll have to calculate the upper and lower limits of the confidence interval at each point. For this we'll use another base function called `qbeta()`. This generates the `$\beta$` distribution for a given probabily value or range of values. For the lower half of the confidence interval, we'll take 1 (i.e. the null line) minus the confidence interval (`0.95`), and since this is only half of the interval, we'll divide that value by 2. We'll do the same for the upper half of the confidence interval, except not it's 1 plus the confidence interval. For both the upper and lower interval, we'll supply a vector from 1 to the number of SNPs in your data frame and then also the reverse. These two vectors will be the parameters of the `$\beta$` distribution. The output from all the functions below are the same length as the number of SNPs in your original data frame. ``` r -plotdata <- data.frame( - observed = -log10(sort(sumstats.data$P)), - expected = -log10(ppoints(nSNPs)), - clower = -log10(qbeta(p = (1 - ci) / 2, shape1 = seq(nSNPs), shape2 = rev(seq(nSNPs)))), - cupper = -log10(qbeta(p = (1 + ci) / 2, shape1 = seq(nSNPs), shape2 = rev(seq(nSNPs)))) +plotdata <- tibble( + observed = -log10(sort(sumstats_data$p)), + expected = -log10(ppoints(n_snps)), + clower = -log10(qbeta( + p = (1 - ci) / 2, + shape1 = seq(n_snps), + shape2 = rev(seq(n_snps)) + )), + cupper = -log10(qbeta( + p = (1 + ci) / 2, + shape1 = seq(n_snps), + shape2 = rev(seq(n_snps)) + )) ) ``` @@ -81,42 +93,53 @@ plotdata_sub <- plotdata |> plotdata_sup <- plotdata |> filter(expected > 2) -plotdata_small <- rbind(plotdata_sub, plotdata_sup) +plotdata_small <- bind_rows(plotdata_sub, plotdata_sup) ``` Now we have a much smaller data frame, which should make plotting a lot faster! -### Plotting the data +## Plotting the data Let's make a plot! We want the expected values on the x-axis, and the observed values on the y-axis. Then I'll make a `geom_ribbon()` layer with the upper and lower bounds of the confidence interval. Now, there's a few options. Some people prefer to see individual points (with `geom_point()`), but if you have a million SNPs, I don't think this always makes much sense. You can also use `geom_line()`, but I especially prefer `geom_step()`. This function creates a line between the dots in a stepwise manner, so strictly speaking there are no diagonal lines (but with many points in a region, it may look like a diagonal line). There's two options for the direction of the `geom_step()` function, to go from a point to the next first in vertical direction and then horizontal direction is `"vh"`, the other way around is denoted by `"hv"`. What is most suitable for your plot depends on what you want to look at. I'd recommend to be conservative, so if you're trying to assess inflation, go for `"vh"`, which will visually bias the curve upwards. If you're looking only at the quality of genetic signal in your GWAS, then go for `"hv"`, since that will visually bias the curve to be closer to the null line. If you're not sure, then just go for `geom_line()`. When there is no signal in your GWAS, the expected and observed p-values should overlap perfectly. In that case there'll be a correlation of exactly 1. This is our reference value. In order to indicate this null-line, we can add a line with an intersect at 0 and a slope of 1 with the `geom_abline()` function. This `geom_abline()` will plot a line that exceeds the observed values. I prefer to use a `geom_segment()` so that I can specify the range of the line. I select the largest expected x- and y-value and plot a line from (0,0) to that point, which will perfectly correspond to a line with a slope of 1. Your line with observed p-values should never go below this null line. If it does, it means that the p-values that you measured are higher than could reasonably be expected under a null distrbution. -We'll also add some labels (with the `expression()` and `paste()` functions). You may just copy paste this. I'll add my preferred theme (`theme_norment()`) and I added an empty `theme()` layer. I haven't added anything there yet, but perhaps if there's some unwanted behavior in the theme, you can fix it there. +We'll also add some labels written in Markdown (that will be parsed with the `element_markdown()` function from the `{ggtext}` package. I'll add my preferred theme (`theme_norment()`) which looks a lot like the `theme_minimal()`. ``` r -qqplot <- ggplot(plotdata_small, aes(x = expected, y = observed)) + - geom_ribbon(aes(ymax = cupper, ymin = clower), fill = "grey30", alpha = 0.5) + - geom_step(color = norment_colors[["purple"]], size = 1.1, direction = "vh") + - geom_segment(data = . %>% filter(expected == max(expected)), - aes(x = 0, xend = expected, y = 0, yend = expected), - size = 1.25, alpha = 0.5, color = "grey30", lineend = "round") + - labs(x = expression(paste("Expected -log"[10],"(", plain(P),")")), - y = expression(paste("Observed -log"[10],"(", plain(P),")"))) + +qqplot <- plotdata_small |> + ggplot(aes(x = expected, y = observed)) + + geom_ribbon(aes(ymax = cupper, ymin = clower), + fill = "grey30", alpha = 0.5 + ) + + geom_step( + color = norment_colors[["purple"]], + linewidth = 1.1, direction = "vh" + ) + + geom_segment( + data = . %>% filter(expected == max(expected)), + aes(x = 0, xend = expected, y = 0, yend = expected), + linewidth = 1.25, alpha = 0.5, + color = "grey30", lineend = "round" + ) + + labs( + x = str_glue("Expected -log10(P)"), + y = str_glue("Observed -log10(P)") + ) + theme_norment() + - theme() + theme( + axis.title.x = element_markdown(), + axis.title.y = element_markdown() + ) ``` - Warning: Using `size` aesthetic for lines was deprecated in ggplot2 3.4.0. - ℹ Please use `linewidth` instead. - I must issue a small warning now, since these GWAS files can be quite large, plotting all these values can take quite some time, so if you're working on an older MacBook Air (like I am) you may have to be a little patient. Let's see what the plot looks like. ``` r print(qqplot) ``` - + Now we have our QQ plot. We see that our (simulated) GWAS dataset has a very nice signal! I'd be dissapointed if it didn't since I coded it to be this way. It is possible to annotate this plot further or to add multiple lines depending on thresholds (I'm thinking about conditional/conjunctional FDR thresholds for instance). Good luck! diff --git a/content/blog/2020-how-i-make-qq-plots-using-ggplot/index.qmd b/content/blog/2020-how-i-make-qq-plots-using-ggplot/index.qmd index 6e33364..adeaed6 100644 --- a/content/blog/2020-how-i-make-qq-plots-using-ggplot/index.qmd +++ b/content/blog/2020-how-i-make-qq-plots-using-ggplot/index.qmd @@ -14,20 +14,23 @@ execute: fig.show: hold results: hold out.width: 80% +editor_options: + chunk_output_type: console --- -### Introduction -Whenever I show my colleagues in the genetics group my results, the first things they say are "_can you show me the Manhattan plots?_" and "_can you show me the QQ plots?_". I covered how to make Manhattan plots in ggplot before (click [here](https://danielroelfs.com/blog/how-i-create-manhattan-plots-using-ggplot/) for a link. But now I want to go through how I make QQ plots. I'm aware that there's a number of packages available that offer this funcionality, but I feel they're for the most part a bit limiting compared to making the plot yourself using the `{ggplot2}` package. Another advantage I found of creating QQ plots myself, is that I got a better understanding of how GWAS summary statistics are projected on the QQ plot, and thus I got a better understanding of QQ plots. For this process, I'll use the `{tidyverse}` package (which includes `{ggplot2}`) for all operations, and the `{normentR}` package to simulate some summary statistics and get some of my preferred color palettes. The code to calculate the confidence interval is based on code from Kamil Slowikowski (click [here](https://gist.github.com/slowkow/9041570) to go to the Gist). +## Introduction +Whenever I show my colleagues in the genetics group my results, the first things they say are "_can you show me the Manhattan plots?_" and "_can you show me the QQ plots?_". I covered how to make Manhattan plots in ggplot before (click [here](https://danielroelfs.com/blog/how-i-create-manhattan-plots-using-ggplot/) for a link. But now I want to go through how I make QQ plots. I'm aware that there's a number of packages available that offer this funcionality, but I feel they're for the most part a bit limiting compared to making the plot yourself using the `{ggplot2}` package. Another advantage I found of creating QQ plots myself, is that I got a better understanding of how GWAS summary statistics are projected on the QQ plot, and thus I got a better understanding of QQ plots. For this process, I'll use the `{tidyverse}` package (which includes `{ggplot2}`) for all operations, the `{ggtext}` package for some fancy plot labels, and the `{normentR}` package to simulate some summary statistics and get some of my preferred color palettes. The code to calculate the confidence interval is based on code from Kamil Slowikowski (click [here](https://gist.github.com/slowkow/9041570) to go to the Gist). ```{r} #| label: pkgs #| message: false library(tidyverse) +library(ggtext) library(normentR) ``` -### Import data into R +## Import data into R First we'll simulate some summary statistics data. We can do this using the `simulateGWAS()` function in the `{normentR}` package for now. If you have summary statistics ready, you can load it in and use that instead. ```{r} @@ -35,20 +38,21 @@ First we'll simulate some summary statistics data. We can do this using the `sim set.seed(1994) -sumstats.data <- simulateGWAS(nSNPs = 1e6, nSigCols = 1) +sumstats_data <- simulateGWAS(nSNPs = 1e6, nSigCols = 1) |> + janitor::clean_names() ``` -Now we have a data frame called `sumstats.data` that contains a simulated GWAS summary statistic dataset. Note that I only generated a small number of SNPs. The QQ plot can take quite a while to generate if there's a lot of SNPs in the file. QQ plots require only the p-values, but we'll keep the entire data frame because you might need this data frame at some point later on. +Now we have a data frame called `sumstats_data` that contains a simulated GWAS summary statistic dataset. Note that I only generated a small number of SNPs. The QQ plot can take quite a while to generate if there's a lot of SNPs in the file. QQ plots require only the p-values, but we'll keep the entire data frame because you might need this data frame at some point later on. -### Preparing the data +## Preparing the data -First I'll specify two variables, just to make the code later somehwat cleaner. I'll want to plot an area indicating the confidence interval, so we specify the confidence interval (e.g. 95%) and the number of SNPs (which is just the length of your `sumstats.data` data frame) +First I'll specify two variables, just to make the code later somehwat cleaner. I'll want to plot an area indicating the confidence interval, so we specify the confidence interval (e.g. 95%) and the number of SNPs (which is just the length of your `sumstats_data` data frame) ```{r} #| label: set-params ci <- 0.95 -nSNPs <- nrow(sumstats.data) +n_snps <- nrow(sumstats_data) ``` Next, we'll create a data frame that has all the data we need for the plot. We'll call that data frame `plotdata`. We initiate a data frame with four columns. The first column is the observed p-values, sorted in decreasing order, and then -log~10~ transformed. So then the SNPs with the lowest p-values will have the highest value. These SNPs will end up on the right side of the figure later. Based on this sorted vector, we can also generate the expected vector of log transformed p-values. We could do this manually, but I much prefer to use one of the (somewhat obscure but very useful) base functions in R called `ppoints()`. This generates the sequence of probabilities based on the input vector. In this case, we'll input just the number of SNPs. @@ -57,11 +61,19 @@ Since we also want to plot the confidence interval, we'll have to calculate the ```{r} #| label: calculate-data -plotdata <- data.frame( - observed = -log10(sort(sumstats.data$P)), - expected = -log10(ppoints(nSNPs)), - clower = -log10(qbeta(p = (1 - ci) / 2, shape1 = seq(nSNPs), shape2 = rev(seq(nSNPs)))), - cupper = -log10(qbeta(p = (1 + ci) / 2, shape1 = seq(nSNPs), shape2 = rev(seq(nSNPs)))) +plotdata <- tibble( + observed = -log10(sort(sumstats_data$p)), + expected = -log10(ppoints(n_snps)), + clower = -log10(qbeta( + p = (1 - ci) / 2, + shape1 = seq(n_snps), + shape2 = rev(seq(n_snps)) + )), + cupper = -log10(qbeta( + p = (1 + ci) / 2, + shape1 = seq(n_snps), + shape2 = rev(seq(n_snps)) + )) ) ``` @@ -79,40 +91,54 @@ plotdata_sub <- plotdata |> plotdata_sup <- plotdata |> filter(expected > 2) -plotdata_small <- rbind(plotdata_sub, plotdata_sup) +plotdata_small <- bind_rows(plotdata_sub, plotdata_sup) ``` Now we have a much smaller data frame, which should make plotting a lot faster! -### Plotting the data +## Plotting the data Let's make a plot! We want the expected values on the x-axis, and the observed values on the y-axis. Then I'll make a `geom_ribbon()` layer with the upper and lower bounds of the confidence interval. Now, there's a few options. Some people prefer to see individual points (with `geom_point()`), but if you have a million SNPs, I don't think this always makes much sense. You can also use `geom_line()`, but I especially prefer `geom_step()`. This function creates a line between the dots in a stepwise manner, so strictly speaking there are no diagonal lines (but with many points in a region, it may look like a diagonal line). There's two options for the direction of the `geom_step()` function, to go from a point to the next first in vertical direction and then horizontal direction is `"vh"`, the other way around is denoted by `"hv"`. What is most suitable for your plot depends on what you want to look at. I'd recommend to be conservative, so if you're trying to assess inflation, go for `"vh"`, which will visually bias the curve upwards. If you're looking only at the quality of genetic signal in your GWAS, then go for `"hv"`, since that will visually bias the curve to be closer to the null line. If you're not sure, then just go for `geom_line()`. When there is no signal in your GWAS, the expected and observed p-values should overlap perfectly. In that case there'll be a correlation of exactly 1. This is our reference value. In order to indicate this null-line, we can add a line with an intersect at 0 and a slope of 1 with the `geom_abline()` function. This `geom_abline()` will plot a line that exceeds the observed values. I prefer to use a `geom_segment()` so that I can specify the range of the line. I select the largest expected x- and y-value and plot a line from (0,0) to that point, which will perfectly correspond to a line with a slope of 1. Your line with observed p-values should never go below this null line. If it does, it means that the p-values that you measured are higher than could reasonably be expected under a null distrbution. -We'll also add some labels (with the `expression()` and `paste()` functions). You may just copy paste this. I'll add my preferred theme (`theme_norment()`) and I added an empty `theme()` layer. I haven't added anything there yet, but perhaps if there's some unwanted behavior in the theme, you can fix it there. +We'll also add some labels written in Markdown (that will be parsed with the `element_markdown()` function from the `{ggtext}` package. I'll add my preferred theme (`theme_norment()`) which looks a lot like the `theme_minimal()`. ```{r} #| label: qqplot -qqplot <- ggplot(plotdata_small, aes(x = expected, y = observed)) + - geom_ribbon(aes(ymax = cupper, ymin = clower), fill = "grey30", alpha = 0.5) + - geom_step(color = norment_colors[["purple"]], size = 1.1, direction = "vh") + - geom_segment(data = . %>% filter(expected == max(expected)), - aes(x = 0, xend = expected, y = 0, yend = expected), - size = 1.25, alpha = 0.5, color = "grey30", lineend = "round") + - labs(x = expression(paste("Expected -log"[10],"(", plain(P),")")), - y = expression(paste("Observed -log"[10],"(", plain(P),")"))) + +qqplot <- plotdata_small |> + ggplot(aes(x = expected, y = observed)) + + geom_ribbon(aes(ymax = cupper, ymin = clower), + fill = "grey30", alpha = 0.5 + ) + + geom_step( + color = norment_colors[["purple"]], + linewidth = 1.1, direction = "vh" + ) + + geom_segment( + data = . %>% filter(expected == max(expected)), + aes(x = 0, xend = expected, y = 0, yend = expected), + linewidth = 1.25, alpha = 0.5, + color = "grey30", lineend = "round" + ) + + labs( + x = str_glue("Expected -log10(P)"), + y = str_glue("Observed -log10(P)") + ) + theme_norment() + - theme() + theme( + axis.title.x = element_markdown(), + axis.title.y = element_markdown() + ) ``` I must issue a small warning now, since these GWAS files can be quite large, plotting all these values can take quite some time, so if you're working on an older MacBook Air (like I am) you may have to be a little patient. Let's see what the plot looks like. ```{r} #| label: print-plot -#| fig-height: 8 -#| fig-width: 8 +#| fig-height: 6 +#| fig-width: 6 print(qqplot) ``` diff --git a/content/blog/2020-plotting-star-destroyers-in-r/index.md b/content/blog/2020-plotting-star-destroyers-in-r/index.md index 1588555..2d8b6ac 100644 --- a/content/blog/2020-plotting-star-destroyers-in-r/index.md +++ b/content/blog/2020-plotting-star-destroyers-in-r/index.md @@ -14,40 +14,42 @@ execute: fig.show: hold results: hold out.width: 80% +editor_options: + chunk_output_type: console --- -### Introduction +## Introduction -Most people in my environment know R as the quintessential tool for statistical analysis and data visualization. There's plenty of tutorials and discussions online about how to go about either. But one important aspect that is noticeably lacking is a discussion about R's ability to simulate scenes from Star Wars. Today, we'll set this straight. At the same time we'll also explore how to incorporate a mathematical formula with a set of simple rules into R. This post is an R implementation of a formula published on the *On-line Encyclopedia of Integer Sequences* (OEIS, [sequence A117966](https://oeis.org/A117966)). Numberphile has a video from Brady Haran with Neil Sloane where they discuss the mathematics behind fun graphs, among which this one. It's a great video, you can check it out [**here**](https://youtu.be/o8c4uYnnNnc). +Most people in my environment know R as the quintessential tool for statistical analysis and data visualization. There's plenty of tutorials and discussions online about how to go about either. But one important aspect that is noticeably lacking is a discussion about R's ability to simulate scenes from Star Wars. Today, we'll set this straight. At the same time we'll also explore how to incorporate a mathematical formula with a set of simple rules into R. This post is an R implementation of a formula published on the *On-line Encyclopedia of Integer Sequences* (OEIS, [sequence A117966](https://oeis.org/A117966)). Numberphile has a video from Brady Haran with Neil Sloane where they discuss the mathematics behind fun graphs, among which this one. It's a great video, you can check it out [here](https://youtu.be/o8c4uYnnNnc). We'll use a formula called the *balanced ternary enumeration*. The game is to convert a decimal number to base 3, or the ternary. The best way to describe the ternary is to put it next to the binary. Binary (or base 2) is perhaps more widely understood. I don't actually know of any use of the ternary outside of these hypothetical mathematical problems. Octadecimal and hexadecimal are used sometimes. The latter is used for instance in defining colors in computers. -### Let's start with counting +## Let's start with counting Let's start at the very basics. Like primary school basic. In the decimal system, we could from 1 to 9 and then add a digit and start from 1 again (\[1 2 3 ... 8 9 10 11 12 ... 20 21 ...\]) This system likely comes from the fact that humans have 10 fingers. (Interestingly, the Maya used a system with base 20, because they used their toes to count also, and there's a Native American tribe in California that uses base 8 because they use the space between fingers to count.) Computers use binary (base 2) because a pin or electrical signal can either be on or off. This means that there's two values available to denote a number, either 0 or 1. In this system each digit represents the double of the previous, starting from 1. So the first digit represents 1, the second 2, the third 4, the fourth 8 and so on. So the number 5 is made from adding 4 and 1 together, so the binary representation of the number 4 is 100, and 5 is 101. The number 6 is denoted as 110, and so on. In a ternary system, this same principle applies as in the binary system, except digits now increase by a factor of 3. The first digit represents 1, the second 3, the third9 and so on. This also means that there's three possible values to denote a number, 0, 1, or 2. To illustrate how decimal numbers are represented in the binary and ternary system, look at the table below. -| Decimal | Binary | Ternary | +| decimal | binary | ternary | |:-------:|:------:|:-------:| -| 1 | 1 | 1 | -| 2 | 10 | 2 | -| 3 | 11 | 10 | -| 4 | 100 | 11 | -| 5 | 101 | 12 | -| 6 | 110 | 20 | -| 7 | 111 | 21 | -| 8 | 1000 | 22 | -| 9 | 1001 | 100 | -| 10 | 1010 | 101 | -| 11 | 1011 | 102 | -| 12 | 1100 | 110 | -| 13 | 1101 | 111 | -| 14 | 1110 | 112 | -| 15 | 1111 | 120 | +| 1 | 00001 | 001 | +| 2 | 00010 | 002 | +| 3 | 00011 | 010 | +| 4 | 00100 | 011 | +| 5 | 00101 | 012 | +| 6 | 00110 | 020 | +| 7 | 00111 | 021 | +| 8 | 01000 | 022 | +| 9 | 01001 | 100 | +| 10 | 01010 | 101 | +| 11 | 01011 | 102 | +| 12 | 01100 | 110 | +| 13 | 01101 | 111 | +| 14 | 01110 | 112 | +| 15 | 01111 | 120 | | 16 | 10000 | 121 | | 17 | 10001 | 122 | | 18 | 10010 | 200 | @@ -55,14 +57,14 @@ In a ternary system, this same principle applies as in the binary system, except | 20 | 10100 | 202 | | 21 | 10101 | 210 | -### Moving numbers between systems +## Moving numbers between systems -Unlike MATLAB (`dec2bin()`) or Python (`bin()`), R doesn't have a natural built-in function to convert decimal numbers to binary (unless you want to use the weird `intToBits()` function). So I wrote a function instead which continually appends the modulus while it loops through the digits of the numbers by dividing it in half continuously. The modulus is what is left after division of number x by number y. Let's take the number 13 as an example. In the first loop, the modulus of 13 when divided by 2 is 1. So that goes in the first position, representing value 1. The next digit in the binary system represents the value 2. So we divide our initial value by 2 (and round it to the lower integer it in case we get decimal points) and take the modulus again. So the modulus of 6 when divided by 2 is 0. So that's the second digit. The next loop will result in another 1 (`$x = 6; \lfloor x/2 \rfloor\ mod\ 2 = 1$`, `$\lfloor x \rfloor$` represents rounding `$x$` to the nearest lower integer, i.e. floor), and then in the final loop, we take half of 3 and take the modulus again (`$x = 3; \lfloor x/2 \rfloor\ mod\ 2 = 1$`). After this, when we floor 0.5, the loop cancels since from here on it will just produce an infinite amount of leading zero's. This function now gives us the value 1101, which corresponds to the table above. The code for this function looks like this: +Unlike MATLAB (`dec2bin()`) or Python (`bin()`), R doesn't have a natural built-in function to convert decimal numbers to binary (unless you want to use the weird `intToBits()` function). So I wrote a function instead which continually appends the modulus while it loops through the digits of the numbers by dividing it in half continuously. The modulus is what is left after division of number x by number y. Let's take the number 13 as an example. In the first loop, the modulus of 13 when divided by 2 is 1. So that goes in the first position, representing value 1. The next digit in the binary system represents the value 2. So we divide our initial value by 2 (and round it to the lower integer it in case we get decimal points) and take the modulus again. So the modulus of 6 when divided by 2 is 0. So that's the second digit. The next loop will result in another 1 ($x = 6; \lfloor x/2 \rfloor\\mod 2 = 1$), $\lfloor x \rfloor$ represents rounding $x$ to the nearest lower integer, i.e. floor), and then in the final loop, we take half of 3 and take the modulus again ($x = 3; \lfloor x/2 \rfloor\mod 2 = 1$). After this, when we floor 0.5, the loop cancels since from here on it will just produce an infinite amount of leading zero's. This function now gives us the value 1101, which corresponds to the table above. The code for this function looks like this: ``` r -ConvertToBinary <- function(x) { +convert_to_binary <- function(x) { out <- mod <- NULL - while (x > 0 | is.null(mod)) { + while (x > 0 || is.null(mod)) { mod <- x %% 2 out <- paste0(mod, out) x <- floor(x / 2) @@ -74,7 +76,7 @@ ConvertToBinary <- function(x) { Let's run this function: ``` r -ConvertToBinary(13) +convert_to_binary(13) ``` [1] "1101" @@ -82,12 +84,12 @@ ConvertToBinary(13) Now in this function we've hard coded that the base is 2, but this code works for any base up to base 10 with just a simple rewrite of the code. If we go higher, e.g. base 11, we are going to have to start using letters to represent values, which I'm too lazy to implement right now. We can specify the base as an input variable. ``` r -ConvertToBase <- function(x, base = NULL) { +convert_to_base_n <- function(x, base = NULL) { if (base > 10) { stop("Function not defined for bases higher than 10") } out <- mod <- NULL - while (x > 0 | is.null(mod)) { + while (x > 0 || is.null(mod)) { mod <- x %% base out <- paste0(mod, out) x <- floor(x / base) @@ -99,33 +101,35 @@ ConvertToBase <- function(x, base = NULL) { These functions only accepts a single value, but to get the transformed value for a vector of integers we can use the `map()` function from `{purrr}`. `map()` as default outputs a list, but we can also ask for a character vector. We can then convert a number of values to binary like this: ``` r -example_vector <- c(0:4,10,13,22,50,75,100) -map_chr(example_vector, ConvertToBinary) +example_vector <- c(seq(4), 10, 13, 22, 50, 75, 100) +map_chr(example_vector, convert_to_binary) ``` - [1] "0" "1" "10" "11" "100" "1010" "1101" - [8] "10110" "110010" "1001011" "1100100" + [1] "1" "10" "11" "100" "1010" "1101" "10110" + [8] "110010" "1001011" "1100100" ``` r -map_chr(example_vector, base = 3, ConvertToBase) +map_chr(example_vector, base = 3, convert_to_base_n) ``` - [1] "0" "1" "2" "10" "11" "101" "111" "211" "1212" - [10] "2210" "10201" + [1] "1" "2" "10" "11" "101" "111" "211" "1212" "2210" + [10] "10201" -### The rules of the game +## The rules of the game Okay, so here's the rules of the *balanced ternary enumeration* game We write all integers in base 3, and replace all digits that are 2 with -1 and then sum the outcome. Let's look at the first ten numbers in ternary: ``` r -map_chr(seq(10), base = 3, ConvertToBase) +map_chr(seq(10), base = 3, convert_to_base_n) ``` [1] "1" "2" "10" "11" "12" "20" "21" "22" "100" "101" -So in this sequence, we would replace the second value (`2`) with `-1`, which makes -1. We would also replace the second digit of fifth value (`12`), which makes \[1,-1\], which, when we add these numbers up , makes 2. This because 1 in the first position denotes value 3, minus 1 in the second position (representing 1) makes 2 since `$(3*1) + (1*-1) = 2$`. The next value (`20`) has a 2 in the first position, replace this with -1 makes \[-1,0\]. The first position denotes 3, minus 0 in the second position, makes -3 since `$(3*-1) + (1*0) = -3$`. Applying the same rule to the sixth value gives `$(3*-1) + (1*1)$` which makes -2. The next value has two incidences of the number 2, replacing both with -1 gives `$(3*-1) + (1*-1)$` is equal to -4. Let's skip ahead a few numbers to decimal number 18, which in ternary becomes 200, where the first position represents the number 9. Replacing the 2 in this number gives `$(9*-1) + (3*0) + (1*0)$`, which makes -9. This process is the balanced ternary enumeration. +So in this sequence, we would replace the second value (`2`) with `-1`, which makes -1. We would also replace the second digit of fifth value (`12`), which makes \[1,-1\], which, when we add these numbers up , makes 2. This because 1 in the first position denotes value 3, minus 1 in the second position (representing 1) makes 2 since $(3 * 1) + (1 * -1) = 2$. The next value (`20`) has a 2 in the first position, replace this with -1 makes \[-1,0\]. The first position denotes 3, minus 0 in the second position, makes -3 since $(3 * -1) + (1 * 0) = -3$. -Just as show of proof, we can also apply the same formula to values that don't contain a 2, for instance decimal number 10, which becomes 101 in ternary. The formula for this becomes `$(9*1) + (3*0) + (1*1)$`, which makes again 10. +Applying the same rule to the sixth value gives $(3 * -1) + (1 * 1)$ which makes -2. The next value has two incidences of the number 2, replacing both with -1 gives $(3 * -1) + (1 * -1)$ is equal to -4. Let's skip ahead a few numbers to decimal number 18, which in ternary becomes 200, where the first position represents the number 9. Replacing the 2 in this number gives $(9 * -1) + (3 * 0) + (1 * 0)$, which makes -9. This process is the balanced ternary enumeration. + +Just as show of proof, we can also apply the same formula to values that don't contain a 2, for instance decimal number 10, which becomes 101 in ternary. The formula for this becomes $(9 * 1) + (3 * 0) + (1 * 1)$, which makes again 10. Let's put this sequence together: @@ -133,9 +137,9 @@ Let's put this sequence together: And that's balanced ternary enumeration. -### Coding the formula +## Coding the formula -So obviously we are lazy, and don't want to do this process manually for thousands of values. that's why we're going to code it. For this step I translated some Python code to R syntax. The function I wrote to do one step of balanced ternary enumartion is shown below. The first value is always 0 (since it's a 0 in the first position, hence `$1*0 = 0$`). After this, we can incorporate the two steps (of converting into ternary and the enumeration) into one. The formula for this looks like this: +So obviously we are lazy, and don't want to do this process manually for thousands of values. that's why we're going to code it. For this step I translated some Python code to R syntax. The function I wrote to do one step of balanced ternary enumartion is shown below. The first value is always 0 (since it's a 0 in the first position, hence $1 * 0 = 0$). After this, we can incorporate the two steps (of converting into ternary and the enumeration) into one. The formula for this looks like this: $$ \begin{aligned} @@ -146,38 +150,60 @@ a(3n + 2) &= 3 * a(n) - 1 \end{aligned} $$ -The Python code for this function came from a website that collects mathematical functions and sequences and can be found [here](https://oeis.org/A117966). I've adapted it to work in R. Since 0 will always result in 0, this is hard coded in. Afterwards it is a nested function (I know, we all love it) where it iteratively calls itself until the input to the function is 0 and it stops. At that point we have out balanced ternary. This function only performs the calculation for one value. So getting a sequence means putting it in a `map()` function. +{{< sidenote >}} +The Python code for this function came from a website that collects mathematical functions and +sequences and can be found [here](https://oeis.org/A117966) +{{< /sidenote >}} + +In the function, since 0 will always result in 0, this is hard coded in. Afterwards it is a nested function (I know, we all love it) where it iteratively calls itself until the input to the function is 0 and it stops. At that point we have out balanced ternary. This function only performs the calculation for one value. So getting a sequence means putting it in a `map()` function. ``` r -BTE <- function(x) { +bte <- function(x) { if (x == 0) { return(0) } if (x %% 3 == 0) { - return(3 * BTE(x / 3)) + return(3 * bte(x / 3)) } else if (x %% 3 == 1) { - return(3 * BTE((x - 1) / 3) + 1) + return(3 * bte((x - 1) / 3) + 1) } else if (x %% 3 == 2) { - return(3 * BTE((x - 2) / 3) - 1) + return(3 * bte((x - 2) / 3) - 1) } } ``` -Let's go through one iteration of this code. Let's say `x <- 3`. 3 modulo 3 is equal to 0, so we enter the first condition. The result of this is 3 multiplied by the outcome of the same function, except the input now is x divided by three, or 3/3, or 1, in our example. This becomes the new input for the function. 1 modulo 3 is equal to 1. So now we enter the second condition. Now the input to the `BTE()` function is 1 minus 1, divided by 3. This is 0, so we return 3 \* 0 + 1, which is equal to 3. +
+Click here to see the function in Python + +``` python +def bte(x): + if x == 0: + return 0 + if x % 3 == 0: + return 3 * bte(x / 3) + elif x % 3 == 1: + return 3 * bte((x - 1) / 3) + 1 + else: + return 3 * bte((x - 2) / 3) - 1 +``` + +
+ +Let's go through one iteration of this code. Let's say `x <- 3`. 3 modulo 3 is equal to 0, so we enter the first condition. The result of this is 3 multiplied by the outcome of the same function, except the input now is x divided by three, or 3/3, or 1, in our example. This becomes the new input for the function. 1 modulo 3 is equal to 1. So now we enter the second condition. Now the input to the `bte()` function is 1 minus 1, divided by 3. This is 0, so we return 3 \* 0 + 1, which is equal to 3. If we plug the number 3 into the formula, we will get the same result: ``` r -BTE(3) +bte(3) ``` [1] 3 -Let's also look at a few other examples using the `map()` function again. Since the `BTE()` function outputs only integers, I use `map_dbl()`. Let's input a few examples: +Let's also look at a few other examples using the `map()` function again. Since the `bte()` function outputs only integers, I use `map_dbl()`. Let's input a few examples: ``` r -example_vector <- c(0,seq(10),500,1500,9999) -map_dbl(example_vector, BTE) +example_vector <- c(0, seq(10), 500, 1500, 9999) +map_dbl(example_vector, bte) ``` [1] 0 1 -1 3 4 2 -3 -2 -4 9 10 -232 -696 9270 @@ -185,10 +211,10 @@ map_dbl(example_vector, BTE) This corresponds to the values we created earlier, and the larger numbers make sense also. Okay, let's now create an entire sequence. We'll do 59.048 iterations (it'll become clear later why this specific number). We'll save the output in a variable called `starwars_seq` (forgotten yet that this thing started with Star Wars?). ``` r -starwars_seq <- map_dbl(seq(59048), BTE) +starwars_seq <- map_dbl(seq(59048), bte) ``` -### Plotting the Star Destroyers +## Plotting the Star Destroyers Now, when we plot the values as a scatter plot, it'll become maybe a bit clearer how this mathematical formula circles back to our Star Wars scene. @@ -208,30 +234,42 @@ length(starwars_seq) == length(unique(starwars_seq)) [1] TRUE -The first point of each "squadron" of star destroyers starts with a value that is the same in decimal system as it is in balanced ternary enumeration. Remember the length of the `starwars_seq` variable was 59 048? I chose that because it would start plotting a new squadron at x = 59 049. Let's confirm this: +The first point of each "squadron" of star destroyers starts with a value that is the same in decimal system as it is in balanced ternary enumeration. Remember the length of the `starwars_seq` variable was 59 048? I chose that because it would start plotting a new squadron at `x = 59049`. Let's confirm this: ``` r -BTE(59049) +bte(59049) ``` [1] 59049 -There is a range of values where the input of the balanced ternary enumeration is equal to the outcome (or where the outcome is equal to the input divided by two times -1 (`$\frac{-x}{2}$`). There's a clear pattern to these numbers, but I've done enough maths for today, so I'll save it for another time. +There is a range of values where the input of the balanced ternary enumeration is equal to the outcome (or where the outcome is equal to the input divided by two times -1 ($\frac{-x}{2}$). There's a clear pattern to these numbers, but I've done enough maths for today, so I'll save it for another time. Anyway, the plot! It is cool and all, but let's make it look a bit more like Star Wars by changing some style elements. We'll also generate some stars. For fun's sake we'll also add a planet or moon. ``` r set.seed(1983) + ggplot(data = NULL, aes(x = seq(starwars_seq), y = starwars_seq)) + - geom_point(aes(x = sample(seq(-1e4,length(starwars_seq) + 1e4), 1e3), - y = sample(seq(min(starwars_seq)-1e4,max(starwars_seq) + 1e4), 1e3), - size = sample(runif(length(starwars_seq) + 1e4), 1e3)), - shape = 18, color = "yellow") + - geom_point(aes(x = sample(seq(starwars_seq), 1), - y = sample(starwars_seq, 1)), - shape = 19, size = 12, color = "darkslategray") + + geom_point( + aes( + x = sample(seq(-1e4, length(starwars_seq) + 1e4), 1e3), + y = sample(seq( + min(starwars_seq) - 1e4, + max(starwars_seq) + 1e4 + ), 1e3), + size = sample(runif(length(starwars_seq) + 1e4), 1e3) + ), + shape = 18, color = "yellow" + ) + + geom_point( + aes( + x = sample(seq(starwars_seq), 1), + y = sample(starwars_seq, 1) + ), + shape = 19, size = 12, color = "darkslategray" + ) + geom_point(size = 4, color = "grey90") + - scale_size_continuous(range = c(1e-3,5e-1)) + + scale_size_continuous(range = c(1e-3, 5e-1)) + theme_void() + theme( legend.position = "none", @@ -249,7 +287,7 @@ We can create another plot (nothing related to Star Wars perhaps), that looks ni ``` r ggplot(data = NULL, aes(x = seq(starwars_seq), y = starwars_seq)) + - geom_line() + + geom_line() + theme_minimal() ``` diff --git a/content/blog/2020-plotting-star-destroyers-in-r/index.qmd b/content/blog/2020-plotting-star-destroyers-in-r/index.qmd index 4b749ac..d9e099b 100644 --- a/content/blog/2020-plotting-star-destroyers-in-r/index.qmd +++ b/content/blog/2020-plotting-star-destroyers-in-r/index.qmd @@ -14,14 +14,18 @@ execute: fig.show: hold results: hold out.width: 80% +editor_options: + chunk_output_type: console --- -### Introduction -Most people in my environment know R as the quintessential tool for statistical analysis and data visualization. There's plenty of tutorials and discussions online about how to go about either. But one important aspect that is noticeably lacking is a discussion about R's ability to simulate scenes from Star Wars. Today, we'll set this straight. At the same time we'll also explore how to incorporate a mathematical formula with a set of simple rules into R. This post is an R implementation of a formula published on the _On-line Encyclopedia of Integer Sequences_ (OEIS, [sequence A117966](https://oeis.org/A117966)). Numberphile has a video from Brady Haran with Neil Sloane where they discuss the mathematics behind fun graphs, among which this one. It's a great video, you can check it out [**here**](https://youtu.be/o8c4uYnnNnc). +## Introduction + +Most people in my environment know R as the quintessential tool for statistical analysis and data visualization. There's plenty of tutorials and discussions online about how to go about either. But one important aspect that is noticeably lacking is a discussion about R's ability to simulate scenes from Star Wars. Today, we'll set this straight. At the same time we'll also explore how to incorporate a mathematical formula with a set of simple rules into R. This post is an R implementation of a formula published on the _On-line Encyclopedia of Integer Sequences_ (OEIS, [sequence A117966](https://oeis.org/A117966)). Numberphile has a video from Brady Haran with Neil Sloane where they discuss the mathematics behind fun graphs, among which this one. It's a great video, you can check it out [here](https://youtu.be/o8c4uYnnNnc). We'll use a formula called the _balanced ternary enumeration_. The game is to convert a decimal number to base 3, or the ternary. The best way to describe the ternary is to put it next to the binary. Binary (or base 2) is perhaps more widely understood. I don't actually know of any use of the ternary outside of these hypothetical mathematical problems. Octadecimal and hexadecimal are used sometimes. The latter is used for instance in defining colors in computers. -### Let's start with counting +## Let's start with counting + Let's start at the very basics. Like primary school basic. In the decimal system, we could from 1 to 9 and then add a digit and start from 1 again ([1 2 3 ... 8 9 10 11 12 ... 20 21 ...]) This system likely comes from the fact that humans have 10 fingers. (Interestingly, the Maya used a system with base 20, because they used their toes to count also, and there's a Native American tribe in California that uses base 8 because they use the space between fingers to count.) Computers use binary (base 2) because a pin or electrical signal can either be on or off. This means that there's two values available to denote a number, either 0 or 1. In this system each digit represents the double of the previous, starting from 1. So the first digit represents 1, the second 2, the third 4, the fourth 8 and so on. So the number 5 is made from adding 4 and 1 together, so the binary representation of the number 4 is 100, and 5 is 101. The number 6 is denoted as 110, and so on. In a ternary system, this same principle applies as in the binary system, except digits now increase by a factor of 3. The first digit represents 1, the second 3, the third9 and so on. This also means that there's three possible values to denote a number, 0, 1, or 2. To illustrate how decimal numbers are represented in the binary and ternary system, look at the table below. @@ -31,17 +35,15 @@ In a ternary system, this same principle applies as in the binary system, except #| message: false library(tidyverse) - -reticulate::use_virtualenv("./.venv") ``` ```{r} #| label: define-funs #| echo: false -ConvertToBinary <- function(x) { +convert_to_binary <- function(x) { out <- mod <- NULL - while (x > 0 | is.null(mod)) { + while (x > 0 || is.null(mod)) { mod <- x %% 2 out <- paste0(mod, out) x <- floor(x / 2) @@ -49,12 +51,12 @@ ConvertToBinary <- function(x) { return(out) } -ConvertToBase <- function(x, base = NULL) { +convert_to_base_n <- function(x, base = NULL) { if (base > 10) { stop("Function not defined for bases higher than 10") } out <- mod <- NULL - while (x > 0 | is.null(mod)) { + while (x > 0 || is.null(mod)) { mod <- x %% base out <- paste0(mod, out) x <- floor(x / base) @@ -69,27 +71,32 @@ ConvertToBase <- function(x, base = NULL) { #| warning: false illust_table <- tibble( - Decimal = seq(1:21), - Binary = map_chr(.x = Decimal, .f = ConvertToBinary), - Ternary = map_chr(.x = Decimal, base = 3, .f = ConvertToBase) + decimal = seq(21), + binary = map_chr(.x = decimal, .f = convert_to_binary), + ternary = map_chr(.x = decimal, base = 3, .f = convert_to_base_n) ) -knitr::kable(illust_table, - align = "ccc", - latex_header_includes = c("\\renewcommand{\\arraystretch}{1}")) |> +illust_table |> + mutate( + binary = str_pad(binary, width = 5, side = "left", pad = "0"), + ternary = str_pad(ternary, width = 3, side = "left", pad = "0") + ) |> + knitr::kable( + align = "ccc" + ) |> kableExtra::kable_styling(position = "center") ``` -### Moving numbers between systems +## Moving numbers between systems -Unlike MATLAB (`dec2bin()`) or Python (`bin()`), R doesn't have a natural built-in function to convert decimal numbers to binary (unless you want to use the weird `intToBits()` function). So I wrote a function instead which continually appends the modulus while it loops through the digits of the numbers by dividing it in half continuously. The modulus is what is left after division of number x by number y. Let's take the number 13 as an example. In the first loop, the modulus of 13 when divided by 2 is 1. So that goes in the first position, representing value 1. The next digit in the binary system represents the value 2. So we divide our initial value by 2 (and round it to the lower integer it in case we get decimal points) and take the modulus again. So the modulus of 6 when divided by 2 is 0. So that's the second digit. The next loop will result in another 1 (`$x = 6; \lfloor x/2 \rfloor\ mod\ 2 = 1$`, `$\lfloor x \rfloor$` represents rounding `$x$` to the nearest lower integer, i.e. floor), and then in the final loop, we take half of 3 and take the modulus again (`$x = 3; \lfloor x/2 \rfloor\ mod\ 2 = 1$`). After this, when we floor 0.5, the loop cancels since from here on it will just produce an infinite amount of leading zero's. This function now gives us the value 1101, which corresponds to the table above. The code for this function looks like this: +Unlike MATLAB (`dec2bin()`) or Python (`bin()`), R doesn't have a natural built-in function to convert decimal numbers to binary (unless you want to use the weird `intToBits()` function). So I wrote a function instead which continually appends the modulus while it loops through the digits of the numbers by dividing it in half continuously. The modulus is what is left after division of number x by number y. Let's take the number 13 as an example. In the first loop, the modulus of 13 when divided by 2 is 1. So that goes in the first position, representing value 1. The next digit in the binary system represents the value 2. So we divide our initial value by 2 (and round it to the lower integer it in case we get decimal points) and take the modulus again. So the modulus of 6 when divided by 2 is 0. So that's the second digit. The next loop will result in another 1 ($x = 6; \lfloor x/2 \rfloor\\mod 2 = 1$), $\lfloor x \rfloor$ represents rounding $x$ to the nearest lower integer, i.e. floor), and then in the final loop, we take half of 3 and take the modulus again ($x = 3; \lfloor x/2 \rfloor\mod 2 = 1$). After this, when we floor 0.5, the loop cancels since from here on it will just produce an infinite amount of leading zero's. This function now gives us the value 1101, which corresponds to the table above. The code for this function looks like this: ```{r} #| label: binary-func -ConvertToBinary <- function(x) { +convert_to_binary <- function(x) { out <- mod <- NULL - while (x > 0 | is.null(mod)) { + while (x > 0 || is.null(mod)) { mod <- x %% 2 out <- paste0(mod, out) x <- floor(x / 2) @@ -103,7 +110,7 @@ Let's run this function: ```{r} #| label: bin-13 -ConvertToBinary(13) +convert_to_binary(13) ``` Now in this function we've hard coded that the base is 2, but this code works for any base up to base 10 with just a simple rewrite of the code. If we go higher, e.g. base 11, we are going to have to start using letters to represent values, which I'm too lazy to implement right now. We can specify the base as an input variable. @@ -111,12 +118,12 @@ Now in this function we've hard coded that the base is 2, but this code works fo ```{r} #| label: anybase-func -ConvertToBase <- function(x, base = NULL) { +convert_to_base_n <- function(x, base = NULL) { if (base > 10) { stop("Function not defined for bases higher than 10") } out <- mod <- NULL - while (x > 0 | is.null(mod)) { + while (x > 0 || is.null(mod)) { mod <- x %% base out <- paste0(mod, out) x <- floor(x / base) @@ -130,24 +137,26 @@ These functions only accepts a single value, but to get the transformed value fo ```{r} #| label: map-example -example_vector <- c(0:4,10,13,22,50,75,100) -map_chr(example_vector, ConvertToBinary) -map_chr(example_vector, base = 3, ConvertToBase) +example_vector <- c(seq(4), 10, 13, 22, 50, 75, 100) +map_chr(example_vector, convert_to_binary) +map_chr(example_vector, base = 3, convert_to_base_n) ``` -### The rules of the game +## The rules of the game Okay, so here's the rules of the _balanced ternary enumeration_ game We write all integers in base 3, and replace all digits that are 2 with -1 and then sum the outcome. Let's look at the first ten numbers in ternary: ```{r} #| label: bte-ex -map_chr(seq(10), base = 3, ConvertToBase) +map_chr(seq(10), base = 3, convert_to_base_n) ``` -So in this sequence, we would replace the second value (`2`) with `-1`, which makes -1. We would also replace the second digit of fifth value (`12`), which makes [1,-1], which, when we add these numbers up , makes 2. This because 1 in the first position denotes value 3, minus 1 in the second position (representing 1) makes 2 since `$(3*1) + (1*-1) = 2$`. The next value (`20`) has a 2 in the first position, replace this with -1 makes [-1,0]. The first position denotes 3, minus 0 in the second position, makes -3 since `$(3*-1) + (1*0) = -3$`. Applying the same rule to the sixth value gives `$(3*-1) + (1*1)$` which makes -2. The next value has two incidences of the number 2, replacing both with -1 gives `$(3*-1) + (1*-1)$` is equal to -4. Let's skip ahead a few numbers to decimal number 18, which in ternary becomes 200, where the first position represents the number 9. Replacing the 2 in this number gives `$(9*-1) + (3*0) + (1*0)$`, which makes -9. This process is the balanced ternary enumeration. +So in this sequence, we would replace the second value (`2`) with `-1`, which makes -1. We would also replace the second digit of fifth value (`12`), which makes [1,-1], which, when we add these numbers up , makes 2. This because 1 in the first position denotes value 3, minus 1 in the second position (representing 1) makes 2 since $(3 * 1) + (1 * -1) = 2$. The next value (`20`) has a 2 in the first position, replace this with -1 makes [-1,0]. The first position denotes 3, minus 0 in the second position, makes -3 since $(3 * -1) + (1 * 0) = -3$. + +Applying the same rule to the sixth value gives $(3 * -1) + (1 * 1)$ which makes -2. The next value has two incidences of the number 2, replacing both with -1 gives $(3 * -1) + (1 * -1)$ is equal to -4. Let's skip ahead a few numbers to decimal number 18, which in ternary becomes 200, where the first position represents the number 9. Replacing the 2 in this number gives $(9 * -1) + (3 * 0) + (1 * 0)$, which makes -9. This process is the balanced ternary enumeration. -Just as show of proof, we can also apply the same formula to values that don't contain a 2, for instance decimal number 10, which becomes 101 in ternary. The formula for this becomes `$(9*1) + (3*0) + (1*1)$`, which makes again 10. +Just as show of proof, we can also apply the same formula to values that don't contain a 2, for instance decimal number 10, which becomes 101 in ternary. The formula for this becomes $(9 * 1) + (3 * 0) + (1 * 1)$, which makes again 10. Let's put this sequence together: @@ -155,9 +164,9 @@ Let's put this sequence together: And that's balanced ternary enumeration. -### Coding the formula +## Coding the formula -So obviously we are lazy, and don't want to do this process manually for thousands of values. that's why we're going to code it. For this step I translated some Python code to R syntax. The function I wrote to do one step of balanced ternary enumartion is shown below. The first value is always 0 (since it's a 0 in the first position, hence `$1*0 = 0$`). After this, we can incorporate the two steps (of converting into ternary and the enumeration) into one. The formula for this looks like this: +So obviously we are lazy, and don't want to do this process manually for thousands of values. that's why we're going to code it. For this step I translated some Python code to R syntax. The function I wrote to do one step of balanced ternary enumartion is shown below. The first value is always 0 (since it's a 0 in the first position, hence $1 * 0 = 0$). After this, we can incorporate the two steps (of converting into ternary and the enumeration) into one. The formula for this looks like this: $$ \begin{aligned} @@ -168,57 +177,64 @@ a(3n + 2) &= 3 * a(n) - 1 \end{aligned} $$ -The Python code for this function came from a website that collects mathematical functions and sequences and can be found [here](https://oeis.org/A117966). I've adapted it to work in R. Since 0 will always result in 0, this is hard coded in. Afterwards it is a nested function (I know, we all love it) where it iteratively calls itself until the input to the function is 0 and it stops. At that point we have out balanced ternary. This function only performs the calculation for one value. So getting a sequence means putting it in a `map()` function. +{{{< sidenote >}}} +The Python code for this function came from a website that collects mathematical functions and +sequences and can be found [here](https://oeis.org/A117966) +{{{< /sidenote >}}} -```{python} -#| label: python-bte -#| echo: false - -def BTE(x): - if x == 0: - return 0 - if x % 3 == 0: - return 3*BTE(x/3) - elif x % 3 == 1: - return 3*BTE((x - 1)/3) + 1 - else: - return 3*BTE((x - 2)/3) - 1 -``` +In the function, since 0 will always result in 0, this is hard coded in. Afterwards it is a nested function (I know, we all love it) where it iteratively calls itself until the input to the function is 0 and it stops. At that point we have out balanced ternary. This function only performs the calculation for one value. So getting a sequence means putting it in a `map()` function. ```{r} #| label: bte-func -BTE <- function(x) { +bte <- function(x) { if (x == 0) { return(0) } if (x %% 3 == 0) { - return(3 * BTE(x / 3)) + return(3 * bte(x / 3)) } else if (x %% 3 == 1) { - return(3 * BTE((x - 1) / 3) + 1) + return(3 * bte((x - 1) / 3) + 1) } else if (x %% 3 == 2) { - return(3 * BTE((x - 2) / 3) - 1) + return(3 * bte((x - 2) / 3) - 1) } } ``` -Let's go through one iteration of this code. Let's say `x <- 3`. 3 modulo 3 is equal to 0, so we enter the first condition. The result of this is 3 multiplied by the outcome of the same function, except the input now is x divided by three, or 3/3, or 1, in our example. This becomes the new input for the function. 1 modulo 3 is equal to 1. So now we enter the second condition. Now the input to the `BTE()` function is 1 minus 1, divided by 3. This is 0, so we return 3 * 0 + 1, which is equal to 3. +```{python} +#| label: python-bte +#| code-fold: true +#| code-summary: "Click here to see the function in Python" +#| eval: false + +def bte(x): + if x == 0: + return 0 + if x % 3 == 0: + return 3 * bte(x / 3) + elif x % 3 == 1: + return 3 * bte((x - 1) / 3) + 1 + else: + return 3 * bte((x - 2) / 3) - 1 +``` + +Let's go through one iteration of this code. Let's say `x <- 3`. 3 modulo 3 is equal to 0, so we enter the first condition. The result of this is 3 multiplied by the outcome of the same function, except the input now is x divided by three, or 3/3, or 1, in our example. This becomes the new input for the function. 1 modulo 3 is equal to 1. So now we enter the second condition. Now the input to the `bte()` function is 1 minus 1, divided by 3. This is 0, so we return 3 * 0 + 1, which is equal to 3. If we plug the number 3 into the formula, we will get the same result: ```{r} #| label: bte-3 -BTE(3) +bte(3) ``` -Let's also look at a few other examples using the `map()` function again. Since the `BTE()` function outputs only integers, I use `map_dbl()`. Let's input a few examples: +Let's also look at a few other examples using the `map()` function again. Since the `bte()` function outputs only integers, I use `map_dbl()`. Let's input a few examples: ```{r} #| label: bte-vector -example_vector <- c(0,seq(10),500,1500,9999) -map_dbl(example_vector, BTE) +example_vector <- c(0, seq(10), 500, 1500, 9999) +map_dbl(example_vector, bte) ``` This corresponds to the values we created earlier, and the larger numbers make sense also. Okay, let's now create an entire sequence. We'll do 59.048 iterations (it'll become clear later why this specific number). We'll save the output in a variable called `starwars_seq` (forgotten yet that this thing started with Star Wars?). @@ -226,10 +242,10 @@ This corresponds to the values we created earlier, and the larger numbers make s ```{r} #| label: bte-starwars -starwars_seq <- map_dbl(seq(59048), BTE) +starwars_seq <- map_dbl(seq(59048), bte) ``` -### Plotting the Star Destroyers +## Plotting the Star Destroyers Now, when we plot the values as a scatter plot, it'll become maybe a bit clearer how this mathematical formula circles back to our Star Wars scene. @@ -249,15 +265,15 @@ It's star destroyers in battle formation! How crazy is that!? It's such an inter length(starwars_seq) == length(unique(starwars_seq)) ``` -The first point of each "squadron" of star destroyers starts with a value that is the same in decimal system as it is in balanced ternary enumeration. Remember the length of the `starwars_seq` variable was 59 048? I chose that because it would start plotting a new squadron at x = 59 049. Let's confirm this: +The first point of each "squadron" of star destroyers starts with a value that is the same in decimal system as it is in balanced ternary enumeration. Remember the length of the `starwars_seq` variable was 59 048? I chose that because it would start plotting a new squadron at `x = 59049`. Let's confirm this: ```{r} #| label: bte-last -BTE(59049) +bte(59049) ``` -There is a range of values where the input of the balanced ternary enumeration is equal to the outcome (or where the outcome is equal to the input divided by two times -1 (`$\frac{-x}{2}$`). There's a clear pattern to these numbers, but I've done enough maths for today, so I'll save it for another time. +There is a range of values where the input of the balanced ternary enumeration is equal to the outcome (or where the outcome is equal to the input divided by two times -1 ($\frac{-x}{2}$). There's a clear pattern to these numbers, but I've done enough maths for today, so I'll save it for another time. Anyway, the plot! It is cool and all, but let's make it look a bit more like Star Wars by changing some style elements. We'll also generate some stars. For fun's sake we'll also add a planet or moon. @@ -265,16 +281,28 @@ Anyway, the plot! It is cool and all, but let's make it look a bit more like Sta #| label: plot-pretty set.seed(1983) + ggplot(data = NULL, aes(x = seq(starwars_seq), y = starwars_seq)) + - geom_point(aes(x = sample(seq(-1e4,length(starwars_seq) + 1e4), 1e3), - y = sample(seq(min(starwars_seq)-1e4,max(starwars_seq) + 1e4), 1e3), - size = sample(runif(length(starwars_seq) + 1e4), 1e3)), - shape = 18, color = "yellow") + - geom_point(aes(x = sample(seq(starwars_seq), 1), - y = sample(starwars_seq, 1)), - shape = 19, size = 12, color = "darkslategray") + + geom_point( + aes( + x = sample(seq(-1e4, length(starwars_seq) + 1e4), 1e3), + y = sample(seq( + min(starwars_seq) - 1e4, + max(starwars_seq) + 1e4 + ), 1e3), + size = sample(runif(length(starwars_seq) + 1e4), 1e3) + ), + shape = 18, color = "yellow" + ) + + geom_point( + aes( + x = sample(seq(starwars_seq), 1), + y = sample(starwars_seq, 1) + ), + shape = 19, size = 12, color = "darkslategray" + ) + geom_point(size = 4, color = "grey90") + - scale_size_continuous(range = c(1e-3,5e-1)) + + scale_size_continuous(range = c(1e-3, 5e-1)) + theme_void() + theme( legend.position = "none", @@ -292,6 +320,6 @@ We can create another plot (nothing related to Star Wars perhaps), that looks ni #| label: plot-castle ggplot(data = NULL, aes(x = seq(starwars_seq), y = starwars_seq)) + - geom_line() + + geom_line() + theme_minimal() ``` diff --git a/content/blog/2020-running-an-ica-on-questionnaires/codebook.txt b/content/blog/2020-running-an-ica-on-questionnaires/data/codebook.txt similarity index 100% rename from content/blog/2020-running-an-ica-on-questionnaires/codebook.txt rename to content/blog/2020-running-an-ica-on-questionnaires/data/codebook.txt diff --git a/content/blog/2020-running-an-ica-on-questionnaires/codebook_clean.txt b/content/blog/2020-running-an-ica-on-questionnaires/data/codebook_clean.txt similarity index 100% rename from content/blog/2020-running-an-ica-on-questionnaires/codebook_clean.txt rename to content/blog/2020-running-an-ica-on-questionnaires/data/codebook_clean.txt diff --git a/content/blog/2020-running-an-ica-on-questionnaires/data.csv b/content/blog/2020-running-an-ica-on-questionnaires/data/data.csv similarity index 100% rename from content/blog/2020-running-an-ica-on-questionnaires/data.csv rename to content/blog/2020-running-an-ica-on-questionnaires/data/data.csv diff --git a/content/blog/2020-running-an-ica-on-questionnaires/index.markdown_strict_files/figure-markdown_strict/plot-wmatrix-nolabs-1.png b/content/blog/2020-running-an-ica-on-questionnaires/index.markdown_strict_files/figure-markdown_strict/plot-wmatrix-nolabs-1.png index c3fc090..aecc4aa 100644 Binary files a/content/blog/2020-running-an-ica-on-questionnaires/index.markdown_strict_files/figure-markdown_strict/plot-wmatrix-nolabs-1.png and b/content/blog/2020-running-an-ica-on-questionnaires/index.markdown_strict_files/figure-markdown_strict/plot-wmatrix-nolabs-1.png differ diff --git a/content/blog/2020-running-an-ica-on-questionnaires/index.md b/content/blog/2020-running-an-ica-on-questionnaires/index.md index a36a7df..4ff2689 100644 --- a/content/blog/2020-running-an-ica-on-questionnaires/index.md +++ b/content/blog/2020-running-an-ica-on-questionnaires/index.md @@ -15,9 +15,11 @@ execute: fig.show: hold results: hold out.width: 80% +editor_options: + chunk_output_type: console --- -### Introduction +## Introduction If there was a statistics magazine and it were to have ads, the ad for independent component analysis (commonly referred to as the ICA) might read something like this: @@ -27,9 +29,13 @@ If there was a statistics magazine and it were to have ads, the ad for independe Obviously, there is more to it than this, but I don't have the time to talk about the statistical properties of ICA. There's many people who can explain this a hell of a lot better than I could anyway. I'm going to assume you've done your homework and already know that ICA is the right strategy to answer your research question and you just want to know how to implement it in R. That's what this tutorial is for. We'll run both a PCA and an ICA and visualize the results. -### Getting the data +## Getting the data -We'll use a dataset called the *Nerdy Personality Attributes Scale* from the [Open-Source Psychometrics Project](https://openpsychometrics.org/tests/OSRI/development/). The dataset can be downloaded from [Kaggle](https://www.kaggle.com/lucasgreenwell/nerdy-personality-attributes-scale-responses/version/1?select=data.csv). In short, it's a questionnaire on "nerdiness". It aims to measure attributes of ones personality that reflect the popular understanding of "nerdiness". It is a series of questions to which participants are supposed to rank themselves on a Likert scale. The questionnaire has a high reliability, but the questionnaire isn't immune to valid criticisms. Here, we'll use this dataset to see if we can identify subtypes of the concept of "nerdiness" in our dataset. +{{< sidenote br="2em" >}} +The dataset is available for download on [Kaggle](https://www.kaggle.com/lucasgreenwell/nerdy-personality-attributes-scale-responses/version/1?select=data.csv) +{{< /sidenote >}} + +We'll use a dataset called the *Nerdy Personality Attributes Scale* from the [Open-Source Psychometrics Project](https://openpsychometrics.org/tests/OSRI/development/). In short, it's a questionnaire on "nerdiness". It aims to measure attributes of ones personality that reflect the popular understanding of "nerdiness". It is a series of questions to which participants are supposed to rank themselves on a Likert scale. The questionnaire has a high reliability, but the questionnaire isn't immune to valid criticisms. Here, we'll use this dataset to see if we can identify subtypes of the concept of "nerdiness" in our dataset. The dataset in question consists of almost 20.000 individuals from 149 different countries. Is there any reliable way to ensure that the tests are filled in correctly? No, definitely not. Does that make it a unreliable dataset for scientific analysis? Probably. Both issues are perhaps slightly confounded by its sheer size, but the dataset serves our goal well, which is to run an ICA in R. We'll use the `{tidyverse}` package and the `{fastICA}` package. @@ -41,24 +47,31 @@ library(fastICA) There are two files we need, one with the actual data (we'll call this `loaddata`), and one with the list of questions (we'll call this `loadcodes`). ``` r -loaddata <- read_delim("data.csv", delim = "\t") |> - mutate(id = row_number()) - -loadcodes <- read_delim("codebook_clean.txt", delim = "\t", col_names = FALSE) |> - rename(qnum = X1, - question = X2) +loaddata <- read_delim("./data/data.csv", delim = "\t") |> + rowid_to_column(var = "id") + +loadcodes <- read_delim("./data/codebook_clean.txt", + delim = "\t", col_names = FALSE +) |> + rename( + qnum = X1, + question = X2 + ) ``` After cleaning, we still have more than 250.000 individual records left. This is great! Now we're going to have to do some preprocessing before we run the ICA. -### Preprocessing of the data +## Preprocessing of the data Next, we want to prune the data. We want to exclude questions that have a low degree of variance and we might want to remove or impute questions with a large number of `NA`s. Our first step is to get the data in a format that's easier to work with, in this case, the long format. We'll select all questions, and put the question number in a column called `question` and the values in a column called `score`. ``` r questdata <- loaddata |> select(starts_with("Q")) |> - pivot_longer(cols = everything(), names_to = "question", values_to = "score") + pivot_longer( + cols = everything(), + names_to = "question", values_to = "score" + ) ``` We'll first find if there's any unanswered questions. These are rows where `score` is `NA`. @@ -79,7 +92,7 @@ The next step is to find if there are questions with insufficient variation in a less15var <- questdata |> group_by(question, score) |> count() |> - group_by(question) |> + group_by(question) |> mutate(perc = n / sum(n)) |> arrange(question, -perc) |> slice(2) |> @@ -92,28 +105,32 @@ print(less15var) # Groups: question [0] # ℹ 4 variables: question , score , n , perc -So no question has too little variance. If there was, I'd remove that question like so: - -``` r -data <- loaddata |> - select(-less15var$question) -``` +So no question has too little variance. If there were, I'd remove those questions from the analysis. Since no question has less than 15% variance, we won't drop any questions and use all of them in the analysis. Next we want to normalize the data. Usually you'd do this to ensure that all answers on a questionnaire are in the same domain. Let's imagine a scenario where some questions are scored from 0 to 10, others are scored from 0 to 5, and a third is scored from 80 to 120. The ICA is then biased towards questions that have larger values, like the third question in the example above. That's why we want to normalize the data. The statistical term for this is z-score normalization. The defining property of z-scored transformed data is that the mean is 0 and the standard deviation is one. The z-score transformation is obtained by subtracting the mean from each individual value and then dividing by the standard deviation. This is implemented in R with the `scale()` function. We'll also check if it worked afterwards. +{{< sidenote >}} +In this case, the second line with the `select()` function doesn't remove anything, see paragraph above +{{< /sidenote >}} + ``` r -data <- data |> - pivot_longer(starts_with("Q"), names_to = "qnum", values_to = "score") |> +data <- loaddata |> + select(-less15var$question) |> + pivot_longer(starts_with("Q"), + names_to = "qnum", values_to = "score" + ) |> group_by(qnum) |> mutate(score_z = scale(score)) data |> ungroup() |> select(score_z) |> - summarise(mean = mean(score_z), - sd = sd(score_z), - min = min(score_z), - max = max(score_z)) + summarise( + mean = mean(score_z), + sd = sd(score_z), + min = min(score_z), + max = max(score_z) + ) ``` # A tibble: 1 × 4 @@ -123,7 +140,7 @@ data |> Great! Now we have a nice normalized dataset, without any missing values, and with questions that have a low degree of variance removed. All those preparations served to get us to a place where our clustering analyses are actually valid. Now it's time for the fun stuff! -### Run PCA +## Run PCA There's two clustering approaches we'll use. One will actually help us do the other. The ICA algorithm has is unsupervised, but does require us to tell it how many components we want to get out of the algorithm. It's complicated to calculate the ideal number of components up front, but we can use some standards. I usually use a combination of [icasso](https://research.ics.aalto.fi/ica/icasso/) and PCA. We'll first do a principal component analysis (PCA). We can calculate the eigenvalue of each component in the PCA. PCA components are organized in decreasing order of variance explained. The threshold I use is an eigenvalue of 1. The number of PCA components with an eigenvalue larger than 1 is possibly a good number of components to give the ICA. @@ -131,7 +148,10 @@ The PCA is implemented in the `prcomp()` function. This function doesn't accept ``` r mat <- data |> - pivot_wider(names_from = "qnum", values_from = "score_z", id_cols = "id") |> + pivot_wider( + names_from = "qnum", + values_from = "score_z", id_cols = "id" + ) |> select(starts_with("Q")) |> as.matrix() @@ -144,50 +164,54 @@ Then we can calculate a few other parameters. We want to calculate the variance pca_data$var <- pca_data$sdev^2 pca_data$varexpl <- pca_data$var / sum(pca_data$var) -pca_stats <- tibble(sdev = pca_data$sdev, - var = pca_data$var, - varexpl = pca_data$varexpl) +pca_stats <- tibble( + sdev = pca_data$sdev, + var = pca_data$var, + varexpl = pca_data$varexpl +) ``` We can visualize the variance captured in each component by creating a [scree plot](https://en.wikipedia.org/wiki/Scree_plot). This plot shows the components on the x-axis and the variance on the y-axis. Scree plots also typically include a horizontal line to indicate the eigenvalue of 1. Scree plots are typically interpreted based on the "elbow" in the plot, where the variance decreases. This can be a bit subjective. That's where the horizontal line comes in. ``` r -ggplot(pca_stats, aes(x = seq(var), y = var)) + - geom_line(color = "grey30", size = 1) + - geom_point(color = "grey30", size = 2) + +ggplot(pca_stats, aes(x = seq(var), y = var)) + + geom_line(color = "grey30", linewidth = 1) + + geom_point(color = "grey30", size = 2) + geom_hline(yintercept = 1, linetype = "dashed") + - labs(x = "Principal component", - y = "Variance (eigenvalue)") + + labs( + x = "Principal component", + y = "Variance (eigenvalue)" + ) + theme_minimal() ``` - Warning: Using `size` aesthetic for lines was deprecated in ggplot2 3.4.0. - ℹ Please use `linewidth` instead. - It can be a bit hard to see from the plot how many components have an eigenvalue larger than 1. But we can calulate it. The number of components with an eigenvalue larger than 1 will be the number of independent components we'll request from the ICA. ``` r -nICs <- pca_stats |> +n_ics <- pca_stats |> filter(var > 1) |> nrow() -print(nICs) +print(n_ics) ``` [1] 6 -### Run ICA +## Run ICA Now we're ready to run the ICA! We'll use the `fastICA()` function. The function has a bunch of inputs. We'll pick the parallel algorithm (as opposed to the "deflation" algorithm). In the parallel algorithm the components are extracted simultaneously, with the deflation algorithm they're calculated one at a time. The "fun" option defines the form of the entropy function. I'm not 100% sure what it does. Just set it to `"exp"` and move on. For the `"method"` option there are two options: `"R"` or `"C"`. In the first option, all analyses are run in R, in the second option, all code is run in C, which is slightly faster. I typically use the `"C"` option. The `maxit` option defines the maximum number of iterations to perform. The default is 200, but in complex datasets, it may take a larger number of iterations to converge. That's why I set it to 5000. This process may take a while. If you want to follow along with what the algorithm is doing, you can set the `verbose` option to `TRUE`. ``` r set.seed(2020) -ica_model <- fastICA(mat, n.comp = nICs, alg.typ = "parallel", - fun = "exp", method = "C", maxit = 5000) + +ica_model <- fastICA(mat, + n.comp = n_ics, alg.typ = "parallel", + fun = "exp", method = "C", maxit = 5000 +) ``` -### Create Weight Matrix +## Create Weight Matrix So now we have run the ICA decomposition. The function provides a few outputs: @@ -233,7 +257,7 @@ Then we can also have a quick look at hierarchical clustering. This is a process ``` r weight_matrix <- data.frame(t(ica_model$A)) -names(weight_matrix) <- paste0("IC",seq(weight_matrix)) +names(weight_matrix) <- str_glue("IC{seq(weight_matrix)}") dist_matrix <- dist(as.matrix(weight_matrix)) clust <- hclust(dist_matrix) @@ -244,37 +268,64 @@ plot(clust) What we can see from the dendrogram is that for instance question 16 (*"I gravitate towards introspection"*) and question 17 (*"I am more comfortable interacting online than in person"*) on the left are very similar. Question 3 (*"I like to play RPGs. (Ex. D&D)"*) is somewhat similar, but not as much as question 16 and 17. The same goes for question 1 and 4, question 11 and 19, and so on. +{{< sidenote br="1.5em" >}} +We'll also do some cleaning of the questions that will help us make the plot look nicer later +{{< /sidenote >}} + ``` r codes <- loadcodes |> - filter(str_detect(qnum, "Q")) + filter(str_detect(qnum, "Q")) |> + mutate( + question = str_remove_all(question, "\t"), + question = str_remove(question, "\\.$") + ) ``` Next we'll plot the weight matrix. For that we first create a column with the question number, we'll then merge the matrix with the list of questions so we can plot the actual question asked instead of the question number. Next, we'll put the data frame in a long format. Finally, we'll also reorder the questions to match the order determined by the hierarchical clustering. +{{< sidenote br="4em" >}} +The `%>%` pipe is necessary here because the placeholder functions a bit differently from the R's native `|>` pipe +{{< /sidenote >}} + ``` r weight_matrix_long <- weight_matrix |> mutate(qnum = str_glue("Q{row_number()}")) |> inner_join(codes, by = "qnum") |> - pivot_longer(cols = starts_with("IC"), names_to = "IC", values_to = "loading") %>% - mutate(question = as_factor(question), - question = fct_relevel(question, unique(.$question)[clust$order])) + pivot_longer( + cols = starts_with("IC"), + names_to = "IC", values_to = "loading" + ) %>% + mutate( + question = fct_relevel( + question, + unique(.$question)[clust$order] + ), + ) ``` So let's see what the ICA spit out. We'll make a plot where the questions are shown along the y-axis and where the x-axis shows their loading onto the different independent components. The orientation of the loading between the components does not have meaning, but within the components it can indicate that some questions have opposing effects. We'll reorder the x- and y-axis to order it based on the loading strength. ``` r -ggplot(weight_matrix_long, aes(x = IC, y = question, fill = loading)) + - geom_tile() + - labs(x = NULL, - y = NULL, - fill = NULL) + - scale_fill_gradient2(low = "#FF729F", mid = "black", high = "#7EE8FA", limits = c(-1, 1), - guide = guide_colorbar(barwidth = 0.5, barheight = 15, ticks = FALSE)) + - theme_minimal(base_size = 8) + +weight_matrix_long |> + ggplot(aes(x = IC, y = question, fill = loading)) + + geom_tile() + + labs( + x = NULL, + y = NULL, + fill = NULL + ) + + scale_fill_gradient2( + low = "#FF729F", mid = "black", high = "#7EE8FA", + limits = c(-1, 1), + guide = guide_colorbar( + barwidth = 0.5, + barheight = 15, ticks = FALSE + ) + ) + + theme_minimal(base_size = 10) + theme( legend.position = "right", - panel.grid = element_blank(), - axis.text.x = element_text(size = 6), + panel.grid = element_blank() ) ``` diff --git a/content/blog/2020-running-an-ica-on-questionnaires/index.qmd b/content/blog/2020-running-an-ica-on-questionnaires/index.qmd index 66dbae1..d27ff51 100644 --- a/content/blog/2020-running-an-ica-on-questionnaires/index.qmd +++ b/content/blog/2020-running-an-ica-on-questionnaires/index.qmd @@ -15,9 +15,11 @@ execute: fig.show: hold results: hold out.width: 80% +editor_options: + chunk_output_type: console --- -### Introduction +## Introduction If there was a statistics magazine and it were to have ads, the ad for independent component analysis (commonly referred to as the ICA) might read something like this: @@ -27,9 +29,13 @@ If there was a statistics magazine and it were to have ads, the ad for independe Obviously, there is more to it than this, but I don't have the time to talk about the statistical properties of ICA. There's many people who can explain this a hell of a lot better than I could anyway. I'm going to assume you've done your homework and already know that ICA is the right strategy to answer your research question and you just want to know how to implement it in R. That's what this tutorial is for. We'll run both a PCA and an ICA and visualize the results. -### Getting the data +## Getting the data -We'll use a dataset called the _Nerdy Personality Attributes Scale_ from the [Open-Source Psychometrics Project](https://openpsychometrics.org/tests/OSRI/development/). The dataset can be downloaded from [Kaggle](https://www.kaggle.com/lucasgreenwell/nerdy-personality-attributes-scale-responses/version/1?select=data.csv). In short, it's a questionnaire on "nerdiness". It aims to measure attributes of ones personality that reflect the popular understanding of "nerdiness". It is a series of questions to which participants are supposed to rank themselves on a Likert scale. The questionnaire has a high reliability, but the questionnaire isn't immune to valid criticisms. Here, we'll use this dataset to see if we can identify subtypes of the concept of "nerdiness" in our dataset. +{{{< sidenote br="2em" >}}} +The dataset is available for download on [Kaggle](https://www.kaggle.com/lucasgreenwell/nerdy-personality-attributes-scale-responses/version/1?select=data.csv) +{{{< /sidenote >}}} + +We'll use a dataset called the _Nerdy Personality Attributes Scale_ from the [Open-Source Psychometrics Project](https://openpsychometrics.org/tests/OSRI/development/). In short, it's a questionnaire on "nerdiness". It aims to measure attributes of ones personality that reflect the popular understanding of "nerdiness". It is a series of questions to which participants are supposed to rank themselves on a Likert scale. The questionnaire has a high reliability, but the questionnaire isn't immune to valid criticisms. Here, we'll use this dataset to see if we can identify subtypes of the concept of "nerdiness" in our dataset. The dataset in question consists of almost 20.000 individuals from 149 different countries. Is there any reliable way to ensure that the tests are filled in correctly? No, definitely not. Does that make it a unreliable dataset for scientific analysis? Probably. Both issues are perhaps slightly confounded by its sheer size, but the dataset serves our goal well, which is to run an ICA in R. We'll use the `{tidyverse}` package and the `{fastICA}` package. @@ -48,17 +54,21 @@ There are two files we need, one with the actual data (we'll call this `loaddata #| message: false #| warning: false -loaddata <- read_delim("data.csv", delim = "\t") |> - mutate(id = row_number()) +loaddata <- read_delim("./data/data.csv", delim = "\t") |> + rowid_to_column(var = "id") -loadcodes <- read_delim("codebook_clean.txt", delim = "\t", col_names = FALSE) |> - rename(qnum = X1, - question = X2) +loadcodes <- read_delim("./data/codebook_clean.txt", + delim = "\t", col_names = FALSE +) |> + rename( + qnum = X1, + question = X2 + ) ``` After cleaning, we still have more than 250.000 individual records left. This is great! Now we're going to have to do some preprocessing before we run the ICA. -### Preprocessing of the data +## Preprocessing of the data Next, we want to prune the data. We want to exclude questions that have a low degree of variance and we might want to remove or impute questions with a large number of `NA`s. Our first step is to get the data in a format that's easier to work with, in this case, the long format. We'll select all questions, and put the question number in a column called `question` and the values in a column called `score`. @@ -67,7 +77,10 @@ Next, we want to prune the data. We want to exclude questions that have a low de questdata <- loaddata |> select(starts_with("Q")) |> - pivot_longer(cols = everything(), names_to = "question", values_to = "score") + pivot_longer( + cols = everything(), + names_to = "question", values_to = "score" + ) ``` We'll first find if there's any unanswered questions. These are rows where `score` is `NA`. @@ -90,7 +103,7 @@ The next step is to find if there are questions with insufficient variation in a less15var <- questdata |> group_by(question, score) |> count() |> - group_by(question) |> + group_by(question) |> mutate(perc = n / sum(n)) |> arrange(question, -perc) |> slice(2) |> @@ -99,37 +112,39 @@ less15var <- questdata |> print(less15var) ``` -So no question has too little variance. If there was, I'd remove that question like so: - -```{r} -#| label: remove-low-var - -data <- loaddata |> - select(-less15var$question) -``` +So no question has too little variance. If there were, I'd remove those questions from the analysis. Since no question has less than 15% variance, we won't drop any questions and use all of them in the analysis. Next we want to normalize the data. Usually you'd do this to ensure that all answers on a questionnaire are in the same domain. Let's imagine a scenario where some questions are scored from 0 to 10, others are scored from 0 to 5, and a third is scored from 80 to 120. The ICA is then biased towards questions that have larger values, like the third question in the example above. That's why we want to normalize the data. The statistical term for this is z-score normalization. The defining property of z-scored transformed data is that the mean is 0 and the standard deviation is one. The z-score transformation is obtained by subtracting the mean from each individual value and then dividing by the standard deviation. This is implemented in R with the `scale()` function. We'll also check if it worked afterwards. +{{{< sidenote >}}} +In this case, the second line with the `select()` function doesn't remove anything, see paragraph above +{{{< /sidenote >}}} + ```{r} #| label: normalize-data -data <- data |> - pivot_longer(starts_with("Q"), names_to = "qnum", values_to = "score") |> +data <- loaddata |> + select(-less15var$question) |> + pivot_longer(starts_with("Q"), + names_to = "qnum", values_to = "score" + ) |> group_by(qnum) |> mutate(score_z = scale(score)) data |> ungroup() |> select(score_z) |> - summarise(mean = mean(score_z), - sd = sd(score_z), - min = min(score_z), - max = max(score_z)) + summarise( + mean = mean(score_z), + sd = sd(score_z), + min = min(score_z), + max = max(score_z) + ) ``` Great! Now we have a nice normalized dataset, without any missing values, and with questions that have a low degree of variance removed. All those preparations served to get us to a place where our clustering analyses are actually valid. Now it's time for the fun stuff! -### Run PCA +## Run PCA There's two clustering approaches we'll use. One will actually help us do the other. The ICA algorithm has is unsupervised, but does require us to tell it how many components we want to get out of the algorithm. It's complicated to calculate the ideal number of components up front, but we can use some standards. I usually use a combination of [icasso](https://research.ics.aalto.fi/ica/icasso/) and PCA. We'll first do a principal component analysis (PCA). We can calculate the eigenvalue of each component in the PCA. PCA components are organized in decreasing order of variance explained. The threshold I use is an eigenvalue of 1. The number of PCA components with an eigenvalue larger than 1 is possibly a good number of components to give the ICA. @@ -139,7 +154,10 @@ The PCA is implemented in the `prcomp()` function. This function doesn't accept #| label: run-pca mat <- data |> - pivot_wider(names_from = "qnum", values_from = "score_z", id_cols = "id") |> + pivot_wider( + names_from = "qnum", + values_from = "score_z", id_cols = "id" + ) |> select(starts_with("Q")) |> as.matrix() @@ -154,9 +172,11 @@ Then we can calculate a few other parameters. We want to calculate the variance pca_data$var <- pca_data$sdev^2 pca_data$varexpl <- pca_data$var / sum(pca_data$var) -pca_stats <- tibble(sdev = pca_data$sdev, - var = pca_data$var, - varexpl = pca_data$varexpl) +pca_stats <- tibble( + sdev = pca_data$sdev, + var = pca_data$var, + varexpl = pca_data$varexpl +) ``` We can visualize the variance captured in each component by creating a [scree plot](https://en.wikipedia.org/wiki/Scree_plot). This plot shows the components on the x-axis and the variance on the y-axis. Scree plots also typically include a horizontal line to indicate the eigenvalue of 1. Scree plots are typically interpreted based on the "elbow" in the plot, where the variance decreases. This can be a bit subjective. That's where the horizontal line comes in. @@ -164,12 +184,14 @@ We can visualize the variance captured in each component by creating a [scree pl ```{r} #| label: plot-var -ggplot(pca_stats, aes(x = seq(var), y = var)) + - geom_line(color = "grey30", size = 1) + - geom_point(color = "grey30", size = 2) + +ggplot(pca_stats, aes(x = seq(var), y = var)) + + geom_line(color = "grey30", linewidth = 1) + + geom_point(color = "grey30", size = 2) + geom_hline(yintercept = 1, linetype = "dashed") + - labs(x = "Principal component", - y = "Variance (eigenvalue)") + + labs( + x = "Principal component", + y = "Variance (eigenvalue)" + ) + theme_minimal() ``` @@ -178,13 +200,13 @@ It can be a bit hard to see from the plot how many components have an eigenvalue ```{r} #| label: get-n-ics -nICs <- pca_stats |> +n_ics <- pca_stats |> filter(var > 1) |> nrow() -print(nICs) +print(n_ics) ``` -### Run ICA +## Run ICA Now we're ready to run the ICA! We'll use the `fastICA()` function. The function has a bunch of inputs. We'll pick the parallel algorithm (as opposed to the "deflation" algorithm). In the parallel algorithm the components are extracted simultaneously, with the deflation algorithm they're calculated one at a time. The "fun" option defines the form of the entropy function. I'm not 100% sure what it does. Just set it to `"exp"` and move on. For the `"method"` option there are two options: `"R"` or `"C"`. In the first option, all analyses are run in R, in the second option, all code is run in C, which is slightly faster. I typically use the `"C"` option. The `maxit` option defines the maximum number of iterations to perform. The default is 200, but in complex datasets, it may take a larger number of iterations to converge. That's why I set it to 5000. This process may take a while. If you want to follow along with what the algorithm is doing, you can set the `verbose` option to `TRUE`. @@ -192,11 +214,14 @@ Now we're ready to run the ICA! We'll use the `fastICA()` function. The function #| label: run-ica set.seed(2020) -ica_model <- fastICA(mat, n.comp = nICs, alg.typ = "parallel", - fun = "exp", method = "C", maxit = 5000) + +ica_model <- fastICA(mat, + n.comp = n_ics, alg.typ = "parallel", + fun = "exp", method = "C", maxit = 5000 +) ``` -### Create Weight Matrix +## Create Weight Matrix So now we have run the ICA decomposition. The function provides a few outputs: @@ -226,7 +251,7 @@ Then we can also have a quick look at hierarchical clustering. This is a process #| label: show-hclust weight_matrix <- data.frame(t(ica_model$A)) -names(weight_matrix) <- paste0("IC",seq(weight_matrix)) +names(weight_matrix) <- str_glue("IC{seq(weight_matrix)}") dist_matrix <- dist(as.matrix(weight_matrix)) clust <- hclust(dist_matrix) @@ -235,24 +260,43 @@ plot(clust) What we can see from the dendrogram is that for instance question 16 (_"I gravitate towards introspection"_) and question 17 (_"I am more comfortable interacting online than in person"_) on the left are very similar. Question 3 (_"I like to play RPGs. (Ex. D&D)"_) is somewhat similar, but not as much as question 16 and 17. The same goes for question 1 and 4, question 11 and 19, and so on. +{{{< sidenote br="1.5em" >}}} +We'll also do some cleaning of the questions that will help us make the plot look nicer later +{{{< /sidenote >}}} + ```{r} #| label: select-qs codes <- loadcodes |> - filter(str_detect(qnum, "Q")) + filter(str_detect(qnum, "Q")) |> + mutate( + question = str_remove_all(question, "\t"), + question = str_remove(question, "\\.$") + ) ``` Next we'll plot the weight matrix. For that we first create a column with the question number, we'll then merge the matrix with the list of questions so we can plot the actual question asked instead of the question number. Next, we'll put the data frame in a long format. Finally, we'll also reorder the questions to match the order determined by the hierarchical clustering. +{{{< sidenote br="4em" >}}} +The `%>%` pipe is necessary here because the placeholder functions a bit differently from the R's native `|>` pipe +{{{< /sidenote >}}} + ```{r} #| label: create-plotdata weight_matrix_long <- weight_matrix |> mutate(qnum = str_glue("Q{row_number()}")) |> inner_join(codes, by = "qnum") |> - pivot_longer(cols = starts_with("IC"), names_to = "IC", values_to = "loading") %>% - mutate(question = as_factor(question), - question = fct_relevel(question, unique(.$question)[clust$order])) + pivot_longer( + cols = starts_with("IC"), + names_to = "IC", values_to = "loading" + ) %>% + mutate( + question = fct_relevel( + question, + unique(.$question)[clust$order] + ), + ) ``` So let's see what the ICA spit out. We'll make a plot where the questions are shown along the y-axis and where the x-axis shows their loading onto the different independent components. The orientation of the loading between the components does not have meaning, but within the components it can indicate that some questions have opposing effects. We'll reorder the x- and y-axis to order it based on the loading strength. @@ -261,18 +305,26 @@ So let's see what the ICA spit out. We'll make a plot where the questions are sh #| label: plot-wmatrix-nolabs #| out.width: 100% -ggplot(weight_matrix_long, aes(x = IC, y = question, fill = loading)) + - geom_tile() + - labs(x = NULL, - y = NULL, - fill = NULL) + - scale_fill_gradient2(low = "#FF729F", mid = "black", high = "#7EE8FA", limits = c(-1, 1), - guide = guide_colorbar(barwidth = 0.5, barheight = 15, ticks = FALSE)) + - theme_minimal(base_size = 8) + +weight_matrix_long |> + ggplot(aes(x = IC, y = question, fill = loading)) + + geom_tile() + + labs( + x = NULL, + y = NULL, + fill = NULL + ) + + scale_fill_gradient2( + low = "#FF729F", mid = "black", high = "#7EE8FA", + limits = c(-1, 1), + guide = guide_colorbar( + barwidth = 0.5, + barheight = 15, ticks = FALSE + ) + ) + + theme_minimal(base_size = 10) + theme( legend.position = "right", - panel.grid = element_blank(), - axis.text.x = element_text(size = 6), + panel.grid = element_blank() ) ``` diff --git a/content/blog/2021-amsterdam-housing-market/MVA_kwartaalcijfers.xlsx b/content/blog/2021-amsterdam-housing-market/data/MVA_kwartaalcijfers.xlsx similarity index 100% rename from content/blog/2021-amsterdam-housing-market/MVA_kwartaalcijfers.xlsx rename to content/blog/2021-amsterdam-housing-market/data/MVA_kwartaalcijfers.xlsx diff --git a/content/blog/2021-amsterdam-housing-market/data/data_merged.rds b/content/blog/2021-amsterdam-housing-market/data/data_merged.rds new file mode 100644 index 0000000..23f6364 --- /dev/null +++ b/content/blog/2021-amsterdam-housing-market/data/data_merged.rds @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:9db1e1602cd03957db7e3dcbd267786036acafdf3da162e70a4277dc8298128d +size 21105 diff --git a/content/blog/2021-amsterdam-housing-market/data_merged.rds b/content/blog/2021-amsterdam-housing-market/data_merged.rds deleted file mode 100644 index 8d36d14..0000000 Binary files a/content/blog/2021-amsterdam-housing-market/data_merged.rds and /dev/null differ diff --git a/content/blog/2021-amsterdam-housing-market/index.md b/content/blog/2021-amsterdam-housing-market/index.md index 5e4c4fd..1851b73 100644 --- a/content/blog/2021-amsterdam-housing-market/index.md +++ b/content/blog/2021-amsterdam-housing-market/index.md @@ -14,11 +14,13 @@ execute: fig.show: hold results: hold dev.args: list(bg = "#EBEBEB") +editor_options: + chunk_output_type: console --- -

-For this post, I decided to hide the code chunks by default to improve legibility. You can click the Show code-button to expand the code. -

+ +{{< standout bg="#acc8d4" >}} +For this post, I decided to hide the code chunks by default to improve legibility. You can click on *Show code* to expand the code. +{{< /standout >}} ### Introduction I'm at the age where a lot of my peers start thinking about buying a home. It is a major conversation topic around the office, and these conversations are not often filled with optimism and excitement. Oslo is a notoriously expensive city to live in, and the housing market is very tough. -Of all the worrying events in 2020, Millenials and Gen Z kids ranked financial instability as the second most common driver of stress, the most common being the welfare of their family ([link](https://www2.deloitte.com/global/en/pages/about-deloitte/articles/millennialsurvey.html)). Today I want to dissect one of the possible causes of this *economic anxiety*: **the housing market**. Now, I'm not an economist nor a financial expert. Nonetheless, I believe I can leverage some of my knowledge of data wrangling and analysis to contribute a small piece to this topic. My insight into the Oslo housing market is fairly limited to so far and there's many nuances I don't fully understand yet, but I do think I have some relevant knowledge of the Dutch housing market. So today I'll dive into some data on the Dutch market, and particularly the Amsterdam housing market. There's countless stories about the Amsterdam housing market and how terrible it is for new buyers, similar to the Oslo housing market. However, in my opinion the Amsterdam housing market is already a few steps ahead of the Oslo market, and hopefully the Amsterdam market can offer some warnings about what the Oslo housing market might look like in a few years without some policy corrections. +{{< sidenote >}} +This conclusion is based on a survey from Deloitte in 2020 where they surveyed millennials and Gen Z kids both before and after the onset of the pandemic +{{< /sidenote >}} + +Of all the worrying events in 2020, millennials and Gen Z kids ranked financial instability as the second most common driver of stress, the most common being the welfare of their family ([source](https://www2.deloitte.com/content/dam/Deloitte/global/Documents/About-Deloitte/deloitte-2020-millennial-survey.pdf)). Today I want to dissect one of the possible causes of this *economic anxiety*: **the housing market**. Now, I'm not an economist nor a financial expert. Nonetheless, I believe I can leverage some of my knowledge of data wrangling and analysis to contribute a small piece to this topic. My insight into the Oslo housing market is fairly limited to so far and there's many nuances I don't fully understand yet, but I do think I have some relevant knowledge of the Dutch housing market. So today I'll dive into some data on the Dutch market, and particularly the Amsterdam housing market. There's countless stories about the Amsterdam housing market and how terrible it is for new buyers, similar to the Oslo housing market. However, in my opinion the Amsterdam housing market is already a few steps ahead of the Oslo market, and hopefully the Amsterdam market can offer some warnings about what the Oslo housing market might look like in a few years without some policy corrections. This will be a fairly basic data analysis and visualization post, I can't claim that this is an exhaustive list and that I didn't miss some nuances, but I'll give it . I collected some data from the Amsterdam Real Estate Association ([Makelaars Vereniging Amsterdam; MVA](https://www.mva.nl)) and statistics from the Central Bureau for Statistics ([Centraal Bureau for de Statistiek; CBS](https://www.cbs.nl)). As usual, I'll be using `{tidyverse}` *a lot*. I've also recently started using the `{ggtext}` package to manage the text elements in my plots, inspired by Cédric Scherer ([@CedScherer](https://twitter.com/CedScherer)). I'll use the `{gt}` package to spice up some of the tables, and `{patchwork}` for arranging plots. I'll use the `{cbsodataR}` to download data from the Central Bureau for Statistics ([CBS](https://www.cbs.nl)). @@ -60,27 +67,46 @@ font_add_google(name = "Nunito Sans", family = "nunito-sans") showtext_auto() theme_set(ggthemes::theme_economist(base_family = "nunito-sans") + - theme(rect = element_rect(fill = "#EBEBEB", color = "transparent"), - plot.background = element_rect(fill = "#EBEBEB", color = "transparent"), - panel.background = element_rect(fill = "#EBEBEB", color = "transparent"), - plot.title = element_textbox(margin = margin(0,0,5,0,"pt")), - plot.title.position = "plot", - plot.subtitle = element_textbox(hjust = 0, margin = margin(0,0,15,0,"pt")), - plot.caption = element_textbox(hjust = 1), - plot.caption.position = "plot", - axis.title.y = element_textbox(orientation = "left-rotated", face = "bold", - margin = margin(0,0,5,0,"pt")), - axis.text.y = element_text(hjust = 1), - legend.position = "bottom", - legend.box = "vertical", - legend.text = element_text(size = 10))) + theme( + rect = element_rect(fill = "#EBEBEB", color = "transparent"), + plot.background = element_rect( + fill = "#EBEBEB", + color = "transparent" + ), + panel.background = element_rect( + fill = "#EBEBEB", + color = "transparent" + ), + plot.title = element_textbox( + margin = margin(0, 0, 5, 0, "pt") + ), + plot.title.position = "plot", + plot.subtitle = element_textbox( + hjust = 0, + margin = margin(0, 0, 15, 0, "pt") + ), + plot.caption = element_textbox(hjust = 1), + plot.caption.position = "plot", + axis.title.y = element_textbox( + orientation = "left-rotated", face = "bold", + margin = margin(0, 0, 5, 0, "pt") + ), + axis.text.y = element_text(hjust = 1), + legend.position = "bottom", + legend.box = "vertical", + legend.text = element_text(size = 10) + )) ``` ### Getting the data -The first piece of data I'll use comes from the Amsterdam Real Estate Association. They publish quarterly data on a number of variables about the Amsterdam housing market ([link](https://www.mva.nl/over-de-mva/mva/kwartaalcijfers)), inclusing asking price, final price paid, number of properties put on the market, number of properties sold and a few more back to the first quarter of 2012. *Obviously*, these numbers all come in pdf-format, because the people writing quarterly reports apparently have a *massive hatred* towards people that want to analyze this data. I downloaded the reports, used the online tool [PDFTables](https://pdftables.com) to convert them to Excel, and then stitched the tables together manually. Of course (remember the authors have a *massive hatred* towards us), the formatting of the numbers in the tables weren't consistent between different quarterly reports, so I had to do some cleaning in R. I put each table in a separate sheet and then used functionality from the `{readxl}` package to load each table into a different variable and then do the cleaning per table. This was a bit cumbersome. Since this will probably be a very long post, I'll hide the code, but you can see it in the [Rmarkdown file](https://github.com/danielroelfs/danielroelfs.com/blob/main/content/blog/2020-running-an-ica-on-questionnaires/index.Rmd) (at least until I figure out how to get code folding to work on my laptop). +{{< sidenote br="10em" >}} +There are [R packages](https://docs.ropensci.org/tabulizer/) that can parse PDF files, but in my experience they can be clunky. In this case the simplest solution seemed the best, despite requiring some manual work +{{< /sidenote >}} + +The first piece of data I'll use comes from the Amsterdam Real Estate Association. They publish quarterly data on a number of variables about the Amsterdam housing market ([link](https://www.mva.nl/over-de-mva/mva/kwartaalcijfers)), inclusing asking price, final price paid, number of properties put on the market, number of properties sold and a few more back to the first quarter of 2012. *Obviously*, these numbers all come in pdf-format, because the people writing quarterly reports apparently have a *massive hatred* towards people that want to analyze this data. I downloaded the reports, used the online tool [PDFTables](https://pdftables.com) to convert them to Excel, and then stitched the tables together manually. Of course (remember the authors have a *massive hatred* towards us), the formatting of the numbers in the tables weren't consistent between different quarterly reports, so I had to do some cleaning in R. I put each table in a separate sheet and then used functionality from the `{readxl}` package to load each table into a different variable and then do the cleaning per table. This was a bit cumbersome. You can look at the code I used to load and merge the files. It's a bit of a mess: @@ -88,61 +114,135 @@ You can look at the code I used to load and merge the files. It's a bit of a mes Show code ``` r -asking_price <- readxl::read_xlsx("MVA_kwartaalcijfers.xlsx", sheet = 2) |> - janitor::clean_names() |> - mutate(type_woning = as_factor(type_woning), - across(where(is.double), ~ .x * 1e3), - across(where(is.character), ~ parse_number(str_remove_all(.x, fixed(" "))))) |> - pivot_longer(starts_with("x"), names_to = "date", values_to = "asking_price") - -transaction_price <- readxl::read_xlsx("MVA_kwartaalcijfers.xlsx", sheet = 3) |> - janitor::clean_names() |> - mutate(type_woning = as_factor(type_woning), - across(where(is.character), ~ parse_number(str_remove_all(.x, fixed(" "))))) |> - pivot_longer(starts_with("x"), names_to = "date", values_to = "transaction_price") - -price_per_m2 <- readxl::read_xlsx("MVA_kwartaalcijfers.xlsx", sheet = 4) |> - janitor::clean_names() |> - mutate(type_woning = as_factor(type_woning), - across(where(is.double), ~ .x * 1e3), - across(where(is.character), ~ parse_number(str_remove_all(.x, fixed(" "))))) |> - pivot_longer(starts_with("x"), names_to = "date", values_to = "price_per_m2") - -n_offered <- readxl::read_xlsx("MVA_kwartaalcijfers.xlsx", sheet = 5) |> - janitor::clean_names() |> - mutate(type_woning = as_factor(type_woning), - across(where(is.character), ~ parse_number(str_remove_all(.x, fixed(" "))))) |> - pivot_longer(starts_with("x"), names_to = "date", values_to = "n_offered") - -n_sold <- readxl::read_xlsx("MVA_kwartaalcijfers.xlsx", sheet = 6) |> - janitor::clean_names() |> - mutate(type_woning = as_factor(type_woning), - across(where(is.character), ~ parse_number(str_remove_all(.x, fixed(" "))))) |> - pivot_longer(starts_with("x"), names_to = "date", values_to = "n_sold") - -mortgage_months <- readxl::read_xlsx("MVA_kwartaalcijfers.xlsx", sheet = 7) |> - janitor::clean_names() |> - mutate(type_woning = as_factor(type_woning), - across(where(is.character), ~ parse_number(str_remove_all(.x, fixed(" "))))) |> - pivot_longer(starts_with("x"), names_to = "date", values_to = "mortgage_months") - -tightness_index <- readxl::read_xlsx("MVA_kwartaalcijfers.xlsx", sheet = 8) |> - janitor::clean_names() |> - mutate(type_woning = as_factor(type_woning), - across(where(is.character), ~ parse_number(str_replace_all(.x, ",", ".")))) |> - pivot_longer(starts_with("x"), names_to = "date", values_to = "tightness_index") - -data_merged <- inner_join(asking_price, transaction_price) |> - inner_join(price_per_m2) |> - inner_join(n_offered) |> - inner_join(n_sold) |> - inner_join(mortgage_months) |> - inner_join(tightness_index) |> - mutate(asking_price = ifelse(asking_price < 1e5, asking_price * 1e3, asking_price), - transaction_price = ifelse(transaction_price < 1e5, transaction_price * 1e3, transaction_price), - price_per_m2 = ifelse(price_per_m2 > 1e4, price_per_m2 / 1e3, price_per_m2)) - -write_rds(data_merged, "data_merged.rds") +asking_price <- readxl::read_xlsx("./data/MVA_kwartaalcijfers.xlsx", + sheet = 2 +) |> + janitor::clean_names() |> + mutate( + type_woning = as_factor(type_woning), + across(where(is.double), ~ .x * 1e3), + across( + where(is.character), + ~ parse_number(str_remove_all(.x, fixed(" "))) + ) + ) |> + pivot_longer(starts_with("x"), + names_to = "date", values_to = "asking_price" + ) + +transaction_price <- readxl::read_xlsx("./data/MVA_kwartaalcijfers.xlsx", + sheet = 3 +) |> + janitor::clean_names() |> + mutate( + type_woning = as_factor(type_woning), + across( + where(is.character), + ~ parse_number(str_remove_all(.x, fixed(" "))) + ) + ) |> + pivot_longer(starts_with("x"), + names_to = "date", values_to = "transaction_price" + ) + +price_per_m2 <- readxl::read_xlsx("./data/MVA_kwartaalcijfers.xlsx", + sheet = 4 +) |> + janitor::clean_names() |> + mutate( + type_woning = as_factor(type_woning), + across(where(is.double), ~ .x * 1e3), + across( + where(is.character), + ~ parse_number(str_remove_all(.x, fixed(" "))) + ) + ) |> + pivot_longer(starts_with("x"), + names_to = "date", values_to = "price_per_m2" + ) + +n_offered <- readxl::read_xlsx("./data/MVA_kwartaalcijfers.xlsx", + sheet = 5 +) |> + janitor::clean_names() |> + mutate( + type_woning = as_factor(type_woning), + across( + where(is.character), + ~ parse_number(str_remove_all(.x, fixed(" "))) + ) + ) |> + pivot_longer(starts_with("x"), + names_to = "date", values_to = "n_offered" + ) + +n_sold <- readxl::read_xlsx("./data/MVA_kwartaalcijfers.xlsx", + sheet = 6 +) |> + janitor::clean_names() |> + mutate( + type_woning = as_factor(type_woning), + across( + where(is.character), + ~ parse_number(str_remove_all(.x, fixed(" "))) + ) + ) |> + pivot_longer(starts_with("x"), + names_to = "date", values_to = "n_sold" + ) + +mortgage_months <- readxl::read_xlsx("./data/MVA_kwartaalcijfers.xlsx", + sheet = 7 +) |> + janitor::clean_names() |> + mutate( + type_woning = as_factor(type_woning), + across( + where(is.character), + ~ parse_number(str_remove_all(.x, fixed(" "))) + ) + ) |> + pivot_longer(starts_with("x"), + names_to = "date", values_to = "mortgage_months" + ) + +tightness_index <- readxl::read_xlsx("./data/MVA_kwartaalcijfers.xlsx", + sheet = 8 +) |> + janitor::clean_names() |> + mutate( + type_woning = as_factor(type_woning), + across( + where(is.character), + ~ parse_number(str_replace_all(.x, ",", ".")) + ) + ) |> + pivot_longer(starts_with("x"), + names_to = "date", values_to = "tightness_index" + ) + +data_merged <- inner_join(asking_price, transaction_price) |> + inner_join(price_per_m2) |> + inner_join(n_offered) |> + inner_join(n_sold) |> + inner_join(mortgage_months) |> + inner_join(tightness_index) |> + mutate( + asking_price = ifelse(asking_price < 1e5, + asking_price * 1e3, + asking_price + ), + transaction_price = ifelse(transaction_price < 1e5, + transaction_price * 1e3, + transaction_price + ), + price_per_m2 = ifelse(price_per_m2 > 1e4, + price_per_m2 / 1e3, + price_per_m2 + ) + ) + +write_rds(data_merged, "./data/data_merged.rds") ``` @@ -153,7 +253,7 @@ Let's have a look at the dataset. Show code ``` r -data_merged <- read_rds("data_merged.rds") +data_merged <- read_rds("./data/data_merged.rds") glimpse(data_merged) ``` @@ -178,28 +278,34 @@ From this dataset, I want to create a few new variables. I want to create a date Show code ``` r -data <- data_merged |> - rename(type = type_woning) |> - mutate(quarter = str_extract(date, pattern = "x(.*?)e"), - quarter = parse_number(quarter), - year = str_extract(date, pattern = "kw_(.*)"), - year = parse_number(year), - date = as.Date(str_glue("{year}-{(quarter * 3)}-01")), - diff_ask_paid = transaction_price - asking_price, - diff_ask_paid_perc = diff_ask_paid / asking_price, - diff_offered_sold = n_offered - n_sold, - diff_offered_sold_perc = diff_offered_sold / n_offered, - perc_sold = n_sold / n_offered, - type = case_when(str_detect(type,"Totaal") ~ "Total", - str_detect(type,"<= 1970") ~ "Apartments (pre-1970)", - str_detect(type,"> 1970") ~ "Apartments (post-1970)", - str_detect(type,"Tussenwoning") ~ "Terraced house", - str_detect(type,"Hoekwoning") ~ "Corner house", - str_detect(type,"Vrijstaand") ~ "Detached house", - str_detect(type,"2-onder-1-kap") ~ "Semi-detached house"), - type = factor(type, levels = c("Apartments (pre-1970)","Apartments (post-1970)", - "Terraced house","Corner house","Detached house", - "Semi-detached house","Total"))) |> +data <- data_merged |> + rename(type = type_woning) |> + mutate( + quarter = str_extract(date, pattern = "x(.*?)e"), + quarter = parse_number(quarter), + year = str_extract(date, pattern = "kw_(.*)"), + year = parse_number(year), + date = as.Date(str_glue("{year}-{(quarter * 3)}-01")), + diff_ask_paid = transaction_price - asking_price, + diff_ask_paid_perc = diff_ask_paid / asking_price, + diff_offered_sold = n_offered - n_sold, + diff_offered_sold_perc = diff_offered_sold / n_offered, + perc_sold = n_sold / n_offered, + type = case_when( + str_detect(type, "Totaal") ~ "Total", + str_detect(type, "<= 1970") ~ "Apartments (pre-1970)", + str_detect(type, "> 1970") ~ "Apartments (post-1970)", + str_detect(type, "Tussenwoning") ~ "Terraced house", + str_detect(type, "Hoekwoning") ~ "Corner house", + str_detect(type, "Vrijstaand") ~ "Detached house", + str_detect(type, "2-onder-1-kap") ~ "Semi-detached house" + ), + type = factor(type, levels = c( + "Apartments (pre-1970)", "Apartments (post-1970)", + "Terraced house", "Corner house", "Detached house", + "Semi-detached house", "Total" + )) + ) |> glimpse() ``` @@ -224,39 +330,58 @@ data <- data_merged |> $ diff_offered_sold_perc 0.848854962, 0.837431694, 0.850767085, 0.786610… $ perc_sold 0.1511450, 0.1625683, 0.1492329, 0.2133891, 0.1… -The first thing that seems interesting to do is to plot the percentage difference between the asking price and the price paid. This will give us an indication of the trend in overpaying on different types of properties. I'll use a color palette and legend design I again shamelessly stole from Cédric Scherer ([@CedScherer](https://twitter.com/CedScherer)). I'll use a simple line graph to visualize the percentage overpay. +{{< sidenote br="1em" >}} +I'll use a color palette and legend design I shamelessly stole from [Cédric Scherer](https://twitter.com/CedScherer) +{{< /sidenote >}} + +The first thing that seems interesting to do is to plot the percentage difference between the asking price and the price paid. This will give us an indication of the trend in overpaying on different types of properties. Let's use a simple line graph to visualize the percentage overpay.
Show code ``` r -colors <- c("#019868","#9dd292","#ec0b88","#651eac","#e18a1e","#2b7de5") - -data |> - filter(type != "Total") |> - group_by(type) |> - mutate(n_total = sum(n_sold), - type_label = str_glue("{type} (n={format(n_total, big.mark = \".\", decimal.mark = \",\")})"), - type_label = str_replace(type_label,"\\) \\(", ", ")) |> - arrange(type) |> - mutate(type_label = factor(type_label)) |> - ggplot(aes(x = date, y = diff_ask_paid_perc, color = type_label)) + +colors <- c("#019868", "#9dd292", "#ec0b88", "#651eac", "#e18a1e", "#2b7de5") + +data |> + filter(type != "Total") |> + group_by(type) |> + mutate( + n_total = sum(n_sold), + type_label = str_glue( + "{type} (n={format(n_total, big.mark = \".\", decimal.mark = \",\")})" + ), + type_label = str_replace(type_label, "\\) \\(", ", ") + ) |> + arrange(type) |> + mutate(type_label = factor(type_label)) |> + ggplot(aes(x = date, y = diff_ask_paid_perc, color = type_label)) + geom_hline(yintercept = 0, color = "grey30", size = 1) + - geom_line(size = 1.2, alpha = 0.8, lineend = "round", key_glyph = "point") + - labs(title = "Overbidding has become the new normal", - subtitle = "_Paying as much as 5% over asking price is common the past few years_", - x = NULL, - y = "Percentage difference between\nasking price and price paid", - color = NULL, - caption = "_**Data**: MVA_") + - scale_y_continuous(labels = scales::label_percent()) + - scale_color_manual(values = colors, - guide = guide_legend(title.position = "top", title.hjust = 0.5, nrow = 2, - label.position = "right", - override.aes = list(fill = "transparent", size = 6, alpha = 1))) + - theme(plot.title = element_textbox(size = 20), - axis.title.y = element_textbox(width = grid::unit(2.5, "in")), - legend.key = element_rect(fill = "transparent", color = "transparent")) + geom_line( + size = 1.2, alpha = 0.8, + lineend = "round", key_glyph = "point" + ) + + labs( + title = "Overbidding has become the new normal", + subtitle = "_Paying as much as 5% over asking price is common the past few years_", + x = NULL, + y = "Percentage difference between\nasking price and price paid", + color = NULL, + caption = "_**Data**: MVA_" + ) + + scale_y_continuous(labels = scales::label_percent()) + + scale_color_manual( + values = colors, + guide = guide_legend( + title.position = "top", title.hjust = 0.5, nrow = 2, + label.position = "right", + override.aes = list(fill = "transparent", size = 6, alpha = 1) + ) + ) + + theme( + plot.title = element_textbox(size = 20), + axis.title.y = element_textbox(width = grid::unit(2.5, "in")), + legend.key = element_rect(fill = "transparent", color = "transparent") + ) ```
@@ -269,27 +394,40 @@ Prior to 2014, most properties in Amsterdam were sold at about 6% below asking p Show code ``` r -data |> - filter(type != "Total", - date == max(date)) |> - select(type, diff_ask_paid_perc, diff_ask_paid, transaction_price, n_sold) |> - arrange(-diff_ask_paid_perc) |> - gt() |> - cols_align(columns = "type", align = "left") |> - fmt_percent(columns = "diff_ask_paid_perc") |> - fmt_currency(columns = "diff_ask_paid", currency = "EUR", decimals = 0, sep_mark = " ") |> - fmt_currency(columns = "transaction_price", currency = "EUR", decimals = 0, sep_mark = " ") |> - fmt_number(columns = "n_sold", sep_mark = " ", drop_trailing_zeros = TRUE) |> +data |> + filter( + type != "Total", + date == max(date) + ) |> + select(type, diff_ask_paid_perc, diff_ask_paid, transaction_price, n_sold) |> + arrange(-diff_ask_paid_perc) |> + gt() |> + cols_align(columns = "type", align = "left") |> + fmt_percent(columns = "diff_ask_paid_perc") |> + fmt_currency( + columns = "diff_ask_paid", currency = "EUR", + decimals = 0, sep_mark = " " + ) |> + fmt_currency( + columns = "transaction_price", currency = "EUR", + decimals = 0, sep_mark = " " + ) |> + fmt_number( + columns = "n_sold", sep_mark = " ", + drop_trailing_zeros = TRUE + ) |> cols_label( type = html("
Property type
"), diff_ask_paid_perc = html("
Percentage overpay
"), diff_ask_paid = html("
Mean overpay
"), transaction_price = html("
Mean transaction price
"), n_sold = html("
Number of properties
sold in period
") - ) |> - tab_header(title = html("

Difference between
asking price and price paid

"), - subtitle = html("

Data from the first quarter of 2021

")) |> - tab_source_note(source_note = md("_**Data**: MVA_")) |> + ) |> + tab_header( + title = html("

Difference between
asking price and price paid

"), + subtitle = html("

Data from the first quarter of 2021

") + ) |> + tab_source_note(source_note = md("_**Data**: MVA_")) |> tab_options(table.background.color = "#EBEBEB") |> gtsave("overpay-table.png", expand = 0) ``` @@ -302,42 +440,69 @@ What contributed to this price increase? A simple supply-and-demand plays a part Show code ``` r -data |> - filter(type != "Total") |> - group_by(type) |> - mutate(n_total = sum(n_sold), - type_label = str_glue("{type} (n={format(n_total, big.mark = \".\", decimal.mark = \",\")})"), - type_label = str_replace(type_label,"\\) \\(", ", ")) |> - arrange(type) |> - mutate(type_label = factor(type_label)) |> - ggplot(aes(x = date, y = tightness_index, color = type_label)) + - geom_rect(data = tibble(), - aes(xmin = as.Date(-Inf), xmax = as.Date(Inf), ymin = c(0,5,10), ymax = c(5,10,Inf), - fill = c("Sellers market", "Balanced market", "Buyers market")), - color = "transparent", alpha = 0.2, key_glyph = "point", inherit.aes = FALSE) + +data |> + filter(type != "Total") |> + group_by(type) |> + mutate( + n_total = sum(n_sold), + type_label = str_glue( + "{type} (n={format(n_total, big.mark = \".\", decimal.mark = \",\")})" + ), + type_label = str_replace(type_label, "\\) \\(", ", ") + ) |> + arrange(type) |> + mutate(type_label = factor(type_label)) |> + ggplot(aes(x = date, y = tightness_index, color = type_label)) + + geom_rect( + data = tibble(), + aes( + xmin = as.Date(-Inf), xmax = as.Date(Inf), + ymin = c(0, 5, 10), ymax = c(5, 10, Inf), + fill = c("Sellers market", "Balanced market", "Buyers market") + ), + color = "transparent", alpha = 0.2, key_glyph = "point", inherit.aes = FALSE + ) + geom_hline(yintercept = 0, color = "grey30", size = 1) + - geom_line(size = 1.2, alpha = 0.8, lineend = "round", key_glyph = "point") + - labs(title = "Amsterdam has had a sellers market for nearly 5 years", - x = NULL, - y = "Indicator of _\"density\"_ on the housing market", - color = NULL, - fill = "Type of market:", - caption = "_**Data**: MVA_") + - scale_x_date(expand = c(0,0)) + - scale_y_continuous(trans = "reverse", expand = c(0,0)) + - scale_color_manual(values = colors, - guide = guide_legend(title.position = "top", title.hjust = 0.5, nrow = 2, - label.position = "right", order = 1, - override.aes = list(fill = "transparent", size = 6, alpha = 1))) + - scale_fill_manual(values = c("#F5E000","#00B300","#D1000E"), - limits = c("Buyers market","Balanced market","Sellers market"), - guide = guide_legend(order = 2, override.aes = list(shape = 21, size = 6, alpha = 1, stroke = 0))) + + geom_line(size = 1.2, alpha = 0.8, lineend = "round", key_glyph = "point") + + labs( + title = "Amsterdam has had a sellers market for nearly 5 years", + x = NULL, + y = "Indicator of _\"density\"_ on the housing market", + color = NULL, + fill = "Type of market:", + caption = "_**Data**: MVA_" + ) + + scale_x_date(expand = c(0, 0)) + + scale_y_continuous(trans = "reverse", expand = c(0, 0)) + + scale_color_manual( + values = colors, + guide = guide_legend( + title.position = "top", title.hjust = 0.5, nrow = 2, + label.position = "right", order = 1, + override.aes = list(fill = "transparent", size = 6, alpha = 1) + ) + ) + + scale_fill_manual( + values = c("#F5E000", "#00B300", "#D1000E"), + limits = c("Buyers market", "Balanced market", "Sellers market"), + guide = guide_legend( + order = 2, + override.aes = list( + shape = 21, size = 6, + alpha = 1, stroke = 0 + ) + ) + ) + coord_cartesian(clip = "off") + - theme(plot.title = element_textbox(size = 16), - plot.subtitle = element_textbox(size = 10), - axis.title.y = element_textbox(orientation = "left-rotated", - width = grid::unit(2, "in")), - legend.key = element_rect(fill = "transparent")) + theme( + plot.title = element_textbox(size = 16), + plot.subtitle = element_textbox(size = 10), + axis.title.y = element_textbox( + orientation = "left-rotated", + width = grid::unit(2, "in") + ), + legend.key = element_rect(fill = "transparent") + ) ``` @@ -346,36 +511,53 @@ data |> So, there's a lot of competition among buyers, and people looking to sell their houses can expect to be paid more than they anticipated. Dozens of buyers compete for the same properties, driving up the price. The figure below shows the percentage of properties sold compared to the number of properties offered. It's clear that after the 2008 housing bubble crisis, the housing market was still recovering in 2012 and 2013. However, since 2016, more apartments were sold than were put on the market. This means that the number of properties available for the growing number of people wanting to move to Amsterdam is decreasing. This decreases supply in a time with increasing demand, thus pushing the prices higher twice over. -

-A new phenomenon that entered the scene a little while ago may indicate how skewed the market is. People wanting to buy a house in a certain neighborhood will put notes and letters in the mailboxes of people living in that area saying "Nice house! Are you interested in selling it to me?". This is now not an uncommon strategy to find a house. Some people living in popular neighborshoods are inundated with notes from agencies facilitating these practices. See also a news item by RTL. -

+{{< standout bg="#acc8d4" >}} +A new phenomenon that entered the scene a little while ago may indicate how skewed the market is. People wanting to buy a house in a certain neighborhood will put notes and letters in the mailboxes of people living in that area saying *"Nice house! Are you interested in selling it to me?"*. This is now not an uncommon strategy to find a house. Some people living in popular neighborshoods are inundated with notes from agencies facilitating these practices. See also a [news item by RTL](https://www.rtlnieuws.nl/geld-en-werk/artikel/19001/droomhuis-gezien-maar-niet-te-koop-stop-dan-een-briefje-de-bus) +{{< /standout >}} +
Show code ``` r -data |> - filter(type != "Total") |> - ggplot(aes(x = date, y = perc_sold, fill = type)) + - geom_col(key_glyph = "point") + +data |> + filter(type != "Total") |> + ggplot(aes(x = date, y = perc_sold, fill = type)) + + geom_col(key_glyph = "point") + geom_hline(yintercept = 1, linetype = "dashed", color = "grey20") + - labs(title = "Some years, more houses are sold than are being put on the market", - x = NULL, - y = "Percentage of offered houses sold", - fill = NULL, - caption = "_**Data**: MVA_") + + labs( + title = "Some years, more houses are sold than are being put on the market", + x = NULL, + y = "Percentage of offered houses sold", + fill = NULL, + caption = "_**Data**: MVA_" + ) + coord_cartesian(clip = "off") + - scale_y_continuous(labels = scales::label_percent(), expand = c(0,NA), n.breaks = 4) + - scale_fill_manual(values = colors, - guide = guide_legend(title.position = "top", title.hjust = 0.5, nrow = 2, - label.position = "right", - override.aes = list(shape = 21, size = 6, alpha = 1, stroke = 0))) + - theme(plot.title = element_textbox(size = 20, - width = grid::unit(6,"in")), - plot.subtitle = element_textbox(size = 10), - strip.text.x = element_markdown(face = "bold", padding = margin(10,0,5,0,"pt")), - strip.background = element_rect(fill = "transparent"), - legend.key = element_rect(fill = "transparent")) + - facet_wrap(~ type, strip.position = "top", scales = "free_x") + scale_y_continuous( + labels = scales::label_percent(), + expand = c(0, NA), n.breaks = 4 + ) + + scale_fill_manual( + values = colors, + guide = guide_legend( + title.position = "top", title.hjust = 0.5, nrow = 2, + label.position = "right", + override.aes = list(shape = 21, size = 6, alpha = 1, stroke = 0) + ) + ) + + theme( + plot.title = element_textbox( + size = 20, + width = grid::unit(6, "in") + ), + plot.subtitle = element_textbox(size = 10), + strip.text.x = element_markdown( + face = "bold", + padding = margin(10, 0, 5, 0, "pt") + ), + strip.background = element_rect(fill = "transparent"), + legend.key = element_rect(fill = "transparent") + ) + + facet_wrap(~type, strip.position = "top", scales = "free_x") ```
@@ -389,55 +571,88 @@ This adds fuel to the fire. I guess I'm trying to show that there are a number o ``` r decades <- tibble( - x1 = seq(1920,2020,10), + x1 = seq(1920, 2020, 10), x2 = x1 + 10 -) |> - slice(seq(1,1e2,2)) |> +) |> + slice(seq(1, 1e2, 2)) |> filter(x2 < 2030) -hist_data_homes <- cbsodataR::cbs_get_data("82235NED") |> - janitor::clean_names() |> - rename(stock_end = eindstand_voorraad_8) |> - mutate(year = parse_number(perioden), - stock_end = stock_end * 1e3, - diff = stock_end - lag(stock_end)) +hist_data_homes <- cbsodataR::cbs_get_data("82235NED") |> + janitor::clean_names() |> + rename(stock_end = eindstand_voorraad_8) |> + mutate( + year = parse_number(perioden), + stock_end = stock_end * 1e3, + diff = stock_end - lag(stock_end) + ) -total_hist_plot <- hist_data_homes |> +total_hist_plot <- hist_data_homes |> ggplot(aes(x = year, y = stock_end)) + - geom_rect(data = tibble(), aes(xmin = 1940.01, xmax = 1945, ymin = -Inf, ymax = Inf), - fill = "red", alpha = 0.2, inherit.aes = FALSE) + - geom_text(data = tibble(), aes(x = 1942.5, y = max(hist_data_homes$stock_end, na.rm = TRUE), vjust = 0, label = "WWII"), size = 3, - fontface = "bold", family = "nunito-sans", inherit.aes = FALSE) + - geom_rect(data = decades, aes(xmin = x1, xmax = x2, ymin = -Inf, ymax = Inf), - fill = "grey30", alpha = 0.2, color = "transparent", inherit.aes = FALSE) + + geom_rect( + data = tibble(), aes(xmin = 1940.01, xmax = 1945, ymin = -Inf, ymax = Inf), + fill = "red", alpha = 0.2, inherit.aes = FALSE + ) + + geom_text( + data = tibble(), + aes(x = 1942.5, y = max(hist_data_homes$stock_end, na.rm = TRUE)), + label = "WWII", fontface = "bold", family = "nunito-sans", + vjust = 0, size = 3, inherit.aes = FALSE + ) + + geom_rect( + data = decades, aes(xmin = x1, xmax = x2, ymin = -Inf, ymax = Inf), + fill = "grey30", alpha = 0.2, color = "transparent", inherit.aes = FALSE + ) + geom_line(color = "darkred", size = 1.5, lineend = "round") + - labs(x = NULL, - y = "Total number of homes") + - scale_x_continuous(expand = c(0,0), breaks = c(decades$x1,decades$x2,2020)) + + labs( + x = NULL, + y = "Total number of homes" + ) + + scale_x_continuous( + expand = c(0, 0), + breaks = c(decades$x1, decades$x2, 2020) + ) + scale_y_continuous(labels = scales::label_number()) + coord_cartesian(clip = "off") -diff_hist_plot <- hist_data_homes |> +diff_hist_plot <- hist_data_homes |> ggplot(aes(x = year, y = diff)) + geom_hline(yintercept = 0.5, color = "grey30", size = 1) + - geom_rect(data = tibble(), aes(xmin = 1940.01, xmax = 1945, ymin = -Inf, ymax = Inf), - fill = "red", alpha = 0.2, inherit.aes = FALSE) + - geom_text(data = tibble(), aes(x = 1942.5, y = max(hist_data_homes$diff, na.rm = TRUE), vjust = 0, label = "WWII"), size = 3, - fontface = "bold", family = "nunito-sans", inherit.aes = FALSE) + - geom_rect(data = decades, aes(xmin = x1, xmax = x2, ymin = -Inf, ymax = Inf), - fill = "grey30", alpha = 0.2, color = "transparent", inherit.aes = FALSE) + + geom_rect( + data = tibble(), aes(xmin = 1940.01, xmax = 1945, ymin = -Inf, ymax = Inf), + fill = "red", alpha = 0.2, inherit.aes = FALSE + ) + + geom_text( + data = tibble(), aes( + x = 1942.5, + y = max(hist_data_homes$diff, na.rm = TRUE), + vjust = 0, label = "WWII" + ), + size = 3, fontface = "bold", + family = "nunito-sans", inherit.aes = FALSE + ) + + geom_rect( + data = decades, aes(xmin = x1, xmax = x2, ymin = -Inf, ymax = Inf), + fill = "grey30", alpha = 0.2, color = "transparent", inherit.aes = FALSE + ) + geom_line(color = "grey30", alpha = 0.5, size = 1.5, lineend = "round") + geom_smooth(color = "darkred", size = 1.5, se = FALSE) + - labs(x = NULL, - y = "Net homes added per year") + - scale_x_continuous(expand = c(0,0), breaks = c(decades$x1,decades$x2,2020)) + + labs( + x = NULL, + y = "Net homes added per year" + ) + + scale_x_continuous( + expand = c(0, 0), + breaks = c(decades$x1, decades$x2, 2020) + ) + scale_y_continuous(labels = scales::label_number_auto()) + coord_cartesian(clip = "off") total_hist_plot / diff_hist_plot + - plot_annotation(title = "Number of houses available", - subtitle = "Data covers development in the Netherlands nationwide", - caption = "**Data**: CBS") & + plot_annotation( + title = "Number of houses available", + subtitle = "Data covers development in the Netherlands nationwide", + caption = "**Data**: CBS" + ) & theme(plot.title = element_textbox(size = 20)) ``` @@ -449,6 +664,10 @@ The figure displays data from from the CBS through the `{cbsodataR}` package. It It's also important to mention that between the 60s and 70s, the Netherlands started building entire cities from scratch on newly-reclaimed land from the sea. The province of [Flevoland](https://en.wikipedia.org/wiki/Flevoland) is currently home to about 415 000 people, and until the early 50s this province was non-existent, the current land was at the bottom of a bay of the North sea (called the Zuiderzee or "Southern Sea" in English). Other than a general enthusiasm for building, I think this contributed considerably to the increase in the number of homes added in the 60s and 70s. This new province has good access to Amsterdam, and if it weren't for this new piece of land, the shortage might have peaked earlier. +{{< sidenote >}} +There's actually quite a few more, but I'll focus on the ones I can quantify at the moment +{{< /sidenote >}} + But that's not all, there's a few other features that contribute to the gridlock. See, not only young people are on the market to buy apartments in Amsterdam. There's a thriving market for investors looking to take advantage of the rising prices in the Amsterdam housing market (source: [Algemeen Dagblad](https://www.ad.nl/wonen/beleggers-verwerven-massaal-koophuizen-voor-verhuur-in-amsterdam~afedc50c6/)). According to the Dutch central bank, about 1 in 5 properties are sold to investors, who are mostly looking to convert it to a rental property or flip it for a profit. I couldn't find the data the Dutch central bank relied on, but I found something else. The Central Bureau for Statistics collects data on the number of "mutations" among properties. The "mutation" in this case refers to the change of purpose of a property, e.g. if a house meant for sale is bought and then transformed to a rental property by either private individuals or corporations, or vice versa. I collected this data from the governmental yearly report on the housing market of 2020 ("*Staat van de Woningmarkt - Jaarrapportage 2020*", [link](https://www.rijksoverheid.nl/documenten/rapporten/2020/06/15/staat-van-de-woningmarkt-jaarrapportage-2020)). Instead of per quarter, this data is reported per year. Unfortunately, the data on housing mutations in the report (from 2020 mind you) only goes until 2017. It's important to note that these numbers are country-wide, not specific for Amsterdam. That means there could be some other factors in play. Many of these trends are present across the country, but they're massively amplified in the larger cities. The data was contained in a pdf that wasn't easily machine-readable, so I had to manually copy the numbers into a tibble, which was great...
@@ -456,7 +675,8 @@ But that's not all, there's a few other features that contribute to the gridlock ``` r house_mutations_in <- tribble( - ~year, ~buy_to_rent_corp, ~buy_to_rent_other, ~rent_corp_to_buy, ~rent_corp_to_rent_other, ~rent_other_to_buy, ~rent_other_to_rent_corp, + ~year, ~buy_to_rent_corp, ~buy_to_rent_other, ~rent_corp_to_buy, + ~rent_corp_to_rent_other, ~rent_other_to_buy, ~rent_other_to_rent_corp, 2012, 900, 58000, 14600, 5500, 50600, 4900, 2013, 800, 62200, 15200, 11500, 50900, 6000, 2014, 1000, 62200, 15400, 9300, 59900, 39000, @@ -465,19 +685,28 @@ house_mutations_in <- tribble( 2017, 1600, 98900, 6400, 11000, 7300, 9000 ) -house_mutations <- house_mutations_in |> - pivot_longer(cols = -year, names_to = "mutation", values_to = "n_mutations") |> - mutate(from = str_extract(mutation, "(.*?)_to"), - from = str_remove_all(from, "_to"), - to = str_extract(mutation, "to_(.*?)$"), - to = str_remove_all(to, "to_")) |> - group_by(year) |> - mutate(total_mutations = sum(n_mutations), - perc_mutations = n_mutations / total_mutations, - across(c(from,to), ~ case_when(str_detect(.x,"buy") ~ "buy", - str_detect(.x,"rent_corp") ~ "rent (corporation)", - str_detect(.x,"rent_other") ~ "rent (other)")), - mutation_label = str_glue("From {from} to {to}")) |> +house_mutations <- house_mutations_in |> + pivot_longer( + cols = -year, + names_to = "mutation", values_to = "n_mutations" + ) |> + mutate( + from = str_extract(mutation, "(.*?)_to"), + from = str_remove_all(from, "_to"), + to = str_extract(mutation, "to_(.*?)$"), + to = str_remove_all(to, "to_") + ) |> + group_by(year) |> + mutate( + total_mutations = sum(n_mutations), + perc_mutations = n_mutations / total_mutations, + across(c(from, to), ~ case_when( + str_detect(.x, "buy") ~ "buy", + str_detect(.x, "rent_corp") ~ "rent (corporation)", + str_detect(.x, "rent_other") ~ "rent (other)" + )), + mutation_label = str_glue("From {from} to {to}") + ) |> ungroup() ``` @@ -489,46 +718,77 @@ So not every year there's the same number of "mutations" (transformations of pur Show code ``` r -mutations_n_plot <- house_mutations |> - arrange(from,to) |> - mutate(mutation_label = fct_inorder(mutation_label)) |> - ggplot(aes(x = year, y = n_mutations, alluvium = mutation_label, fill = mutation_label)) + +mutations_n_plot <- house_mutations |> + arrange(from, to) |> + mutate(mutation_label = fct_inorder(mutation_label)) |> + ggplot(aes( + x = year, y = n_mutations, + alluvium = mutation_label, fill = mutation_label + )) + geom_point(shape = 21, color = "transparent", size = NA) + ggalluvial::geom_flow(width = 0, show.legend = FALSE) + - labs(x = NULL, - y = "Number of mutations", - fill = NULL) + - scale_x_continuous(expand = c(0,0)) + - scale_y_continuous(labels = scales::label_number_auto(), expand = c(0,0)) + - scale_fill_manual(values = rev(colors), - guide = guide_legend(title.position = "top", title.hjust = 0.5, nrow = 2, - label.position = "right", - override.aes = list(color = "transparent", size = 6, alpha = 1, stroke = 0))) + + labs( + x = NULL, + y = "Number of mutations", + fill = NULL + ) + + scale_x_continuous(expand = c(0, 0)) + + scale_y_continuous( + labels = scales::label_number_auto(), + expand = c(0, 0) + ) + + scale_fill_manual( + values = rev(colors), + guide = guide_legend( + title.position = "top", title.hjust = 0.5, nrow = 2, + label.position = "right", + override.aes = list( + color = "transparent", size = 6, + alpha = 1, stroke = 0 + ) + ) + ) + theme(legend.key = element_rect(fill = "transparent")) -mutations_perc_plot <- house_mutations |> - arrange(from,to) |> - mutate(mutation_label = fct_inorder(mutation_label)) |> - ggplot(aes(x = year, y = perc_mutations, alluvium = mutation_label, fill = mutation_label)) + +mutations_perc_plot <- house_mutations |> + arrange(from, to) |> + mutate(mutation_label = fct_inorder(mutation_label)) |> + ggplot(aes( + x = year, y = perc_mutations, + alluvium = mutation_label, fill = mutation_label + )) + geom_point(shape = 21, color = "transparent", size = NA) + ggalluvial::geom_flow(width = 0, show.legend = FALSE) + - labs(x = NULL, - y = "Percentage of total mutations per quarter", - fill = NULL) + - scale_x_continuous(expand = c(0,0)) + - scale_y_continuous(labels = scales::label_percent(), expand = c(0,0)) + - scale_fill_manual(values = rev(colors), - guide = guide_legend(title.position = "top", title.hjust = 0.5, nrow = 2, - label.position = "right", - override.aes = list(color = "transparent", size = 6, alpha = 1, stroke = 0))) + - theme(axis.title.y = element_textbox(width = grid::unit(2,"in")), - legend.key = element_rect(fill = "transparent")) + labs( + x = NULL, + y = "Percentage of total mutations per quarter", + fill = NULL + ) + + scale_x_continuous(expand = c(0, 0)) + + scale_y_continuous(labels = scales::label_percent(), expand = c(0, 0)) + + scale_fill_manual( + values = rev(colors), + guide = guide_legend( + title.position = "top", title.hjust = 0.5, nrow = 2, + label.position = "right", + override.aes = list( + color = "transparent", size = 6, + alpha = 1, stroke = 0 + ) + ) + ) + + theme( + axis.title.y = element_textbox(width = grid::unit(2, "in")), + legend.key = element_rect(fill = "transparent") + ) mutations_n_plot / mutations_perc_plot + - plot_annotation(title = "Shift to a renters market", - subtitle = "_A lot more houses meant for sale are being transformed into rental properties than vice versa_", - caption = "_**Data**: CBS_") + - plot_layout(guides = 'collect') & + plot_annotation( + title = "Shift to a renters market", + subtitle = "_A lot more houses meant for sale are being transformed into rental properties than vice versa_", + caption = "_**Data**: CBS_" + ) + + plot_layout(guides = "collect") & theme(plot.title = element_textbox(size = 20)) ``` @@ -544,51 +804,65 @@ We can look at the net number of houses added to the rental market by adding up Show code ``` r -net_house_mutations <- house_mutations |> - mutate(across(c(from,to), ~ case_when(str_detect(.x, "rent") ~ "rent", - str_detect(.x, "buy") ~ "buy"))) |> - group_by(year) |> - filter(from != to) |> - mutate(total_mutations = sum(n_mutations)) |> - group_by(year, from, to) |> - summarise(n_mutations = sum(n_mutations), - total_mutations = first(total_mutations)) |> - pivot_wider(id_cols = c(year,total_mutations), names_from = c(from,to), values_from = n_mutations, - names_sep = "_to_") |> - mutate(net_buy_to_rent = buy_to_rent - rent_to_buy, - perc_buy_to_rent = buy_to_rent / total_mutations) - -net_mutation_plot <- net_house_mutations |> - ggplot(aes(x = year, y = net_buy_to_rent)) + +net_house_mutations <- house_mutations |> + mutate(across(c(from, to), ~ case_when( + str_detect(.x, "rent") ~ "rent", + str_detect(.x, "buy") ~ "buy" + ))) |> + group_by(year) |> + filter(from != to) |> + mutate(total_mutations = sum(n_mutations)) |> + group_by(year, from, to) |> + summarise( + n_mutations = sum(n_mutations), + total_mutations = first(total_mutations) + ) |> + pivot_wider( + id_cols = c(year, total_mutations), + names_from = c(from, to), + values_from = n_mutations, + names_sep = "_to_" + ) |> + mutate( + net_buy_to_rent = buy_to_rent - rent_to_buy, + perc_buy_to_rent = buy_to_rent / total_mutations + ) + +net_mutation_plot <- net_house_mutations |> + ggplot(aes(x = year, y = net_buy_to_rent)) + geom_hline(yintercept = 0, color = "grey30", size = 1) + - #geom_ribbon(aes(ymin = 0, ymax = net_buy_to_rent), fill = "grey30", alpha = 0.2) + geom_line(color = "darkred", size = 1.5, lineend = "round") + - #geom_textbox(data = tibble(), aes(x = 2014, y = 4e4, - # label = "Net number of properties meant for sale withdrawn from the market"), - # family = "nunito-sans", size = 4, fill = "transparent", - # maxwidth = grid::unit(2,"in"), hjust = 0.5, vjust = 0) + - #geom_curve(data = tibble(), aes(x = 2014, y = 4e4, xend = 2016.5, yend = 2.5e4), - # curvature = 0.3, size = 0.75, arrow = arrow(length = unit(2,"mm")), - # lineend = "round") + - labs(x = NULL, - y = "**Net number of properties changed from properties meant for sale to rental properties**") + - scale_y_continuous(labels = scales::label_number_auto(), limits = c(-3e4, NA), n.breaks = 5) + - theme(axis.title.y = element_textbox(width = grid::unit(3.5,"in"))) - -perc_mutation_plot <- net_house_mutations |> - ggplot(aes(x = year, y = perc_buy_to_rent)) + + labs( + x = NULL, + y = "**Net number of properties changed from properties meant for sale to rental properties**" + ) + + scale_y_continuous( + labels = scales::label_number_auto(), + limits = c(-3e4, NA), n.breaks = 5 + ) + + theme(axis.title.y = element_textbox(width = grid::unit(3.5, "in"))) + +perc_mutation_plot <- net_house_mutations |> + ggplot(aes(x = year, y = perc_buy_to_rent)) + geom_hline(yintercept = 0.5, color = "grey30", size = 1) + geom_line(color = "darkred", size = 1.5, lineend = "round") + - labs(x = NULL, - y = "**Percentage of mutations that changed properties meant for sale to rental properties**") + - scale_y_continuous(labels = scales::label_percent(), limits = c(0,1), expand = c(0,0)) + + labs( + x = NULL, + y = "**Percentage of mutations that changed properties meant for sale to rental properties**" + ) + + scale_y_continuous( + labels = scales::label_percent(), + limits = c(0, 1), expand = c(0, 0) + ) + coord_cartesian(clip = "off") + - theme(axis.title.y = element_textbox(width = grid::unit(3.5,"in"))) - -net_mutation_plot + perc_mutation_plot + - plot_annotation(title = "Major net shift towards rental market", - subtitle = "_A lot more houses meant for sale are being transformed into rental properties than vice versa_", - caption = "_**Data**: CBS_") & + theme(axis.title.y = element_textbox(width = grid::unit(3.5, "in"))) + +net_mutation_plot + perc_mutation_plot + + plot_annotation( + title = "Major net shift towards rental market", + subtitle = "_A lot more houses meant for sale are being transformed into rental properties than vice versa_", + caption = "_**Data**: CBS_" + ) & theme(plot.title = element_textbox(size = 20)) ``` @@ -602,97 +876,123 @@ So in 2017 nearly 90 000 houses were mutated from sale to rental properties. In Show code ``` r -house_mutations |> - filter(year == max(year)) |> - select(from, to, n_mutations, perc_mutations) |> - mutate(across(c(from,to), ~ str_to_sentence(.x))) |> - arrange(-perc_mutations) |> - gt() |> - fmt_number(columns = "n_mutations", sep_mark = " ", drop_trailing_zeros = TRUE) |> - fmt_percent(columns = "perc_mutations") |> +house_mutations |> + filter(year == max(year)) |> + select(from, to, n_mutations, perc_mutations) |> + mutate(across(c(from, to), ~ str_to_sentence(.x))) |> + arrange(-perc_mutations) |> + gt() |> + fmt_number( + columns = "n_mutations", sep_mark = " ", + drop_trailing_zeros = TRUE + ) |> + fmt_percent(columns = "perc_mutations") |> grand_summary_rows( columns = "n_mutations", fns = list(total = "sum"), - formatter = fmt_number, + fmt = ~ fmt_number(.), sep_mark = " ", - drop_trailing_zeros = TRUE) |> + drop_trailing_zeros = TRUE + ) |> grand_summary_rows( columns = "perc_mutations", fns = list(total = "sum"), - formatter = fmt_percent, - missing_text = "") |> + fmt = ~ fmt_percent(.), + missing_text = "" + ) |> cols_label( from = html("
From
"), to = html("
To
"), n_mutations = html("
Number of mutations
"), perc_mutations = html("
Percentage of total mutations
") - ) |> - tab_header(title = html("

Mutations in the Dutch housing market

"), - subtitle = html("

Data from the year 2017

")) |> - tab_source_note(source_note = md("_**Data**: CBS_")) |> - tab_options(table.background.color = "#EBEBEB", - grand_summary_row.text_transform = "capitalize") |> + ) |> + tab_header( + title = html( + "

Mutations in the Dutch housing market

" + ), + subtitle = html( + "

Data from the year 2017

" + ) + ) |> + tab_source_note(source_note = md("_**Data**: CBS_")) |> + tab_options( + table.background.color = "#EBEBEB", + grand_summary_row.text_transform = "capitalize" + ) |> gtsave("mutations-table.png", expand = 0) ```
- Warning: Since gt v0.9.0, the `formatter` argument (and associated `...`) has been - deprecated. - • Please use the `fmt` argument to provide formatting directives. - This warning is displayed once every 8 hours. - So what's the result of all these phenomena? The figure below shows the housing price index for Amsterdam. The first quarter of 2012 was taken as the reference point (100%). In the past 8(!) years, properties had on average nearly doubled in price.
Show code ``` r -asking_start <- data |> - filter(type != "Total", - date == min(date)) |> - rename(asking_start = asking_price) |> +asking_start <- data |> + filter( + type != "Total", + date == min(date) + ) |> + rename(asking_start = asking_price) |> select(type, asking_start) -data_asking_index <- data |> - filter(type != "Total") |> - left_join(asking_start) |> +data_asking_index <- data |> + filter(type != "Total") |> + left_join(asking_start) |> mutate(asking_index = asking_price / asking_start) -data_asking_index |> - ggplot(aes(x = date, y = asking_index, color = type, group = type)) + - geom_line(size = 2, alpha = 0.15, lineend = "round", show.legend = FALSE) + +data_asking_index |> + ggplot(aes(x = date, y = asking_index, color = type, group = type)) + + geom_line(size = 2, alpha = 0.15, lineend = "round", show.legend = FALSE) + geom_smooth(se = FALSE, show.legend = FALSE) + - geom_point(size = NA) + - labs(title = "Price development over the past decade", - subtitle = "_On average, properties doubled in value since 2012_", - x = NULL, - y = "Price development relative to early 2012", - color = NULL, - caption = "_**Data**: MVA_") + - scale_y_continuous(labels = scales::label_percent()) + - scale_color_manual(values = colors, - guide = guide_legend(title.position = "top", title.hjust = 0.5, nrow = 2, - label.position = "right", - override.aes = list(fill = "transparent", size = 6, alpha = 1))) + - theme(plot.title = element_textbox(size = 20), - axis.title.y = element_textbox(width = grid::unit(2.5, "in")), - legend.key = element_rect(fill = "transparent", color = "transparent")) + geom_point(size = NA) + + labs( + title = "Price development over the past decade", + subtitle = "_On average, properties doubled in value since 2012_", + x = NULL, + y = "Price development relative to early 2012", + color = NULL, + caption = "_**Data**: MVA_" + ) + + scale_y_continuous(labels = scales::label_percent()) + + scale_color_manual( + values = colors, + guide = guide_legend( + title.position = "top", title.hjust = 0.5, nrow = 2, + label.position = "right", + override.aes = list(fill = "transparent", size = 6, alpha = 1) + ) + ) + + theme( + plot.title = element_textbox(size = 20), + axis.title.y = element_textbox(width = grid::unit(2.5, "in")), + legend.key = element_rect(fill = "transparent", color = "transparent") + ) ```
-Adjusted for the number of properties offered, the average asking price in Amsterdam has increased by 107%. That means that prices since 2012 have (on average) doubled. For apartments, this increase was 110%. Inflation on the other hand was only about 8.5% since 2012. So young people wanting to buy a house in Amsterdam need to bring nearly twice as much money to the table as they did just a decade ago. The money they saved and the salaries they earned have by no means kept up with the increasing price though. +{{< sidenote >}} +Inflation since 2012 was about 8.5% which should be subtracted from these values +{{< /sidenote >}} + +Adjusted for the number of properties offered, the average asking price in Amsterdam has increased by 107%. That means that prices since 2012 have (on average) roughly doubled. For apartments, this increase was 110%. So young people wanting to buy a house in Amsterdam need to bring nearly twice as much money to the table as they did just a decade ago. The money they saved and the salaries they earned have by no means kept up with the increasing price though. I cannot know this for sure, because the data I presented here doesn't contain the answer, but perhaps these factors combined play a role in the financial uncertainty that millenials experience. How do we fix it? I'm not sure. This is a really complicated issue. But a few things might (in my opinion) help: -1. **Build more (sustainable) durable houses** -2. **Prohibit sale of properties to individuals who don't plan to live there themselves for at least 2 or 3 years** -3. **Increase transparency when buying or selling a house through mandatory public offerings to prevent scooping by investors** -4. **Rent control to make *"mutating"* houses less profitable** -5. **Increased socialized housing for poor and low-income families** +- **Build more (sustainable) durable houses** + +- **Prohibit sale of properties to individuals who don't plan to live there themselves for at least 2 or 3 years** + +- **Increase transparency when buying or selling a house through mandatory public offerings to prevent scooping by investors** + +- **Rent control to make *"mutating"* houses less profitable** + +- **Increased socialized housing for poor and low-income families** Perhaps if we implement a few of these strategies we can make the housing market a little fairer and perhaps ensure that future generations will have the same opportunities current home-owners have. diff --git a/content/blog/2021-amsterdam-housing-market/index.qmd b/content/blog/2021-amsterdam-housing-market/index.qmd index bb897de..41a3c2b 100644 --- a/content/blog/2021-amsterdam-housing-market/index.qmd +++ b/content/blog/2021-amsterdam-housing-market/index.qmd @@ -14,6 +14,8 @@ execute: fig.show: hold results: hold dev.args: list(bg = "#EBEBEB") +editor_options: + chunk_output_type: console --- ```{css} @@ -21,7 +23,7 @@ execute: #| echo: FALSE body { - background-color: #EBEBEB; /* was #EFEFEF */ + background-color: #EBEBEB; } table { @@ -37,13 +39,19 @@ img { } ``` -

For this post, I decided to hide the code chunks by default to improve legibility. You can click the Show code-button to expand the code.

+{{{< standout bg="#acc8d4" >}}} +For this post, I decided to hide the code chunks by default to improve legibility. You can click on _Show code_ to expand the code. +{{{< /standout >}}} ### Introduction -I'm at the age where a lot of my peers start thinking about buying a home. It is a major conversation topic around the office, and these conversations are not often filled with optimism and excitement. Oslo is a notoriously expensive city to live in, and the housing market is very tough. +I'm at the age where a lot of my peers start thinking about buying a home. It is a major conversation topic around the office, and these conversations are not often filled with optimism and excitement. Oslo is a notoriously expensive city to live in, and the housing market is very tough. -Of all the worrying events in 2020, Millenials and Gen Z kids ranked financial instability as the second most common driver of stress, the most common being the welfare of their family ([link](https://www2.deloitte.com/global/en/pages/about-deloitte/articles/millennialsurvey.html)). Today I want to dissect one of the possible causes of this _economic anxiety_: **the housing market**. Now, I'm not an economist nor a financial expert. Nonetheless, I believe I can leverage some of my knowledge of data wrangling and analysis to contribute a small piece to this topic. My insight into the Oslo housing market is fairly limited to so far and there's many nuances I don't fully understand yet, but I do think I have some relevant knowledge of the Dutch housing market. So today I'll dive into some data on the Dutch market, and particularly the Amsterdam housing market. There's countless stories about the Amsterdam housing market and how terrible it is for new buyers, similar to the Oslo housing market. However, in my opinion the Amsterdam housing market is already a few steps ahead of the Oslo market, and hopefully the Amsterdam market can offer some warnings about what the Oslo housing market might look like in a few years without some policy corrections. +{{{< sidenote >}}} +This conclusion is based on a survey from Deloitte in 2020 where they surveyed millennials and Gen Z kids both before and after the onset of the pandemic +{{{< /sidenote >}}} + +Of all the worrying events in 2020, millennials and Gen Z kids ranked financial instability as the second most common driver of stress, the most common being the welfare of their family ([source](https://www2.deloitte.com/content/dam/Deloitte/global/Documents/About-Deloitte/deloitte-2020-millennial-survey.pdf)). Today I want to dissect one of the possible causes of this _economic anxiety_: **the housing market**. Now, I'm not an economist nor a financial expert. Nonetheless, I believe I can leverage some of my knowledge of data wrangling and analysis to contribute a small piece to this topic. My insight into the Oslo housing market is fairly limited to so far and there's many nuances I don't fully understand yet, but I do think I have some relevant knowledge of the Dutch housing market. So today I'll dive into some data on the Dutch market, and particularly the Amsterdam housing market. There's countless stories about the Amsterdam housing market and how terrible it is for new buyers, similar to the Oslo housing market. However, in my opinion the Amsterdam housing market is already a few steps ahead of the Oslo market, and hopefully the Amsterdam market can offer some warnings about what the Oslo housing market might look like in a few years without some policy corrections. This will be a fairly basic data analysis and visualization post, I can't claim that this is an exhaustive list and that I didn't miss some nuances, but I'll give it . I collected some data from the Amsterdam Real Estate Association ([Makelaars Vereniging Amsterdam; MVA](https://www.mva.nl)) and statistics from the Central Bureau for Statistics ([Centraal Bureau for de Statistiek; CBS](https://www.cbs.nl)). As usual, I'll be using `{tidyverse}` _a lot_. I've also recently started using the `{ggtext}` package to manage the text elements in my plots, inspired by Cédric Scherer ([@CedScherer](https://twitter.com/CedScherer)). I'll use the `{gt}` package to spice up some of the tables, and `{patchwork}` for arranging plots. I'll use the `{cbsodataR}` to download data from the Central Bureau for Statistics ([CBS](https://www.cbs.nl)). @@ -64,25 +72,44 @@ font_add_google(name = "Nunito Sans", family = "nunito-sans") showtext_auto() theme_set(ggthemes::theme_economist(base_family = "nunito-sans") + - theme(rect = element_rect(fill = "#EBEBEB", color = "transparent"), - plot.background = element_rect(fill = "#EBEBEB", color = "transparent"), - panel.background = element_rect(fill = "#EBEBEB", color = "transparent"), - plot.title = element_textbox(margin = margin(0,0,5,0,"pt")), - plot.title.position = "plot", - plot.subtitle = element_textbox(hjust = 0, margin = margin(0,0,15,0,"pt")), - plot.caption = element_textbox(hjust = 1), - plot.caption.position = "plot", - axis.title.y = element_textbox(orientation = "left-rotated", face = "bold", - margin = margin(0,0,5,0,"pt")), - axis.text.y = element_text(hjust = 1), - legend.position = "bottom", - legend.box = "vertical", - legend.text = element_text(size = 10))) + theme( + rect = element_rect(fill = "#EBEBEB", color = "transparent"), + plot.background = element_rect( + fill = "#EBEBEB", + color = "transparent" + ), + panel.background = element_rect( + fill = "#EBEBEB", + color = "transparent" + ), + plot.title = element_textbox( + margin = margin(0, 0, 5, 0, "pt") + ), + plot.title.position = "plot", + plot.subtitle = element_textbox( + hjust = 0, + margin = margin(0, 0, 15, 0, "pt") + ), + plot.caption = element_textbox(hjust = 1), + plot.caption.position = "plot", + axis.title.y = element_textbox( + orientation = "left-rotated", face = "bold", + margin = margin(0, 0, 5, 0, "pt") + ), + axis.text.y = element_text(hjust = 1), + legend.position = "bottom", + legend.box = "vertical", + legend.text = element_text(size = 10) + )) ``` ### Getting the data -The first piece of data I'll use comes from the Amsterdam Real Estate Association. They publish quarterly data on a number of variables about the Amsterdam housing market ([link](https://www.mva.nl/over-de-mva/mva/kwartaalcijfers)), inclusing asking price, final price paid, number of properties put on the market, number of properties sold and a few more back to the first quarter of 2012. _Obviously_, these numbers all come in pdf-format, because the people writing quarterly reports apparently have a _massive hatred_ towards people that want to analyze this data. I downloaded the reports, used the online tool [PDFTables](https://pdftables.com) to convert them to Excel, and then stitched the tables together manually. Of course (remember the authors have a _massive hatred_ towards us), the formatting of the numbers in the tables weren't consistent between different quarterly reports, so I had to do some cleaning in R. I put each table in a separate sheet and then used functionality from the `{readxl}` package to load each table into a different variable and then do the cleaning per table. This was a bit cumbersome. Since this will probably be a very long post, I'll hide the code, but you can see it in the [Rmarkdown file](https://github.com/danielroelfs/danielroelfs.com/blob/main/content/blog/2020-running-an-ica-on-questionnaires/index.Rmd) (at least until I figure out how to get code folding to work on my laptop). +{{{< sidenote br="10em" >}}} +There are [R packages](https://docs.ropensci.org/tabulizer/) that can parse PDF files, but in my experience they can be clunky. In this case the simplest solution seemed the best, despite requiring some manual work +{{{< /sidenote >}}} + +The first piece of data I'll use comes from the Amsterdam Real Estate Association. They publish quarterly data on a number of variables about the Amsterdam housing market ([link](https://www.mva.nl/over-de-mva/mva/kwartaalcijfers)), inclusing asking price, final price paid, number of properties put on the market, number of properties sold and a few more back to the first quarter of 2012. _Obviously_, these numbers all come in pdf-format, because the people writing quarterly reports apparently have a _massive hatred_ towards people that want to analyze this data. I downloaded the reports, used the online tool [PDFTables](https://pdftables.com) to convert them to Excel, and then stitched the tables together manually. Of course (remember the authors have a _massive hatred_ towards us), the formatting of the numbers in the tables weren't consistent between different quarterly reports, so I had to do some cleaning in R. I put each table in a separate sheet and then used functionality from the `{readxl}` package to load each table into a different variable and then do the cleaning per table. This was a bit cumbersome. You can look at the code I used to load and merge the files. It's a bit of a mess: @@ -92,61 +119,135 @@ You can look at the code I used to load and merge the files. It's a bit of a mes #| code-fold: true #| code-summary: "Show code" -asking_price <- readxl::read_xlsx("MVA_kwartaalcijfers.xlsx", sheet = 2) |> - janitor::clean_names() |> - mutate(type_woning = as_factor(type_woning), - across(where(is.double), ~ .x * 1e3), - across(where(is.character), ~ parse_number(str_remove_all(.x, fixed(" "))))) |> - pivot_longer(starts_with("x"), names_to = "date", values_to = "asking_price") - -transaction_price <- readxl::read_xlsx("MVA_kwartaalcijfers.xlsx", sheet = 3) |> - janitor::clean_names() |> - mutate(type_woning = as_factor(type_woning), - across(where(is.character), ~ parse_number(str_remove_all(.x, fixed(" "))))) |> - pivot_longer(starts_with("x"), names_to = "date", values_to = "transaction_price") - -price_per_m2 <- readxl::read_xlsx("MVA_kwartaalcijfers.xlsx", sheet = 4) |> - janitor::clean_names() |> - mutate(type_woning = as_factor(type_woning), - across(where(is.double), ~ .x * 1e3), - across(where(is.character), ~ parse_number(str_remove_all(.x, fixed(" "))))) |> - pivot_longer(starts_with("x"), names_to = "date", values_to = "price_per_m2") - -n_offered <- readxl::read_xlsx("MVA_kwartaalcijfers.xlsx", sheet = 5) |> - janitor::clean_names() |> - mutate(type_woning = as_factor(type_woning), - across(where(is.character), ~ parse_number(str_remove_all(.x, fixed(" "))))) |> - pivot_longer(starts_with("x"), names_to = "date", values_to = "n_offered") - -n_sold <- readxl::read_xlsx("MVA_kwartaalcijfers.xlsx", sheet = 6) |> - janitor::clean_names() |> - mutate(type_woning = as_factor(type_woning), - across(where(is.character), ~ parse_number(str_remove_all(.x, fixed(" "))))) |> - pivot_longer(starts_with("x"), names_to = "date", values_to = "n_sold") - -mortgage_months <- readxl::read_xlsx("MVA_kwartaalcijfers.xlsx", sheet = 7) |> - janitor::clean_names() |> - mutate(type_woning = as_factor(type_woning), - across(where(is.character), ~ parse_number(str_remove_all(.x, fixed(" "))))) |> - pivot_longer(starts_with("x"), names_to = "date", values_to = "mortgage_months") - -tightness_index <- readxl::read_xlsx("MVA_kwartaalcijfers.xlsx", sheet = 8) |> - janitor::clean_names() |> - mutate(type_woning = as_factor(type_woning), - across(where(is.character), ~ parse_number(str_replace_all(.x, ",", ".")))) |> - pivot_longer(starts_with("x"), names_to = "date", values_to = "tightness_index") - -data_merged <- inner_join(asking_price, transaction_price) |> - inner_join(price_per_m2) |> - inner_join(n_offered) |> - inner_join(n_sold) |> - inner_join(mortgage_months) |> - inner_join(tightness_index) |> - mutate(asking_price = ifelse(asking_price < 1e5, asking_price * 1e3, asking_price), - transaction_price = ifelse(transaction_price < 1e5, transaction_price * 1e3, transaction_price), - price_per_m2 = ifelse(price_per_m2 > 1e4, price_per_m2 / 1e3, price_per_m2)) - -write_rds(data_merged, "data_merged.rds") +asking_price <- readxl::read_xlsx("./data/MVA_kwartaalcijfers.xlsx", + sheet = 2 +) |> + janitor::clean_names() |> + mutate( + type_woning = as_factor(type_woning), + across(where(is.double), ~ .x * 1e3), + across( + where(is.character), + ~ parse_number(str_remove_all(.x, fixed(" "))) + ) + ) |> + pivot_longer(starts_with("x"), + names_to = "date", values_to = "asking_price" + ) + +transaction_price <- readxl::read_xlsx("./data/MVA_kwartaalcijfers.xlsx", + sheet = 3 +) |> + janitor::clean_names() |> + mutate( + type_woning = as_factor(type_woning), + across( + where(is.character), + ~ parse_number(str_remove_all(.x, fixed(" "))) + ) + ) |> + pivot_longer(starts_with("x"), + names_to = "date", values_to = "transaction_price" + ) + +price_per_m2 <- readxl::read_xlsx("./data/MVA_kwartaalcijfers.xlsx", + sheet = 4 +) |> + janitor::clean_names() |> + mutate( + type_woning = as_factor(type_woning), + across(where(is.double), ~ .x * 1e3), + across( + where(is.character), + ~ parse_number(str_remove_all(.x, fixed(" "))) + ) + ) |> + pivot_longer(starts_with("x"), + names_to = "date", values_to = "price_per_m2" + ) + +n_offered <- readxl::read_xlsx("./data/MVA_kwartaalcijfers.xlsx", + sheet = 5 +) |> + janitor::clean_names() |> + mutate( + type_woning = as_factor(type_woning), + across( + where(is.character), + ~ parse_number(str_remove_all(.x, fixed(" "))) + ) + ) |> + pivot_longer(starts_with("x"), + names_to = "date", values_to = "n_offered" + ) + +n_sold <- readxl::read_xlsx("./data/MVA_kwartaalcijfers.xlsx", + sheet = 6 +) |> + janitor::clean_names() |> + mutate( + type_woning = as_factor(type_woning), + across( + where(is.character), + ~ parse_number(str_remove_all(.x, fixed(" "))) + ) + ) |> + pivot_longer(starts_with("x"), + names_to = "date", values_to = "n_sold" + ) + +mortgage_months <- readxl::read_xlsx("./data/MVA_kwartaalcijfers.xlsx", + sheet = 7 +) |> + janitor::clean_names() |> + mutate( + type_woning = as_factor(type_woning), + across( + where(is.character), + ~ parse_number(str_remove_all(.x, fixed(" "))) + ) + ) |> + pivot_longer(starts_with("x"), + names_to = "date", values_to = "mortgage_months" + ) + +tightness_index <- readxl::read_xlsx("./data/MVA_kwartaalcijfers.xlsx", + sheet = 8 +) |> + janitor::clean_names() |> + mutate( + type_woning = as_factor(type_woning), + across( + where(is.character), + ~ parse_number(str_replace_all(.x, ",", ".")) + ) + ) |> + pivot_longer(starts_with("x"), + names_to = "date", values_to = "tightness_index" + ) + +data_merged <- inner_join(asking_price, transaction_price) |> + inner_join(price_per_m2) |> + inner_join(n_offered) |> + inner_join(n_sold) |> + inner_join(mortgage_months) |> + inner_join(tightness_index) |> + mutate( + asking_price = ifelse(asking_price < 1e5, + asking_price * 1e3, + asking_price + ), + transaction_price = ifelse(transaction_price < 1e5, + transaction_price * 1e3, + transaction_price + ), + price_per_m2 = ifelse(price_per_m2 > 1e4, + price_per_m2 / 1e3, + price_per_m2 + ) + ) + +write_rds(data_merged, "./data/data_merged.rds") ``` Let's have a look at the dataset. @@ -156,7 +257,7 @@ Let's have a look at the dataset. #| code-fold: true #| code-summary: "Show code" -data_merged <- read_rds("data_merged.rds") +data_merged <- read_rds("./data/data_merged.rds") glimpse(data_merged) ``` @@ -168,32 +269,42 @@ From this dataset, I want to create a few new variables. I want to create a date #| code-fold: true #| code-summary: "Show code" -data <- data_merged |> - rename(type = type_woning) |> - mutate(quarter = str_extract(date, pattern = "x(.*?)e"), - quarter = parse_number(quarter), - year = str_extract(date, pattern = "kw_(.*)"), - year = parse_number(year), - date = as.Date(str_glue("{year}-{(quarter * 3)}-01")), - diff_ask_paid = transaction_price - asking_price, - diff_ask_paid_perc = diff_ask_paid / asking_price, - diff_offered_sold = n_offered - n_sold, - diff_offered_sold_perc = diff_offered_sold / n_offered, - perc_sold = n_sold / n_offered, - type = case_when(str_detect(type,"Totaal") ~ "Total", - str_detect(type,"<= 1970") ~ "Apartments (pre-1970)", - str_detect(type,"> 1970") ~ "Apartments (post-1970)", - str_detect(type,"Tussenwoning") ~ "Terraced house", - str_detect(type,"Hoekwoning") ~ "Corner house", - str_detect(type,"Vrijstaand") ~ "Detached house", - str_detect(type,"2-onder-1-kap") ~ "Semi-detached house"), - type = factor(type, levels = c("Apartments (pre-1970)","Apartments (post-1970)", - "Terraced house","Corner house","Detached house", - "Semi-detached house","Total"))) |> +data <- data_merged |> + rename(type = type_woning) |> + mutate( + quarter = str_extract(date, pattern = "x(.*?)e"), + quarter = parse_number(quarter), + year = str_extract(date, pattern = "kw_(.*)"), + year = parse_number(year), + date = as.Date(str_glue("{year}-{(quarter * 3)}-01")), + diff_ask_paid = transaction_price - asking_price, + diff_ask_paid_perc = diff_ask_paid / asking_price, + diff_offered_sold = n_offered - n_sold, + diff_offered_sold_perc = diff_offered_sold / n_offered, + perc_sold = n_sold / n_offered, + type = case_when( + str_detect(type, "Totaal") ~ "Total", + str_detect(type, "<= 1970") ~ "Apartments (pre-1970)", + str_detect(type, "> 1970") ~ "Apartments (post-1970)", + str_detect(type, "Tussenwoning") ~ "Terraced house", + str_detect(type, "Hoekwoning") ~ "Corner house", + str_detect(type, "Vrijstaand") ~ "Detached house", + str_detect(type, "2-onder-1-kap") ~ "Semi-detached house" + ), + type = factor(type, levels = c( + "Apartments (pre-1970)", "Apartments (post-1970)", + "Terraced house", "Corner house", "Detached house", + "Semi-detached house", "Total" + )) + ) |> glimpse() ``` -The first thing that seems interesting to do is to plot the percentage difference between the asking price and the price paid. This will give us an indication of the trend in overpaying on different types of properties. I'll use a color palette and legend design I again shamelessly stole from Cédric Scherer ([@CedScherer](https://twitter.com/CedScherer)). I'll use a simple line graph to visualize the percentage overpay. +{{{< sidenote br="1em" >}}} +I'll use a color palette and legend design I shamelessly stole from [Cédric Scherer](https://twitter.com/CedScherer) +{{{< /sidenote >}}} + +The first thing that seems interesting to do is to plot the percentage difference between the asking price and the price paid. This will give us an indication of the trend in overpaying on different types of properties. Let's use a simple line graph to visualize the percentage overpay. ```{r} #| label: plot-overpay @@ -201,33 +312,48 @@ The first thing that seems interesting to do is to plot the percentage differenc #| code-fold: true #| code-summary: "Show code" -colors <- c("#019868","#9dd292","#ec0b88","#651eac","#e18a1e","#2b7de5") - -data |> - filter(type != "Total") |> - group_by(type) |> - mutate(n_total = sum(n_sold), - type_label = str_glue("{type} (n={format(n_total, big.mark = \".\", decimal.mark = \",\")})"), - type_label = str_replace(type_label,"\\) \\(", ", ")) |> - arrange(type) |> - mutate(type_label = factor(type_label)) |> - ggplot(aes(x = date, y = diff_ask_paid_perc, color = type_label)) + +colors <- c("#019868", "#9dd292", "#ec0b88", "#651eac", "#e18a1e", "#2b7de5") + +data |> + filter(type != "Total") |> + group_by(type) |> + mutate( + n_total = sum(n_sold), + type_label = str_glue( + "{type} (n={format(n_total, big.mark = \".\", decimal.mark = \",\")})" + ), + type_label = str_replace(type_label, "\\) \\(", ", ") + ) |> + arrange(type) |> + mutate(type_label = factor(type_label)) |> + ggplot(aes(x = date, y = diff_ask_paid_perc, color = type_label)) + geom_hline(yintercept = 0, color = "grey30", size = 1) + - geom_line(size = 1.2, alpha = 0.8, lineend = "round", key_glyph = "point") + - labs(title = "Overbidding has become the new normal", - subtitle = "_Paying as much as 5% over asking price is common the past few years_", - x = NULL, - y = "Percentage difference between\nasking price and price paid", - color = NULL, - caption = "_**Data**: MVA_") + - scale_y_continuous(labels = scales::label_percent()) + - scale_color_manual(values = colors, - guide = guide_legend(title.position = "top", title.hjust = 0.5, nrow = 2, - label.position = "right", - override.aes = list(fill = "transparent", size = 6, alpha = 1))) + - theme(plot.title = element_textbox(size = 20), - axis.title.y = element_textbox(width = grid::unit(2.5, "in")), - legend.key = element_rect(fill = "transparent", color = "transparent")) + geom_line( + size = 1.2, alpha = 0.8, + lineend = "round", key_glyph = "point" + ) + + labs( + title = "Overbidding has become the new normal", + subtitle = "_Paying as much as 5% over asking price is common the past few years_", + x = NULL, + y = "Percentage difference between\nasking price and price paid", + color = NULL, + caption = "_**Data**: MVA_" + ) + + scale_y_continuous(labels = scales::label_percent()) + + scale_color_manual( + values = colors, + guide = guide_legend( + title.position = "top", title.hjust = 0.5, nrow = 2, + label.position = "right", + override.aes = list(fill = "transparent", size = 6, alpha = 1) + ) + ) + + theme( + plot.title = element_textbox(size = 20), + axis.title.y = element_textbox(width = grid::unit(2.5, "in")), + legend.key = element_rect(fill = "transparent", color = "transparent") + ) ``` Prior to 2014, most properties in Amsterdam were sold at about 6% below asking price, in the last quarter of 2020, that trend had changed to more than 3.5% above asking. The variance is obviously larger for detached and semi-detached houses, since those are both expensive and scarce in Amsterdam, and are thus also bought and sold less often. The table below shows the stats for the first quarter of 2021. Apart from semi-detached houses, most other properties were sold at 4% over asking or more. The types of properties most accessible to first-time buyers are obviously the apartments. Either the older type in the inner city, or the newer apartments in the suburbs. People buying pre-1970 apartments spent more than €28 000 over the asking price and people buying apartments built after 1970 spent on average more than €18 000 over the asking price. @@ -238,27 +364,40 @@ Prior to 2014, most properties in Amsterdam were sold at about 6% below asking p #| code-fold: true #| code-summary: "Show code" -data |> - filter(type != "Total", - date == max(date)) |> - select(type, diff_ask_paid_perc, diff_ask_paid, transaction_price, n_sold) |> - arrange(-diff_ask_paid_perc) |> - gt() |> - cols_align(columns = "type", align = "left") |> - fmt_percent(columns = "diff_ask_paid_perc") |> - fmt_currency(columns = "diff_ask_paid", currency = "EUR", decimals = 0, sep_mark = " ") |> - fmt_currency(columns = "transaction_price", currency = "EUR", decimals = 0, sep_mark = " ") |> - fmt_number(columns = "n_sold", sep_mark = " ", drop_trailing_zeros = TRUE) |> +data |> + filter( + type != "Total", + date == max(date) + ) |> + select(type, diff_ask_paid_perc, diff_ask_paid, transaction_price, n_sold) |> + arrange(-diff_ask_paid_perc) |> + gt() |> + cols_align(columns = "type", align = "left") |> + fmt_percent(columns = "diff_ask_paid_perc") |> + fmt_currency( + columns = "diff_ask_paid", currency = "EUR", + decimals = 0, sep_mark = " " + ) |> + fmt_currency( + columns = "transaction_price", currency = "EUR", + decimals = 0, sep_mark = " " + ) |> + fmt_number( + columns = "n_sold", sep_mark = " ", + drop_trailing_zeros = TRUE + ) |> cols_label( type = html("
Property type
"), diff_ask_paid_perc = html("
Percentage overpay
"), diff_ask_paid = html("
Mean overpay
"), transaction_price = html("
Mean transaction price
"), n_sold = html("
Number of properties
sold in period
") - ) |> - tab_header(title = html("

Difference between
asking price and price paid

"), - subtitle = html("

Data from the first quarter of 2021

")) |> - tab_source_note(source_note = md("_**Data**: MVA_")) |> + ) |> + tab_header( + title = html("

Difference between
asking price and price paid

"), + subtitle = html("

Data from the first quarter of 2021

") + ) |> + tab_source_note(source_note = md("_**Data**: MVA_")) |> tab_options(table.background.color = "#EBEBEB") |> gtsave("overpay-table.png", expand = 0) ``` @@ -273,47 +412,76 @@ What contributed to this price increase? A simple supply-and-demand plays a part #| code-fold: true #| code-summary: "Show code" -data |> - filter(type != "Total") |> - group_by(type) |> - mutate(n_total = sum(n_sold), - type_label = str_glue("{type} (n={format(n_total, big.mark = \".\", decimal.mark = \",\")})"), - type_label = str_replace(type_label,"\\) \\(", ", ")) |> - arrange(type) |> - mutate(type_label = factor(type_label)) |> - ggplot(aes(x = date, y = tightness_index, color = type_label)) + - geom_rect(data = tibble(), - aes(xmin = as.Date(-Inf), xmax = as.Date(Inf), ymin = c(0,5,10), ymax = c(5,10,Inf), - fill = c("Sellers market", "Balanced market", "Buyers market")), - color = "transparent", alpha = 0.2, key_glyph = "point", inherit.aes = FALSE) + +data |> + filter(type != "Total") |> + group_by(type) |> + mutate( + n_total = sum(n_sold), + type_label = str_glue( + "{type} (n={format(n_total, big.mark = \".\", decimal.mark = \",\")})" + ), + type_label = str_replace(type_label, "\\) \\(", ", ") + ) |> + arrange(type) |> + mutate(type_label = factor(type_label)) |> + ggplot(aes(x = date, y = tightness_index, color = type_label)) + + geom_rect( + data = tibble(), + aes( + xmin = as.Date(-Inf), xmax = as.Date(Inf), + ymin = c(0, 5, 10), ymax = c(5, 10, Inf), + fill = c("Sellers market", "Balanced market", "Buyers market") + ), + color = "transparent", alpha = 0.2, key_glyph = "point", inherit.aes = FALSE + ) + geom_hline(yintercept = 0, color = "grey30", size = 1) + - geom_line(size = 1.2, alpha = 0.8, lineend = "round", key_glyph = "point") + - labs(title = "Amsterdam has had a sellers market for nearly 5 years", - x = NULL, - y = "Indicator of _\"density\"_ on the housing market", - color = NULL, - fill = "Type of market:", - caption = "_**Data**: MVA_") + - scale_x_date(expand = c(0,0)) + - scale_y_continuous(trans = "reverse", expand = c(0,0)) + - scale_color_manual(values = colors, - guide = guide_legend(title.position = "top", title.hjust = 0.5, nrow = 2, - label.position = "right", order = 1, - override.aes = list(fill = "transparent", size = 6, alpha = 1))) + - scale_fill_manual(values = c("#F5E000","#00B300","#D1000E"), - limits = c("Buyers market","Balanced market","Sellers market"), - guide = guide_legend(order = 2, override.aes = list(shape = 21, size = 6, alpha = 1, stroke = 0))) + + geom_line(size = 1.2, alpha = 0.8, lineend = "round", key_glyph = "point") + + labs( + title = "Amsterdam has had a sellers market for nearly 5 years", + x = NULL, + y = "Indicator of _\"density\"_ on the housing market", + color = NULL, + fill = "Type of market:", + caption = "_**Data**: MVA_" + ) + + scale_x_date(expand = c(0, 0)) + + scale_y_continuous(trans = "reverse", expand = c(0, 0)) + + scale_color_manual( + values = colors, + guide = guide_legend( + title.position = "top", title.hjust = 0.5, nrow = 2, + label.position = "right", order = 1, + override.aes = list(fill = "transparent", size = 6, alpha = 1) + ) + ) + + scale_fill_manual( + values = c("#F5E000", "#00B300", "#D1000E"), + limits = c("Buyers market", "Balanced market", "Sellers market"), + guide = guide_legend( + order = 2, + override.aes = list( + shape = 21, size = 6, + alpha = 1, stroke = 0 + ) + ) + ) + coord_cartesian(clip = "off") + - theme(plot.title = element_textbox(size = 16), - plot.subtitle = element_textbox(size = 10), - axis.title.y = element_textbox(orientation = "left-rotated", - width = grid::unit(2, "in")), - legend.key = element_rect(fill = "transparent")) + theme( + plot.title = element_textbox(size = 16), + plot.subtitle = element_textbox(size = 10), + axis.title.y = element_textbox( + orientation = "left-rotated", + width = grid::unit(2, "in") + ), + legend.key = element_rect(fill = "transparent") + ) ``` So, there's a lot of competition among buyers, and people looking to sell their houses can expect to be paid more than they anticipated. Dozens of buyers compete for the same properties, driving up the price. The figure below shows the percentage of properties sold compared to the number of properties offered. It's clear that after the 2008 housing bubble crisis, the housing market was still recovering in 2012 and 2013. However, since 2016, more apartments were sold than were put on the market. This means that the number of properties available for the growing number of people wanting to move to Amsterdam is decreasing. This decreases supply in a time with increasing demand, thus pushing the prices higher twice over. -

A new phenomenon that entered the scene a little while ago may indicate how skewed the market is. People wanting to buy a house in a certain neighborhood will put notes and letters in the mailboxes of people living in that area saying "Nice house! Are you interested in selling it to me?". This is now not an uncommon strategy to find a house. Some people living in popular neighborshoods are inundated with notes from agencies facilitating these practices. See also a news item by RTL.

+{{{< standout bg="#acc8d4" >}}} +A new phenomenon that entered the scene a little while ago may indicate how skewed the market is. People wanting to buy a house in a certain neighborhood will put notes and letters in the mailboxes of people living in that area saying _"Nice house! Are you interested in selling it to me?"_. This is now not an uncommon strategy to find a house. Some people living in popular neighborshoods are inundated with notes from agencies facilitating these practices. See also a [news item by RTL](https://www.rtlnieuws.nl/geld-en-werk/artikel/19001/droomhuis-gezien-maar-niet-te-koop-stop-dan-een-briefje-de-bus) +{{{< /standout >}}} ```{r} #| label: plot-n-sales @@ -323,29 +491,45 @@ So, there's a lot of competition among buyers, and people looking to sell their #| code-fold: true #| code-summary: "Show code" -data |> - filter(type != "Total") |> - ggplot(aes(x = date, y = perc_sold, fill = type)) + - geom_col(key_glyph = "point") + +data |> + filter(type != "Total") |> + ggplot(aes(x = date, y = perc_sold, fill = type)) + + geom_col(key_glyph = "point") + geom_hline(yintercept = 1, linetype = "dashed", color = "grey20") + - labs(title = "Some years, more houses are sold than are being put on the market", - x = NULL, - y = "Percentage of offered houses sold", - fill = NULL, - caption = "_**Data**: MVA_") + + labs( + title = "Some years, more houses are sold than are being put on the market", + x = NULL, + y = "Percentage of offered houses sold", + fill = NULL, + caption = "_**Data**: MVA_" + ) + coord_cartesian(clip = "off") + - scale_y_continuous(labels = scales::label_percent(), expand = c(0,NA), n.breaks = 4) + - scale_fill_manual(values = colors, - guide = guide_legend(title.position = "top", title.hjust = 0.5, nrow = 2, - label.position = "right", - override.aes = list(shape = 21, size = 6, alpha = 1, stroke = 0))) + - theme(plot.title = element_textbox(size = 20, - width = grid::unit(6,"in")), - plot.subtitle = element_textbox(size = 10), - strip.text.x = element_markdown(face = "bold", padding = margin(10,0,5,0,"pt")), - strip.background = element_rect(fill = "transparent"), - legend.key = element_rect(fill = "transparent")) + - facet_wrap(~ type, strip.position = "top", scales = "free_x") + scale_y_continuous( + labels = scales::label_percent(), + expand = c(0, NA), n.breaks = 4 + ) + + scale_fill_manual( + values = colors, + guide = guide_legend( + title.position = "top", title.hjust = 0.5, nrow = 2, + label.position = "right", + override.aes = list(shape = 21, size = 6, alpha = 1, stroke = 0) + ) + ) + + theme( + plot.title = element_textbox( + size = 20, + width = grid::unit(6, "in") + ), + plot.subtitle = element_textbox(size = 10), + strip.text.x = element_markdown( + face = "bold", + padding = margin(10, 0, 5, 0, "pt") + ), + strip.background = element_rect(fill = "transparent"), + legend.key = element_rect(fill = "transparent") + ) + + facet_wrap(~type, strip.position = "top", scales = "free_x") ``` This adds fuel to the fire. I guess I'm trying to show that there are a number of factors stacked against young buyers. Any money you have saved up you need to spend to outbid the massive competition. The massive competition means buyers only have a handful of feasible options. Again, because of the massive competition, the chance they actually get to buy one of those options is low. The number of options is not steady either, the number properties sold has overtaken the number of properties being put on the market. There is a dwindling reserve of properties. Combine this with more and more young people wanting to move to the bigger cities and you have a perfect cocktail for a congested housing market. Building new properties can counteract this, but over the last years the Netherlands has actually slowed building new properties. @@ -360,55 +544,88 @@ This adds fuel to the fire. I guess I'm trying to show that there are a number o #| code-summary: "Show code" decades <- tibble( - x1 = seq(1920,2020,10), + x1 = seq(1920, 2020, 10), x2 = x1 + 10 -) |> - slice(seq(1,1e2,2)) |> +) |> + slice(seq(1, 1e2, 2)) |> filter(x2 < 2030) -hist_data_homes <- cbsodataR::cbs_get_data("82235NED") |> - janitor::clean_names() |> - rename(stock_end = eindstand_voorraad_8) |> - mutate(year = parse_number(perioden), - stock_end = stock_end * 1e3, - diff = stock_end - lag(stock_end)) +hist_data_homes <- cbsodataR::cbs_get_data("82235NED") |> + janitor::clean_names() |> + rename(stock_end = eindstand_voorraad_8) |> + mutate( + year = parse_number(perioden), + stock_end = stock_end * 1e3, + diff = stock_end - lag(stock_end) + ) -total_hist_plot <- hist_data_homes |> +total_hist_plot <- hist_data_homes |> ggplot(aes(x = year, y = stock_end)) + - geom_rect(data = tibble(), aes(xmin = 1940.01, xmax = 1945, ymin = -Inf, ymax = Inf), - fill = "red", alpha = 0.2, inherit.aes = FALSE) + - geom_text(data = tibble(), aes(x = 1942.5, y = max(hist_data_homes$stock_end, na.rm = TRUE), vjust = 0, label = "WWII"), size = 3, - fontface = "bold", family = "nunito-sans", inherit.aes = FALSE) + - geom_rect(data = decades, aes(xmin = x1, xmax = x2, ymin = -Inf, ymax = Inf), - fill = "grey30", alpha = 0.2, color = "transparent", inherit.aes = FALSE) + + geom_rect( + data = tibble(), aes(xmin = 1940.01, xmax = 1945, ymin = -Inf, ymax = Inf), + fill = "red", alpha = 0.2, inherit.aes = FALSE + ) + + geom_text( + data = tibble(), + aes(x = 1942.5, y = max(hist_data_homes$stock_end, na.rm = TRUE)), + label = "WWII", fontface = "bold", family = "nunito-sans", + vjust = 0, size = 3, inherit.aes = FALSE + ) + + geom_rect( + data = decades, aes(xmin = x1, xmax = x2, ymin = -Inf, ymax = Inf), + fill = "grey30", alpha = 0.2, color = "transparent", inherit.aes = FALSE + ) + geom_line(color = "darkred", size = 1.5, lineend = "round") + - labs(x = NULL, - y = "Total number of homes") + - scale_x_continuous(expand = c(0,0), breaks = c(decades$x1,decades$x2,2020)) + + labs( + x = NULL, + y = "Total number of homes" + ) + + scale_x_continuous( + expand = c(0, 0), + breaks = c(decades$x1, decades$x2, 2020) + ) + scale_y_continuous(labels = scales::label_number()) + coord_cartesian(clip = "off") -diff_hist_plot <- hist_data_homes |> +diff_hist_plot <- hist_data_homes |> ggplot(aes(x = year, y = diff)) + geom_hline(yintercept = 0.5, color = "grey30", size = 1) + - geom_rect(data = tibble(), aes(xmin = 1940.01, xmax = 1945, ymin = -Inf, ymax = Inf), - fill = "red", alpha = 0.2, inherit.aes = FALSE) + - geom_text(data = tibble(), aes(x = 1942.5, y = max(hist_data_homes$diff, na.rm = TRUE), vjust = 0, label = "WWII"), size = 3, - fontface = "bold", family = "nunito-sans", inherit.aes = FALSE) + - geom_rect(data = decades, aes(xmin = x1, xmax = x2, ymin = -Inf, ymax = Inf), - fill = "grey30", alpha = 0.2, color = "transparent", inherit.aes = FALSE) + + geom_rect( + data = tibble(), aes(xmin = 1940.01, xmax = 1945, ymin = -Inf, ymax = Inf), + fill = "red", alpha = 0.2, inherit.aes = FALSE + ) + + geom_text( + data = tibble(), aes( + x = 1942.5, + y = max(hist_data_homes$diff, na.rm = TRUE), + vjust = 0, label = "WWII" + ), + size = 3, fontface = "bold", + family = "nunito-sans", inherit.aes = FALSE + ) + + geom_rect( + data = decades, aes(xmin = x1, xmax = x2, ymin = -Inf, ymax = Inf), + fill = "grey30", alpha = 0.2, color = "transparent", inherit.aes = FALSE + ) + geom_line(color = "grey30", alpha = 0.5, size = 1.5, lineend = "round") + geom_smooth(color = "darkred", size = 1.5, se = FALSE) + - labs(x = NULL, - y = "Net homes added per year") + - scale_x_continuous(expand = c(0,0), breaks = c(decades$x1,decades$x2,2020)) + + labs( + x = NULL, + y = "Net homes added per year" + ) + + scale_x_continuous( + expand = c(0, 0), + breaks = c(decades$x1, decades$x2, 2020) + ) + scale_y_continuous(labels = scales::label_number_auto()) + coord_cartesian(clip = "off") total_hist_plot / diff_hist_plot + - plot_annotation(title = "Number of houses available", - subtitle = "Data covers development in the Netherlands nationwide", - caption = "**Data**: CBS") & + plot_annotation( + title = "Number of houses available", + subtitle = "Data covers development in the Netherlands nationwide", + caption = "**Data**: CBS" + ) & theme(plot.title = element_textbox(size = 20)) ``` @@ -417,6 +634,10 @@ The figure displays data from from the CBS through the `{cbsodataR}` package. It It's also important to mention that between the 60s and 70s, the Netherlands started building entire cities from scratch on newly-reclaimed land from the sea. The province of [Flevoland](https://en.wikipedia.org/wiki/Flevoland) is currently home to about 415 000 people, and until the early 50s this province was non-existent, the current land was at the bottom of a bay of the North sea (called the Zuiderzee or "Southern Sea" in English). Other than a general enthusiasm for building, I think this contributed considerably to the increase in the number of homes added in the 60s and 70s. This new province has good access to Amsterdam, and if it weren't for this new piece of land, the shortage might have peaked earlier. +{{{< sidenote >}}} +There's actually quite a few more, but I'll focus on the ones I can quantify at the moment +{{{< /sidenote >}}} + But that's not all, there's a few other features that contribute to the gridlock. See, not only young people are on the market to buy apartments in Amsterdam. There's a thriving market for investors looking to take advantage of the rising prices in the Amsterdam housing market (source: [Algemeen Dagblad](https://www.ad.nl/wonen/beleggers-verwerven-massaal-koophuizen-voor-verhuur-in-amsterdam~afedc50c6/)). According to the Dutch central bank, about 1 in 5 properties are sold to investors, who are mostly looking to convert it to a rental property or flip it for a profit. I couldn't find the data the Dutch central bank relied on, but I found something else. The Central Bureau for Statistics collects data on the number of "mutations" among properties. The "mutation" in this case refers to the change of purpose of a property, e.g. if a house meant for sale is bought and then transformed to a rental property by either private individuals or corporations, or vice versa. I collected this data from the governmental yearly report on the housing market of 2020 ("_Staat van de Woningmarkt - Jaarrapportage 2020_", [link](https://www.rijksoverheid.nl/documenten/rapporten/2020/06/15/staat-van-de-woningmarkt-jaarrapportage-2020)). Instead of per quarter, this data is reported per year. Unfortunately, the data on housing mutations in the report (from 2020 mind you) only goes until 2017. It's important to note that these numbers are country-wide, not specific for Amsterdam. That means there could be some other factors in play. Many of these trends are present across the country, but they're massively amplified in the larger cities. The data was contained in a pdf that wasn't easily machine-readable, so I had to manually copy the numbers into a tibble, which was great... ```{r} @@ -425,7 +646,8 @@ But that's not all, there's a few other features that contribute to the gridlock #| code-summary: "Show code" house_mutations_in <- tribble( - ~year, ~buy_to_rent_corp, ~buy_to_rent_other, ~rent_corp_to_buy, ~rent_corp_to_rent_other, ~rent_other_to_buy, ~rent_other_to_rent_corp, + ~year, ~buy_to_rent_corp, ~buy_to_rent_other, ~rent_corp_to_buy, + ~rent_corp_to_rent_other, ~rent_other_to_buy, ~rent_other_to_rent_corp, 2012, 900, 58000, 14600, 5500, 50600, 4900, 2013, 800, 62200, 15200, 11500, 50900, 6000, 2014, 1000, 62200, 15400, 9300, 59900, 39000, @@ -434,19 +656,28 @@ house_mutations_in <- tribble( 2017, 1600, 98900, 6400, 11000, 7300, 9000 ) -house_mutations <- house_mutations_in |> - pivot_longer(cols = -year, names_to = "mutation", values_to = "n_mutations") |> - mutate(from = str_extract(mutation, "(.*?)_to"), - from = str_remove_all(from, "_to"), - to = str_extract(mutation, "to_(.*?)$"), - to = str_remove_all(to, "to_")) |> - group_by(year) |> - mutate(total_mutations = sum(n_mutations), - perc_mutations = n_mutations / total_mutations, - across(c(from,to), ~ case_when(str_detect(.x,"buy") ~ "buy", - str_detect(.x,"rent_corp") ~ "rent (corporation)", - str_detect(.x,"rent_other") ~ "rent (other)")), - mutation_label = str_glue("From {from} to {to}")) |> +house_mutations <- house_mutations_in |> + pivot_longer( + cols = -year, + names_to = "mutation", values_to = "n_mutations" + ) |> + mutate( + from = str_extract(mutation, "(.*?)_to"), + from = str_remove_all(from, "_to"), + to = str_extract(mutation, "to_(.*?)$"), + to = str_remove_all(to, "to_") + ) |> + group_by(year) |> + mutate( + total_mutations = sum(n_mutations), + perc_mutations = n_mutations / total_mutations, + across(c(from, to), ~ case_when( + str_detect(.x, "buy") ~ "buy", + str_detect(.x, "rent_corp") ~ "rent (corporation)", + str_detect(.x, "rent_other") ~ "rent (other)" + )), + mutation_label = str_glue("From {from} to {to}") + ) |> ungroup() ``` @@ -460,46 +691,77 @@ So not every year there's the same number of "mutations" (transformations of pur #| code-fold: true #| code-summary: "Show code" -mutations_n_plot <- house_mutations |> - arrange(from,to) |> - mutate(mutation_label = fct_inorder(mutation_label)) |> - ggplot(aes(x = year, y = n_mutations, alluvium = mutation_label, fill = mutation_label)) + +mutations_n_plot <- house_mutations |> + arrange(from, to) |> + mutate(mutation_label = fct_inorder(mutation_label)) |> + ggplot(aes( + x = year, y = n_mutations, + alluvium = mutation_label, fill = mutation_label + )) + geom_point(shape = 21, color = "transparent", size = NA) + ggalluvial::geom_flow(width = 0, show.legend = FALSE) + - labs(x = NULL, - y = "Number of mutations", - fill = NULL) + - scale_x_continuous(expand = c(0,0)) + - scale_y_continuous(labels = scales::label_number_auto(), expand = c(0,0)) + - scale_fill_manual(values = rev(colors), - guide = guide_legend(title.position = "top", title.hjust = 0.5, nrow = 2, - label.position = "right", - override.aes = list(color = "transparent", size = 6, alpha = 1, stroke = 0))) + + labs( + x = NULL, + y = "Number of mutations", + fill = NULL + ) + + scale_x_continuous(expand = c(0, 0)) + + scale_y_continuous( + labels = scales::label_number_auto(), + expand = c(0, 0) + ) + + scale_fill_manual( + values = rev(colors), + guide = guide_legend( + title.position = "top", title.hjust = 0.5, nrow = 2, + label.position = "right", + override.aes = list( + color = "transparent", size = 6, + alpha = 1, stroke = 0 + ) + ) + ) + theme(legend.key = element_rect(fill = "transparent")) -mutations_perc_plot <- house_mutations |> - arrange(from,to) |> - mutate(mutation_label = fct_inorder(mutation_label)) |> - ggplot(aes(x = year, y = perc_mutations, alluvium = mutation_label, fill = mutation_label)) + +mutations_perc_plot <- house_mutations |> + arrange(from, to) |> + mutate(mutation_label = fct_inorder(mutation_label)) |> + ggplot(aes( + x = year, y = perc_mutations, + alluvium = mutation_label, fill = mutation_label + )) + geom_point(shape = 21, color = "transparent", size = NA) + ggalluvial::geom_flow(width = 0, show.legend = FALSE) + - labs(x = NULL, - y = "Percentage of total mutations per quarter", - fill = NULL) + - scale_x_continuous(expand = c(0,0)) + - scale_y_continuous(labels = scales::label_percent(), expand = c(0,0)) + - scale_fill_manual(values = rev(colors), - guide = guide_legend(title.position = "top", title.hjust = 0.5, nrow = 2, - label.position = "right", - override.aes = list(color = "transparent", size = 6, alpha = 1, stroke = 0))) + - theme(axis.title.y = element_textbox(width = grid::unit(2,"in")), - legend.key = element_rect(fill = "transparent")) + labs( + x = NULL, + y = "Percentage of total mutations per quarter", + fill = NULL + ) + + scale_x_continuous(expand = c(0, 0)) + + scale_y_continuous(labels = scales::label_percent(), expand = c(0, 0)) + + scale_fill_manual( + values = rev(colors), + guide = guide_legend( + title.position = "top", title.hjust = 0.5, nrow = 2, + label.position = "right", + override.aes = list( + color = "transparent", size = 6, + alpha = 1, stroke = 0 + ) + ) + ) + + theme( + axis.title.y = element_textbox(width = grid::unit(2, "in")), + legend.key = element_rect(fill = "transparent") + ) mutations_n_plot / mutations_perc_plot + - plot_annotation(title = "Shift to a renters market", - subtitle = "_A lot more houses meant for sale are being transformed into rental properties than vice versa_", - caption = "_**Data**: CBS_") + - plot_layout(guides = 'collect') & + plot_annotation( + title = "Shift to a renters market", + subtitle = "_A lot more houses meant for sale are being transformed into rental properties than vice versa_", + caption = "_**Data**: CBS_" + ) + + plot_layout(guides = "collect") & theme(plot.title = element_textbox(size = 20)) ``` @@ -515,53 +777,66 @@ We can look at the net number of houses added to the rental market by adding up #| code-fold: true #| code-summary: "Show code" -net_house_mutations <- house_mutations |> - mutate(across(c(from,to), ~ case_when(str_detect(.x, "rent") ~ "rent", - str_detect(.x, "buy") ~ "buy"))) |> - group_by(year) |> - filter(from != to) |> - mutate(total_mutations = sum(n_mutations)) |> - group_by(year, from, to) |> - summarise(n_mutations = sum(n_mutations), - total_mutations = first(total_mutations)) |> - pivot_wider(id_cols = c(year,total_mutations), names_from = c(from,to), values_from = n_mutations, - names_sep = "_to_") |> - mutate(net_buy_to_rent = buy_to_rent - rent_to_buy, - perc_buy_to_rent = buy_to_rent / total_mutations) - -net_mutation_plot <- net_house_mutations |> - ggplot(aes(x = year, y = net_buy_to_rent)) + +net_house_mutations <- house_mutations |> + mutate(across(c(from, to), ~ case_when( + str_detect(.x, "rent") ~ "rent", + str_detect(.x, "buy") ~ "buy" + ))) |> + group_by(year) |> + filter(from != to) |> + mutate(total_mutations = sum(n_mutations)) |> + group_by(year, from, to) |> + summarise( + n_mutations = sum(n_mutations), + total_mutations = first(total_mutations) + ) |> + pivot_wider( + id_cols = c(year, total_mutations), + names_from = c(from, to), + values_from = n_mutations, + names_sep = "_to_" + ) |> + mutate( + net_buy_to_rent = buy_to_rent - rent_to_buy, + perc_buy_to_rent = buy_to_rent / total_mutations + ) + +net_mutation_plot <- net_house_mutations |> + ggplot(aes(x = year, y = net_buy_to_rent)) + geom_hline(yintercept = 0, color = "grey30", size = 1) + - #geom_ribbon(aes(ymin = 0, ymax = net_buy_to_rent), fill = "grey30", alpha = 0.2) + geom_line(color = "darkred", size = 1.5, lineend = "round") + - #geom_textbox(data = tibble(), aes(x = 2014, y = 4e4, - # label = "Net number of properties meant for sale withdrawn from the market"), - # family = "nunito-sans", size = 4, fill = "transparent", - # maxwidth = grid::unit(2,"in"), hjust = 0.5, vjust = 0) + - #geom_curve(data = tibble(), aes(x = 2014, y = 4e4, xend = 2016.5, yend = 2.5e4), - # curvature = 0.3, size = 0.75, arrow = arrow(length = unit(2,"mm")), - # lineend = "round") + - labs(x = NULL, - y = "**Net number of properties changed from properties meant for sale to rental properties**") + - scale_y_continuous(labels = scales::label_number_auto(), limits = c(-3e4, NA), n.breaks = 5) + - theme(axis.title.y = element_textbox(width = grid::unit(3.5,"in"))) - -perc_mutation_plot <- net_house_mutations |> - ggplot(aes(x = year, y = perc_buy_to_rent)) + + labs( + x = NULL, + y = "**Net number of properties changed from properties meant for sale to rental properties**" + ) + + scale_y_continuous( + labels = scales::label_number_auto(), + limits = c(-3e4, NA), n.breaks = 5 + ) + + theme(axis.title.y = element_textbox(width = grid::unit(3.5, "in"))) + +perc_mutation_plot <- net_house_mutations |> + ggplot(aes(x = year, y = perc_buy_to_rent)) + geom_hline(yintercept = 0.5, color = "grey30", size = 1) + geom_line(color = "darkred", size = 1.5, lineend = "round") + - labs(x = NULL, - y = "**Percentage of mutations that changed properties meant for sale to rental properties**") + - scale_y_continuous(labels = scales::label_percent(), limits = c(0,1), expand = c(0,0)) + + labs( + x = NULL, + y = "**Percentage of mutations that changed properties meant for sale to rental properties**" + ) + + scale_y_continuous( + labels = scales::label_percent(), + limits = c(0, 1), expand = c(0, 0) + ) + coord_cartesian(clip = "off") + - theme(axis.title.y = element_textbox(width = grid::unit(3.5,"in"))) - -net_mutation_plot + perc_mutation_plot + - plot_annotation(title = "Major net shift towards rental market", - subtitle = "_A lot more houses meant for sale are being transformed into rental properties than vice versa_", - caption = "_**Data**: CBS_") & + theme(axis.title.y = element_textbox(width = grid::unit(3.5, "in"))) + +net_mutation_plot + perc_mutation_plot + + plot_annotation( + title = "Major net shift towards rental market", + subtitle = "_A lot more houses meant for sale are being transformed into rental properties than vice versa_", + caption = "_**Data**: CBS_" + ) & theme(plot.title = element_textbox(size = 20)) - ``` So in 2017 nearly 90 000 houses were mutated from sale to rental properties. In that same year, about 62 000 new homes were built (source: [CBS](https://www.cbs.nl/nl-nl/nieuws/2018/04/hoogste-aantal-nieuwbouwwoningen-in-acht-jaar)). Not all of those 62 000 homes are meant for sale, a proportion are added to the rental market, but even if all those new homes were meant for sale, the sale market still shrunk due to the (net) ~90 000 properties that were transformed and in that way removed from the sale market. @@ -572,36 +847,49 @@ So in 2017 nearly 90 000 houses were mutated from sale to rental properties. In #| code-fold: true #| code-summary: "Show code" -house_mutations |> - filter(year == max(year)) |> - select(from, to, n_mutations, perc_mutations) |> - mutate(across(c(from,to), ~ str_to_sentence(.x))) |> - arrange(-perc_mutations) |> - gt() |> - fmt_number(columns = "n_mutations", sep_mark = " ", drop_trailing_zeros = TRUE) |> - fmt_percent(columns = "perc_mutations") |> +house_mutations |> + filter(year == max(year)) |> + select(from, to, n_mutations, perc_mutations) |> + mutate(across(c(from, to), ~ str_to_sentence(.x))) |> + arrange(-perc_mutations) |> + gt() |> + fmt_number( + columns = "n_mutations", sep_mark = " ", + drop_trailing_zeros = TRUE + ) |> + fmt_percent(columns = "perc_mutations") |> grand_summary_rows( columns = "n_mutations", fns = list(total = "sum"), - formatter = fmt_number, + fmt = ~ fmt_number(.), sep_mark = " ", - drop_trailing_zeros = TRUE) |> + drop_trailing_zeros = TRUE + ) |> grand_summary_rows( columns = "perc_mutations", fns = list(total = "sum"), - formatter = fmt_percent, - missing_text = "") |> + fmt = ~ fmt_percent(.), + missing_text = "" + ) |> cols_label( from = html("
From
"), to = html("
To
"), n_mutations = html("
Number of mutations
"), perc_mutations = html("
Percentage of total mutations
") - ) |> - tab_header(title = html("

Mutations in the Dutch housing market

"), - subtitle = html("

Data from the year 2017

")) |> - tab_source_note(source_note = md("_**Data**: CBS_")) |> - tab_options(table.background.color = "#EBEBEB", - grand_summary_row.text_transform = "capitalize") |> + ) |> + tab_header( + title = html( + "

Mutations in the Dutch housing market

" + ), + subtitle = html( + "

Data from the year 2017

" + ) + ) |> + tab_source_note(source_note = md("_**Data**: CBS_")) |> + tab_options( + table.background.color = "#EBEBEB", + grand_summary_row.text_transform = "capitalize" + ) |> gtsave("mutations-table.png", expand = 0) ``` @@ -616,48 +904,65 @@ So what's the result of all these phenomena? The figure below shows the housing #| code-fold: true #| code-summary: "Show code" -asking_start <- data |> - filter(type != "Total", - date == min(date)) |> - rename(asking_start = asking_price) |> +asking_start <- data |> + filter( + type != "Total", + date == min(date) + ) |> + rename(asking_start = asking_price) |> select(type, asking_start) -data_asking_index <- data |> - filter(type != "Total") |> - left_join(asking_start) |> +data_asking_index <- data |> + filter(type != "Total") |> + left_join(asking_start) |> mutate(asking_index = asking_price / asking_start) -data_asking_index |> - ggplot(aes(x = date, y = asking_index, color = type, group = type)) + - geom_line(size = 2, alpha = 0.15, lineend = "round", show.legend = FALSE) + +data_asking_index |> + ggplot(aes(x = date, y = asking_index, color = type, group = type)) + + geom_line(size = 2, alpha = 0.15, lineend = "round", show.legend = FALSE) + geom_smooth(se = FALSE, show.legend = FALSE) + - geom_point(size = NA) + - labs(title = "Price development over the past decade", - subtitle = "_On average, properties doubled in value since 2012_", - x = NULL, - y = "Price development relative to early 2012", - color = NULL, - caption = "_**Data**: MVA_") + - scale_y_continuous(labels = scales::label_percent()) + - scale_color_manual(values = colors, - guide = guide_legend(title.position = "top", title.hjust = 0.5, nrow = 2, - label.position = "right", - override.aes = list(fill = "transparent", size = 6, alpha = 1))) + - theme(plot.title = element_textbox(size = 20), - axis.title.y = element_textbox(width = grid::unit(2.5, "in")), - legend.key = element_rect(fill = "transparent", color = "transparent")) - + geom_point(size = NA) + + labs( + title = "Price development over the past decade", + subtitle = "_On average, properties doubled in value since 2012_", + x = NULL, + y = "Price development relative to early 2012", + color = NULL, + caption = "_**Data**: MVA_" + ) + + scale_y_continuous(labels = scales::label_percent()) + + scale_color_manual( + values = colors, + guide = guide_legend( + title.position = "top", title.hjust = 0.5, nrow = 2, + label.position = "right", + override.aes = list(fill = "transparent", size = 6, alpha = 1) + ) + ) + + theme( + plot.title = element_textbox(size = 20), + axis.title.y = element_textbox(width = grid::unit(2.5, "in")), + legend.key = element_rect(fill = "transparent", color = "transparent") + ) ``` -Adjusted for the number of properties offered, the average asking price in Amsterdam has increased by `r data_asking_index |> filter(date == max(date)) %$% weighted.mean(asking_index, n_offered) %>% subtract(.,1) |> scales::percent()`. That means that prices since 2012 have (on average) doubled. For apartments, this increase was `r data_asking_index |> filter(str_detect(type,"Apartments"), date == max(date)) %$% weighted.mean(asking_index, n_offered) %>% subtract(.,1) |> scales::percent()`. Inflation on the other hand was only about 8.5% since 2012. So young people wanting to buy a house in Amsterdam need to bring nearly twice as much money to the table as they did just a decade ago. The money they saved and the salaries they earned have by no means kept up with the increasing price though. +{{{< sidenote >}}} +Inflation since 2012 was about 8.5% which should be subtracted from these values +{{{< /sidenote >}}} + +Adjusted for the number of properties offered, the average asking price in Amsterdam has increased by `r data_asking_index |> filter(date == max(date)) %$% weighted.mean(asking_index, n_offered) %>% subtract(.,1) |> scales::percent()`. That means that prices since 2012 have (on average) roughly doubled. For apartments, this increase was `r data_asking_index |> filter(str_detect(type,"Apartments"), date == max(date)) %$% weighted.mean(asking_index, n_offered) %>% subtract(.,1) |> scales::percent()`. So young people wanting to buy a house in Amsterdam need to bring nearly twice as much money to the table as they did just a decade ago. The money they saved and the salaries they earned have by no means kept up with the increasing price though. I cannot know this for sure, because the data I presented here doesn't contain the answer, but perhaps these factors combined play a role in the financial uncertainty that millenials experience. How do we fix it? I'm not sure. This is a really complicated issue. But a few things might (in my opinion) help: -1. **Build more (sustainable) durable houses** -1. **Prohibit sale of properties to individuals who don't plan to live there themselves for at least 2 or 3 years** -1. **Increase transparency when buying or selling a house through mandatory public offerings to prevent scooping by investors** -1. **Rent control to make _"mutating"_ houses less profitable** -1. **Increased socialized housing for poor and low-income families** +- **Build more (sustainable) durable houses** + +- **Prohibit sale of properties to individuals who don't plan to live there themselves for at least 2 or 3 years** + +- **Increase transparency when buying or selling a house through mandatory public offerings to prevent scooping by investors** + +- **Rent control to make _"mutating"_ houses less profitable** + +- **Increased socialized housing for poor and low-income families** Perhaps if we implement a few of these strategies we can make the housing market a little fairer and perhaps ensure that future generations will have the same opportunities current home-owners have. diff --git a/content/blog/2021-amsterdam-housing-market/mutations-table.png b/content/blog/2021-amsterdam-housing-market/mutations-table.png index 3488a5c..a489091 100644 Binary files a/content/blog/2021-amsterdam-housing-market/mutations-table.png and b/content/blog/2021-amsterdam-housing-market/mutations-table.png differ diff --git a/content/blog/2021-comparison-fa-pca-ica/sobar-72.csv b/content/blog/2021-comparison-fa-pca-ica/data/sobar-72.csv similarity index 100% rename from content/blog/2021-comparison-fa-pca-ica/sobar-72.csv rename to content/blog/2021-comparison-fa-pca-ica/data/sobar-72.csv diff --git a/content/blog/2021-comparison-fa-pca-ica/index.md b/content/blog/2021-comparison-fa-pca-ica/index.md index 00f9c7c..c2935b5 100644 --- a/content/blog/2021-comparison-fa-pca-ica/index.md +++ b/content/blog/2021-comparison-fa-pca-ica/index.md @@ -14,13 +14,19 @@ execute: fig.show: hold results: hold out.width: 80% +editor_options: + chunk_output_type: console --- This is just a very quick blog post outlining some of the commonalities and differences between factor analysis (FA), principal component analysis (PCA), and independent component analysis (ICA). I was inspired to write some of this down through some confusion caused in the lab by SPSS' apparent dual usage of the term "factor analysis" and "principal components". A few of my colleagues who use SPSS showed me the following screen: -{{< image src="spss-screenshot.png" alt="spss screenshot" >}} +{{< figure src="spss-screenshot.png" alt="spss screenshot" >}} -This screen shows up when you click `Analyze` -\> `Dimension Reduction` -\> `Factor`, which then opens a window called "Factor Analysis: Extraction" which lets you pick "Principal components" as a method. To put the (apparent) PCA method as a sub-section of factor analysis is misleading at best, and straight-up erronious at worst. The other options for the method here is "Principal axis factoring" (which is closer to traditional factor analysis) and "Maximum likelihood" ([source for screenshot and SPSS interface](https://stats.idre.ucla.edu/spss/seminars/efa-spss/)). If you're wondering if you're the first one to be confused by SPSS' choice to present this in such a way, you're not ([link](https://stats.stackexchange.com/questions/1576/what-are-the-differences-between-factor-analysis-and-principal-component-analysi), [link](https://stats.stackexchange.com/questions/24781/interpreting-discrepancies-between-r-and-spss-with-exploratory-factor-analysis)). +{{< sidenote br="2em" >}} +If you're wondering if you're the first one to be confused by SPSS' choice to present this in such a way, you're not ([link](https://stats.stackexchange.com/questions/1576/what-are-the-differences-between-factor-analysis-and-principal-component-analysi), [link](https://stats.stackexchange.com/questions/24781/interpreting-discrepancies-between-r-and-spss-with-exploratory-factor-analysis)). +{{< /sidenote >}} + +This screen shows up when you click `Analyze` -\> `Dimension Reduction` -\> `Factor`, which then opens a window called "Factor Analysis: Extraction" which lets you pick "Principal components" as a method. To put the (apparent) PCA method as a sub-section of factor analysis is misleading at best, and straight-up erronious at worst. The other options for the method here is "Principal axis factoring" (which is closer to traditional factor analysis) and "Maximum likelihood" ([source for screenshot and SPSS interface](https://stats.idre.ucla.edu/spss/seminars/efa-spss/)). Standard disclaimer: I'm not a statistician, and I'm definitely not confident enough to go in-depth into the mathematics of the different algorithms. Instead, I'll just run three common latent variable modeling/clustering methods, and show the difference in results when applied to the same data. Where-ever I feel confident, I will also elaborate on the underlying mathematical principles and concepts. It's a short post, and I'm sure there's levels of nuance and complexity I've missed. Please let me know if you think I've committed a major oversight. @@ -41,8 +47,8 @@ theme_set(theme_minimal()) We'll load the data into R, clean up the variable names, convert the outcome variable (`ca_cervix`) to a factor. We'll have a look at the dataset using some functions from `{skimr}`. This function will give us summary statistics and basic histograms of the different variables. ``` r -data <- read_csv("sobar-72.csv") |> - janitor::clean_names() |> +data <- read_csv("./data/sobar-72.csv") |> + janitor::clean_names() |> mutate(ca_cervix = as_factor(ca_cervix)) skim_summ <- skimr::skim_with(base = skimr::sfl()) @@ -96,8 +102,8 @@ Data summary Now let's only select the variables containing the risk factors (called features from here on). We'll also scale all the features to have a mean of 0 and a standard deviation of 1 using the `scale()` function. We can check what the new variable looks like using the `summary()` function. ``` r -data_features <- data |> - select(-ca_cervix) |> +data_features <- data |> + select(-ca_cervix) |> mutate(across(everything(), ~ scale(.x))) summary(data_features$attitude_consistency) @@ -114,17 +120,23 @@ summary(data_features$attitude_consistency) So now we have a data frame with 72 entries and 19 normalized columns, each representing a feature that may or may not predict cervical cancer. We can create a correlation matrix to visualize the degree of correlation between the different features. For this we simply run `cor()` on the data frame with the features, transform the output to a data frame in long format and then visualize it using `ggplot()` and `geom_tile()`. ``` r -cor(data_features) |> - as_tibble() %>% - mutate(feature_y = names(.)) |> - pivot_longer(cols = -feature_y, names_to = "feature_x", values_to = "correlation") |> - mutate(feature_y = fct_rev(feature_y)) |> - ggplot(aes(x = feature_x, y = feature_y, fill = correlation)) + - geom_tile() + - labs(x = NULL, - y = NULL) + - scico::scale_fill_scico(palette = "berlin", limits = c(-1,1)) + - coord_fixed() + +cor(data_features) |> + as_tibble() |> + mutate(feature_y = names(data_features)) |> + pivot_longer( + cols = -feature_y, + names_to = "feature_x", + values_to = "correlation" + ) |> + mutate(feature_y = fct_rev(feature_y)) |> + ggplot(aes(x = feature_x, y = feature_y, fill = correlation)) + + geom_tile() + + labs( + x = NULL, + y = NULL + ) + + scico::scale_fill_scico(palette = "berlin", limits = c(-1, 1)) + + coord_fixed() + theme(axis.text.x = element_text(hjust = 1, angle = 30)) ``` @@ -138,6 +150,10 @@ Let's now dive into the approaches we'll use. We'll discuss three here, and if t ## Selecting the number of components +{{< sidenote br="6em" >}} +Icasso is implemented in MATLAB, which I won't bore you with here, but it seems someone created a [Python port](https://github.com/Teekuningas/icasso) also +{{< /sidenote >}} + For factor analysis and ICA we need to provide the number of factors we want to extract. We'll use the same number of factors/components throughout this tutorial (also for PCA). Selection of the optimal number of factors/components is a fairly arbitrary process which I won't go into now. In short, before writing this I ran PCA and a tool called [icasso](https://research.ics.aalto.fi/ica/icasso/). *Icasso* runs an ICA algorithm a number of times and provides a number of parameters on the stability of the clusters at different thresholds, this approach is very data-driven. A more common and easier way to get some data-driven insight into the optimal number of clusters is using the "elbow"-method using PCA, eigenvalues of the components, and the cumulative variance explained by the components (we'll show those later). However, in the end, you should also look at the weight matrix of the different cluster thresholds and it becomes a fairly arbitrary process. In this case, the *icasso* showed that 7 components was good (but not the best), the weight matrix looked okay, and 7 components explained more than 80% of the variance in the PCA. I went for 7 components in the end also because it served the purpose of this blogpost quite well, but I think different thresholds are valid and you could make a case for different ones based on the data, based on the interpretation of the weight matrix, etc. This process is a bit of an art and a science combined. ``` r @@ -146,12 +162,18 @@ n_comps <- 7 ## Factor Analysis -So, let's run the factor analysis. Technically speaking, factor analysis isn't a clustering method but rather a latent variable modeling method [source](https://stats.stackexchange.com/questions/241726/understanding-exploratory-factor-analysis-some-points-for-clarification). The primary aim of a factor analysis is the reconstruction of correlations/covariances between variables. Maximizing explained variance of the factors is only a secondary aim and we'll get to why that is relevant in the PCA section. +So, let's run the factor analysis. Technically speaking, factor analysis isn't a clustering method but rather a latent variable modeling method ([source](https://stats.stackexchange.com/questions/241726/understanding-exploratory-factor-analysis-some-points-for-clarification)). The primary aim of a factor analysis is the reconstruction of correlations/covariances between variables. Maximizing explained variance of the factors is only a secondary aim and we'll get to why that is relevant in the PCA section. We've established we want 7 factors. The factor analysis method is implemented in R through the `factanal()` function (see [here](https://www.rdocumentation.org/packages/stats/versions/3.6.2/topics/factanal%20for%20the%20documentation). This function applies a common factor model using the maximum likelihood method. We'll simply provide it with our data frame, the number of factors we want to extract, and we'll ask to provide Bartlett's weighted least-squares scores as well. We can apply a "rotation" to the factors to make them reduce the complexity of the factor loadings and make them easier to interpret, here we'll use the `varimax` option. Varimax maximizes the sum of the variances of the squared loadings. Then we'll print the model to see the results (and sort so the loadings are ordered). The output may be a bit long. ``` r -fa_model <- factanal(data_features, factors = n_comps, scores = "Bartlett", rotation = "varimax") +fa_model <- factanal( + data_features, + factors = n_comps, + scores = "Bartlett", + rotation = "varimax" +) + print(fa_model, sort = TRUE) ``` @@ -234,19 +256,26 @@ print(fa_model, sort = TRUE) We may be tempted to immediately look at the *p*-value at the end of the output. This *p*-value denotes whether the assumption of perfect fit can be rejected. If this *p*-value is below 0.05 or 0.01 we can reject the hypothesis of perfect fit, meaning that we could probably try a different method or try a different number of factors. In this case, the *p*-value is larger than 0.05 so we cannot reject the hypothesis of perfect fit. -The "Loadings" section of the results show a make-shift weight matrix, but in order to further interpret these results, let's create a plot showing the weight matrix. We'll get the results from the factor analysis model we created earlier using the `tidy()` function from `{broom}` and convert it to long format. We'll then create a weight matrix much in the same way we did earlier. +The "Loadings" section of the results show a make-shift weight matrix, but in order to further interpret these results, let's create a plot showing the weight matrix. We'll get the results from the factor analysis model we created earlier using the `tidy()` function from the `{broom}` package and convert it to long format. We'll then create a weight matrix much in the same way we did earlier. ``` r -fa_weight_matrix <- broom::tidy(fa_model) |> - pivot_longer(starts_with("fl"), names_to = "factor", values_to = "loading") - -fa_loading_plot <- ggplot(fa_weight_matrix, aes(x = factor, y = variable, fill = loading)) + - geom_tile() + - labs(title = "FA loadings", - x = NULL, - y = NULL) + - scico::scale_fill_scico(palette = "cork", limits = c(-1,1)) + - coord_fixed(ratio = 1/2) +fa_weight_matrix <- broom::tidy(fa_model) |> + pivot_longer(starts_with("fl"), + names_to = "factor", values_to = "loading" + ) + +fa_loading_plot <- ggplot( + fa_weight_matrix, + aes(x = factor, y = variable, fill = loading) +) + + geom_tile() + + labs( + title = "FA loadings", + x = NULL, + y = NULL + ) + + scico::scale_fill_scico(palette = "cork", limits = c(-1, 1)) + + coord_fixed(ratio = 1 / 2) print(fa_loading_plot) ``` @@ -258,24 +287,29 @@ Here we can more easily see that there's two strong clusters in Factor 1 and Fac Lastly, I think it would be interesting to see how the different factors relate to each other. We'll take the Bartlett's scores and correlate them with each other much like before and create a correlation matrix like before. ``` r -fa_model$scores |> - cor() |> - data.frame() |> - rownames_to_column("factor_x") |> - pivot_longer(cols = -factor_x, names_to = "factor_y", values_to = "correlation") |> - ggplot(aes(x = factor_x, y = factor_y, fill = correlation)) + +fa_model$scores |> + cor() |> + data.frame() |> + rownames_to_column("factor_x") |> + pivot_longer( + cols = -factor_x, + names_to = "factor_y", values_to = "correlation" + ) |> + ggplot(aes(x = factor_x, y = factor_y, fill = correlation)) + geom_tile() + - geom_text(aes(label = round(correlation,4)), color = "white") + - labs(title = "Correlation between FA scores", - x = NULL, - y = NULL) + - scico::scale_fill_scico(palette = "berlin", limits = c(-1,1)) + + geom_text(aes(label = round(correlation, 4)), color = "white") + + labs( + title = "Correlation between FA scores", + x = NULL, + y = NULL + ) + + scico::scale_fill_scico(palette = "berlin", limits = c(-1, 1)) + coord_equal() ``` -We can see little correlation but certainly a few non-zero correlations also. In particular Factor 5 and Factor 7 seem to correlate to some extent at least and there are a few others with some minor correlations, e.g. Factor 1 and Factor 3, as well as Factor 3 and Factor 6. We'll compare this later with the other algorithms. +We can see little correlation but certainly a few non-zero correlations also. In particular Factor 5 and Factor 7 seem to correlate to some extent at least and there are a few others with some minor correlations, e.g. Factor 1 and Factor 3, as well as Factor 5 and Factor 6. We'll compare this later with the other algorithms. ## Principal Component Analysis @@ -285,7 +319,11 @@ Principal component analysis can reliably be classfied as a clustering method (a I can recommend this paper as a great primer to PCA methods. It goes over a few concepts very relevant for PCA methods as well as clustering methods in general. -Now, let's run the PCA. In R there's two functions built-in to run a principal component analysis, here we'll use the `prcomp()`function (the other being `princomp()`, but `prcomp()` is preferred for reasons beyond the scope of this post). The `prcomp()` function contains a few options we can play with, but in my experience it works fine out of the box if you've normalized the data beforehand ([link to documentation](https://www.rdocumentation.org/packages/stats/versions/3.6.2/topics/prcomp)). So we'll simply provide the data frame with the features. In addition, we'll also calculate the variance explained by each component. We do that by simply taking the standard deviation calculated by the PCA and squaring it, we'll save it in a new field called `variance`. +{{< sidenote br="1em" >}} +For some more info on the difference between `prcomp()` and `princomp()` see [this](https://stats.stackexchange.com/questions/20101/what-is-the-difference-between-r-functions-prcomp-and-princomp) thread on CrossValidated +{{< /sidenote >}} + +Now, let's run the PCA. In R there's two functions built-in to run a principal component analysis, here we'll use the `prcomp()` function (the other being `princomp()`, but `prcomp()` is preferred for reasons beyond the scope of this post). The `prcomp()` function contains a few options we can play with, but in my experience it works fine out of the box if you've normalized the data beforehand ([link to documentation](https://www.rdocumentation.org/packages/stats/versions/3.6.2/topics/prcomp)). So we'll simply provide the data frame with the features. In addition, we'll also calculate the variance explained by each component. We do that by simply taking the standard deviation calculated by the PCA and squaring it, we'll save it in a new field called `variance`. ``` r pc_model <- prcomp(data_features) @@ -296,40 +334,46 @@ pc_model$variance <- pc_model$sdev^2 Next we can make a simple scree plot using the variance we calculate above. We'll create a plot with the principal components on the x-axis and the eigenvalue on the y-axis. The scree plot is a very popular plot to visualize features of a PCA. In this plot the elbow is quite clearly at principal component 3, but as discussed, the scree plot is not the best nor the only way to determine the optimal number of components. In the code below I also added a calculation for the cumulative variance (`cum_variance`) which showed that a little more than 80% of the variance is captured in the first 7 components, while the first 3 components combined capture only 56%. ``` r -pc_model$variance |> +pc_model$variance |> as_tibble() |> - rename(eigenvalue = value) |> - rownames_to_column("comp") |> - mutate(comp = parse_number(comp), - cum_variance = cumsum(eigenvalue)/sum(eigenvalue)) |> - ggplot(aes(x = comp, y = eigenvalue)) + + rename(eigenvalue = value) |> + rownames_to_column("comp") |> + mutate( + comp = parse_number(comp), + cum_variance = cumsum(eigenvalue) / sum(eigenvalue) + ) |> + ggplot(aes(x = comp, y = eigenvalue)) + geom_hline(yintercept = 1) + - geom_line(size = 1) + + geom_line(linewidth = 1) + geom_point(size = 3) ``` - Warning: Using `size` aesthetic for lines was deprecated in ggplot2 3.4.0. - ℹ Please use `linewidth` instead. - We'll also create a weight matrix again, based on the rotation from the PCA. We'll create the weight matrix much in the same way as before. A PCA by its very nature returns an equal number of components as the number of variables put in, however, we're interested in just the first 7 components, so we'll select just those using the `filter()` function. ``` r -pc_weight_matrix <- pc_model$rotation |> - data.frame() |> - rownames_to_column("variable") |> - pivot_longer(starts_with("PC"), names_to = "prin_comp", values_to = "loading") - -pca_loading_plot <- pc_weight_matrix |> - filter(parse_number(prin_comp) <= n_comps) |> - ggplot(aes(x = reorder(prin_comp, parse_number(prin_comp)), y = variable, fill = loading)) + - geom_tile() + - labs(title = "PCA loadings", - x = NULL, - y = NULL) + - scico::scale_fill_scico(palette = "cork", limits = c(-1,1)) + - coord_fixed(ratio = 1/2) +pc_weight_matrix <- pc_model$rotation |> + data.frame() |> + rownames_to_column("variable") |> + pivot_longer(starts_with("PC"), + names_to = "prin_comp", values_to = "loading" + ) + +pca_loading_plot <- pc_weight_matrix |> + filter(parse_number(prin_comp) <= n_comps) |> + ggplot(aes( + x = reorder(prin_comp, parse_number(prin_comp)), + y = variable, fill = loading + )) + + geom_tile() + + labs( + title = "PCA loadings", + x = NULL, + y = NULL + ) + + scico::scale_fill_scico(palette = "cork", limits = c(-1, 1)) + + coord_fixed(ratio = 1 / 2) print(pca_loading_plot) ``` @@ -341,20 +385,27 @@ One thing that immediately jumps out is that PC1 and PC2 are nearly identical to We can also make a correlation matrix for the different principal components. We'll use the `x` field and otherwise create the correlation matrix the same way as before: ``` r -pc_model$x |> - cor() |> - data.frame() |> - rownames_to_column("comp_x") |> - pivot_longer(cols = starts_with("PC"), names_to = "comp_y", values_to = "correlation") |> - filter(parse_number(comp_x) <= n_comps, - parse_number(comp_y) <= n_comps) |> - ggplot(aes(x = comp_x, y = comp_y, fill = correlation)) + +pc_model$x |> + cor() |> + data.frame() |> + rownames_to_column("comp_x") |> + pivot_longer( + cols = starts_with("PC"), + names_to = "comp_y", values_to = "correlation" + ) |> + filter( + parse_number(comp_x) <= n_comps, + parse_number(comp_y) <= n_comps + ) |> + ggplot(aes(x = comp_x, y = comp_y, fill = correlation)) + geom_tile() + - geom_text(aes(label = round(correlation,4)), color = "white") + - labs(title = "Correlation between PCs", - x = NULL, - y = NULL) + - scico::scale_fill_scico(palette = "berlin", limits = c(-1,1)) + + geom_text(aes(label = round(correlation, 4)), color = "white") + + labs( + title = "Correlation between PCs", + x = NULL, + y = NULL + ) + + scico::scale_fill_scico(palette = "berlin", limits = c(-1, 1)) + coord_equal() ``` @@ -387,10 +438,10 @@ pc_manual <- as.matrix(data_features) %*% eigenvector Let's look at the scree plot: ``` r -tibble(eigenvalues) |> - ggplot(aes(x = seq_along(eigenvalues), y = eigenvalues)) + +tibble(eigenvalues) |> + ggplot(aes(x = seq_along(eigenvalues), y = eigenvalues)) + geom_hline(yintercept = 1) + - geom_line(size = 1) + + geom_line(linewidth = 1) + geom_point(size = 3) ``` @@ -399,9 +450,9 @@ tibble(eigenvalues) |> Looks identical to the previous one. Let's also look at the correlation matrix between the principal components. ``` r -data.frame(pc_manual) |> - cor() %>% - round(., 4) +data.frame(pc_manual) |> + cor() |> + round(x = _, digits = 4) ``` X1 X2 X3 X4 X5 X6 X7 X8 X9 X10 X11 X12 X13 X14 X15 X16 X17 X18 X19 @@ -429,6 +480,10 @@ Yup, also still 0 across the board. Calculating a PCA by just using matrix manip ## Independent Component Analysis +{{< sidenote br="14em" >}} +If you're feeling particularly *mathematical*, you can read find a paper [here](https://www.iiis.org/CDs2017/CD2017Spring/papers/ZA832BA.pdf) that compares different algorithms +{{< /sidenote >}} + While PCA attempts to find components explaining the maximum degree of covariance or correlation, an ICA attemps to find components with maximum statistical independence. There's very complicated nuance here where PCA components are orthogonal and uncorrelated to each other and ICA components are merely statistically independent, which is a very subtle nuance. In practice, it'll mean that ICA components also have a practically zero correlation. The main difference is in how the components are obtained. Like with factor analysis, most ICA algorithms require you to provide a number of components up front. The FastICA algorithm we'll use here is a version of an ICA implementation. There's also InfoMax and JADE to name two other implementations. I couldn't tell you the difference between these ICA algorithms. The FastICA is implemented in R through the `fastICA()` function and the eponymous `{fastICA}` package ([link to documentation](https://www.rdocumentation.org/packages/fastICA/versions/1.2-2/topics/fastICA)). ``` r @@ -438,18 +493,26 @@ ica_model <- fastICA(data_features, n.comp = n_comps) Let's create a weight matrix again. The output from the `fastICA()` doesn't provide useful names, but the [documentation](https://www.rdocumentation.org/packages/fastICA/versions/1.2-2/topics/fastICA) provides sufficient information. To create the weight matrix we take the `A` field, transpose it, get the right names to the right places and then create the plot like we've done several times now. ``` r -ica_weight_matrix <- data.frame(t(ica_model$A)) |> +ica_weight_matrix <- data.frame(t(ica_model$A)) |> rename_with(~ str_glue("IC{seq(.)}")) |> mutate(variable = names(data_features)) |> - pivot_longer(cols = starts_with("IC"), names_to = "ic", values_to = "loading") - -ica_loading_plot <- ggplot(ica_weight_matrix, aes(x = ic, y = variable, fill = loading)) + - geom_tile() + - labs(title = "ICA loadings", - x = NULL, - y = NULL) + - scico::scale_fill_scico(palette = "cork", limits = c(-1,1)) + - coord_fixed(ratio = 1/2) + pivot_longer( + cols = starts_with("IC"), + names_to = "ic", values_to = "loading" + ) + +ica_loading_plot <- ggplot( + ica_weight_matrix, + aes(x = ic, y = variable, fill = loading) +) + + geom_tile() + + labs( + title = "ICA loadings", + x = NULL, + y = NULL + ) + + scico::scale_fill_scico(palette = "cork", limits = c(-1, 1)) + + coord_fixed(ratio = 1 / 2) print(ica_loading_plot) ``` @@ -461,18 +524,23 @@ The FastICA method doesn't rank the components based on variance like factor ana Again, we can also visualize the correlation between the different components. ``` r -ica_model$S |> - cor() |> - data.frame() |> - rownames_to_column("comp_x") |> - pivot_longer(cols = starts_with("X"), names_to = "comp_y", values_to = "correlation") |> - ggplot(aes(x = comp_x, y = comp_y, fill = correlation)) + +ica_model$S |> + cor() |> + data.frame() |> + rownames_to_column("comp_x") |> + pivot_longer( + cols = starts_with("X"), + names_to = "comp_y", values_to = "correlation" + ) |> + ggplot(aes(x = comp_x, y = comp_y, fill = correlation)) + geom_tile() + - geom_text(aes(label = round(correlation,4)), color = "white") + - labs(title = "Correlation between ICs", - x = NULL, - y = NULL) + - scico::scale_fill_scico(palette = "berlin", limits = c(-1,1)) + + geom_text(aes(label = round(correlation, 4)), color = "white") + + labs( + title = "Correlation between ICs", + x = NULL, + y = NULL + ) + + scico::scale_fill_scico(palette = "berlin", limits = c(-1, 1)) + coord_equal() ``` @@ -484,34 +552,38 @@ Okay, let's now compare the three approaches and put the loading matrices side b ``` r all_weight_matrices <- bind_rows( - fa_weight_matrix |> - rename(comp = factor) |> + fa_weight_matrix |> + rename(comp = factor) |> mutate(alg = "FA"), - pc_weight_matrix |> - rename(comp = prin_comp) |> - mutate(alg = "PCA"), - ica_weight_matrix |> - rename(comp = ic) |> + pc_weight_matrix |> + rename(comp = prin_comp) |> + mutate(alg = "PCA"), + ica_weight_matrix |> + rename(comp = ic) |> mutate(alg = "ICA") ) -all_weight_matrices |> - filter(parse_number(comp) <= n_comps) |> - mutate(alg = str_glue("{alg} loadings"), - alg = as_factor(alg)) |> +all_weight_matrices |> + filter(parse_number(comp) <= n_comps) |> + mutate( + alg = str_glue("{alg} loadings"), + alg = as_factor(alg) + ) |> ggplot(aes(x = comp, y = variable, fill = loading)) + - geom_tile() + - labs(x = NULL, - y = NULL) + - scico::scale_fill_scico(palette = "cork", limits = c(-1,1)) + - facet_wrap(~ alg, scales = "free_x") + geom_tile() + + labs( + x = NULL, + y = NULL + ) + + scico::scale_fill_scico(palette = "cork", limits = c(-1, 1)) + + facet_wrap(~alg, scales = "free_x") ``` Here we can most clearly see the overlap between the three methods, Factor 1, PC1, and IC 6 capture essentially the same information. The same goes for Factor 2, PC2, and IC 4. Other than that we can see that the other components vary quite markedly. I wouldn't be comfortable calling any of the other components "fairly similar" to another. You could see how some variables load together with multiple methods, but then the components those are captured in also have other information or miss information. We already discussed IC7 consisting mostly of a single variable, and a similar thing happens with Factor 4, but for a different variable. -### BONUS: Hierarchical clustering +## BONUS: Hierarchical clustering I'll quickly go over hierarchical clustering too, it's simple and easy to interpret. Hierarchical clustering works by taking your variables and clustering them first into two groups, then three, then four, and so on. It looks at similarity and a concept called "Euclidian distance" (other methods are available) between the variables and determines how to separate. Essentially, hierarchical clustering works by iteratively separating variables into groups until every variable is on its own. It does so rather aggressively, with the previous methods it's possible for a variable to be part of two clusters, with hierarchical clustering it's part of a single cluster only. This approach makes it an easy way to see how variables cluster together at different thresholds. @@ -540,7 +612,7 @@ hclust_weight_matrix <- cutree(hclust_model, k = n_comps) The `cutree()` function assigns a cluster to each of the variables. It looks at the tree and determines where to cut the tree to get the desired number of branches and then tells you the composition of the branches. We can recreate this ourselves by simply adding a `geom_hline()` to the dendrogram. With some trial and error, it seems like cutting the tree at `y = 10.5` will give us 7 clusters. ``` r -ggdendro::ggdendrogram(hclust_model) + +ggdendro::ggdendrogram(hclust_model) + geom_hline(yintercept = 10.5, color = "firebrick") ``` @@ -549,16 +621,20 @@ ggdendro::ggdendrogram(hclust_model) + Let's look at how the clusters are made up according to the hierarchical clustering model. This isn't really a weight matrix, but rather a definition of the clusters. The "loadings" here are not numerical, but rather 1 or 0. ``` r -hclust_weight_matrix %>% - data.frame() |> - janitor::clean_names() |> - rename(cluster = x) |> - rownames_to_column("variable") |> - ggplot(aes(x = as_factor(cluster), y = variable, fill = as_factor(cluster))) + - geom_tile() + - labs(x = NULL, - y = NULL, - fill = "cluster") + +hclust_weight_matrix |> + data.frame() |> + rename(cluster = hclust_weight_matrix) |> + rownames_to_column("variable") |> + ggplot(aes( + x = as_factor(cluster), y = variable, + fill = as_factor(cluster) + )) + + geom_tile() + + labs( + x = NULL, + y = NULL, + fill = "cluster" + ) + scico::scale_fill_scico_d(palette = "batlow") ``` diff --git a/content/blog/2021-comparison-fa-pca-ica/index.qmd b/content/blog/2021-comparison-fa-pca-ica/index.qmd index 8cc0c05..07d13e8 100644 --- a/content/blog/2021-comparison-fa-pca-ica/index.qmd +++ b/content/blog/2021-comparison-fa-pca-ica/index.qmd @@ -14,13 +14,19 @@ execute: fig.show: hold results: hold out.width: 80% +editor_options: + chunk_output_type: console --- This is just a very quick blog post outlining some of the commonalities and differences between factor analysis (FA), principal component analysis (PCA), and independent component analysis (ICA). I was inspired to write some of this down through some confusion caused in the lab by SPSS' apparent dual usage of the term "factor analysis" and "principal components". A few of my colleagues who use SPSS showed me the following screen: -{{{< image src="spss-screenshot.png" alt="spss screenshot" >}}} +{{{< figure src="spss-screenshot.png" alt="spss screenshot" >}}} -This screen shows up when you click `Analyze` -> `Dimension Reduction` -> `Factor`, which then opens a window called "Factor Analysis: Extraction" which lets you pick "Principal components" as a method. To put the (apparent) PCA method as a sub-section of factor analysis is misleading at best, and straight-up erronious at worst. The other options for the method here is "Principal axis factoring" (which is closer to traditional factor analysis) and "Maximum likelihood" ([source for screenshot and SPSS interface](https://stats.idre.ucla.edu/spss/seminars/efa-spss/)). If you're wondering if you're the first one to be confused by SPSS' choice to present this in such a way, you're not ([link](https://stats.stackexchange.com/questions/1576/what-are-the-differences-between-factor-analysis-and-principal-component-analysi), [link](https://stats.stackexchange.com/questions/24781/interpreting-discrepancies-between-r-and-spss-with-exploratory-factor-analysis)). +{{{< sidenote br="2em" >}}} +If you're wondering if you're the first one to be confused by SPSS' choice to present this in such a way, you're not ([link](https://stats.stackexchange.com/questions/1576/what-are-the-differences-between-factor-analysis-and-principal-component-analysi), [link](https://stats.stackexchange.com/questions/24781/interpreting-discrepancies-between-r-and-spss-with-exploratory-factor-analysis)). +{{{< /sidenote >}}} + +This screen shows up when you click `Analyze` -> `Dimension Reduction` -> `Factor`, which then opens a window called "Factor Analysis: Extraction" which lets you pick "Principal components" as a method. To put the (apparent) PCA method as a sub-section of factor analysis is misleading at best, and straight-up erronious at worst. The other options for the method here is "Principal axis factoring" (which is closer to traditional factor analysis) and "Maximum likelihood" ([source for screenshot and SPSS interface](https://stats.idre.ucla.edu/spss/seminars/efa-spss/)). Standard disclaimer: I'm not a statistician, and I'm definitely not confident enough to go in-depth into the mathematics of the different algorithms. Instead, I'll just run three common latent variable modeling/clustering methods, and show the difference in results when applied to the same data. Where-ever I feel confident, I will also elaborate on the underlying mathematical principles and concepts. It's a short post, and I'm sure there's levels of nuance and complexity I've missed. Please let me know if you think I've committed a major oversight. @@ -47,8 +53,8 @@ We'll load the data into R, clean up the variable names, convert the outcome var #| label: load-data #| message: false -data <- read_csv("sobar-72.csv") |> - janitor::clean_names() |> +data <- read_csv("./data/sobar-72.csv") |> + janitor::clean_names() |> mutate(ca_cervix = as_factor(ca_cervix)) skim_summ <- skimr::skim_with(base = skimr::sfl()) @@ -60,8 +66,8 @@ Now let's only select the variables containing the risk factors (called features ```{r} #| label: scale-data -data_features <- data |> - select(-ca_cervix) |> +data_features <- data |> + select(-ca_cervix) |> mutate(across(everything(), ~ scale(.x))) summary(data_features$attitude_consistency) @@ -72,17 +78,23 @@ So now we have a data frame with 72 entries and 19 normalized columns, each repr ```{r} #| label: corr-matrix-original -cor(data_features) |> - as_tibble() %>% - mutate(feature_y = names(.)) |> - pivot_longer(cols = -feature_y, names_to = "feature_x", values_to = "correlation") |> - mutate(feature_y = fct_rev(feature_y)) |> - ggplot(aes(x = feature_x, y = feature_y, fill = correlation)) + - geom_tile() + - labs(x = NULL, - y = NULL) + - scico::scale_fill_scico(palette = "berlin", limits = c(-1,1)) + - coord_fixed() + +cor(data_features) |> + as_tibble() |> + mutate(feature_y = names(data_features)) |> + pivot_longer( + cols = -feature_y, + names_to = "feature_x", + values_to = "correlation" + ) |> + mutate(feature_y = fct_rev(feature_y)) |> + ggplot(aes(x = feature_x, y = feature_y, fill = correlation)) + + geom_tile() + + labs( + x = NULL, + y = NULL + ) + + scico::scale_fill_scico(palette = "berlin", limits = c(-1, 1)) + + coord_fixed() + theme(axis.text.x = element_text(hjust = 1, angle = 30)) ``` @@ -94,6 +106,10 @@ Let's now dive into the approaches we'll use. We'll discuss three here, and if t ## Selecting the number of components +{{{< sidenote br="6em" >}}} +Icasso is implemented in MATLAB, which I won't bore you with here, but it seems someone created a [Python port](https://github.com/Teekuningas/icasso) also +{{{< /sidenote >}}} + For factor analysis and ICA we need to provide the number of factors we want to extract. We'll use the same number of factors/components throughout this tutorial (also for PCA). Selection of the optimal number of factors/components is a fairly arbitrary process which I won't go into now. In short, before writing this I ran PCA and a tool called [icasso](https://research.ics.aalto.fi/ica/icasso/). _Icasso_ runs an ICA algorithm a number of times and provides a number of parameters on the stability of the clusters at different thresholds, this approach is very data-driven. A more common and easier way to get some data-driven insight into the optimal number of clusters is using the "elbow"-method using PCA, eigenvalues of the components, and the cumulative variance explained by the components (we'll show those later). However, in the end, you should also look at the weight matrix of the different cluster thresholds and it becomes a fairly arbitrary process. In this case, the _icasso_ showed that 7 components was good (but not the best), the weight matrix looked okay, and 7 components explained more than 80% of the variance in the PCA. I went for 7 components in the end also because it served the purpose of this blogpost quite well, but I think different thresholds are valid and you could make a case for different ones based on the data, based on the interpretation of the weight matrix, etc. This process is a bit of an art and a science combined. ```{r} @@ -105,34 +121,47 @@ n_comps <- 7 ## Factor Analysis -So, let's run the factor analysis. Technically speaking, factor analysis isn't a clustering method but rather a latent variable modeling method [source](https://stats.stackexchange.com/questions/241726/understanding-exploratory-factor-analysis-some-points-for-clarification). The primary aim of a factor analysis is the reconstruction of correlations/covariances between variables. Maximizing explained variance of the factors is only a secondary aim and we'll get to why that is relevant in the PCA section. +So, let's run the factor analysis. Technically speaking, factor analysis isn't a clustering method but rather a latent variable modeling method ([source](https://stats.stackexchange.com/questions/241726/understanding-exploratory-factor-analysis-some-points-for-clarification)). The primary aim of a factor analysis is the reconstruction of correlations/covariances between variables. Maximizing explained variance of the factors is only a secondary aim and we'll get to why that is relevant in the PCA section. We've established we want 7 factors. The factor analysis method is implemented in R through the `factanal()` function (see [here](https://www.rdocumentation.org/packages/stats/versions/3.6.2/topics/factanal for the documentation). This function applies a common factor model using the maximum likelihood method. We'll simply provide it with our data frame, the number of factors we want to extract, and we'll ask to provide Bartlett's weighted least-squares scores as well. We can apply a "rotation" to the factors to make them reduce the complexity of the factor loadings and make them easier to interpret, here we'll use the `varimax` option. Varimax maximizes the sum of the variances of the squared loadings. Then we'll print the model to see the results (and sort so the loadings are ordered). The output may be a bit long. ```{r} #| label: fa -fa_model <- factanal(data_features, factors = n_comps, scores = "Bartlett", rotation = "varimax") +fa_model <- factanal( + data_features, + factors = n_comps, + scores = "Bartlett", + rotation = "varimax" +) + print(fa_model, sort = TRUE) ``` We may be tempted to immediately look at the _p_-value at the end of the output. This _p_-value denotes whether the assumption of perfect fit can be rejected. If this _p_-value is below 0.05 or 0.01 we can reject the hypothesis of perfect fit, meaning that we could probably try a different method or try a different number of factors. In this case, the _p_-value is larger than 0.05 so we cannot reject the hypothesis of perfect fit. -The "Loadings" section of the results show a make-shift weight matrix, but in order to further interpret these results, let's create a plot showing the weight matrix. We'll get the results from the factor analysis model we created earlier using the `tidy()` function from `{broom}` and convert it to long format. We'll then create a weight matrix much in the same way we did earlier. +The "Loadings" section of the results show a make-shift weight matrix, but in order to further interpret these results, let's create a plot showing the weight matrix. We'll get the results from the factor analysis model we created earlier using the `tidy()` function from the `{broom}` package and convert it to long format. We'll then create a weight matrix much in the same way we did earlier. ```{r} #| label: fa-weight-matrix -fa_weight_matrix <- broom::tidy(fa_model) |> - pivot_longer(starts_with("fl"), names_to = "factor", values_to = "loading") +fa_weight_matrix <- broom::tidy(fa_model) |> + pivot_longer(starts_with("fl"), + names_to = "factor", values_to = "loading" + ) -fa_loading_plot <- ggplot(fa_weight_matrix, aes(x = factor, y = variable, fill = loading)) + - geom_tile() + - labs(title = "FA loadings", - x = NULL, - y = NULL) + - scico::scale_fill_scico(palette = "cork", limits = c(-1,1)) + - coord_fixed(ratio = 1/2) +fa_loading_plot <- ggplot( + fa_weight_matrix, + aes(x = factor, y = variable, fill = loading) +) + + geom_tile() + + labs( + title = "FA loadings", + x = NULL, + y = NULL + ) + + scico::scale_fill_scico(palette = "cork", limits = c(-1, 1)) + + coord_fixed(ratio = 1 / 2) print(fa_loading_plot) ``` @@ -144,22 +173,27 @@ Lastly, I think it would be interesting to see how the different factors relate ```{r} #| label: fa-corr-matrix -fa_model$scores |> - cor() |> - data.frame() |> - rownames_to_column("factor_x") |> - pivot_longer(cols = -factor_x, names_to = "factor_y", values_to = "correlation") |> - ggplot(aes(x = factor_x, y = factor_y, fill = correlation)) + +fa_model$scores |> + cor() |> + data.frame() |> + rownames_to_column("factor_x") |> + pivot_longer( + cols = -factor_x, + names_to = "factor_y", values_to = "correlation" + ) |> + ggplot(aes(x = factor_x, y = factor_y, fill = correlation)) + geom_tile() + - geom_text(aes(label = round(correlation,4)), color = "white") + - labs(title = "Correlation between FA scores", - x = NULL, - y = NULL) + - scico::scale_fill_scico(palette = "berlin", limits = c(-1,1)) + + geom_text(aes(label = round(correlation, 4)), color = "white") + + labs( + title = "Correlation between FA scores", + x = NULL, + y = NULL + ) + + scico::scale_fill_scico(palette = "berlin", limits = c(-1, 1)) + coord_equal() ``` -We can see little correlation but certainly a few non-zero correlations also. In particular Factor 5 and Factor 7 seem to correlate to some extent at least and there are a few others with some minor correlations, e.g. Factor 1 and Factor 3, as well as Factor 3 and Factor 6. We'll compare this later with the other algorithms. +We can see little correlation but certainly a few non-zero correlations also. In particular Factor 5 and Factor 7 seem to correlate to some extent at least and there are a few others with some minor correlations, e.g. Factor 1 and Factor 3, as well as Factor 5 and Factor 6. We'll compare this later with the other algorithms. ## Principal Component Analysis @@ -170,7 +204,11 @@ Principal component analysis can reliably be classfied as a clustering method (a I can recommend this paper as a great primer to PCA methods. It goes over a few concepts very relevant for PCA methods as well as clustering methods in general. -Now, let's run the PCA. In R there's two functions built-in to run a principal component analysis, here we'll use the `prcomp()`function (the other being `princomp()`, but `prcomp()` is preferred for reasons beyond the scope of this post). The `prcomp()` function contains a few options we can play with, but in my experience it works fine out of the box if you've normalized the data beforehand ([link to documentation](https://www.rdocumentation.org/packages/stats/versions/3.6.2/topics/prcomp)). So we'll simply provide the data frame with the features. In addition, we'll also calculate the variance explained by each component. We do that by simply taking the standard deviation calculated by the PCA and squaring it, we'll save it in a new field called `variance`. +{{{< sidenote br="1em" >}}} +For some more info on the difference between `prcomp()` and `princomp()` see [this](https://stats.stackexchange.com/questions/20101/what-is-the-difference-between-r-functions-prcomp-and-princomp) thread on CrossValidated +{{{< /sidenote >}}} + +Now, let's run the PCA. In R there's two functions built-in to run a principal component analysis, here we'll use the `prcomp()` function (the other being `princomp()`, but `prcomp()` is preferred for reasons beyond the scope of this post). The `prcomp()` function contains a few options we can play with, but in my experience it works fine out of the box if you've normalized the data beforehand ([link to documentation](https://www.rdocumentation.org/packages/stats/versions/3.6.2/topics/prcomp)). So we'll simply provide the data frame with the features. In addition, we'll also calculate the variance explained by each component. We do that by simply taking the standard deviation calculated by the PCA and squaring it, we'll save it in a new field called `variance`. ```{r} #| label: pca @@ -185,15 +223,17 @@ Next we can make a simple scree plot using the variance we calculate above. We'l ```{r} #| label: pca-scree -pc_model$variance |> +pc_model$variance |> as_tibble() |> - rename(eigenvalue = value) |> - rownames_to_column("comp") |> - mutate(comp = parse_number(comp), - cum_variance = cumsum(eigenvalue)/sum(eigenvalue)) |> - ggplot(aes(x = comp, y = eigenvalue)) + + rename(eigenvalue = value) |> + rownames_to_column("comp") |> + mutate( + comp = parse_number(comp), + cum_variance = cumsum(eigenvalue) / sum(eigenvalue) + ) |> + ggplot(aes(x = comp, y = eigenvalue)) + geom_hline(yintercept = 1) + - geom_line(size = 1) + + geom_line(linewidth = 1) + geom_point(size = 3) ``` @@ -202,20 +242,27 @@ We'll also create a weight matrix again, based on the rotation from the PCA. We' ```{r} #| label: pca-weight-matrix -pc_weight_matrix <- pc_model$rotation |> - data.frame() |> - rownames_to_column("variable") |> - pivot_longer(starts_with("PC"), names_to = "prin_comp", values_to = "loading") - -pca_loading_plot <- pc_weight_matrix |> - filter(parse_number(prin_comp) <= n_comps) |> - ggplot(aes(x = reorder(prin_comp, parse_number(prin_comp)), y = variable, fill = loading)) + - geom_tile() + - labs(title = "PCA loadings", - x = NULL, - y = NULL) + - scico::scale_fill_scico(palette = "cork", limits = c(-1,1)) + - coord_fixed(ratio = 1/2) +pc_weight_matrix <- pc_model$rotation |> + data.frame() |> + rownames_to_column("variable") |> + pivot_longer(starts_with("PC"), + names_to = "prin_comp", values_to = "loading" + ) + +pca_loading_plot <- pc_weight_matrix |> + filter(parse_number(prin_comp) <= n_comps) |> + ggplot(aes( + x = reorder(prin_comp, parse_number(prin_comp)), + y = variable, fill = loading + )) + + geom_tile() + + labs( + title = "PCA loadings", + x = NULL, + y = NULL + ) + + scico::scale_fill_scico(palette = "cork", limits = c(-1, 1)) + + coord_fixed(ratio = 1 / 2) print(pca_loading_plot) ``` @@ -227,20 +274,27 @@ We can also make a correlation matrix for the different principal components. We ```{r} #| label: pca-corr-matrix -pc_model$x |> - cor() |> - data.frame() |> - rownames_to_column("comp_x") |> - pivot_longer(cols = starts_with("PC"), names_to = "comp_y", values_to = "correlation") |> - filter(parse_number(comp_x) <= n_comps, - parse_number(comp_y) <= n_comps) |> - ggplot(aes(x = comp_x, y = comp_y, fill = correlation)) + +pc_model$x |> + cor() |> + data.frame() |> + rownames_to_column("comp_x") |> + pivot_longer( + cols = starts_with("PC"), + names_to = "comp_y", values_to = "correlation" + ) |> + filter( + parse_number(comp_x) <= n_comps, + parse_number(comp_y) <= n_comps + ) |> + ggplot(aes(x = comp_x, y = comp_y, fill = correlation)) + geom_tile() + - geom_text(aes(label = round(correlation,4)), color = "white") + - labs(title = "Correlation between PCs", - x = NULL, - y = NULL) + - scico::scale_fill_scico(palette = "berlin", limits = c(-1,1)) + + geom_text(aes(label = round(correlation, 4)), color = "white") + + labs( + title = "Correlation between PCs", + x = NULL, + y = NULL + ) + + scico::scale_fill_scico(palette = "berlin", limits = c(-1, 1)) + coord_equal() ``` @@ -275,10 +329,10 @@ Let's look at the scree plot: ```{r} #| label: pca-manual-scree -tibble(eigenvalues) |> - ggplot(aes(x = seq_along(eigenvalues), y = eigenvalues)) + +tibble(eigenvalues) |> + ggplot(aes(x = seq_along(eigenvalues), y = eigenvalues)) + geom_hline(yintercept = 1) + - geom_line(size = 1) + + geom_line(linewidth = 1) + geom_point(size = 3) ``` @@ -290,33 +344,42 @@ Looks identical to the previous one. Let's also look at the correlation matrix b #| eval: false data.frame(pc_manual) |> - rename_with(~ str_glue("PC{parse_number(.x)}")) |> - rownames_to_column("participant") |> - pivot_longer(starts_with("PC"), names_to = "prin_comp", values_to = "loading") |> - filter(parse_number(prin_comp) <= n_comps) |> - ggplot(aes(x = reorder(prin_comp, parse_number(prin_comp)), - y = parse_number(participant), fill = loading)) + - geom_tile() + - labs(title = "Individual loadings", - x = NULL, - y = NULL) + - scico::scale_fill_scico(palette = "berlin") + - coord_fixed(1/5) + rename_with(~ str_glue("PC{parse_number(.x)}")) |> + rownames_to_column("participant") |> + pivot_longer(starts_with("PC"), + names_to = "prin_comp", values_to = "loading" + ) |> + filter(parse_number(prin_comp) <= n_comps) |> + ggplot(aes( + x = reorder(prin_comp, parse_number(prin_comp)), + y = parse_number(participant), fill = loading + )) + + geom_tile() + + labs( + title = "Individual loadings", + x = NULL, + y = NULL + ) + + scico::scale_fill_scico(palette = "berlin") + + coord_fixed(1 / 5) ``` ```{r} #| label: pca-manual-corr-matrix -data.frame(pc_manual) |> - cor() %>% - round(., 4) +data.frame(pc_manual) |> + cor() |> + round(x = _, digits = 4) ``` Yup, also still 0 across the board. Calculating a PCA by just using matrix manipulations isn't too complicated, and it's a fun exercise, but I'd recommend just sticking to the `prcomp()` function, since it's a lot simpler and offer better functionality. Now, let's move on to the independent component analysis. - ## Independent Component Analysis +{{{< sidenote br="14em" >}}} +If you're feeling particularly _mathematical_, you can read find a paper [here](https://www.iiis.org/CDs2017/CD2017Spring/papers/ZA832BA.pdf) that compares different algorithms +{{{< /sidenote >}}} + While PCA attempts to find components explaining the maximum degree of covariance or correlation, an ICA attemps to find components with maximum statistical independence. There's very complicated nuance here where PCA components are orthogonal and uncorrelated to each other and ICA components are merely statistically independent, which is a very subtle nuance. In practice, it'll mean that ICA components also have a practically zero correlation. The main difference is in how the components are obtained. Like with factor analysis, most ICA algorithms require you to provide a number of components up front. The FastICA algorithm we'll use here is a version of an ICA implementation. There's also InfoMax and JADE to name two other implementations. I couldn't tell you the difference between these ICA algorithms. The FastICA is implemented in R through the `fastICA()` function and the eponymous `{fastICA}` package ([link to documentation](https://www.rdocumentation.org/packages/fastICA/versions/1.2-2/topics/fastICA)). ```{r} @@ -330,18 +393,26 @@ Let's create a weight matrix again. The output from the `fastICA()` doesn't prov ```{r} #| label: ica-weight-matrix -ica_weight_matrix <- data.frame(t(ica_model$A)) |> +ica_weight_matrix <- data.frame(t(ica_model$A)) |> rename_with(~ str_glue("IC{seq(.)}")) |> mutate(variable = names(data_features)) |> - pivot_longer(cols = starts_with("IC"), names_to = "ic", values_to = "loading") - -ica_loading_plot <- ggplot(ica_weight_matrix, aes(x = ic, y = variable, fill = loading)) + - geom_tile() + - labs(title = "ICA loadings", - x = NULL, - y = NULL) + - scico::scale_fill_scico(palette = "cork", limits = c(-1,1)) + - coord_fixed(ratio = 1/2) + pivot_longer( + cols = starts_with("IC"), + names_to = "ic", values_to = "loading" + ) + +ica_loading_plot <- ggplot( + ica_weight_matrix, + aes(x = ic, y = variable, fill = loading) +) + + geom_tile() + + labs( + title = "ICA loadings", + x = NULL, + y = NULL + ) + + scico::scale_fill_scico(palette = "cork", limits = c(-1, 1)) + + coord_fixed(ratio = 1 / 2) print(ica_loading_plot) ``` @@ -353,18 +424,23 @@ Again, we can also visualize the correlation between the different components. ```{r} #| label: ica-corr-matrix -ica_model$S |> - cor() |> - data.frame() |> - rownames_to_column("comp_x") |> - pivot_longer(cols = starts_with("X"), names_to = "comp_y", values_to = "correlation") |> - ggplot(aes(x = comp_x, y = comp_y, fill = correlation)) + +ica_model$S |> + cor() |> + data.frame() |> + rownames_to_column("comp_x") |> + pivot_longer( + cols = starts_with("X"), + names_to = "comp_y", values_to = "correlation" + ) |> + ggplot(aes(x = comp_x, y = comp_y, fill = correlation)) + geom_tile() + - geom_text(aes(label = round(correlation,4)), color = "white") + - labs(title = "Correlation between ICs", - x = NULL, - y = NULL) + - scico::scale_fill_scico(palette = "berlin", limits = c(-1,1)) + + geom_text(aes(label = round(correlation, 4)), color = "white") + + labs( + title = "Correlation between ICs", + x = NULL, + y = NULL + ) + + scico::scale_fill_scico(palette = "berlin", limits = c(-1, 1)) + coord_equal() ``` @@ -377,32 +453,36 @@ Okay, let's now compare the three approaches and put the loading matrices side b #| out.width: 100% all_weight_matrices <- bind_rows( - fa_weight_matrix |> - rename(comp = factor) |> + fa_weight_matrix |> + rename(comp = factor) |> mutate(alg = "FA"), - pc_weight_matrix |> - rename(comp = prin_comp) |> - mutate(alg = "PCA"), - ica_weight_matrix |> - rename(comp = ic) |> + pc_weight_matrix |> + rename(comp = prin_comp) |> + mutate(alg = "PCA"), + ica_weight_matrix |> + rename(comp = ic) |> mutate(alg = "ICA") ) -all_weight_matrices |> - filter(parse_number(comp) <= n_comps) |> - mutate(alg = str_glue("{alg} loadings"), - alg = as_factor(alg)) |> +all_weight_matrices |> + filter(parse_number(comp) <= n_comps) |> + mutate( + alg = str_glue("{alg} loadings"), + alg = as_factor(alg) + ) |> ggplot(aes(x = comp, y = variable, fill = loading)) + - geom_tile() + - labs(x = NULL, - y = NULL) + - scico::scale_fill_scico(palette = "cork", limits = c(-1,1)) + - facet_wrap(~ alg, scales = "free_x") + geom_tile() + + labs( + x = NULL, + y = NULL + ) + + scico::scale_fill_scico(palette = "cork", limits = c(-1, 1)) + + facet_wrap(~alg, scales = "free_x") ``` Here we can most clearly see the overlap between the three methods, Factor 1, PC1, and IC 6 capture essentially the same information. The same goes for Factor 2, PC2, and IC 4. Other than that we can see that the other components vary quite markedly. I wouldn't be comfortable calling any of the other components "fairly similar" to another. You could see how some variables load together with multiple methods, but then the components those are captured in also have other information or miss information. We already discussed IC7 consisting mostly of a single variable, and a similar thing happens with Factor 4, but for a different variable. -### BONUS: Hierarchical clustering +## BONUS: Hierarchical clustering I'll quickly go over hierarchical clustering too, it's simple and easy to interpret. Hierarchical clustering works by taking your variables and clustering them first into two groups, then three, then four, and so on. It looks at similarity and a concept called "Euclidian distance" (other methods are available) between the variables and determines how to separate. Essentially, hierarchical clustering works by iteratively separating variables into groups until every variable is on its own. It does so rather aggressively, with the previous methods it's possible for a variable to be part of two clusters, with hierarchical clustering it's part of a single cluster only. This approach makes it an easy way to see how variables cluster together at different thresholds. @@ -437,7 +517,7 @@ The `cutree()` function assigns a cluster to each of the variables. It looks at ```{r} #| label: dendrogram-w-line -ggdendro::ggdendrogram(hclust_model) + +ggdendro::ggdendrogram(hclust_model) + geom_hline(yintercept = 10.5, color = "firebrick") ``` @@ -446,21 +526,23 @@ Let's look at how the clusters are made up according to the hierarchical cluster ```{r} #| label: hclust-weight-matrix -hclust_weight_matrix %>% - data.frame() |> - janitor::clean_names() |> - rename(cluster = x) |> - rownames_to_column("variable") |> - ggplot(aes(x = as_factor(cluster), y = variable, fill = as_factor(cluster))) + - geom_tile() + - labs(x = NULL, - y = NULL, - fill = "cluster") + +hclust_weight_matrix |> + data.frame() |> + rename(cluster = hclust_weight_matrix) |> + rownames_to_column("variable") |> + ggplot(aes( + x = as_factor(cluster), y = variable, + fill = as_factor(cluster) + )) + + geom_tile() + + labs( + x = NULL, + y = NULL, + fill = "cluster" + ) + scico::scale_fill_scico_d(palette = "batlow") ``` We can again see that the same two clusters show up. Cluster 6 and 7 resemble those of Factor 2 and Factor 1, PC2 and PC1, and IC4 and IC6 respectively. If you want to learn more about what you can do with the `hlcust()` and associated functions, you can check out [this webpage](https://uc-r.github.io/hc_clustering). Okay I'd like to leave it at that. This blogpost is long enough. Again, I had to simplify and take some shortcuts, if you think I made mistakes in that effort, please let me know and I'll fix it as well as I can! - - diff --git a/content/blog/2021-easy-map-norway/areal.csv b/content/blog/2021-easy-map-norway/data/areal.csv similarity index 100% rename from content/blog/2021-easy-map-norway/areal.csv rename to content/blog/2021-easy-map-norway/data/areal.csv diff --git a/content/blog/2021-easy-map-norway/data/norway_hospitals.rds b/content/blog/2021-easy-map-norway/data/norway_hospitals.rds new file mode 100644 index 0000000..d13a32c --- /dev/null +++ b/content/blog/2021-easy-map-norway/data/norway_hospitals.rds @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:10db380d65beafdbc632a6a0ac27d7c714e2a426f5693635726a4f1a929e50ef +size 601 diff --git a/content/blog/2021-easy-map-norway/splmaps_data.RData b/content/blog/2021-easy-map-norway/data/splmaps_data.RData similarity index 100% rename from content/blog/2021-easy-map-norway/splmaps_data.RData rename to content/blog/2021-easy-map-norway/data/splmaps_data.RData diff --git a/content/blog/2021-easy-map-norway/index.markdown_strict_files/figure-markdown_strict/age-plot-1.png b/content/blog/2021-easy-map-norway/index.markdown_strict_files/figure-markdown_strict/age-plot-1.png index 03b39ea..6514024 100644 Binary files a/content/blog/2021-easy-map-norway/index.markdown_strict_files/figure-markdown_strict/age-plot-1.png and b/content/blog/2021-easy-map-norway/index.markdown_strict_files/figure-markdown_strict/age-plot-1.png differ diff --git a/content/blog/2021-easy-map-norway/index.markdown_strict_files/figure-markdown_strict/plot-kommune-faceted-1.png b/content/blog/2021-easy-map-norway/index.markdown_strict_files/figure-markdown_strict/plot-kommune-faceted-1.png index f8ce1ce..3e39cf3 100644 Binary files a/content/blog/2021-easy-map-norway/index.markdown_strict_files/figure-markdown_strict/plot-kommune-faceted-1.png and b/content/blog/2021-easy-map-norway/index.markdown_strict_files/figure-markdown_strict/plot-kommune-faceted-1.png differ diff --git a/content/blog/2021-easy-map-norway/index.md b/content/blog/2021-easy-map-norway/index.md index 0249baf..16b83d6 100644 --- a/content/blog/2021-easy-map-norway/index.md +++ b/content/blog/2021-easy-map-norway/index.md @@ -9,40 +9,35 @@ tags: - ggplot - map - norway -editor_options: - chunk_output_type: console execute: fig.retina: 2 fig.align: center fig.show: hold results: hold out.width: 80% +editor_options: + chunk_output_type: console --- -

-As of March 2023 {fhimaps} and {splmaps} are no longer supported due to budget cuts at the institution supporting them. The amazing developers (led by Richard Aubrey White and Chi Zhang) have moved the data and functionality to a new package called {csmaps}. The code below is updated to reflect the changes. -

- - - +{{< standout bg="#acc8d4" >}} +As of March 2023 {fhimaps} and {splmaps} are no longer supported due to budget cuts at the institution supporting them. The amazing developers (led by [Richard Aubrey White](https://www.rwhite.no) and [Chi Zhang](https://andreaczhang.github.io)) have moved the data and functionality to a new package called `{csmaps}`. The code below is updated to reflect the changes. +{{< /standout >}} Every now and then you discover a discover a much simpler solution to a problem you spent a lot of time solving. This recently happened to me on the topic of creating a map of Norway in R. In this post, I want to go through the process of what I learned. Previously, I used a JSON file and the `{geojsonio}` package to create a map of Norway and its fylker (counties) in particular. This was a very flexible and robust way of going about this, but also quite cumbersome. This method relies on a high-quality JSON file, meaning, a file that is detailed enough to display everything nicely, but not too detailed that it takes a ton of time and computing power to create a single plot. While I'll still use this method if I need to create a map for places other than Norway, I think I've found a better and easier solution for plotting Norway and the fylker and kommuner in the [`{csmaps}`](https://www.csids.no/csmaps/) package. -The `{csmaps}` package is created by the *Consortium for Statistics in Disease Surveillance* team. It's part of a series of packages (which they refer to as the "csverse"), which includes a package containing basic health surveillance data ([`{csdata}`](https://www.csids.no/csdata/)), one for real-time analysis in disease surveillance ([`{sc9}`](https://www.csids.no/sc9/)) and a few more. Here I'll dive into the `{csmaps}` package with some help from the `{csdata}` package. I'll also use the `{ggmap}` package to help with some other data and plotting. It's perhaps important to note that `{ggmap}` does contain a map of Norway as a whole, but not of the fylker and kommuner (municipalities), hence the usefulness of the `{csmaps}` package, which contains both. I'll also use `{tidyverse}` and `{ggtext}` as I always do. I won't load `{csmaps}` with the `library()` function, but will use the `::` operator instead since it'll make it easier to navigate the different datasets included. +{{< sidenote >}} +The group has [a bunch of packages](https://www.csids.no) for data science within public health +{{< /sidenote >}} + +The `{csmaps}` package is created by the [*Consortium for Statistics in Disease Surveillance*](https://www.csids.no) team. It's part of a series of packages (which they refer to as the "csverse"), which includes a package containing basic health surveillance data ([`{csdata}`](https://www.csids.no/csdata/)), one for real-time analysis in disease surveillance ([`{sc9}`](https://www.csids.no/sc9/)) and a few more. Here I'll dive into the `{csmaps}` package with some help from the `{csdata}` package. I'll also use the `{ggmap}` package to help with some other data and plotting. It's perhaps important to note that `{ggmap}` does contain a map of Norway as a whole, but not of the fylker and kommuner (municipalities), hence the usefulness of the `{csmaps}` package, which contains both. I'll also use `{tidyverse}` and `{ggtext}` as I always do. I won't load `{csmaps}` with the `library()` function, but will use the `::` operator instead since it'll make it easier to navigate the different datasets included. ``` r library(tidyverse) @@ -54,10 +49,10 @@ library(ggmap) So let's have a look at what's included. You'll see that nearly all maps come in either a `data.table` format or an `sf` format. Here I'll use only the data frames, since they're a lot easier to work with. The maps in `sf` format can be useful elsewhere, but I think for most purposes it's easier and more intuitive to work with data frames. ``` r -data(package = "csmaps") |> - pluck("results") |> - as_tibble() |> - select(Item, Title) |> +data(package = "csmaps") |> + pluck("results") |> + as_tibble() |> + select(Item, Title) |> print(n = 18) ``` @@ -66,22 +61,25 @@ A comprehensive version of this list is also included in the [reference](https:/ So let's have a look at one of those maps. For instance the one with the new fylker from 2020 with an inset of Oslo. ``` r -map_df <- nor_county_map_b2020_insert_oslo_dt |> +map_df <- nor_county_map_b2020_insert_oslo_dt |> glimpse() ``` Rows: 4,493 Columns: 5 - $ long 5.823860, 5.969413, 6.183042, 6.302433, 6.538059, 6.693511, 6.657289, 6.767480, 7.096287, 7.110080, 7.214667, 7.098724, 7.048585, 6.991842, 6.962960, 7.022880, 6.882833, 6.932169, 6.865142, 6.824533, 6.847579, 6.939172, 6.881039, 6.903689, 6.560838, 6.608553, 6.518096, 6.572750, 6.622347, 6.436069, 6.497499, 6.614721, 6.587698, 6.631556, 6.560330, 6.382453, 6.324024, 6.350390, 6.239081, 6.039468, 5.996479, 5.875390, 5.666733, 5.489157, 5.601538, 5.576495, 5.633912, 5.686111, 5.565115, 5.581353, 5.769811, 5.741457, 5.786016, 5.851127, 5.893869, 5.979000, 5.965561, 6.175315, 6.281425, 6.116682, 6.086432, 6.108257, 6.072709, 5.995039, 6.036212, 6.090791, 5.949121, 5.889756, 5.977865, 6.174015, 6.022552, 6.192079, 6.562320, 6.360672, 6.157856, 6.113738, 6.273343, 6.289766, 6.217630, 6.226502, 6.186815, 6.201218, 6.176712, 6.102012, 6.118100, 6.085544, 6.020944, 6.212991, 6.210265, 6.270777, 6.598666, 6.270651, 6.368525, 6.312591, 6.218008, 6.119585, 6.014455, 5.977… - $ lat 59.64576, 59.57897, 59.58607, 59.76743, 59.84285, 59.82295, 59.70174, 59.70049, 59.78278, 59.70060, 59.67269, 59.65129, 59.58683, 59.59703, 59.51645, 59.48120, 59.42930, 59.40027, 59.36840, 59.26988, 59.18267, 59.18192, 59.14459, 59.11340, 58.90995, 58.88007, 58.76741, 58.76528, 58.69668, 58.64933, 58.60992, 58.62509, 58.49277, 58.44801, 58.32740, 58.27794, 58.29827, 58.32613, 58.32347, 58.37897, 58.47691, 58.46442, 58.55017, 58.75369, 58.87953, 58.95905, 58.89528, 58.92810, 58.98060, 59.03644, 58.96494, 58.85185, 58.93869, 58.92934, 58.96797, 58.93705, 58.90389, 58.83516, 58.84096, 58.87579, 58.90089, 58.92352, 58.90352, 58.97113, 58.99728, 58.98579, 59.02345, 59.07371, 59.12151, 59.13993, 59.14356, 59.25835, 59.32107, 59.31982, 59.26062, 59.30281, 59.29761, 59.31837, 59.39100, 59.32360, 59.31884, 59.34543, 59.31699, 59.32179, 59.37375, 59.32646, 59.34218, 59.44488, 59.40925, 59.50584, 59.55886, 59.51752, 59.64394, 59.64117, 59.47384, 59.40548, 59.41009, 59.35… - $ order 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31, 32, 33, 34, 35, 36, 37, 38, 39, 40, 41, 42, 43, 44, 45, 46, 47, 48, 49, 50, 51, 52, 53, 54, 55, 56, 57, 58, 59, 60, 61, 62, 63, 64, 65, 66, 67, 68, 69, 70, 71, 72, 73, 74, 75, 76, 77, 78, 79, 80, 81, 82, 83, 84, 85, 86, 87, 88, 89, 90, 91, 92, 93, 94, 95, 96, 97, 98, 99, 100, 101, 102, 103, 104, 105, 106, 107, 108, 109, 110, 111, 112, 113, 114, 115, 116, 117, 118, 119, 120, 121, 122, 123, 124, 125, 126, 127, 128, 129, 130, 131, 132, 133, 134, 135, 136, 137, 138, 139, 140, 141, 142, 143, 144, 145, 146, 147, 148, 149, 150, 151, 152, 153, 154, 155, 156, 157, 158, 159, 160, 161, 162, 163, 164, 165, 166, 167, 168, 169, 170, 171, 172, 173, 174, 175, 176, 177, 178, 179, 180, 181, 182, 183, 184, 185, 186, 187, 188, 189, 190, 191, 192, 193, 194, 195, 196, 197, 198, 199, 200, 201, 202, 203, 204, 205, 206, 207, 208, 209, 210, 211, 212, 213, 214, 215, 216, 217… - $ group 11.1, 11.1, 11.1, 11.1, 11.1, 11.1, 11.1, 11.1, 11.1, 11.1, 11.1, 11.1, 11.1, 11.1, 11.1, 11.1, 11.1, 11.1, 11.1, 11.1, 11.1, 11.1, 11.1, 11.1, 11.1, 11.1, 11.1, 11.1, 11.1, 11.1, 11.1, 11.1, 11.1, 11.1, 11.1, 11.1, 11.1, 11.1, 11.1, 11.1, 11.1, 11.1, 11.1, 11.1, 11.1, 11.1, 11.1, 11.1, 11.1, 11.1, 11.1, 11.1, 11.1, 11.1, 11.1, 11.1, 11.1, 11.1, 11.1, 11.1, 11.1, 11.1, 11.1, 11.1, 11.1, 11.1, 11.1, 11.1, 11.1, 11.1, 11.1, 11.1, 11.1, 11.1, 11.1, 11.1, 11.1, 11.1, 11.1, 11.1, 11.1, 11.1, 11.1, 11.1, 11.1, 11.1, 11.1, 11.1, 11.1, 11.1, 11.1, 11.1, 11.1, 11.1, 11.1, 11.1, 11.1, 11.1, 11.1, 11.1, 11.1, 11.1, 11.1, 11.1, 11.1, 11.1, 11.1, 11.1, 11.1, 11.1, 11.1, 11.1, 11.1, 11.1, 11.1, 11.1, 11.1, 11.1, 11.1, 11.1, 11.1, 11.1, 11.1, 11.1, 11.1, 11.1, 11.1, 11.1, 11.1, 11.1, 11.1, 11.1, 11.1, 11.1, 11.1, 11.1, 11.1, 11.1, 11.1, 11.1, 11.1, 11.2, 11.2, 11.2, 11.2, 11.2, 11.2, 11.2, 11.2, 11.2, 11.2, 11.2, 11.2, 11.2, 11.3, 11.3, 11.3, 11.3, 11.3, 11.3, 11.4, 11.4, 11.… - $ location_code "county_nor11", "county_nor11", "county_nor11", "county_nor11", "county_nor11", "county_nor11", "county_nor11", "county_nor11", "county_nor11", "county_nor11", "county_nor11", "county_nor11", "county_nor11", "county_nor11", "county_nor11", "county_nor11", "county_nor11", "county_nor11", "county_nor11", "county_nor11", "county_nor11", "county_nor11", "county_nor11", "county_nor11", "county_nor11", "county_nor11", "county_nor11", "county_nor11", "county_nor11", "county_nor11", "county_nor11", "county_nor11", "county_nor11", "county_nor11", "county_nor11", "county_nor11", "county_nor11", "county_nor11", "county_nor11", "county_nor11", "county_nor11", "county_nor11", "county_nor11", "county_nor11", "county_nor11", "county_nor11", "county_nor11", "county_nor11", "county_nor11", "county_nor11", "county_nor11", "county_nor11", "county_nor11", "county_nor11", "county_nor11", "county_nor11", "county_nor11", "county_nor11", "county_nor11", "county_nor11", "county_nor11",… + $ long 5.823860, 5.969413, 6.183042, 6.302433, 6.538059, 6.6935… + $ lat 59.64576, 59.57897, 59.58607, 59.76743, 59.84285, 59.822… + $ order 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 1… + $ group 11.1, 11.1, 11.1, 11.1, 11.1, 11.1, 11.1, 11.1, 11.1, 11… + $ location_code "county_nor11", "county_nor11", "county_nor11", "county_… Immediately you can see that there's a lot of rows, each representing a point on the map. A data frame with a larger number of rows would be more detailed (i.e. less straight lines, more detail in curvatures of borders etc.). Let's create a very simple map. We'll use the `geom_polygon` to turn our data frame into a map. The location of the points are given in longitudes and latitudes (like x- and y-coordinates), the group serves to make sure lines are drawn correctly (try running the code below without `group = group` and see what happens). The `location_code` denotes the county number (which isn't from 1 to 11, but instead uses some other standard format matching numbers in other government datasets). Let's see the simplest map: ``` r -ggplot(map_df, aes(x = long, y = lat, group = group, fill = location_code)) + +ggplot(map_df, aes( + x = long, y = lat, + group = group, fill = location_code +)) + geom_polygon() ``` @@ -90,7 +88,7 @@ ggplot(map_df, aes(x = long, y = lat, group = group, fill = location_code)) + Now let's convert the awkward county numbers to the actual names of the fylker. The `{csdata}` package has a data frame with codes and names for all the kommuner, fylker, and even regions (Øst, Vest, Nord-Norge etc.). We're only interested in the fylker here, so we'll select the unique location codes and the corresponding location names. ``` r -county_names <- csdata::nor_locations_names(border = 2020) |> +county_names <- csdata::nor_locations_names(border = 2020) |> filter(str_detect(location_code, "^county")) |> distinct(location_code, location_name) @@ -110,33 +108,58 @@ print(county_names) 10: county_nor46 Vestland 11: county_nor30 Viken -Now let's also create a nice color palette to give each fylke a nicer color than the default ggplot colors. We'll create a named vector to match each fylke with a color from the *batlow* palette by [Fabio Crameri](https://www.fabiocrameri.ch/colourmaps/). +{{< sidenote >}} +The [`{scico}`](https://github.com/thomasp85/scico) package contains a collection of color palettes that are great for accessibility +{{< /sidenote >}} + +Now let's also create a nice color palette to give each fylke a nicer color than the default ggplot colors. We'll create a named vector to match each fylke with a color from the *batlow* palette by [Fabio Crameri](https://www.fabiocrameri.ch/colourmaps/) implemented in the `{scico}` package. ``` r -county_colors <- setNames(scico::scico(n = nrow(county_names), palette = "batlow"), - nm = county_names$location_name) +county_colors <- setNames( + scico::scico( + n = nrow(county_names), + palette = "batlow" + ), + nm = county_names$location_name +) ``` Let's see what we can make now. We'll add the county names to the large data frame containing the longitudes and latitudes and then create a plot again. I'll also add some other style elements, such as a labels to the x- and y-axes, circles instead of squares for the legend and a map projection. For Norway especially I think a conic map projection works well since the northern fylker are so massive and the southern fylker are more dense, so adding a conic projection with a cone tangent of 40 degrees makes it a bit more perceptionally balanced (`lat0` refers to the cone tangent, the details are complicated but a higher cone tangent results a greater distortion in favor of southern points). ``` r -map_df |> - left_join(county_names, by = "location_code") |> - ggplot(aes(x = long, y = lat, fill = location_name, group = group)) + - geom_polygon(key_glyph = "point") + - labs(x = NULL, - y = NULL, - fill = NULL) + - scale_x_continuous(labels = scales::label_number(suffix = "\u00b0W")) + - scale_y_continuous(labels = scales::label_number(suffix = "\u00b0N")) + - scale_fill_manual(values = county_colors, - guide = guide_legend(override.aes = list(shape = 21, size = 4))) + - coord_map(projection = "conic", lat0 = 40) + +map_df |> + left_join(county_names, by = "location_code") |> + ggplot(aes( + x = long, y = lat, + fill = location_name, group = group + )) + + geom_polygon(key_glyph = "point") + + labs( + x = NULL, + y = NULL, + fill = NULL + ) + + scale_x_continuous( + labels = scales::label_number(suffix = "\u00b0W") + ) + + scale_y_continuous( + labels = scales::label_number(suffix = "\u00b0N") + ) + + scale_fill_manual( + values = county_colors, + guide = guide_legend(override.aes = list(shape = 21, size = 4)) + ) + + coord_map(projection = "conic", lat0 = 40) + theme_minimal() + - theme(legend.position = c(0.9,0.2), - legend.text = element_text(size = 5), - legend.key.height = unit(10,"pt"), - legend.background = element_rect(fill = "white", color = "transparent")) + theme( + legend.position = c(0.9, 0.2), + legend.text = element_text(size = 5), + legend.key.height = unit(10, "pt"), + legend.background = element_rect( + fill = "white", + color = "transparent" + ) + ) ``` @@ -156,12 +179,20 @@ str_glue("Range across latitude: {str_c(range(map_df$lat), collapse = ', ')}") Let's also combine the map with some actual data. The `{csdata}` package contains some simple data on age distribution in kommuner and fylker. Let's take the data from the different fylker in the last available year and see what proportion of the total population is younger than 18. ``` r -age_data <- csdata::nor_population_by_age_cats(border = 2020, cats = list(c(0:18))) |> - filter(str_detect(location_code, "^county"), - calyear == max(calyear)) |> - pivot_wider(id_cols = location_code, names_from = age, values_from = pop_jan1_n) |> +age_data <- csdata::nor_population_by_age_cats( + border = 2020, + cats = list(seq(18)) +) |> + filter( + str_detect(location_code, "^county"), + calyear == max(calyear) + ) |> + pivot_wider( + id_cols = location_code, + names_from = age, values_from = pop_jan1_n + ) |> janitor::clean_names() |> - rename(age_0_18 = x000_018) |> + rename(age_0_18 = x001_018) |> mutate(proportion = age_0_18 / total) ``` @@ -169,36 +200,60 @@ Let's create a map without the Oslo inset, combine it with the age distribution ``` r nor_county_map_b2020_default_dt |> - left_join(age_data, by = "location_code") |> + left_join(age_data, by = "location_code") |> ggplot(aes(x = long, y = lat, group = group)) + - geom_polygon(data = map_data("world") |> filter(region != "Norway"), - fill = "grey80") + - geom_polygon(aes(fill = proportion), key_glyph = "point") + + geom_polygon( + data = map_data("world") |> filter(region != "Norway"), + fill = "grey80" + ) + + geom_polygon(aes(fill = proportion), key_glyph = "point") + labs(fill = "Proportion of the population younger than 18") + - scico::scale_fill_scico(palette = "devon", limits = c(0.15, 0.31), - labels = scales::percent_format(accuracy = 1), - guide = guide_colorbar(title.position = "top", title.hjust = 0.5, - barwidth = 10, barheight = 0.5, ticks = FALSE)) + - coord_map(projection = "conic", lat0 = 60, - xlim = c(-8,40), ylim = c(57, 70)) + + scico::scale_fill_scico( + palette = "devon", limits = c(0.15, 0.31), + labels = scales::percent_format(accuracy = 1), + guide = guide_colorbar( + title.position = "top", title.hjust = 0.5, + barwidth = 10, barheight = 0.5, ticks = FALSE + ) + ) + + coord_map( + projection = "conic", lat0 = 60, + xlim = c(-8, 40), ylim = c(57, 70) + ) + theme_void() + - theme(plot.background = element_rect(fill = "#A2C0F4", color = "transparent"), - legend.direction = "horizontal", - legend.position = c(0.8, 0.1), - legend.title = element_text(size = 8), - legend.text = element_text(size = 6)) + theme( + plot.background = element_rect( + fill = "#A2C0F4", + color = "transparent" + ), + legend.direction = "horizontal", + legend.position = c(0.8, 0.1), + legend.title = element_text(size = 8), + legend.text = element_text(size = 6) + ) ``` ## Geocoding +{{< sidenote br="3em" >}} +The Google Maps API requires a personal API key, you can read how to obtain and register an API key [here](https://github.com/dkahle/ggmap) +{{< /sidenote >}} + The `{ggmap}` package also has an incredibly useful function called `mutate_geocode()` which transforms a string with an address or description in character format to longitude and latitude. Since `{ggmap}` uses the Google Maps API, it works similarly to typing in a description in Google Maps. So an approximation of the location will (most likely) get you the right result (e.g. with "Hospital Lillehammer"). Note that `mutate_geocode` uses `lon` instead of `long` as column name for longitude. Just to avoid confusion, I'll rename the column to `long`. ``` r -hospitals_df <- tibble(location = c("Ullevål Sykehus, Oslo","Haukeland universitetssjukehus, Bergen","St. Olav, Trondheim", - "Universitetssykehuset Nord-Norge, Tromsø","Stavanger Universitetssjukehus","Sørlandet Hospital Kristiansand", "Hospital Lillehammer")) |> - mutate_geocode(location) |> +hospitals_df <- tibble(location = c( + "Ullevål Sykehus, Oslo", + "Haukeland universitetssjukehus, Bergen", + "St. Olav, Trondheim", + "Universitetssykehuset Nord-Norge, Tromsø", + "Stavanger Universitetssjukehus", + "Sørlandet Hospital Kristiansand", + "Hospital Lillehammer" +)) |> + mutate_geocode(location) |> rename(long = lon) ``` @@ -224,27 +279,47 @@ Now let's put these on top of the map. We'll use the same map we used earlier. W ``` r set.seed(21) -map_df |> - left_join(county_names, by = "location_code") |> - ggplot(aes(x = long, y = lat, fill = location_name, group = group)) + - geom_polygon(key_glyph = "point") + - geom_segment(data = hospitals_df |> filter(str_detect(location, "Oslo")), - aes(x = long, y = lat, xend = 19.5, yend = 62), inherit.aes = FALSE) + - geom_point(data = hospitals_df, aes(x = long, y = lat), inherit.aes = FALSE, - shape = 18, color = "firebrick", size = 4, show.legend = FALSE) + - ggrepel::geom_label_repel(data = hospitals_df, aes(x = long, y = lat, label = location), - size = 2, alpha = 0.75, label.size = 0, inherit.aes = FALSE) + - labs(x = NULL, - y = NULL, - fill = NULL) + - scale_fill_manual(values = county_colors, - guide = guide_legend(override.aes = list(size = 4, shape = 21, - color = "transparent"))) + - coord_map(projection = "conic", lat0 = 60) + +map_df |> + left_join(county_names, by = "location_code") |> + ggplot(aes( + x = long, y = lat, + fill = location_name, group = group + )) + + geom_polygon(key_glyph = "point") + + geom_segment( + data = hospitals_df |> filter(str_detect(location, "Oslo")), + aes(x = long, y = lat, xend = 19.5, yend = 62), + inherit.aes = FALSE + ) + + geom_point( + data = hospitals_df, aes(x = long, y = lat), + inherit.aes = FALSE, + shape = 18, color = "firebrick", + size = 4, show.legend = FALSE + ) + + ggrepel::geom_label_repel( + data = hospitals_df, aes(x = long, y = lat, label = location), + size = 2, alpha = 0.75, label.size = 0, inherit.aes = FALSE + ) + + labs( + x = NULL, + y = NULL, + fill = NULL + ) + + scale_fill_manual( + values = county_colors, + guide = guide_legend(override.aes = list( + size = 4, shape = 21, + color = "transparent" + )) + ) + + coord_map(projection = "conic", lat0 = 60) + theme_void() + - theme(legend.position = c(0.2,0.7), - legend.text = element_text(size = 5), - legend.key.height = unit(10,"pt")) + theme( + legend.position = c(0.2, 0.7), + legend.text = element_text(size = 5), + legend.key.height = unit(10, "pt") + ) ``` @@ -254,97 +329,125 @@ map_df |> Let's take it a step further and now look at how we can combine our map with data that didn't come directly from the FHI. Instead I downloaded some data from the Norwegian Statistics Bureau ([Statistisk sentralbyrå, SSB](https://www.ssb.no)) on land use in the kommuner ([link](https://www.ssb.no/en/natur-og-miljo/areal/statistikk/arealbruk-og-arealressurser)). This came in the form of a semi-colon separated .csv file. ``` r -area_use <- read_delim("areal.csv", delim = ";", skip = 2) |> +area_use <- read_delim("./data/areal.csv", + delim = ";", skip = 2 +) |> janitor::clean_names() print(area_use) ``` # A tibble: 356 × 20 - region area_2021_residential_areas area_2021_recreational_facilities area_2021_built_up_areas_for_agriculture_and_fishing area_2021_industrial_commercial_and_service_areas area_2021_education_and_day_care_facilities area_2021_health_and_social_welfare_institutions area_2021_cultural_and_religious_activities area_2021_transport_telecommunications_and_technical_infrastructure area_2021_emergency_and_defence_services area_2021_green_areas_and_sports_facilities area_2021_unclassified_built_up_areas_and_related_land area_2021_agricultural_land area_2021_forest area_2021_open_firm_ground area_2021_wetland area_2021_bare_rock_gravel_and_blockfields area_2021_permanent_snow_and_glaciers area_2021_inland_waters area_2021_unclassified_undeveloped_areas - - 1 3001 Halden 9.61 1.85 2.51 2.89 0.29 0.21 0.22 11.4 0.03 0.84 1.51 61.6 480. 2.88 16.2 1.3 0 48.7 0 - 2 3002 Moss 10.4 1.86 1.59 3.92 0.32 0.21 0.14 7.7 0.26 1.57 1.42 33.6 60.0 4.09 0.21 0.48 0 10.1 0 - 3 3003 Sarpsborg 14.0 2.95 3.49 4.39 0.48 0.28 0.19 12.2 0.01 1.87 1.52 78.1 241. 8.22 1.7 1.55 0 33.5 0 - 4 3004 Fredrikstad 19.9 5.14 3.05 6.24 0.62 0.36 0.26 11.8 0.27 2.84 2.49 67.4 139. 8.91 0.23 15.4 0 8.59 0 - 5 3005 Drammen 18.4 1.17 1.26 4.59 0.63 0.35 0.23 11.6 0.01 3.41 2.54 26.0 226. 2.59 5 1.17 0 13.2 0 - 6 3006 Kongsberg 7.68 2.23 2.38 2.23 0.36 0.11 0.15 11.8 0.35 1.86 1.4 40.2 616. 15.4 32.1 18.8 0 39.6 0 - 7 3007 Ringerike 11 3.99 4.18 5.19 0.37 0.29 0.11 21.4 0.36 1.27 2.12 77.7 1191. 47.9 50.8 3.21 0 134. 0 - 8 3011 Hvaler 2.31 5.79 0.65 0.34 0.04 0.02 0.03 1.96 0 0.29 0.19 4.76 34.0 5.65 0.16 33.5 0 0.21 0 - 9 3012 Aremark 0.64 0.73 0.8 0.31 0.02 0.01 0.03 3.35 0 0.16 0.14 20.9 245. 0.94 7.24 0.02 0 38.6 0 - 10 3013 Marker 1.38 0.58 1.68 0.52 0.03 0.02 0.05 5.38 0.01 0.14 0.34 40.0 305. 1.71 10.5 0.04 0 46.0 0 + region area_2021_residentia…¹ area_2021_recreation…² area_2021_built_up_a…³ + + 1 3001 Ha… 9.61 1.85 2.51 + 2 3002 Mo… 10.4 1.86 1.59 + 3 3003 Sa… 14.0 2.95 3.49 + 4 3004 Fr… 19.9 5.14 3.05 + 5 3005 Dr… 18.4 1.17 1.26 + 6 3006 Ko… 7.68 2.23 2.38 + 7 3007 Ri… 11 3.99 4.18 + 8 3011 Hv… 2.31 5.79 0.65 + 9 3012 Ar… 0.64 0.73 0.8 + 10 3013 Ma… 1.38 0.58 1.68 # ℹ 346 more rows + # ℹ abbreviated names: ¹​area_2021_residential_areas, + # ²​area_2021_recreational_facilities, + # ³​area_2021_built_up_areas_for_agriculture_and_fishing + # ℹ 16 more variables: area_2021_industrial_commercial_and_service_areas , + # area_2021_education_and_day_care_facilities , + # area_2021_health_and_social_welfare_institutions , … You can see there's 356 rows, each representing a different kommune in Norway. The columns here represent the surface area (in km2) with different designations (e.g. forest, health services, agriculture etc.). All data here is from 2021. Now, kommunes have different sizes, so I want to get the designations of interest as percentages of total area in the kommune. Here I assumed that the sum of all designations is equal to the total size of each kommune. I also want to extract the kommune number, since we'll use that to merge this data frame with the map later. The kommune number needs to be 4 digits, so we need to add a leading 0 in some instances. Then we'll create the `location_code` column which will match the `location_code` column in the data frame from `{csmaps}`. Then we'll calculate the percentage land use for different designations. Here I'm just interested in *"bare rock, gravel, and blockfields"*, *"wetland*, *"forest"*, and *"Open firm ground"*. ``` r -area_use <- area_use |> - mutate(total_area = rowSums(across(where(is.numeric))), - kommune_code = parse_number(region), - kommune_code = format(kommune_code, digits = 4), - kommune_code = str_replace_all(kommune_code, " ", "0"), - location_code = str_glue("municip_nor{kommune_code}"), - perc_rocks = area_2021_bare_rock_gravel_and_blockfields / total_area, - perc_wetland = area_2021_wetland / total_area, - perc_forest = area_2021_forest / total_area, - perc_open_ground = area_2021_open_firm_ground / total_area) |> - arrange(kommune_code) |> +area_use <- area_use |> + mutate( + total_area = rowSums(across(where(is.numeric))), + kommune_code = parse_number(region), + kommune_code = format(kommune_code, digits = 4), + kommune_code = str_replace_all(kommune_code, " ", "0"), + location_code = str_glue("municip_nor{kommune_code}"), + perc_rocks = area_2021_bare_rock_gravel_and_blockfields / total_area, + perc_wetland = area_2021_wetland / total_area, + perc_forest = area_2021_forest / total_area, + perc_open_ground = area_2021_open_firm_ground / total_area + ) |> + arrange(kommune_code) |> glimpse() ``` Rows: 356 Columns: 27 - $ region "0301 Oslo municipality", "1101 Eigersund", "1103 Stavanger", "1106 Haugesund", "1108 Sandnes", "1111 Sokndal", "1112 Lund", "1114 Bjerkreim", "1119 H\xe5", "1120 Klepp", "1121 Time", "1122 Gjesdal", "1124 Sola", "1127 Randaberg", "1130 Strand", "1133 Hjelmeland", "1134 Suldal", "1135 Sauda", "1144 Kvits\xf8y", "1145 Bokn", "1146 Tysv\xe6r", "1149 Karm\xf8y", "1151 Utsira", "1160 Vindafjord", "1505 Kristiansund", "1506 Molde", "1507 \xc5lesund", "1511 Vanylven", "1514 Sande (M\xf8re og Romsdal)", "1515 Her\xf8y (M\xf8re og Romsdal)", "1516 Ulstein", "1517 Hareid", "1520 \xd8rsta", "1525 Stranda", "1528 Sykkylven", "1531 Sula", "1532 Giske", "1535 Vestnes", "1539 Rauma", "1547 Aukra", "1554 Aver\xf8y", "1557 Gjemnes", "1560 Tingvoll", "1563 Sunndal", "1566 Surnadal", "1573 Sm\xf8la", "1576 Aure", "1577 Volda", "1578 Fjord", "1579 Hustadvika", "1804 Bod\xf8", "1806 Narvik", "1811 Bindal", "1812 S\xf8mna", "181… - $ area_2021_residential_areas 51.53, 3.58, 19.28, 7.30, 11.90, 1.15, 1.03, 0.70, 3.66, 3.45, 3.49, 2.23, 4.70, 1.71, 3.75, 0.81, 1.25, 1.54, 0.12, 0.34, 3.48, 12.53, 0.07, 2.98, 4.78, 7.88, 15.66, 1.22, 0.82, 3.00, 2.17, 1.50, 2.92, 1.30, 2.15, 2.69, 2.29, 2.86, 1.96, 1.86, 2.26, 0.83, 1.37, 2.07, 2.11, 0.88, 1.28, 2.53, 0.75, 4.79, 8.31, 5.79, 0.56, 0.65, 2.56, 0.63, 0.20, 0.86, 2.10, 0.85, 3.50, 0.56, 0.57, 0.71, 0.76, 1.69, 7.11, 0.88, 0.19, 0.63, 2.97, 1.04, 0.47, 1.82, 3.82, 0.75, 1.02, 0.77, 0.58, 0.25, 0.29, 0.64, 5.09, 2.41, 2.97, 1.29, 1.85, 3.32, 2.06, 0.45, 1.38, 9.61, 10.41, 13.98, 19.91, 18.35, 7.68, 11.00, 2.31, 0.64, 1.38, 13.26, 1.31, 2.95, 2.67, 1.93, 4.30, 11.79, 4.68, 3.60, 5.70, 22.83, 25.10, 7.36, 3.31, 3.23, 5.76, 15.38, 5.24, 1.92, 7.53, 7.49, 8.37, 4.31, 1.38, 2.35, 0.56, 1.50, 1.81, 0.83, 1.66, 1.98, 1.42, 1.09, 5.35, 6.29, 6.06, 1.11, 0.50, 0.95, 2.39, 3.30, 6.35, 7.93, 7.19, 10.47, 13.54, 3.07, 8.24, 2.52, 3.89… - $ area_2021_recreational_facilities 1.12, 1.20, 1.87, 0.21, 2.85, 0.26, 0.50, 0.67, 0.43, 0.13, 0.04, 0.66, 0.30, 0.14, 1.61, 1.02, 1.64, 0.92, 0.07, 0.18, 1.74, 0.80, 0.05, 1.31, 0.47, 1.62, 1.83, 0.31, 0.17, 0.35, 0.22, 0.13, 0.64, 0.72, 0.68, 0.15, 0.10, 0.63, 1.09, 0.11, 0.52, 0.67, 1.16, 0.67, 1.43, 0.50, 1.39, 0.63, 0.76, 0.80, 2.53, 3.45, 0.42, 0.28, 0.79, 0.44, 0.18, 0.58, 0.50, 0.50, 0.81, 0.71, 0.64, 0.38, 0.39, 0.93, 1.60, 0.85, 0.05, 0.55, 0.83, 1.76, 0.59, 1.06, 1.03, 0.61, 0.90, 0.59, 0.61, 0.05, 0.06, 0.15, 0.81, 0.67, 0.68, 0.75, 0.42, 0.66, 0.50, 0.10, 1.37, 1.85, 1.86, 2.95, 5.14, 1.17, 2.23, 3.99, 5.79, 0.73, 0.58, 1.42, 0.41, 0.29, 1.19, 0.42, 2.02, 0.52, 0.57, 4.12, 1.68, 0.83, 4.51, 1.47, 0.13, 1.02, 0.02, 0.69, 0.53, 0.06, 0.10, 0.66, 0.67, 0.30, 0.69, 1.00, 2.64, 3.65, 3.06, 2.30, 3.08, 7.11, 4.51, 1.56, 0.57, 0.53, 0.32, 3.26, 2.13, 3.73, 0.31, 1.26, 1.46, 0.74, 1.62, 0.86, 7.62, 1.36, 0.90, 0.86, 0.65, 0.82, 0.84, 0.7… - $ area_2021_built_up_areas_for_agriculture_and_fishing 0.64, 1.26, 3.51, 0.33, 3.58, 0.54, 0.89, 1.22, 4.21, 4.36, 2.28, 1.03, 1.91, 1.07, 1.29, 1.41, 1.48, 0.39, 0.09, 0.27, 2.25, 2.12, 0.03, 3.02, 0.30, 3.22, 3.33, 1.26, 0.57, 0.78, 0.39, 0.60, 2.15, 0.93, 1.04, 0.29, 0.82, 1.51, 1.78, 0.33, 1.32, 1.27, 1.14, 1.28, 1.96, 0.81, 1.37, 2.14, 1.02, 3.73, 1.65, 1.21, 0.71, 0.89, 1.13, 0.42, 0.26, 0.18, 0.70, 1.00, 1.68, 0.49, 0.74, 0.62, 0.37, 1.27, 1.53, 0.26, 0.03, 0.25, 0.76, 0.33, 0.55, 1.26, 0.75, 0.62, 1.04, 0.29, 0.49, 0.07, 0.15, 0.24, 0.78, 0.67, 1.14, 0.32, 0.36, 1.35, 1.03, 0.12, 0.75, 2.51, 1.59, 3.49, 3.05, 1.26, 2.38, 4.18, 0.65, 0.80, 1.68, 8.58, 1.33, 3.52, 1.62, 1.41, 1.35, 1.49, 1.38, 0.73, 0.49, 1.09, 2.42, 3.81, 0.16, 1.08, 0.30, 4.10, 0.81, 0.60, 2.66, 4.79, 2.72, 1.97, 0.63, 1.01, 0.63, 1.34, 1.75, 0.84, 2.15, 1.15, 2.14, 0.81, 2.82, 2.39, 2.67, 1.53, 1.12, 1.44, 0.81, 1.53, 3.41, 1.95, 2.40, 3.56, 7.60, 2.30, 3.70, 2.11, 2.62, 2.44, 2.84, 4.2… - $ area_2021_industrial_commercial_and_service_areas 10.37, 2.09, 4.86, 1.71, 5.89, 4.44, 0.24, 0.49, 1.75, 1.79, 1.39, 1.71, 2.97, 0.49, 1.93, 1.22, 1.78, 0.79, 0.02, 0.10, 3.02, 3.96, 0.01, 1.51, 2.06, 3.37, 4.50, 1.97, 0.33, 0.77, 0.85, 0.50, 1.38, 0.75, 0.90, 0.48, 0.51, 1.34, 1.58, 1.58, 1.12, 0.28, 0.46, 1.81, 1.15, 0.31, 1.23, 0.95, 0.85, 2.95, 2.88, 3.71, 0.24, 0.21, 1.91, 0.19, 0.08, 0.21, 1.57, 0.35, 2.27, 0.43, 0.46, 0.11, 0.45, 0.85, 9.87, 0.28, 0.07, 0.16, 1.17, 0.76, 0.16, 0.97, 1.24, 0.79, 0.29, 0.43, 0.23, 0.11, 0.15, 0.26, 1.29, 1.08, 0.76, 0.50, 0.52, 1.41, 1.21, 0.12, 0.51, 2.89, 3.92, 4.39, 6.24, 4.59, 2.23, 5.19, 0.34, 0.31, 0.52, 4.45, 0.33, 1.23, 1.28, 0.29, 2.61, 2.58, 1.38, 0.42, 0.53, 3.41, 4.44, 1.91, 0.19, 0.97, 1.51, 6.03, 1.57, 0.16, 5.18, 1.46, 1.78, 0.75, 0.27, 0.51, 0.39, 0.77, 1.03, 0.99, 1.26, 1.26, 0.94, 0.58, 1.77, 2.48, 3.10, 0.97, 0.43, 0.79, 0.69, 0.60, 2.23, 2.57, 2.28, 3.19, 3.99, 0.54, 1.60, 0.57, 1.20, 0.94, 1.08, 1.… - $ area_2021_education_and_day_care_facilities 3.51, 0.17, 1.27, 0.37, 0.69, 0.02, 0.06, 0.04, 0.22, 0.23, 0.22, 0.14, 0.30, 0.14, 0.16, 0.06, 0.07, 0.06, 0.00, 0.02, 0.13, 0.45, 0.00, 0.12, 0.21, 0.40, 0.80, 0.05, 0.03, 0.11, 0.07, 0.03, 0.12, 0.06, 0.08, 0.10, 0.11, 0.12, 0.07, 0.09, 0.06, 0.06, 0.04, 0.10, 0.09, 0.04, 0.04, 0.17, 0.03, 0.19, 0.49, 0.26, 0.04, 0.02, 0.08, 0.02, 0.01, 0.01, 0.09, 0.03, 0.15, 0.02, 0.03, 0.02, 0.05, 0.06, 0.24, 0.04, 0.01, 0.04, 0.11, 0.05, 0.02, 0.06, 0.15, 0.03, 0.04, 0.03, 0.02, 0.01, 0.01, 0.04, 0.18, 0.11, 0.10, 0.03, 0.04, 0.12, 0.08, 0.01, 0.05, 0.29, 0.32, 0.48, 0.62, 0.63, 0.36, 0.37, 0.04, 0.02, 0.03, 0.45, 0.04, 0.09, 0.09, 0.06, 0.19, 0.47, 0.54, 0.11, 0.17, 1.08, 0.90, 0.18, 0.14, 0.10, 0.33, 0.72, 0.25, 0.06, 0.41, 0.28, 0.26, 0.15, 0.06, 0.11, 0.01, 0.03, 0.08, 0.02, 0.13, 0.07, 0.04, 0.03, 0.19, 0.16, 0.21, 0.05, 0.02, 0.03, 0.06, 0.12, 0.21, 0.33, 0.29, 0.33, 0.41, 0.09, 0.28, 0.05, 0.14, 0.08, 0.05, 0.1… - $ area_2021_health_and_social_welfare_institutions 1.17, 0.04, 0.47, 0.14, 0.19, 0.01, 0.02, 0.00, 0.16, 0.07, 0.05, 0.02, 0.07, 0.02, 0.03, 0.02, 0.04, 0.03, 0.00, 0.00, 0.05, 0.16, 0.00, 0.06, 0.08, 0.15, 0.31, 0.03, 0.01, 0.05, 0.04, 0.02, 0.04, 0.02, 0.02, 0.03, 0.02, 0.07, 0.03, 0.05, 0.03, 0.02, 0.02, 0.04, 0.04, 0.02, 0.04, 0.07, 0.04, 0.08, 0.18, 0.10, 0.02, 0.02, 0.03, 0.01, 0.01, 0.01, 0.03, 0.02, 0.07, 0.00, 0.02, 0.01, 0.01, 0.02, 0.14, 0.03, 0.01, 0.01, 0.05, 0.02, 0.01, 0.05, 0.07, 0.03, 0.02, 0.03, 0.01, 0.00, 0.01, 0.01, 0.05, 0.04, 0.06, 0.02, 0.03, 0.06, 0.03, 0.01, 0.02, 0.21, 0.21, 0.28, 0.36, 0.35, 0.11, 0.29, 0.02, 0.01, 0.02, 0.23, 0.03, 0.04, 0.03, 0.03, 0.06, 0.16, 0.09, 0.04, 0.08, 0.38, 0.43, 0.08, 0.03, 0.03, 0.20, 0.25, 0.05, 0.01, 0.21, 0.07, 0.12, 0.04, 0.04, 0.06, 0.02, 0.03, 0.02, 0.01, 0.04, 0.03, 0.01, 0.02, 0.14, 0.09, 0.07, 0.01, 0.02, 0.02, 0.02, 0.07, 0.16, 0.15, 0.10, 0.17, 0.23, 0.04, 0.23, 0.07, 0.03, 0.04, 0.06, 0.0… - $ area_2021_cultural_and_religious_activities 0.90, 0.06, 0.36, 0.11, 0.22, 0.03, 0.04, 0.02, 0.10, 0.07, 0.06, 0.10, 0.09, 0.04, 0.07, 0.03, 0.05, 0.02, 0.00, 0.01, 0.08, 0.25, 0.00, 0.07, 0.07, 0.18, 0.26, 0.04, 0.03, 0.07, 0.02, 0.02, 0.06, 0.03, 0.03, 0.03, 0.04, 0.05, 0.05, 0.03, 0.06, 0.04, 0.04, 0.05, 0.05, 0.05, 0.03, 0.05, 0.04, 0.12, 0.17, 0.15, 0.02, 0.03, 0.07, 0.03, 0.01, 0.02, 0.05, 0.02, 0.07, 0.01, 0.03, 0.03, 0.03, 0.05, 0.14, 0.04, 0.01, 0.03, 0.07, 0.05, 0.03, 0.04, 0.03, 0.02, 0.04, 0.02, 0.03, 0.00, 0.01, 0.01, 0.11, 0.08, 0.07, 0.03, 0.04, 0.05, 0.08, 0.02, 0.07, 0.22, 0.14, 0.19, 0.26, 0.23, 0.15, 0.11, 0.03, 0.03, 0.05, 0.22, 0.02, 0.07, 0.03, 0.02, 0.03, 0.10, 0.06, 0.05, 0.06, 0.18, 0.20, 0.08, 0.03, 0.03, 0.05, 0.19, 0.05, 0.02, 0.09, 0.07, 0.10, 0.04, 0.04, 0.02, 0.02, 0.05, 0.03, 0.02, 0.06, 0.07, 0.03, 0.02, 0.07, 0.10, 0.06, 0.03, 0.03, 0.03, 0.05, 0.03, 0.10, 0.17, 0.20, 0.14, 0.14, 0.04, 0.11, 0.04, 0.06, 0.07, 0.04, 0.0… - $ area_2021_transport_telecommunications_and_technical_infrastructure 36.75, 6.15, 14.50, 4.02, 12.64, 2.54, 3.20, 4.10, 6.42, 4.41, 3.95, 4.35, 8.82, 1.46, 4.51, 3.95, 6.63, 2.51, 0.15, 0.94, 7.28, 9.44, 0.12, 7.87, 4.19, 12.01, 13.70, 3.74, 1.53, 2.59, 1.83, 1.37, 5.54, 3.47, 2.81, 1.41, 2.66, 4.08, 6.33, 1.69, 3.39, 3.49, 4.13, 4.86, 5.55, 2.96, 5.99, 6.41, 3.14, 7.54, 14.10, 12.01, 2.70, 2.12, 4.95, 1.57, 0.76, 1.05, 3.15, 2.77, 9.05, 5.61, 5.24, 2.20, 1.72, 5.14, 14.53, 1.72, 0.18, 2.20, 4.31, 3.10, 2.17, 5.43, 4.62, 3.18, 4.21, 2.12, 3.65, 0.25, 0.34, 1.33, 5.77, 4.23, 4.79, 2.75, 2.37, 5.37, 9.81, 0.49, 5.11, 11.45, 7.70, 12.20, 11.85, 11.60, 11.76, 21.43, 1.96, 3.35, 5.38, 17.05, 1.92, 6.68, 4.01, 3.35, 5.02, 8.09, 4.36, 3.14, 2.57, 10.12, 13.79, 13.67, 1.66, 3.19, 3.11, 13.83, 4.23, 1.43, 18.48, 9.76, 9.49, 6.29, 3.57, 3.06, 5.72, 6.94, 6.32, 4.33, 7.75, 8.58, 8.10, 4.61, 7.66, 7.79, 7.17, 7.62, 4.54, 9.82, 3.21, 5.32, 16.13, 7.22, 9.64, 13.64, 23.25, 6.24, 13.03, 5.9… - $ area_2021_emergency_and_defence_services 0.66, 0.01, 0.41, 0.02, 0.14, 0.01, 0.00, 0.07, 0.01, 0.00, 0.00, 0.06, 0.09, 0.00, 0.01, 0.00, 0.01, 0.00, 0.00, 0.00, 0.00, 0.02, 0.00, 0.02, 0.03, 0.04, 0.07, 0.02, 0.00, 0.00, 0.01, 0.01, 0.00, 0.00, 0.00, 0.02, 0.01, 0.00, 0.31, 0.01, 0.00, 0.00, 0.03, 0.01, 0.02, 0.02, 0.01, 0.03, 0.00, 0.04, 0.29, 0.34, 0.00, 0.00, 0.00, 0.00, 0.00, 0.00, 0.00, 0.00, 0.10, 0.00, 0.00, 0.00, 0.00, 0.01, 0.02, 0.00, 0.00, 0.00, 0.01, 0.01, 0.00, 0.09, 0.04, 0.00, 0.00, 0.00, 0.64, 0.00, 0.01, 0.00, 0.01, 0.02, 0.03, 0.00, 0.00, 0.04, 0.26, 0.00, 0.00, 0.03, 0.26, 0.01, 0.27, 0.01, 0.35, 0.36, 0.00, 0.00, 0.01, 0.03, 0.00, 0.00, 0.24, 0.01, 0.00, 0.22, 0.01, 0.11, 0.01, 0.21, 0.04, 0.01, 0.05, 0.01, 0.03, 0.31, 0.00, 0.00, 0.85, 0.00, 0.03, 0.01, 0.00, 0.05, 0.00, 0.01, 0.00, 0.00, 0.03, 0.01, 0.00, 0.00, 0.01, 0.01, 0.03, 0.00, 0.01, 0.00, 0.00, 0.00, 0.12, 0.03, 0.37, 0.01, 0.01, 0.01, 0.01, 0.00, 0.01, 0.00, 0.00, 0.0… - $ area_2021_green_areas_and_sports_facilities 9.78, 0.49, 2.81, 0.71, 2.14, 0.21, 0.10, 0.57, 0.81, 0.46, 0.66, 0.44, 1.53, 0.39, 0.37, 0.11, 0.27, 0.45, 0.03, 0.03, 0.57, 1.40, 0.02, 0.37, 0.53, 1.44, 2.03, 0.26, 0.18, 0.19, 0.20, 0.34, 0.59, 1.71, 0.38, 0.19, 0.35, 0.46, 0.60, 0.16, 0.21, 0.14, 0.25, 0.62, 0.61, 0.13, 0.17, 0.91, 0.42, 0.79, 2.12, 2.43, 0.14, 0.07, 0.29, 0.06, 0.03, 0.09, 0.30, 0.11, 0.83, 0.13, 0.12, 0.09, 0.13, 0.17, 1.75, 0.10, 0.02, 0.06, 0.38, 0.12, 0.09, 0.25, 0.61, 0.14, 0.16, 0.10, 0.11, 0.03, 0.02, 0.04, 0.53, 0.84, 0.25, 0.16, 0.17, 0.73, 0.66, 0.04, 0.14, 0.84, 1.57, 1.87, 2.84, 3.41, 1.86, 1.27, 0.29, 0.16, 0.14, 2.44, 0.09, 0.85, 0.21, 0.29, 0.68, 1.81, 0.56, 0.70, 0.25, 4.47, 2.71, 1.15, 0.35, 0.65, 1.44, 2.11, 1.96, 0.47, 1.75, 1.62, 0.58, 0.53, 0.31, 0.70, 0.12, 1.53, 1.19, 1.86, 0.85, 2.64, 0.32, 0.82, 0.73, 0.62, 1.49, 0.41, 0.16, 0.85, 0.29, 0.30, 1.49, 0.95, 1.03, 1.24, 1.98, 0.42, 1.25, 0.25, 0.36, 0.70, 0.35, 0.6… - $ area_2021_unclassified_built_up_areas_and_related_land 8.32, 0.81, 3.21, 1.19, 2.55, 0.21, 0.18, 0.25, 0.84, 0.79, 0.97, 0.53, 1.23, 0.38, 0.82, 0.29, 0.37, 0.25, 0.03, 0.11, 1.28, 2.36, 0.03, 0.95, 0.86, 1.58, 3.43, 0.37, 0.30, 0.81, 0.39, 0.41, 0.80, 0.36, 0.52, 0.54, 0.71, 0.65, 0.63, 0.30, 0.54, 0.31, 0.43, 0.63, 0.69, 0.29, 0.39, 0.76, 0.38, 1.23, 1.81, 1.95, 0.19, 0.19, 0.52, 0.15, 0.07, 0.19, 0.46, 0.37, 0.79, 0.18, 0.12, 0.20, 0.09, 0.26, 0.91, 0.04, 0.01, 0.05, 0.31, 0.09, 0.02, 0.26, 0.92, 0.27, 0.27, 0.20, 0.23, 0.05, 0.13, 0.18, 0.87, 0.66, 0.61, 0.25, 0.40, 0.42, 0.21, 0.15, 0.36, 1.51, 1.42, 1.52, 2.49, 2.54, 1.40, 2.12, 0.19, 0.14, 0.34, 2.13, 0.23, 0.64, 0.34, 0.41, 0.91, 1.33, 0.74, 0.52, 0.64, 2.63, 4.14, 1.16, 0.46, 0.51, 0.78, 1.73, 0.70, 0.23, 1.48, 1.47, 0.97, 0.47, 0.30, 0.40, 0.18, 0.50, 0.24, 0.29, 0.25, 0.18, 0.59, 0.37, 0.89, 1.00, 1.38, 0.30, 0.21, 0.22, 0.42, 0.64, 1.33, 1.08, 1.70, 1.32, 2.81, 0.42, 1.23, 0.42, 0.67, 0.62, 0.42, 0.6… - $ area_2021_agricultural_land 9.60, 52.14, 92.91, 9.14, 89.80, 16.75, 24.57, 60.74, 118.94, 74.25, 83.45, 55.25, 35.43, 14.72, 27.35, 40.64, 35.16, 8.31, 3.44, 9.54, 56.49, 58.49, 1.48, 80.91, 4.85, 46.89, 42.05, 26.10, 10.25, 7.50, 5.69, 4.94, 36.20, 17.96, 14.47, 2.78, 11.52, 22.83, 35.41, 10.75, 22.44, 26.19, 20.60, 24.71, 38.44, 18.89, 24.96, 33.79, 18.92, 79.39, 35.19, 24.16, 14.02, 31.15, 36.28, 17.06, 9.07, 11.67, 29.45, 24.87, 37.48, 9.65, 36.32, 20.95, 15.44, 24.72, 32.93, 11.08, 0.67, 10.22, 18.51, 14.81, 12.53, 14.04, 19.93, 8.49, 33.68, 6.69, 8.91, 1.40, 1.16, 5.15, 41.33, 12.06, 25.22, 18.79, 10.46, 28.30, 24.19, 0.58, 15.93, 61.55, 33.60, 78.12, 67.39, 26.01, 40.17, 77.73, 4.76, 20.86, 40.01, 235.68, 32.57, 109.71, 35.31, 34.96, 34.88, 38.93, 37.71, 15.30, 5.03, 14.39, 40.64, 104.14, 6.26, 30.36, 5.86, 117.74, 17.06, 26.21, 88.50, 141.56, 53.51, 51.83, 7.90, 22.27, 7.35, 17.59, 25.22, 22.12, 40.10, 20.54, 35.57, 10.35, 52.2… - $ area_2021_forest 279.30, 83.02, 80.63, 20.73, 215.91, 60.98, 121.74, 113.52, 16.42, 5.08, 17.12, 97.32, 3.75, 1.49, 123.23, 296.76, 595.47, 118.94, 0.00, 5.40, 165.70, 39.31, 0.18, 275.45, 46.98, 489.57, 270.36, 68.09, 11.13, 13.03, 18.31, 21.31, 188.95, 223.66, 128.83, 23.32, 2.39, 189.39, 379.15, 11.97, 66.69, 199.19, 224.07, 332.08, 455.33, 8.69, 320.03, 270.09, 269.47, 196.36, 545.74, 835.81, 387.48, 69.51, 292.09, 13.65, 97.72, 4.14, 50.08, 124.01, 674.33, 705.00, 808.09, 37.38, 57.54, 531.95, 1007.77, 73.80, 0.00, 142.32, 198.73, 189.96, 308.67, 519.92, 300.21, 435.66, 385.29, 184.36, 149.49, 0.00, 0.02, 8.62, 60.81, 133.56, 213.64, 68.57, 93.44, 298.75, 162.99, 1.39, 637.75, 480.39, 60.03, 241.20, 138.60, 225.58, 616.43, 1190.86, 34.03, 245.44, 304.58, 451.14, 53.75, 282.96, 53.54, 192.11, 77.37, 124.24, 46.05, 54.27, 42.22, 120.58, 254.82, 857.40, 42.49, 150.51, 46.40, 235.65, 141.42, 47.59, 117.12, 418.69, 293.99, 2… - $ area_2021_open_firm_ground 7.55, 216.78, 23.07, 18.13, 521.59, 165.91, 188.31, 368.19, 78.49, 5.89, 46.87, 364.82, 6.49, 1.86, 62.20, 528.31, 818.84, 321.92, 1.60, 25.25, 143.15, 73.82, 3.35, 202.35, 14.90, 592.96, 200.22, 234.44, 61.52, 80.55, 57.77, 38.48, 258.42, 377.52, 125.74, 23.78, 14.61, 138.71, 567.72, 20.77, 60.66, 107.65, 50.78, 822.37, 659.92, 147.25, 223.17, 391.69, 461.82, 146.07, 542.35, 1400.98, 626.84, 64.46, 459.76, 106.97, 277.31, 38.91, 57.29, 207.80, 847.93, 856.49, 1252.48, 106.65, 83.16, 638.39, 2239.28, 131.28, 12.50, 334.86, 249.82, 330.56, 632.46, 1227.25, 508.36, 592.27, 364.43, 245.33, 45.55, 6.17, 11.80, 120.10, 233.13, 216.77, 211.34, 98.67, 132.98, 217.51, 180.95, 85.45, 750.60, 2.88, 4.09, 8.22, 8.91, 2.59, 15.40, 47.93, 5.65, 0.94, 1.71, 8.81, 0.75, 3.23, 3.57, 2.06, 3.52, 3.21, 2.28, 1.21, 0.83, 4.96, 5.08, 4.81, 0.97, 1.46, 1.05, 8.44, 1.41, 0.61, 4.15, 3.54, 4.24, 1.85, 0.81, 1.55, 159.30, 133.12, 8… - $ area_2021_wetland 4.09, 13.50, 5.72, 3.30, 9.87, 9.99, 6.70, 15.98, 12.97, 0.55, 9.36, 8.41, 0.36, 0.09, 2.87, 22.54, 17.53, 8.40, 0.03, 2.08, 12.94, 11.33, 0.04, 18.28, 3.85, 50.85, 36.33, 14.66, 2.39, 5.29, 4.71, 6.02, 5.69, 12.75, 12.20, 0.51, 2.44, 17.23, 21.35, 10.32, 11.46, 27.59, 15.15, 18.43, 80.50, 72.38, 34.71, 13.88, 12.06, 54.52, 57.92, 54.50, 22.24, 6.47, 21.96, 9.90, 2.56, 1.59, 4.67, 20.62, 70.09, 111.61, 144.50, 9.09, 3.19, 63.69, 100.35, 5.10, 0.27, 10.66, 13.42, 11.42, 14.95, 45.82, 36.49, 30.26, 36.43, 13.63, 18.58, 0.09, 0.26, 4.30, 37.72, 28.76, 29.41, 30.02, 42.63, 75.73, 204.36, 0.21, 42.56, 16.21, 0.21, 1.70, 0.23, 5.00, 32.09, 50.78, 0.16, 7.24, 10.49, 7.46, 0.20, 7.96, 0.41, 1.58, 0.12, 0.95, 0.21, 0.11, 0.21, 0.67, 1.58, 53.65, 0.44, 1.81, 0.23, 2.98, 3.53, 2.68, 1.11, 20.15, 7.61, 9.28, 8.75, 2.17, 45.17, 74.40, 59.38, 32.05, 102.76, 84.30, 41.63, 11.29, 10.51, 7.72, 3.98, 22.37, 27.03, 175.26, 9.1… - $ area_2021_bare_rock_gravel_and_blockfields 0.52, 4.57, 1.19, 0.74, 63.49, 3.90, 4.46, 7.11, 1.56, 0.69, 0.22, 19.28, 0.76, 0.09, 4.97, 51.53, 98.29, 41.34, 0.65, 0.28, 1.25, 2.42, 0.88, 1.66, 1.92, 213.87, 11.43, 11.10, 0.81, 2.56, 2.08, 1.00, 140.75, 184.30, 34.40, 0.56, 1.25, 18.40, 352.71, 0.37, 1.79, 2.28, 1.24, 426.30, 64.25, 6.12, 4.17, 105.12, 344.71, 7.66, 93.69, 733.08, 134.70, 15.27, 174.25, 9.95, 127.57, 3.82, 35.94, 66.14, 180.55, 176.39, 146.79, 7.58, 17.76, 104.75, 553.53, 31.44, 2.37, 134.21, 134.17, 62.76, 145.37, 254.45, 121.03, 357.95, 129.93, 51.39, 11.61, 1.04, 4.02, 27.89, 17.30, 56.54, 57.10, 12.56, 24.17, 62.47, 23.28, 20.99, 368.66, 1.30, 0.48, 1.55, 15.41, 1.17, 18.85, 3.21, 33.54, 0.02, 0.04, 0.11, 0.01, 0.01, 1.02, 0.02, 0.23, 0.07, 0.02, 0.13, 0.20, 0.25, 2.49, 0.04, 0.01, 0.08, 0.02, 0.14, 0.11, 0.01, 0.00, 0.01, 0.36, 0.06, 0.12, 0.52, 6.60, 3.13, 1.45, 128.67, 153.00, 154.92, 1.67, 0.55, 1.03, 0.26, 0.83, 1.55, 2.53, 5.… - $ area_2021_permanent_snow_and_glaciers 0.00, 0.00, 0.00, 0.00, 0.00, 0.00, 0.00, 0.00, 0.00, 0.00, 0.00, 0.00, 0.00, 0.00, 0.00, 0.00, 1.53, 0.00, 0.00, 0.00, 0.00, 0.00, 0.00, 0.00, 0.00, 7.08, 0.00, 0.00, 0.00, 0.00, 0.00, 0.00, 3.46, 17.42, 3.51, 0.00, 0.00, 0.00, 16.49, 0.00, 0.00, 0.00, 0.00, 9.38, 0.88, 0.00, 0.00, 1.93, 27.92, 0.00, 0.35, 111.75, 0.05, 0.00, 1.06, 0.00, 0.34, 0.00, 0.00, 0.00, 5.48, 13.66, 14.71, 0.00, 0.00, 55.69, 226.52, 0.00, 0.00, 48.25, 170.79, 4.37, 60.29, 8.74, 106.05, 34.59, 4.43, 0.97, 0.43, 0.00, 0.00, 0.00, 0.00, 0.35, 2.22, 0.00, 0.00, 0.08, 0.00, 0.00, 9.47, 0.00, 0.00, 0.00, 0.00, 0.00, 0.00, 0.00, 0.00, 0.00, 0.00, 0.00, 0.00, 0.00, 0.00, 0.00, 0.00, 0.00, 0.00, 0.00, 0.00, 0.00, 0.00, 0.00, 0.00, 0.00, 0.00, 0.00, 0.00, 0.00, 0.00, 0.00, 0.00, 0.00, 0.00, 0.00, 0.00, 0.00, 0.00, 0.04, 0.21, 9.20, 0.00, 0.00, 0.00, 0.00, 0.00, 0.00, 0.00, 0.00, 0.00, 0.00, 0.00, 0.00, 0.00, 0.00, 0.00, 0.00, 0.00, 0.00, 0.00… - $ area_2021_inland_waters 28.32, 46.55, 6.35, 4.45, 97.18, 28.00, 56.36, 76.87, 10.97, 11.17, 13.05, 61.60, 0.30, 0.62, 26.54, 119.38, 155.83, 40.67, 0.02, 2.65, 25.96, 11.01, 0.04, 23.52, 1.33, 70.25, 26.31, 21.47, 3.21, 1.79, 2.48, 5.64, 13.90, 22.89, 10.02, 1.65, 0.76, 6.31, 60.80, 0.27, 2.70, 11.64, 15.80, 67.95, 52.31, 12.28, 21.24, 45.74, 48.27, 15.55, 85.40, 240.41, 73.64, 4.04, 48.34, 2.29, 22.78, 0.85, 1.25, 15.87, 93.75, 123.20, 273.44, 6.49, 2.24, 159.13, 261.82, 7.73, 0.12, 26.57, 76.80, 43.39, 43.25, 134.66, 104.45, 171.62, 47.64, 20.43, 11.55, 0.59, 0.08, 9.71, 18.37, 20.13, 16.19, 12.03, 9.72, 25.93, 44.89, 8.59, 185.91, 48.71, 10.11, 33.47, 8.59, 13.19, 39.65, 134.27, 0.21, 38.57, 45.95, 38.33, 8.19, 14.64, 13.27, 18.02, 0.68, 7.02, 2.06, 1.17, 0.81, 4.20, 13.43, 93.88, 15.01, 37.54, 3.45, 46.32, 7.31, 1.15, 2.83, 29.16, 71.69, 17.73, 24.65, 57.91, 36.18, 40.90, 19.47, 43.18, 93.92, 201.74, 32.68, 35.43, 53.99, 40.18,… - $ area_2021_unclassified_undeveloped_areas 0.00, 0.00, 0.00, 0.00, 0.00, 0.00, 0.00, 0.00, 0.00, 0.00, 0.00, 0.00, 0.00, 0.00, 0.00, 0.00, 0.00, 0.00, 0.00, 0.00, 0.00, 0.00, 0.00, 0.00, 0.00, 0.00, 0.00, 0.00, 0.00, 0.00, 0.00, 0.00, 0.00, 0.00, 0.00, 0.00, 0.00, 0.00, 0.00, 0.00, 0.00, 0.00, 0.00, 0.00, 0.00, 0.00, 0.00, 0.00, 0.00, 0.00, 0.00, 0.03, 0.00, 0.00, 0.00, 0.00, 0.00, 0.00, 0.00, 0.00, 0.00, 0.00, 0.03, 0.00, 0.00, 0.02, 0.03, 0.00, 0.00, 0.00, 0.00, 0.00, 0.00, 0.02, 0.02, 0.02, 0.00, 0.00, 0.00, 0.00, 0.00, 0.00, 0.00, 0.00, 0.00, 0.00, 0.00, 0.00, 0.00, 0.00, 0.02, 0.00, 0.00, 0.00, 0.00, 0.00, 0.00, 0.00, 0.00, 0.00, 0.00, 0.00, 0.00, 0.00, 0.00, 0.00, 0.00, 0.00, 0.00, 0.00, 0.00, 0.00, 0.00, 0.00, 0.00, 0.00, 0.00, 0.00, 0.00, 0.00, 0.00, 0.00, 0.00, 0.00, 0.00, 0.00, 0.00, 0.00, 0.00, 0.00, 0.00, 0.00, 0.00, 0.00, 0.00, 0.00, 0.00, 0.00, 0.00, 0.00, 0.00, 0.00, 0.00, 0.00, 0.00, 0.00, 0.00, 0.00, 0.00, 0.00, 0.00, 0.00, 0.00, 0.0… - $ total_area 454.13, 432.42, 262.42, 72.60, 1040.63, 294.95, 408.40, 650.54, 257.96, 113.39, 183.18, 617.95, 69.10, 24.71, 261.71, 1068.08, 1736.24, 546.54, 6.25, 47.20, 425.37, 229.87, 6.30, 620.45, 87.41, 1503.36, 632.62, 385.13, 93.28, 119.44, 97.23, 82.32, 661.61, 865.85, 337.78, 58.53, 40.59, 404.64, 1448.06, 60.66, 175.25, 381.65, 336.71, 1713.36, 1365.33, 271.62, 640.22, 876.89, 1190.60, 521.81, 1395.17, 3432.12, 1264.01, 195.38, 1046.07, 163.34, 538.96, 64.18, 187.63, 465.33, 1928.93, 2004.14, 2684.33, 192.51, 183.33, 1588.79, 4460.07, 264.67, 16.51, 711.07, 873.21, 664.60, 1221.63, 2216.19, 1209.82, 1637.30, 1009.82, 527.38, 252.72, 10.11, 18.52, 178.67, 424.15, 478.98, 566.58, 246.74, 319.60, 722.30, 656.59, 118.72, 2020.66, 642.45, 137.92, 405.62, 292.16, 317.68, 793.10, 1555.09, 89.97, 319.26, 412.91, 791.79, 101.18, 434.87, 118.83, 256.97, 133.97, 202.98, 102.70, 85.73, 61.48, 192.28, 376.72, 1144.80, 71.69,… - $ kommune_code "0301", "1101", "1103", "1106", "1108", "1111", "1112", "1114", "1119", "1120", "1121", "1122", "1124", "1127", "1130", "1133", "1134", "1135", "1144", "1145", "1146", "1149", "1151", "1160", "1505", "1506", "1507", "1511", "1514", "1515", "1516", "1517", "1520", "1525", "1528", "1531", "1532", "1535", "1539", "1547", "1554", "1557", "1560", "1563", "1566", "1573", "1576", "1577", "1578", "1579", "1804", "1806", "1811", "1812", "1813", "1815", "1816", "1818", "1820", "1822", "1824", "1825", "1826", "1827", "1828", "1832", "1833", "1834", "1835", "1836", "1837", "1838", "1839", "1840", "1841", "1845", "1848", "1851", "1853", "1856", "1857", "1859", "1860", "1865", "1866", "1867", "1868", "1870", "1871", "1874", "1875", "3001", "3002", "3003", "3004", "3005", "3006", "3007", "3011", "3012", "3013", "3014", "3015", "3016", "3017", "3018", "3019", "3020", "3021", "3022", "3023", "3024", "3025", "3026", "3027", "… - $ location_code "municip_nor0301", "municip_nor1101", "municip_nor1103", "municip_nor1106", "municip_nor1108", "municip_nor1111", "municip_nor1112", "municip_nor1114", "municip_nor1119", "municip_nor1120", "municip_nor1121", "municip_nor1122", "municip_nor1124", "municip_nor1127", "municip_nor1130", "municip_nor1133", "municip_nor1134", "municip_nor1135", "municip_nor1144", "municip_nor1145", "municip_nor1146", "municip_nor1149", "municip_nor1151", "municip_nor1160", "municip_nor1505", "municip_nor1506", "municip_nor1507", "municip_nor1511", "municip_nor1514", "municip_nor1515", "municip_nor1516", "municip_nor1517", "municip_nor1520", "municip_nor1525", "municip_nor1528", "municip_nor1531", "municip_nor1532", "municip_nor1535", "municip_nor1539", "municip_nor1547", "municip_nor1554", "municip_nor1557", "municip_nor1560", "municip_nor1563", "municip_nor1566", "municip_nor1573", "municip_nor1576", "municip_nor1577", "municip… - $ perc_rocks 1.145047e-03, 1.056843e-02, 4.534715e-03, 1.019284e-02, 6.101112e-02, 1.322258e-02, 1.092067e-02, 1.092938e-02, 6.047449e-03, 6.085193e-03, 1.201004e-03, 3.119994e-02, 1.099855e-02, 3.642250e-03, 1.899049e-02, 4.824545e-02, 5.661084e-02, 7.563948e-02, 1.040000e-01, 5.932203e-03, 2.938618e-03, 1.052769e-02, 1.396825e-01, 2.675477e-03, 2.196545e-02, 1.422613e-01, 1.806772e-02, 2.882144e-02, 8.683533e-03, 2.143336e-02, 2.139257e-02, 1.214772e-02, 2.127386e-01, 2.128544e-01, 1.018414e-01, 9.567743e-03, 3.079576e-02, 4.547252e-02, 2.435742e-01, 6.099571e-03, 1.021398e-02, 5.974060e-03, 3.682694e-03, 2.488094e-01, 4.705822e-02, 2.253148e-02, 6.513386e-03, 1.198782e-01, 2.895263e-01, 1.467967e-02, 6.715311e-02, 2.135939e-01, 1.065656e-01, 7.815539e-02, 1.665759e-01, 6.091588e-02, 2.366966e-01, 5.952010e-02, 1.915472e-01, 1.421357e-01, 9.360112e-02, 8.801281e-02, 5.468404e-02, 3.937458e-02, 9.687449e-02, 6.593068e-0… - $ perc_wetland 0.0090062317, 0.0312196476, 0.0217971191, 0.0454545455, 0.0094846391, 0.0338701475, 0.0164054848, 0.0245642082, 0.0502791130, 0.0048505159, 0.0510972814, 0.0136095153, 0.0052098408, 0.0036422501, 0.0109663368, 0.0211032881, 0.0100965304, 0.0153694149, 0.0048000000, 0.0440677966, 0.0304205750, 0.0492887284, 0.0063492063, 0.0294624869, 0.0440453037, 0.0338242337, 0.0574278398, 0.0380650689, 0.0256217839, 0.0442900201, 0.0484418389, 0.0731292517, 0.0086002328, 0.0147254143, 0.0361181834, 0.0087134803, 0.0601133284, 0.0425810597, 0.0147438642, 0.1701285856, 0.0653922967, 0.0722913664, 0.0449942087, 0.0107566419, 0.0589601049, 0.2664752227, 0.0542157383, 0.0158286672, 0.0101293465, 0.1044824745, 0.0415146541, 0.0158793982, 0.0175947975, 0.0331149555, 0.0209928590, 0.0606097710, 0.0047498887, 0.0247740729, 0.0248894100, 0.0443126383, 0.0363362071, 0.0556897223, 0.0538309373, 0.0472183263, 0.0174003164, 0.040087110… - $ perc_forest 0.615022130, 0.191989270, 0.307255545, 0.285537190, 0.207480084, 0.206746906, 0.298090108, 0.174501184, 0.063653280, 0.044801129, 0.093459985, 0.157488470, 0.054269175, 0.060299474, 0.470864698, 0.277844356, 0.342965258, 0.217623596, 0.000000000, 0.114406780, 0.389543221, 0.171009701, 0.028571429, 0.443951970, 0.537467109, 0.325650543, 0.427365559, 0.176797445, 0.119318182, 0.109092431, 0.188316363, 0.258867833, 0.285591209, 0.258312641, 0.381402096, 0.398428157, 0.058881498, 0.468045670, 0.261833073, 0.197329377, 0.380542083, 0.521917988, 0.665468801, 0.193817995, 0.333494467, 0.031993226, 0.499875043, 0.308008986, 0.226331262, 0.376305552, 0.391163801, 0.243525867, 0.306548208, 0.355768246, 0.279226056, 0.083568018, 0.181312157, 0.064506077, 0.266908277, 0.266499044, 0.349587595, 0.351771832, 0.301039738, 0.194171731, 0.313860252, 0.334814544, 0.225953853, 0.278837798, 0.000000000, 0.200149071, 0.227585575… - $ perc_open_ground 0.016625195, 0.501318163, 0.087912507, 0.249724518, 0.501225219, 0.562502119, 0.461092067, 0.565975958, 0.304271980, 0.051944616, 0.255868545, 0.590371389, 0.093921852, 0.075273169, 0.237667647, 0.494635233, 0.471616827, 0.589014528, 0.256000000, 0.534957627, 0.336530550, 0.321138035, 0.531746032, 0.326134257, 0.170461046, 0.394423159, 0.316493314, 0.608729520, 0.659519726, 0.674397187, 0.594158182, 0.467444121, 0.390592645, 0.436010856, 0.372254130, 0.406287374, 0.359940872, 0.342798537, 0.392055578, 0.342400264, 0.346134094, 0.282064719, 0.150812272, 0.479975020, 0.483341024, 0.542117664, 0.348583299, 0.446680884, 0.387888460, 0.279929476, 0.388733989, 0.408196683, 0.495913798, 0.329921179, 0.439511696, 0.654891637, 0.514527980, 0.606263634, 0.305334968, 0.446564803, 0.439585677, 0.427360364, 0.466589428, 0.553997195, 0.453608247, 0.401808924, 0.502072837, 0.496013904, 0.757116899, 0.470924100, 0.286093838… + $ region "0… + $ area_2021_residential_areas 51… + $ area_2021_recreational_facilities 1.… + $ area_2021_built_up_areas_for_agriculture_and_fishing 0.… + $ area_2021_industrial_commercial_and_service_areas 10… + $ area_2021_education_and_day_care_facilities 3.… + $ area_2021_health_and_social_welfare_institutions 1.… + $ area_2021_cultural_and_religious_activities 0.… + $ area_2021_transport_telecommunications_and_technical_infrastructure 36… + $ area_2021_emergency_and_defence_services 0.… + $ area_2021_green_areas_and_sports_facilities 9.… + $ area_2021_unclassified_built_up_areas_and_related_land 8.… + $ area_2021_agricultural_land 9.… + $ area_2021_forest 27… + $ area_2021_open_firm_ground 7.… + $ area_2021_wetland 4.… + $ area_2021_bare_rock_gravel_and_blockfields 0.… + $ area_2021_permanent_snow_and_glaciers 0.… + $ area_2021_inland_waters 28… + $ area_2021_unclassified_undeveloped_areas 0,… + $ total_area 45… + $ kommune_code "0… + $ location_code "… + $ perc_rocks 0.… + $ perc_wetland 0.… + $ perc_forest 0.… + $ perc_open_ground 0.… Then the next step is very similar to what we've done before. We'll use `left_join` to merge the data frame containing the land use variables with the data frame containing the map with the kommune borders. I want to plot the four designations of interest in one figure, so I'll transform the plot to a long format using `pivot_longer`. Then I'll create a new label with nicer descriptions of the designations, and then the rest is similar to before. We'll facet the plot based on the designation: ``` r -nor_municip_map_b2020_split_dt |> - left_join(area_use, by = "location_code") |> - pivot_longer(cols = starts_with("perc"), names_to = "land_type", values_to = "percentage") |> - mutate(land_type_label = case_when(str_detect(land_type, "rocks") ~ "Bare rock, gravel and rockfields", - str_detect(land_type, "wetland") ~ "Wetland", - str_detect(land_type, "forest") ~ "Forest", - str_detect(land_type, "open_ground") ~ "Open firm ground")) |> - ggplot(aes(x = long, y = lat, group = group, fill = percentage)) + - geom_polygon() + +nor_municip_map_b2020_split_dt |> + left_join(area_use, by = "location_code") |> + pivot_longer( + cols = starts_with("perc"), + names_to = "land_type", values_to = "percentage" + ) |> + mutate(land_type_label = case_when( + str_detect(land_type, "rocks") ~ "Bare rock, gravel and rockfields", + str_detect(land_type, "wetland") ~ "Wetland", + str_detect(land_type, "forest") ~ "Forest", + str_detect(land_type, "open_ground") ~ "Open firm ground" + )) |> + ggplot(aes( + x = long, y = lat, + group = group, fill = percentage + )) + + geom_polygon() + labs(fill = "Percentage") + - scico::scale_fill_scico(palette = "acton", labels = scales::label_percent(), limits = c(0,1), - guide = guide_colorbar(barheight = 0.5, barwidth = 12, - ticks = FALSE, direction = "horizontal", - title.position = "top", title.hjust = 0.5)) + - facet_wrap(~ land_type_label) + - coord_map(projection = "conic", lat0 = 60) + + scico::scale_fill_scico( + palette = "acton", + labels = scales::label_percent(), + limits = c(0, 1), + guide = guide_colorbar( + barheight = 0.5, barwidth = 12, + ticks = FALSE, direction = "horizontal", + title.position = "top", title.hjust = 0.5 + ) + ) + + facet_wrap(~land_type_label) + + coord_map(projection = "conic", lat0 = 60) + theme_void() + - theme(legend.position = "bottom", - strip.text.x = element_textbox_simple(size = rel(1.25), halign = 0.5, - margin = margin(10,0,10,0, "pt"))) + theme( + legend.position = "bottom", + strip.text.x = element_textbox_simple( + size = rel(1.25), halign = 0.5, + margin = margin(10, 0, 10, 0, "pt") + ) + ) ``` @@ -353,45 +456,53 @@ nor_municip_map_b2020_split_dt |> The last thing I want to show is a map of Oslo! `{csmaps}` contains a detailed map of the bydeler that we will use. Now, these bydeler are again coded and `{csdata}` (since it's update) now contains a [data frame](https://www.csids.no/csmaps/articles/customization.html#add-location-name-for-ward-and-population-for-oslo-map) with the corresponding names for all geography levels (fylker, municipalities, bydeler, etc.). We could get our data from there, but we also need something to visualize so we'll scrape a Wikipedia article for Oslo's bydeler which contains a table with the bydel numbers, the names, and some data we can use for visualization. We'll extract the table from the website using `{rvest}`, do some data wrangling and prepare it for merging into the data frame with the map. I won't go into the wrangling much here, we're interested mainly in the plotting of the data right now. +{{< sidenote >}} +I won't go into much detail on web scraping here, you can see one of the [other posts](https://danielroelfs.com/blog/) for more details +{{< /sidenote >}} + ``` r -bydel_data <- "https://en.wikipedia.org/wiki/List_of_boroughs_of_Oslo" |> - rvest::read_html() |> - rvest::html_table() |> - pluck(1) |> - janitor::clean_names() |> - mutate(inhabitants = str_remove_all(residents, "[[:blank:]]"), - inhabitants = as.numeric(inhabitants), - area = str_remove_all(area, "km2"), - area = str_replace_all(area, ",", "."), - area = str_squish(area), - area = as.numeric(area), - pop_density = inhabitants / area) |> - arrange(number) |> - mutate(bydel_nr = format(number, digits = 2), - bydel_nr = str_replace_all(bydel_nr, " ", "0"), - location_code = str_glue("wardoslo_nor0301{bydel_nr}")) +bydel_data <- "https://en.wikipedia.org/wiki/List_of_boroughs_of_Oslo" |> + rvest::read_html() |> + rvest::html_table() |> + pluck(1) |> + janitor::clean_names() |> + mutate( + inhabitants = str_remove_all(residents, "[[:blank:]]"), + inhabitants = as.numeric(inhabitants), + area = str_remove_all(area, "km2"), + area = str_replace_all(area, ",", "."), + area = str_squish(area), + area = as.numeric(area), + pop_density = inhabitants / area + ) |> + arrange(number) |> + mutate( + bydel_nr = format(number, digits = 2), + bydel_nr = str_replace_all(bydel_nr, " ", "0"), + location_code = str_glue("wardoslo_nor0301{bydel_nr}") + ) print(bydel_data) ``` # A tibble: 15 × 8 - borough residents area number inhabitants pop_density bydel_nr location_code - - 1 Gamle Oslo 58 671 7.5 1 58671 7823. 01 wardoslo_nor030101 - 2 Grünerløkka 62 423 4.8 2 62423 13005. 02 wardoslo_nor030102 - 3 Sagene 45 089 3.1 3 45089 14545. 03 wardoslo_nor030103 - 4 St. Hanshaugen 38 945 3.6 4 38945 10818. 04 wardoslo_nor030104 - 5 Frogner 59 269 8.3 5 59269 7141. 05 wardoslo_nor030105 - 6 Ullern 34 596 9.4 6 34596 3680. 06 wardoslo_nor030106 - 7 Vestre Aker 50 157 16.6 7 50157 3022. 07 wardoslo_nor030107 - 8 Nordre Aker 52 327 13.6 8 52327 3848. 08 wardoslo_nor030108 - 9 Bjerke 33 422 7.7 9 33422 4341. 09 wardoslo_nor030109 - 10 Grorud 27 707 8.2 10 27707 3379. 10 wardoslo_nor030110 - 11 Stovner 33 316 8.2 11 33316 4063. 11 wardoslo_nor030111 - 12 Alna 49 801 13.7 12 49801 3635. 12 wardoslo_nor030112 - 13 Østensjø 50 806 12.2 13 50806 4164. 13 wardoslo_nor030113 - 14 Nordstrand 52 459 16.9 14 52459 3104. 14 wardoslo_nor030114 - 15 Søndre Nordstrand 39 066 18.4 15 39066 2123. 15 wardoslo_nor030115 + borough residents area number inhabitants pop_density bydel_nr location_code + + 1 Gamle … 58 671 7.5 1 58671 7823. 01 wardoslo_nor… + 2 Grüner… 62 423 4.8 2 62423 13005. 02 wardoslo_nor… + 3 Sagene 45 089 3.1 3 45089 14545. 03 wardoslo_nor… + 4 St. Ha… 38 945 3.6 4 38945 10818. 04 wardoslo_nor… + 5 Frogner 59 269 8.3 5 59269 7141. 05 wardoslo_nor… + 6 Ullern 34 596 9.4 6 34596 3680. 06 wardoslo_nor… + 7 Vestre… 50 157 16.6 7 50157 3022. 07 wardoslo_nor… + 8 Nordre… 52 327 13.6 8 52327 3848. 08 wardoslo_nor… + 9 Bjerke 33 422 7.7 9 33422 4341. 09 wardoslo_nor… + 10 Grorud 27 707 8.2 10 27707 3379. 10 wardoslo_nor… + 11 Stovner 33 316 8.2 11 33316 4063. 11 wardoslo_nor… + 12 Alna 49 801 13.7 12 49801 3635. 12 wardoslo_nor… + 13 Østens… 50 806 12.2 13 50806 4164. 13 wardoslo_nor… + 14 Nordst… 52 459 16.9 14 52459 3104. 14 wardoslo_nor… + 15 Søndre… 39 066 18.4 15 39066 2123. 15 wardoslo_nor… `{csmaps}` also provides a very useful data frame containing the geographical center or best location to put a label to avoid overlap and make it as clear as possible which label corresponds to which bydel. So we'll merge those two together. @@ -404,19 +515,34 @@ Then we'll create the final plot. This will be more-or-less identical to what we ``` r oslo_ward_map_b2020_default_dt |> - left_join(bydel_data, by = "location_code") |> + left_join(bydel_data, by = "location_code") |> ggplot(aes(x = long, y = lat, group = group)) + - geom_polygon(aes(color = pop_density, fill = pop_density)) + - geom_label(data = bydel_centres, aes(label = borough, group = 1), alpha = 0.5, label.size = 0) + - labs(fill = "No of inhabitants per km2") + - scico::scale_color_scico(palette = "turku", limits = c(0,1.5e4), guide = "none") + - scico::scale_fill_scico(palette = "turku", limits = c(0,1.5e4), labels = scales::number_format(), - guide = guide_colorbar(title.position = "top", title.hjust = 0.5, - barwidth = 15, barheight = 0.75, ticks = FALSE)) + - theme_void() + - theme(legend.position = "bottom", - legend.title = element_markdown(), - legend.direction = "horizontal") + geom_polygon(aes(color = pop_density, fill = pop_density)) + + geom_label( + data = bydel_centres, aes(label = borough, group = 1), + alpha = 0.5, label.size = 0 + ) + + labs(fill = "No of inhabitants per km2") + + scico::scale_color_scico( + palette = "turku", + limits = c(0, 1.5e4), + guide = "none" + ) + + scico::scale_fill_scico( + palette = "turku", + limits = c(0, 1.5e4), + labels = scales::number_format(), + guide = guide_colorbar( + title.position = "top", title.hjust = 0.5, + barwidth = 15, barheight = 0.75, ticks = FALSE + ) + ) + + theme_void() + + theme( + legend.position = "bottom", + legend.title = element_markdown(), + legend.direction = "horizontal" + ) ``` @@ -429,11 +555,11 @@ An example of when you might use the `sf` data format is in interactive maps. He library(leaflet) map_sf <- csmaps::nor_county_map_b2020_default_sf |> - sf::st_as_sf() |> + sf::st_as_sf() |> left_join(county_names, by = "location_code") -map_sf |> - leaflet() |> +map_sf |> + leaflet() |> addProviderTiles(providers$Esri.WorldStreetMap) |> addPolygons( fillColor = unname(county_colors), @@ -442,7 +568,8 @@ map_sf |> fillOpacity = 0.75, highlightOptions = highlightOptions( color = "#333", bringToFront = TRUE, - weight = 2, opacity = 1) + weight = 2, opacity = 1 + ) ) ``` @@ -453,3 +580,7 @@ map_sf |> In addition, I also replaced all usage of the NORMENT internal `{normentR}` package with the `{scico}` package which is available on CRAN. **EDIT (2023-06-18)**: Updated the blogpost to reflect the deprecation of the `{splmaps}` and `{spldata}` packages. Replaced them with the replacement package `{csmaps}` available on CRAN. + + + + diff --git a/content/blog/2021-easy-map-norway/index.qmd b/content/blog/2021-easy-map-norway/index.qmd index c52efc6..efd2443 100644 --- a/content/blog/2021-easy-map-norway/index.qmd +++ b/content/blog/2021-easy-map-norway/index.qmd @@ -9,52 +9,38 @@ tags: - ggplot - map - norway -editor_options: - chunk_output_type: console execute: fig.retina: 2 fig.align: center fig.show: hold results: hold out.width: 80% +editor_options: + chunk_output_type: console --- - ```{css} #| label: style #| echo: FALSE -p.announcement { - border-radius: 5px; - background-color: #acc8d4; - padding: 1em; -} - -p.announcement code { +p.stand-out-paragraph code { background-color: #93b8c8; } ``` -

-As of March 2023 {fhimaps} and {splmaps} are no longer supported due to budget cuts at the institution supporting them. The amazing developers (led by Richard Aubrey White and Chi Zhang) have moved the data and functionality to a new package called {csmaps}. The code below is updated to reflect the changes. -

- -```{r} -#| label: knitr-opts -#| include: false - -options(width = 999) -``` - - - - +{{{< standout bg="#acc8d4" >}}} +As of March 2023 {fhimaps} and {splmaps} are no longer supported due to budget cuts at the institution supporting them. The amazing developers (led by [Richard Aubrey White](https://www.rwhite.no) and [Chi Zhang](https://andreaczhang.github.io)) have moved the data and functionality to a new package called `{csmaps}`. The code below is updated to reflect the changes. +{{{< /standout >}}} Every now and then you discover a discover a much simpler solution to a problem you spent a lot of time solving. This recently happened to me on the topic of creating a map of Norway in R. In this post, I want to go through the process of what I learned. Previously, I used a JSON file and the `{geojsonio}` package to create a map of Norway and its fylker (counties) in particular. This was a very flexible and robust way of going about this, but also quite cumbersome. This method relies on a high-quality JSON file, meaning, a file that is detailed enough to display everything nicely, but not too detailed that it takes a ton of time and computing power to create a single plot. While I'll still use this method if I need to create a map for places other than Norway, I think I've found a better and easier solution for plotting Norway and the fylker and kommuner in the [`{csmaps}`](https://www.csids.no/csmaps/) package. -The `{csmaps}` package is created by the _Consortium for Statistics in Disease Surveillance_ team. It's part of a series of packages (which they refer to as the "csverse"), which includes a package containing basic health surveillance data ([`{csdata}`](https://www.csids.no/csdata/)), one for real-time analysis in disease surveillance ([`{sc9}`](https://www.csids.no/sc9/)) and a few more. Here I'll dive into the `{csmaps}` package with some help from the `{csdata}` package. I'll also use the `{ggmap}` package to help with some other data and plotting. It's perhaps important to note that `{ggmap}` does contain a map of Norway as a whole, but not of the fylker and kommuner (municipalities), hence the usefulness of the `{csmaps}` package, which contains both. I'll also use `{tidyverse}` and `{ggtext}` as I always do. I won't load `{csmaps}` with the `library()` function, but will use the `::` operator instead since it'll make it easier to navigate the different datasets included. +{{{< sidenote >}}} +The group has [a bunch of packages](https://www.csids.no) for data science within public health +{{{< /sidenote >}}} + +The `{csmaps}` package is created by the [_Consortium for Statistics in Disease Surveillance_](https://www.csids.no) team. It's part of a series of packages (which they refer to as the "csverse"), which includes a package containing basic health surveillance data ([`{csdata}`](https://www.csids.no/csdata/)), one for real-time analysis in disease surveillance ([`{sc9}`](https://www.csids.no/sc9/)) and a few more. Here I'll dive into the `{csmaps}` package with some help from the `{csdata}` package. I'll also use the `{ggmap}` package to help with some other data and plotting. It's perhaps important to note that `{ggmap}` does contain a map of Norway as a whole, but not of the fylker and kommuner (municipalities), hence the usefulness of the `{csmaps}` package, which contains both. I'll also use `{tidyverse}` and `{ggtext}` as I always do. I won't load `{csmaps}` with the `library()` function, but will use the `::` operator instead since it'll make it easier to navigate the different datasets included. ```{r} #| label: pkgs @@ -72,10 +58,10 @@ So let's have a look at what's included. You'll see that nearly all maps come in #| label: print-map-list #| eval: false -data(package = "csmaps") |> - pluck("results") |> - as_tibble() |> - select(Item, Title) |> +data(package = "csmaps") |> + pluck("results") |> + as_tibble() |> + select(Item, Title) |> print(n = 18) ``` @@ -86,7 +72,7 @@ So let's have a look at one of those maps. For instance the one with the new fyl ```{r} #| label: load-map -map_df <- nor_county_map_b2020_insert_oslo_dt |> +map_df <- nor_county_map_b2020_insert_oslo_dt |> glimpse() ``` @@ -95,7 +81,10 @@ Immediately you can see that there's a lot of rows, each representing a point on ```{r} #| label: minimal-plot -ggplot(map_df, aes(x = long, y = lat, group = group, fill = location_code)) + +ggplot(map_df, aes( + x = long, y = lat, + group = group, fill = location_code +)) + geom_polygon() ``` @@ -104,21 +93,30 @@ Now let's convert the awkward county numbers to the actual names of the fylker. ```{r} #| label: fylke-names -county_names <- csdata::nor_locations_names(border = 2020) |> +county_names <- csdata::nor_locations_names(border = 2020) |> filter(str_detect(location_code, "^county")) |> distinct(location_code, location_name) print(county_names) ``` -Now let's also create a nice color palette to give each fylke a nicer color than the default ggplot colors. We'll create a named vector to match each fylke with a color from the _batlow_ palette by [Fabio Crameri](https://www.fabiocrameri.ch/colourmaps/). +{{{< sidenote >}}} +The [`{scico}`](https://github.com/thomasp85/scico) package contains a collection of color palettes that are great for accessibility +{{{< /sidenote >}}} + +Now let's also create a nice color palette to give each fylke a nicer color than the default ggplot colors. We'll create a named vector to match each fylke with a color from the _batlow_ palette by [Fabio Crameri](https://www.fabiocrameri.ch/colourmaps/) implemented in the `{scico}` package. ```{r} #| label: fylke-colors #| message: false -county_colors <- setNames(scico::scico(n = nrow(county_names), palette = "batlow"), - nm = county_names$location_name) +county_colors <- setNames( + scico::scico( + n = nrow(county_names), + palette = "batlow" + ), + nm = county_names$location_name +) ``` Let's see what we can make now. We'll add the county names to the large data frame containing the longitudes and latitudes and then create a plot again. I'll also add some other style elements, such as a labels to the x- and y-axes, circles instead of squares for the legend and a map projection. For Norway especially I think a conic map projection works well since the northern fylker are so massive and the southern fylker are more dense, so adding a conic projection with a cone tangent of 40 degrees makes it a bit more perceptionally balanced (`lat0` refers to the cone tangent, the details are complicated but a higher cone tangent results a greater distortion in favor of southern points). @@ -126,23 +124,39 @@ Let's see what we can make now. We'll add the county names to the large data fra ```{r} #| label: simple-plot -map_df |> - left_join(county_names, by = "location_code") |> - ggplot(aes(x = long, y = lat, fill = location_name, group = group)) + - geom_polygon(key_glyph = "point") + - labs(x = NULL, - y = NULL, - fill = NULL) + - scale_x_continuous(labels = scales::label_number(suffix = "\u00b0W")) + - scale_y_continuous(labels = scales::label_number(suffix = "\u00b0N")) + - scale_fill_manual(values = county_colors, - guide = guide_legend(override.aes = list(shape = 21, size = 4))) + - coord_map(projection = "conic", lat0 = 40) + +map_df |> + left_join(county_names, by = "location_code") |> + ggplot(aes( + x = long, y = lat, + fill = location_name, group = group + )) + + geom_polygon(key_glyph = "point") + + labs( + x = NULL, + y = NULL, + fill = NULL + ) + + scale_x_continuous( + labels = scales::label_number(suffix = "\u00b0W") + ) + + scale_y_continuous( + labels = scales::label_number(suffix = "\u00b0N") + ) + + scale_fill_manual( + values = county_colors, + guide = guide_legend(override.aes = list(shape = 21, size = 4)) + ) + + coord_map(projection = "conic", lat0 = 40) + theme_minimal() + - theme(legend.position = c(0.9,0.2), - legend.text = element_text(size = 5), - legend.key.height = unit(10,"pt"), - legend.background = element_rect(fill = "white", color = "transparent")) + theme( + legend.position = c(0.9, 0.2), + legend.text = element_text(size = 5), + legend.key.height = unit(10, "pt"), + legend.background = element_rect( + fill = "white", + color = "transparent" + ) + ) ``` ## Norway with Scandinavia @@ -162,12 +176,20 @@ Let's also combine the map with some actual data. The `{csdata}` package contain ```{r} #| label: get-age-data -age_data <- csdata::nor_population_by_age_cats(border = 2020, cats = list(c(0:18))) |> - filter(str_detect(location_code, "^county"), - calyear == max(calyear)) |> - pivot_wider(id_cols = location_code, names_from = age, values_from = pop_jan1_n) |> +age_data <- csdata::nor_population_by_age_cats( + border = 2020, + cats = list(seq(18)) +) |> + filter( + str_detect(location_code, "^county"), + calyear == max(calyear) + ) |> + pivot_wider( + id_cols = location_code, + names_from = age, values_from = pop_jan1_n + ) |> janitor::clean_names() |> - rename(age_0_18 = x000_018) |> + rename(age_0_18 = x001_018) |> mutate(proportion = age_0_18 / total) ``` @@ -177,37 +199,61 @@ Let's create a map without the Oslo inset, combine it with the age distribution #| label: age-plot nor_county_map_b2020_default_dt |> - left_join(age_data, by = "location_code") |> + left_join(age_data, by = "location_code") |> ggplot(aes(x = long, y = lat, group = group)) + - geom_polygon(data = map_data("world") |> filter(region != "Norway"), - fill = "grey80") + - geom_polygon(aes(fill = proportion), key_glyph = "point") + + geom_polygon( + data = map_data("world") |> filter(region != "Norway"), + fill = "grey80" + ) + + geom_polygon(aes(fill = proportion), key_glyph = "point") + labs(fill = "Proportion of the population younger than 18") + - scico::scale_fill_scico(palette = "devon", limits = c(0.15, 0.31), - labels = scales::percent_format(accuracy = 1), - guide = guide_colorbar(title.position = "top", title.hjust = 0.5, - barwidth = 10, barheight = 0.5, ticks = FALSE)) + - coord_map(projection = "conic", lat0 = 60, - xlim = c(-8,40), ylim = c(57, 70)) + + scico::scale_fill_scico( + palette = "devon", limits = c(0.15, 0.31), + labels = scales::percent_format(accuracy = 1), + guide = guide_colorbar( + title.position = "top", title.hjust = 0.5, + barwidth = 10, barheight = 0.5, ticks = FALSE + ) + ) + + coord_map( + projection = "conic", lat0 = 60, + xlim = c(-8, 40), ylim = c(57, 70) + ) + theme_void() + - theme(plot.background = element_rect(fill = "#A2C0F4", color = "transparent"), - legend.direction = "horizontal", - legend.position = c(0.8, 0.1), - legend.title = element_text(size = 8), - legend.text = element_text(size = 6)) + theme( + plot.background = element_rect( + fill = "#A2C0F4", + color = "transparent" + ), + legend.direction = "horizontal", + legend.position = c(0.8, 0.1), + legend.title = element_text(size = 8), + legend.text = element_text(size = 6) + ) ``` ## Geocoding +{{{< sidenote br="3em" >}}} +The Google Maps API requires a personal API key, you can read how to obtain and register an API key [here](https://github.com/dkahle/ggmap) +{{{< /sidenote >}}} + The `{ggmap}` package also has an incredibly useful function called `mutate_geocode()` which transforms a string with an address or description in character format to longitude and latitude. Since `{ggmap}` uses the Google Maps API, it works similarly to typing in a description in Google Maps. So an approximation of the location will (most likely) get you the right result (e.g. with "Hospital Lillehammer"). Note that `mutate_geocode` uses `lon` instead of `long` as column name for longitude. Just to avoid confusion, I'll rename the column to `long`. ```{r} #| label: get-cities-locations #| eval: false -hospitals_df <- tibble(location = c("Ullevål Sykehus, Oslo","Haukeland universitetssjukehus, Bergen","St. Olav, Trondheim", - "Universitetssykehuset Nord-Norge, Tromsø","Stavanger Universitetssjukehus","Sørlandet Hospital Kristiansand", "Hospital Lillehammer")) |> - mutate_geocode(location) |> +hospitals_df <- tibble(location = c( + "Ullevål Sykehus, Oslo", + "Haukeland universitetssjukehus, Bergen", + "St. Olav, Trondheim", + "Universitetssykehuset Nord-Norge, Tromsø", + "Stavanger Universitetssjukehus", + "Sørlandet Hospital Kristiansand", + "Hospital Lillehammer" +)) |> + mutate_geocode(location) |> rename(long = lon) ``` @@ -216,14 +262,14 @@ hospitals_df <- tibble(location = c("Ullevål Sykehus, Oslo","Haukeland universi #| echo: false #| eval: false -write_rds(hospitals_df, "norway_hospitals.rds") +write_rds(hospitals_df, "./data/norway_hospitals.rds") ``` ```{r} #| label: load-cities-geo #| echo: false -hospitals_df <- read_rds("norway_hospitals.rds") +hospitals_df <- read_rds("./data/norway_hospitals.rds") ``` This is the list of coordinates it gave us: @@ -241,27 +287,47 @@ Now let's put these on top of the map. We'll use the same map we used earlier. W set.seed(21) -map_df |> - left_join(county_names, by = "location_code") |> - ggplot(aes(x = long, y = lat, fill = location_name, group = group)) + - geom_polygon(key_glyph = "point") + - geom_segment(data = hospitals_df |> filter(str_detect(location, "Oslo")), - aes(x = long, y = lat, xend = 19.5, yend = 62), inherit.aes = FALSE) + - geom_point(data = hospitals_df, aes(x = long, y = lat), inherit.aes = FALSE, - shape = 18, color = "firebrick", size = 4, show.legend = FALSE) + - ggrepel::geom_label_repel(data = hospitals_df, aes(x = long, y = lat, label = location), - size = 2, alpha = 0.75, label.size = 0, inherit.aes = FALSE) + - labs(x = NULL, - y = NULL, - fill = NULL) + - scale_fill_manual(values = county_colors, - guide = guide_legend(override.aes = list(size = 4, shape = 21, - color = "transparent"))) + - coord_map(projection = "conic", lat0 = 60) + +map_df |> + left_join(county_names, by = "location_code") |> + ggplot(aes( + x = long, y = lat, + fill = location_name, group = group + )) + + geom_polygon(key_glyph = "point") + + geom_segment( + data = hospitals_df |> filter(str_detect(location, "Oslo")), + aes(x = long, y = lat, xend = 19.5, yend = 62), + inherit.aes = FALSE + ) + + geom_point( + data = hospitals_df, aes(x = long, y = lat), + inherit.aes = FALSE, + shape = 18, color = "firebrick", + size = 4, show.legend = FALSE + ) + + ggrepel::geom_label_repel( + data = hospitals_df, aes(x = long, y = lat, label = location), + size = 2, alpha = 0.75, label.size = 0, inherit.aes = FALSE + ) + + labs( + x = NULL, + y = NULL, + fill = NULL + ) + + scale_fill_manual( + values = county_colors, + guide = guide_legend(override.aes = list( + size = 4, shape = 21, + color = "transparent" + )) + ) + + coord_map(projection = "conic", lat0 = 60) + theme_void() + - theme(legend.position = c(0.2,0.7), - legend.text = element_text(size = 5), - legend.key.height = unit(10,"pt")) + theme( + legend.position = c(0.2, 0.7), + legend.text = element_text(size = 5), + legend.key.height = unit(10, "pt") + ) ``` @@ -273,7 +339,9 @@ Let's take it a step further and now look at how we can combine our map with dat #| label: load-area-use #| message: false -area_use <- read_delim("areal.csv", delim = ";", skip = 2) |> +area_use <- read_delim("./data/areal.csv", + delim = ";", skip = 2 +) |> janitor::clean_names() print(area_use) @@ -284,17 +352,19 @@ You can see there's 356 rows, each representing a different kommune in Norway. T ```{r} #| label: wrangle-area-use -area_use <- area_use |> - mutate(total_area = rowSums(across(where(is.numeric))), - kommune_code = parse_number(region), - kommune_code = format(kommune_code, digits = 4), - kommune_code = str_replace_all(kommune_code, " ", "0"), - location_code = str_glue("municip_nor{kommune_code}"), - perc_rocks = area_2021_bare_rock_gravel_and_blockfields / total_area, - perc_wetland = area_2021_wetland / total_area, - perc_forest = area_2021_forest / total_area, - perc_open_ground = area_2021_open_firm_ground / total_area) |> - arrange(kommune_code) |> +area_use <- area_use |> + mutate( + total_area = rowSums(across(where(is.numeric))), + kommune_code = parse_number(region), + kommune_code = format(kommune_code, digits = 4), + kommune_code = str_replace_all(kommune_code, " ", "0"), + location_code = str_glue("municip_nor{kommune_code}"), + perc_rocks = area_2021_bare_rock_gravel_and_blockfields / total_area, + perc_wetland = area_2021_wetland / total_area, + perc_forest = area_2021_forest / total_area, + perc_open_ground = area_2021_open_firm_ground / total_area + ) |> + arrange(kommune_code) |> glimpse() ``` @@ -303,51 +373,77 @@ Then the next step is very similar to what we've done before. We'll use `left_jo ```{r} #| label: plot-kommune-faceted -nor_municip_map_b2020_split_dt |> - left_join(area_use, by = "location_code") |> - pivot_longer(cols = starts_with("perc"), names_to = "land_type", values_to = "percentage") |> - mutate(land_type_label = case_when(str_detect(land_type, "rocks") ~ "Bare rock, gravel and rockfields", - str_detect(land_type, "wetland") ~ "Wetland", - str_detect(land_type, "forest") ~ "Forest", - str_detect(land_type, "open_ground") ~ "Open firm ground")) |> - ggplot(aes(x = long, y = lat, group = group, fill = percentage)) + - geom_polygon() + +nor_municip_map_b2020_split_dt |> + left_join(area_use, by = "location_code") |> + pivot_longer( + cols = starts_with("perc"), + names_to = "land_type", values_to = "percentage" + ) |> + mutate(land_type_label = case_when( + str_detect(land_type, "rocks") ~ "Bare rock, gravel and rockfields", + str_detect(land_type, "wetland") ~ "Wetland", + str_detect(land_type, "forest") ~ "Forest", + str_detect(land_type, "open_ground") ~ "Open firm ground" + )) |> + ggplot(aes( + x = long, y = lat, + group = group, fill = percentage + )) + + geom_polygon() + labs(fill = "Percentage") + - scico::scale_fill_scico(palette = "acton", labels = scales::label_percent(), limits = c(0,1), - guide = guide_colorbar(barheight = 0.5, barwidth = 12, - ticks = FALSE, direction = "horizontal", - title.position = "top", title.hjust = 0.5)) + - facet_wrap(~ land_type_label) + - coord_map(projection = "conic", lat0 = 60) + + scico::scale_fill_scico( + palette = "acton", + labels = scales::label_percent(), + limits = c(0, 1), + guide = guide_colorbar( + barheight = 0.5, barwidth = 12, + ticks = FALSE, direction = "horizontal", + title.position = "top", title.hjust = 0.5 + ) + ) + + facet_wrap(~land_type_label) + + coord_map(projection = "conic", lat0 = 60) + theme_void() + - theme(legend.position = "bottom", - strip.text.x = element_textbox_simple(size = rel(1.25), halign = 0.5, - margin = margin(10,0,10,0, "pt"))) + theme( + legend.position = "bottom", + strip.text.x = element_textbox_simple( + size = rel(1.25), halign = 0.5, + margin = margin(10, 0, 10, 0, "pt") + ) + ) ``` ## Oslo The last thing I want to show is a map of Oslo! `{csmaps}` contains a detailed map of the bydeler that we will use. Now, these bydeler are again coded and `{csdata}` (since it's update) now contains a [data frame](https://www.csids.no/csmaps/articles/customization.html#add-location-name-for-ward-and-population-for-oslo-map) with the corresponding names for all geography levels (fylker, municipalities, bydeler, etc.). We could get our data from there, but we also need something to visualize so we'll scrape a Wikipedia article for Oslo's bydeler which contains a table with the bydel numbers, the names, and some data we can use for visualization. We'll extract the table from the website using `{rvest}`, do some data wrangling and prepare it for merging into the data frame with the map. I won't go into the wrangling much here, we're interested mainly in the plotting of the data right now. +{{{< sidenote >}}} +I won't go into much detail on web scraping here, you can see one of the [other posts](https://danielroelfs.com/blog/) for more details +{{{< /sidenote >}}} + ```{r} #| label: scrape-bydel-names -bydel_data <- "https://en.wikipedia.org/wiki/List_of_boroughs_of_Oslo" |> - rvest::read_html() |> - rvest::html_table() |> - pluck(1) |> - janitor::clean_names() |> - mutate(inhabitants = str_remove_all(residents, "[[:blank:]]"), - inhabitants = as.numeric(inhabitants), - area = str_remove_all(area, "km2"), - area = str_replace_all(area, ",", "."), - area = str_squish(area), - area = as.numeric(area), - pop_density = inhabitants / area) |> - arrange(number) |> - mutate(bydel_nr = format(number, digits = 2), - bydel_nr = str_replace_all(bydel_nr, " ", "0"), - location_code = str_glue("wardoslo_nor0301{bydel_nr}")) +bydel_data <- "https://en.wikipedia.org/wiki/List_of_boroughs_of_Oslo" |> + rvest::read_html() |> + rvest::html_table() |> + pluck(1) |> + janitor::clean_names() |> + mutate( + inhabitants = str_remove_all(residents, "[[:blank:]]"), + inhabitants = as.numeric(inhabitants), + area = str_remove_all(area, "km2"), + area = str_replace_all(area, ",", "."), + area = str_squish(area), + area = as.numeric(area), + pop_density = inhabitants / area + ) |> + arrange(number) |> + mutate( + bydel_nr = format(number, digits = 2), + bydel_nr = str_replace_all(bydel_nr, " ", "0"), + location_code = str_glue("wardoslo_nor0301{bydel_nr}") + ) print(bydel_data) ``` @@ -367,19 +463,34 @@ Then we'll create the final plot. This will be more-or-less identical to what we #| label: plot-oslo oslo_ward_map_b2020_default_dt |> - left_join(bydel_data, by = "location_code") |> + left_join(bydel_data, by = "location_code") |> ggplot(aes(x = long, y = lat, group = group)) + - geom_polygon(aes(color = pop_density, fill = pop_density)) + - geom_label(data = bydel_centres, aes(label = borough, group = 1), alpha = 0.5, label.size = 0) + - labs(fill = "No of inhabitants per km2") + - scico::scale_color_scico(palette = "turku", limits = c(0,1.5e4), guide = "none") + - scico::scale_fill_scico(palette = "turku", limits = c(0,1.5e4), labels = scales::number_format(), - guide = guide_colorbar(title.position = "top", title.hjust = 0.5, - barwidth = 15, barheight = 0.75, ticks = FALSE)) + - theme_void() + - theme(legend.position = "bottom", - legend.title = element_markdown(), - legend.direction = "horizontal") + geom_polygon(aes(color = pop_density, fill = pop_density)) + + geom_label( + data = bydel_centres, aes(label = borough, group = 1), + alpha = 0.5, label.size = 0 + ) + + labs(fill = "No of inhabitants per km2") + + scico::scale_color_scico( + palette = "turku", + limits = c(0, 1.5e4), + guide = "none" + ) + + scico::scale_fill_scico( + palette = "turku", + limits = c(0, 1.5e4), + labels = scales::number_format(), + guide = guide_colorbar( + title.position = "top", title.hjust = 0.5, + barwidth = 15, barheight = 0.75, ticks = FALSE + ) + ) + + theme_void() + + theme( + legend.position = "bottom", + legend.title = element_markdown(), + legend.direction = "horizontal" + ) ``` @@ -394,11 +505,11 @@ An example of when you might use the `sf` data format is in interactive maps. He library(leaflet) map_sf <- csmaps::nor_county_map_b2020_default_sf |> - sf::st_as_sf() |> + sf::st_as_sf() |> left_join(county_names, by = "location_code") -map_sf |> - leaflet() |> +map_sf |> + leaflet() |> addProviderTiles(providers$Esri.WorldStreetMap) |> addPolygons( fillColor = unname(county_colors), @@ -407,7 +518,8 @@ map_sf |> fillOpacity = 0.75, highlightOptions = highlightOptions( color = "#333", bringToFront = TRUE, - weight = 2, opacity = 1) + weight = 2, opacity = 1 + ) ) ``` @@ -416,8 +528,8 @@ map_sf |> #| echo: false #| eval: false -map <- map_sf |> - leaflet() |> +map <- map_sf |> + leaflet() |> addProviderTiles(providers$Esri.WorldStreetMap) |> addPolygons( fillColor = unname(county_colors), @@ -426,10 +538,11 @@ map <- map_sf |> fillOpacity = 0.75, highlightOptions = highlightOptions( color = "#333", bringToFront = TRUE, - weight = 2, opacity = 1) + weight = 2, opacity = 1 + ) ) -htmlwidgets::saveWidget(map, str_glue("{here::here('content','blog','2021-easy-map-norway')}/leafMap.html")) +htmlwidgets::saveWidget(map, "leafMap.html") ```
@@ -437,4 +550,8 @@ htmlwidgets::saveWidget(map, str_glue("{here::here('content','blog','2021-easy-m **EDIT (2022-09-04)**: Updated the blogpost to replace usage of the retiring `{fhimaps}` and `{fhidata}` packages with the newer `{splmaps}` and `{spldata}` packages from FHI. The `{fhidata}` package included a dataset on vaccination rates in Norway, but since this isn't incorporated in the new `{spldata}` package I replaced that plot with a plot about age distribution. In addition, I also replaced all usage of the NORMENT internal `{normentR}` package with the `{scico}` package which is available on CRAN. -**EDIT (2023-06-18)**: Updated the blogpost to reflect the deprecation of the `{splmaps}` and `{spldata}` packages. Replaced them with the replacement package `{csmaps}` available on CRAN. \ No newline at end of file +**EDIT (2023-06-18)**: Updated the blogpost to reflect the deprecation of the `{splmaps}` and `{spldata}` packages. Replaced them with the replacement package `{csmaps}` available on CRAN. + + + + diff --git a/content/blog/2021-easy-map-norway/leafMap.html b/content/blog/2021-easy-map-norway/leafMap.html index adfc58d..43b2e78 100644 --- a/content/blog/2021-easy-map-norway/leafMap.html +++ b/content/blog/2021-easy-map-norway/leafMap.html @@ -4917,4 +4917,4 @@ - + \ No newline at end of file diff --git a/content/blog/2021-easy-map-norway/norway_hospitals.rds b/content/blog/2021-easy-map-norway/norway_hospitals.rds deleted file mode 100644 index 3ac40fd..0000000 Binary files a/content/blog/2021-easy-map-norway/norway_hospitals.rds and /dev/null differ diff --git a/content/blog/2022-dutch-performance-olympic-speed-skating/all_events.json b/content/blog/2022-dutch-performance-olympic-speed-skating/data/all_events.json similarity index 100% rename from content/blog/2022-dutch-performance-olympic-speed-skating/all_events.json rename to content/blog/2022-dutch-performance-olympic-speed-skating/data/all_events.json diff --git a/content/blog/2022-dutch-performance-olympic-speed-skating/index.markdown_strict_files/figure-markdown_strict/events-timeline-1.png b/content/blog/2022-dutch-performance-olympic-speed-skating/index.markdown_strict_files/figure-markdown_strict/events-timeline-1.png index c354b49..760e71d 100644 Binary files a/content/blog/2022-dutch-performance-olympic-speed-skating/index.markdown_strict_files/figure-markdown_strict/events-timeline-1.png and b/content/blog/2022-dutch-performance-olympic-speed-skating/index.markdown_strict_files/figure-markdown_strict/events-timeline-1.png differ diff --git a/content/blog/2022-dutch-performance-olympic-speed-skating/index.md b/content/blog/2022-dutch-performance-olympic-speed-skating/index.md index b7ef3d0..9f23057 100644 --- a/content/blog/2022-dutch-performance-olympic-speed-skating/index.md +++ b/content/blog/2022-dutch-performance-olympic-speed-skating/index.md @@ -14,6 +14,8 @@ execute: fig.show: hold results: hold out.width: 80% +editor_options: + chunk_output_type: console --- The 2022 Winter Olympics started last week. I'm don't usually follow sports (of any kind) religiously during a regular year, but I make an exception for the Winter Olympics. In particular the speed skating events I'll watch live as much as time allows me. It's no secret the Netherlands is a speed skating nation ([although some international TV commentators don't quite grasp why](https://www.washingtonpost.com/news/early-lead/wp/2018/02/11/nbcs-katie-couric-is-in-hot-water-with-the-dutch-who-really-dont-skate-everywhere/)). Is it fun to watch a sport where you have a high chance of winning? Yes, of course! Is it still exiting? Yes, absolutely! Being the favorites brings a certain pressure that is thrilling. Dutch qualifying games to determine which athletes get to go to the Olympics are [always very competitive too](https://www.nytimes.com/2022/02/01/sports/olympics/netherlands-speedskating-beijing-2022.html), so it's exiting to see if they can deal with the pressure and which international surprises might threaten their "favorites" status. Watching speed skating events definitely gets my heart pumping faster. @@ -36,9 +38,13 @@ showtext_auto() -Before we can do anything, we need to find a nice dataset. What was quite surprising to me, it was rather tough to find a good dataset on Olympic events and results. Wikipedia of course has all the data one could want, but it's not always structured in an organized way, which makes it hard to scrape programatically. I looked at the IOC, ISU (International Skating Union), and news agencies, but the best (i.e. most complete and most organized) resource I could was a website called [Olympian Database](https://www.olympiandatabase.com/). The website looks rather outdated and the html is fairly outdated too, but we can work with this. The website has a [main page for speed skating](https://www.olympiandatabase.com/index.php?id=6934&L=1), and then we can iteratively go through the games and events to scrape every individual webpage. +{{< sidenote br="10em" >}} +It looks like the website is (once again) maintained by a small group of enthusiasts, the unsung heroes of the internet +{{< /sidenote >}} -Before we've used the `{rvest}` package to scrape websites, but since then I've actually gotten really fond of using Python for web scraping with the `Selenium` library, and then parsing the html with the `BeautifulSoup` library. So what we'll do first is scrape and parse the [main table](https://www.olympiandatabase.com/index.php?id=6934&L=1). This will give us the links to the speed skating events at each Winter Olympic Game. This will give us a list of all events that were part of that tournament, then we'll go one level deeper and scrape the table there. This table contains the final placements (and in case of team events, the results from the last rounds), the athlete, country, and a comment (Olympic records, disqualifications, etc.). We'll run through each game, and each event iteratively, save the data in an individual json file, and then at the end merge the individual json files into a single large json which we can then parse in R. While running this script I found [one instance](https://www.olympiandatabase.com/index.php?id=11769&L=1) where the header data and some variables were missing, which made machine reading this page very difficult, so when the script got to that instance I filled in the data manually. +Before we can do anything, we need to find a nice dataset. What was quite surprising to me, it was rather tough to find a good dataset on Olympic events and results. Wikipedia of course has all the data one could want, but it's not always structured in an organized way, which makes it hard to scrape programatically. I looked at the IOC, ISU (International Skating Union), and news agencies, but the best (i.e. most complete and most organized) resource I could was a website called [Olympian Database](https://www.olympiandatabase.com/). The website looks rather outdated and the HTML is fairly outdated too, but we can work with this. The website has a [main page for speed skating](https://www.olympiandatabase.com/index.php?id=6934&L=1), and then we can iteratively go through the games and events to scrape every individual webpage. + +Before we've used the `{rvest}` package to scrape websites, but since then I've actually gotten really fond of using Python for web scraping with the `Selenium` library, and then parsing the HTML with the `BeautifulSoup` library. So what we'll do first is scrape and parse the [main table](https://www.olympiandatabase.com/index.php?id=6934&L=1). This will give us the links to the speed skating events at each Winter Olympic Game. This will give us a list of all events that were part of that tournament, then we'll go one level deeper and scrape the table there. This table contains the final placements (and in case of team events, the results from the last rounds), the athlete, country, and a comment (Olympic records, disqualifications, etc.). We'll run through each game, and each event iteratively, save the data in an individual JSON file, and then at the end merge the individual JSON files into a single large JSON which we can then parse in R. While running this script I found [one instance](https://www.olympiandatabase.com/index.php?id=11769&L=1) where the header data and some variables were missing, which made machine reading this page very difficult, so when the script got to that instance I filled in the data manually.
Show code @@ -60,88 +66,94 @@ import glob verbose = True -base_url = 'https://www.olympiandatabase.com' -parent_url = f'{base_url}/index.php?id=6934&L=1' +base_url = "https://www.olympiandatabase.com" +parent_url = f"{base_url}/index.php?id=6934&L=1" -out_path = './event_files' +out_path = "./data/event_files" # -- Get website ------------------------ options = webdriver.ChromeOptions() -options.add_argument('--headless') +options.add_argument("--headless") driver = webdriver.Chrome(options=options) driver.get(parent_url) html_source = driver.page_source -soup = BeautifulSoup(html_source, 'html.parser') +soup = BeautifulSoup(html_source, "html.parser") # -- Get list of games ------------------------ -parent_table = soup.find_all('table', attrs={'class': 'frame_space'})[-1] +parent_table = soup.find_all("table", attrs={"class": "frame_space"})[-1] game_links = [] -for link in parent_table.find_all('a'): - game_links.append(link.get('href')) +for link in parent_table.find_all("a"): + game_links.append(link.get("href")) -game_links = [i for i in game_links if not re.compile(r'http://.*$').match(i)] +game_links = [i for i in game_links if not re.compile(r"http://.*$").match(i)] game_links = game_links[:-1] # -- Get list of events per game ------------------------ for i in game_links: - - driver.get(f'{base_url}{i}') + driver.get(f"{base_url}{i}") html_source = driver.page_source - soup = BeautifulSoup(html_source, 'html.parser') - - event_table = soup.find_all('table', attrs={'class': 'frame_space'})[-1] - + soup = BeautifulSoup(html_source, "html.parser") + + event_table = soup.find_all("table", attrs={"class": "frame_space"})[-1] + event_links = [] - for link in event_table.find_all('a'): - if link.find(text=re.compile('0 m|Combined|Mass|Team')): - event_links.append(link.get('href')) - - event_links = [j for j in event_links if not re.compile(r'/index.php\?id=13738&L=1').match(j)] - + for link in event_table.find_all("a"): + if link.find(text=re.compile("0 m|Combined|Mass|Team")): + event_links.append(link.get("href")) + + event_links = [ + j for j in event_links if not re.compile(r"/index.php\?id=13738&L=1").match(j) + ] + for j in event_links: - - driver.get(f'{base_url}{j}') + driver.get(f"{base_url}{j}") html_source = driver.page_source - soup = BeautifulSoup(html_source, 'html.parser') - - id = re.search('id=(.*)&', j).group(1) - if id != '11769': - title = soup.find('h1').text - year = re.search('Speed Skating (.*) Winter Olympics', title).group(1).split()[-1] - distance = re.search('\'s (.*) -', title).group(1) - sex = re.search('^(.*)\'s', title).group(1).split()[0] - tab = pd.read_html(f'{base_url}{j}', match='Final')[0] + soup = BeautifulSoup(html_source, "html.parser") + + id = re.search("id=(.*)&", j).group(1) + if id != "11769": + title = soup.find("h1").text + year = ( + re.search("Speed Skating (.*) Winter Olympics", title) + .group(1) + .split()[-1] + ) + distance = re.search("'s (.*) -", title).group(1) + sex = re.search("^(.*)'s", title).group(1).split()[0] + tab = pd.read_html(f"{base_url}{j}", match="Final")[0] else: - year = '1994' - distance = '5000 m' - sex = 'Men' - title = f'{sex}\'s {distance} - Speed Skating Lillehammer {year} Winter Olympics' - tab = pd.read_html(f'{base_url}{j}')[2] - + year = "1994" + distance = "5000 m" + sex = "Men" + title = ( + f"{sex}'s {distance} - Speed Skating Lillehammer {year} Winter Olympics" + ) + tab = pd.read_html(f"{base_url}{j}")[2] + if verbose: - print(f'Extracting data for the {sex}\'s {distance} from {year}') - + print(f"Extracting data for the {sex}'s {distance} from {year}") + # Write to json out_data = { - 'title': title, - 'year': int(year), - 'distance': distance, - 'sex': sex, - 'table': tab.to_json(), - 'id': int(id) + "title": title, + "year": int(year), + "distance": distance, + "sex": sex, + "table": tab.to_json(), + "id": int(id), } - + file_name = f'{year}_{distance.lower().replace(" ", "")}_{sex.lower()}.json' - with open(f'{out_path}/{file_name}', 'w') as file_out: + with open(f"{out_path}/{file_name}", "w") as file_out: json.dump(out_data, file_out, indent=4) - + pass - + # -- Quit browser ------------------------ driver.quit() @@ -149,7 +161,7 @@ driver.quit() # -- Merge json files ------------------------- if verbose: - print('Merging json files') + print("Merging json files") json_file_list = [] for file in os.listdir(out_path): @@ -158,122 +170,129 @@ for file in os.listdir(out_path): # -- Define function to merge json files ------------------------ -out_name = "./all_events.json" +out_name = "./data/all_events.json" result = [] -for f in glob.glob(f'{out_path}/*.json'): +for f in glob.glob(f"{out_path}/*.json"): with open(f, "rb") as infile: result.append(json.load(infile)) -with open(out_name, 'w') as outfile: - json.dump(result, outfile, indent=4) +with open(out_name, "w") as outfile: + json.dump(result, outfile, indent=4) ```
-I said before that the data is neatly organized, which is true except for a few instances. The individual events are simple tables with a ranking and time for each athlete. It's a bit more complicated for the team pursuits, since team pursuit events are a direct competition with qualifying rounds and knock-out rounds, the table is a bit more complicated. In this case we're just interested in the final ranking (so we dismiss the semi- and quarter-finals). The final ranking is split across two columns, so we stitch those together. For some reason the men's team pursuit from 2018 lists only the medal winners, and not in the same format as the other team pursuit events. One advantage here is that they list individual skaters too, but since this is the only time indivdual skaters are listed among the team pursuits it's still not very useful. It just meant we have to create another few lines in "if else" statement to parse the json. In the html, the podium places aren't denoted with a numeric list, but rather with a gold, silver, and bronze badge. Since the python script doesn't parse those, we add those back here (except for the 1928 Men's 10.000 m event, which was canceled due to bad weather). +I said before that the data is neatly organized, which is true except for a few instances. The individual events are simple tables with a ranking and time for each athlete. It's a bit more complicated for the team pursuits, since team pursuit events are a direct competition with qualifying rounds and knock-out rounds, the table is a bit more complicated. In this case we're just interested in the final ranking (so we dismiss the semi- and quarter-finals). The final ranking is split across two columns, so we stitch those together. For some reason the men's team pursuit from 2018 lists only the medal winners, and not in the same format as the other team pursuit events. One advantage here is that they list individual skaters too, but since this is the only time indivdual skaters are listed among the team pursuits it's still not very useful. It just meant we have to create another few lines in the `if else` statement to parse the JSON. In the HTML, the podium places aren't denoted with a numeric list, but rather with a gold, silver, and bronze badge. Since the Python script doesn't parse those, we add those back here (except for the 1928 Men's 10.000 m event, which was canceled due to bad weather).
Show code ``` r parse_json <- function(json) { - - t_df <- jsonlite::fromJSON(json) |> - as_tibble() |> - unnest() |> - janitor::clean_names() %>% + t_df <- jsonlite::fromJSON(json) |> + as_tibble() |> + unnest() |> + janitor::clean_names() %>% slice(seq(3, nrow(.) - 2)) - + if (str_detect(json, "Men's Team pursuit 2018")) { - - t_df_out <- t_df |> - filter(is.na(x0)) |> - rename(ranking = x0, - athlete = x1, - country = x3, - time = x4, - comment = x5) |> - mutate(ranking = rep(seq(3), each = 4), - ranking = str_glue("{ranking}.")) |> + t_df_out <- t_df |> + filter(is.na(x0)) |> + rename( + ranking = x0, + athlete = x1, + country = x3, + time = x4, + comment = x5 + ) |> + mutate( + ranking = rep(seq(3), each = 4), + ranking = str_glue("{ranking}.") + ) |> fill(country, time, comment) |> - group_by(ranking) |> + group_by(ranking) |> mutate(athlete = toString(athlete)) |> - ungroup() |> - distinct() |> + ungroup() |> + distinct() |> select(-x2) - } else if (str_detect(json, "Men's Team pursuit|Women's Team pursuit")) { - - t_df_tp <- t_df |> - rename(ranking = x0, - country = x1, - time = x3, - comment = x4, - ranking2 = x5, - country2 = x6, - time2 = x8, - comment2 = x9) |> - select(seq(10), - -c(x2,x7)) |> + t_df_tp <- t_df |> + rename( + ranking = x0, + country = x1, + time = x3, + comment = x4, + ranking2 = x5, + country2 = x6, + time2 = x8, + comment2 = x9 + ) |> + select( + seq(10), + -c(x2, x7) + ) |> slice(seq(0, min(which(nchar(ranking) > 3)) - 1)) - - t_df_out <- bind_rows(t_df_tp |> - select(seq(4)), - t_df_tp |> - select(seq(5,last_col())) |> - rename_with( ~ c("ranking","country","time","comment"))) - + + t_df_out <- bind_rows( + t_df_tp |> + select(seq(4)), + t_df_tp |> + select(seq(5, last_col())) |> + rename_with(~ c("ranking", "country", "time", "comment")) + ) } else { - - t_df <- t_df |> - rename(ranking = x0, - athlete = x1, - country = x3, - time = x4, - comment = x5) |> + t_df <- t_df |> + rename( + ranking = x0, + athlete = x1, + country = x3, + time = x4, + comment = x5 + ) |> select(-x2) - + if (str_detect(json, "Men's 10000 m 1928", negate = TRUE)) { t_df[seq(3), "ranking"] <- str_glue("{seq(3)}.") } - + t_df_out <- t_df - } - + return(t_df_out) } ```
-Okay, now that we have the function to parse the json file, let's look at some R code. We'll load the json file using the `{jsonlite}` package, and then parse each json string using the `map()` function from `{purrr}`. +Okay, now that we have the function to parse the JSON file, let's look at some R code. We'll load the JSON file using the `{jsonlite}` package, and then parse each JSON string using the `map()` function from `{purrr}`. Then when this data is parsed, we'll wrangle the nested data frames into one long data frame, and then we'll tidy up the data. Tied places are denoted using a single dash, we want to get rid of that. Then we'll fill the missing place numbers using the `fill()` function. However, there were also a number of people who either did not finish or start or were disqualified and so they don't have a ranking. These instances are denoted in the `time` column with a "dnf", "dns", or "dq". Since those are the only times it uses the lowercase d, we can use this feature to replace the ranking with a missing value. We'll then also add the comment from the `time` column to the `comment` column. Then there are also some artifacts which we can easily remove since the `country` column uses IOC 3-letter abbreviations, so any entry there that's longer than 3 characters we can remove. -Then we'll also create two vectors that contain the breaks we'll use later for the visualizations. Until 1992 both summer and winter olympic games were held in the same year. However, since 1994 they moved the Olympic Winter Games up 2 years to get the alternating schedule we have today. The Olympic Games were also not held during the second world war. I want to account for that so I create a vector with each unique entry in the `year` column. I also want a neatly organized ordering of the events, so I create a vector called `event_lims` that saves stores this preferred ordering. +Then we'll also create two vectors that contain the breaks we'll use later for the visualizations. Until 1992 both summer and winter olympic games were held in the same year. However, since 1994 they moved the Olympic Winter Games up 2 years to get the alternating schedule we have today. The Olympic Games were also not held during World War II, I want to account for that so I create a vector with each unique entry in the `year` column. I also want a neatly organized ordering of the events, so I create a vector called `event_lims` that saves stores this preferred ordering.
Show code ``` r -data_load <- jsonlite::fromJSON("./all_events.json") |> +data_load <- jsonlite::fromJSON("./data/all_events.json") |> mutate(df = map(table, ~ parse_json(.x))) -data <- data_load |> - select(-table) |> - unnest(df) |> - group_by(year, distance, sex) |> - mutate(ranking = ifelse(str_detect(ranking, "-"), NA, ranking)) |> - fill(ranking) |> - ungroup() |> - mutate(ranking = parse_number(ranking), - ranking = ifelse(str_detect(time, "d"), NA, ranking), - comment = ifelse(str_detect(time, "d"), time, comment), - time = parse_number(time)) |> - filter(nchar(country) < 4) |> - arrange(year) |> +data <- data_load |> + select(-table) |> + unnest(df) |> + group_by(year, distance, sex) |> + mutate(ranking = ifelse(str_detect(ranking, "-"), NA, ranking)) |> + fill(ranking) |> + ungroup() |> + mutate( + ranking = parse_number(ranking), + ranking = ifelse(str_detect(time, "d"), NA, ranking), + comment = ifelse(str_detect(time, "d"), time, comment), + time = parse_number(time) + ) |> + filter(nchar(country) < 4) |> + arrange(year) |> glimpse() ``` @@ -298,42 +317,63 @@ data <- data_load |> ``` r game_years <- unique(data$year) -event_lims <- c("500 m", "1000 m", "1500 m", "3000 m", "5000 m", "10000 m", "Combined", "Team pursuit", "Mass Start") +event_lims <- c("500 m", "1000 m", "1500 m", "3000 m", "5000 m", "10000 m", "Combined", "Team pursuit", "Mass Start") ```
+{{< sidenote br="4em" >}} +If you happen to get here by searching for "Gantt chart in ggplot", you can find an actual tutorial for that [here](https://jtr13.github.io/cc19/gantt-charts.html) +{{< /sidenote >}} + Then we can finally create some plots. Not all speed skating events were present from the start in 1924. Back then only men competed in Olympic speed skating, the women's program started in 1960. Here we'll create something that looks a bit like a Gantt chart. We'll use a `geom_segment()` to visualize the timeline and since there's a few events which have only been on the program once we'll use a `geom_point()` for those since `geom_segment()` requires a begin and end point that are different. Since this is just a casual visualization for illustrative purposes we can take some creative liberty and experiment a bit with the design. That's why I chose to remove the grid lines and axes, make the lines fairly big and added the individual distances as a label on top of the lines. I also made the text quite large and moved the labels slightly up. The first year an event was held is shown slightly below the line.
Show code ``` r -data |> - select(year, distance, sex) |> +data |> + select(year, distance, sex) |> distinct() |> - mutate(distance = fct_relevel(distance, ~ event_lims)) |> - group_by(distance, sex) |> - arrange(year) |> - summarise(first_year = min(year), - last_year = max(year)) |> - ggplot(aes(x = first_year, y = distance)) + - geom_segment(aes(xend = last_year, yend = distance, color = distance), - linewidth = 8, lineend = "round", alpha = 0.4) + - geom_point(data = . %>% filter(first_year == last_year), - aes(color = distance), - size = 8, alpha = 0.5) + - geom_text(aes(x = first_year, label = first_year), - color = "#333333", size = 3, family = "custom", nudge_y = -0.25) + - geom_text(aes(x = 2018, label = distance), - size = 10, color = "grey30", fontface = "bold", - family = "custom", hjust = 1, nudge_y = 0.2) + + mutate(distance = fct_relevel(distance, ~event_lims)) |> + group_by(distance, sex) |> + arrange(year) |> + summarise( + first_year = min(year), + last_year = max(year) + ) |> + ggplot(aes(x = first_year, y = distance)) + + geom_segment( + aes( + xend = last_year, yend = distance, + color = distance + ), + linewidth = 8, lineend = "round", alpha = 0.4 + ) + + geom_point( + data = . %>% filter(first_year == last_year), + aes(color = distance), + size = 8, alpha = 0.5 + ) + + geom_text(aes(x = first_year, label = first_year), + color = "#333333", size = 3, + family = "custom", nudge_y = -0.25 + ) + + geom_text(aes(x = 2018, label = distance), + size = 10, color = "grey30", fontface = "bold", + family = "custom", hjust = 1, nudge_y = 0.2 + ) + scale_y_discrete(limits = rev(event_lims)) + scico::scale_color_scico_d(guide = "none") + - facet_wrap(~ sex, scales = "free", strip.position = "top") + - theme_void(base_family = "custom") + - theme(text = element_text(color = "#333333"), - strip.text = element_text(face = "bold", size = 42)) + facet_wrap(~sex, + scales = "free_y", + strip.position = "top" + ) + + theme_void(base_family = "custom") + + theme( + text = element_text(color = "#333333"), + strip.text = element_text(face = "bold", size = 42) + ) ```
@@ -348,33 +388,55 @@ Now, let's dive into the medals. First let's create a simple barplot with the to Show code ``` r -data |> - filter(year >= 1960, - ranking %in% seq(3)) |> - mutate(country_long = countrycode::countrycode(country, origin = "ioc", destination = "country.name"), - country_long = case_when(str_detect(country, "URS") ~ "Soviet Union", - str_detect(country, "GDR") ~ "East Germany", - str_detect(country, "FRG") ~ "West Germany", - str_detect(country, "OAR") ~ "Olympic Athletes from Russia", - TRUE ~ country_long), - country_label = str_glue("{country_long} ({country})")) |> - count(country_label, sort = TRUE) |> - mutate(highlight_col = ifelse(str_detect(country_label, "NED"), "#FF9B00", "grey")) |> - ggplot(aes(x = n, y = reorder(country_label, n))) + - geom_col(aes(fill = highlight_col)) + - geom_vline(xintercept = 0, linewidth = 1) + - geom_richtext(data = tibble(), aes(x = 24, y = 15, - label = "Total number of medals won per country since 1960"), - family = "custom", size = 7, fontface = "bold", hjust = 0, - label.padding = unit(0.75,"lines"), label.color = NA) + - geom_richtext(data = tibble(), aes(x = 24, y = 13, - label = "The Netherlands has won more than twice as many medals as the runner-up"), - family = "custom", size = 4, hjust = 0, - label.padding = unit(0.75,"lines"), label.color = NA) + - labs(x = NULL, - y = NULL) + - scale_x_continuous(expand = expansion(add = c(0,9)), position = "top") + - scale_fill_identity() + +data |> + filter( + year >= 1960, + ranking %in% seq(3) + ) |> + mutate( + country_long = countrycode::countrycode( + country, + origin = "ioc", + destination = "country.name" + ), + country_long = case_when( + str_detect(country, "URS") ~ "Soviet Union", + str_detect(country, "GDR") ~ "East Germany", + str_detect(country, "FRG") ~ "West Germany", + str_detect(country, "OAR") ~ "Olympic Athletes from Russia", + TRUE ~ country_long + ), + country_label = str_glue("{country_long} ({country})") + ) |> + count(country_label, sort = TRUE) |> + mutate(highlight_col = ifelse( + str_detect(country_label, "NED"), "#FF9B00", "grey" + )) |> + ggplot(aes(x = n, y = reorder(country_label, n))) + + geom_col(aes(fill = highlight_col)) + + geom_vline(xintercept = 0, linewidth = 1) + + geom_richtext( + data = tibble(), aes( + x = 24, y = 15, + label = "Total number of medals won per country since 1960" + ), + family = "custom", size = 7, fontface = "bold", hjust = 0, + label.padding = unit(0.75, "lines"), label.color = NA + ) + + geom_richtext( + data = tibble(), aes( + x = 24, y = 13, + label = "The Netherlands has won more than twice as many medals as the runner-up" + ), + family = "custom", size = 4, hjust = 0, + label.padding = unit(0.75, "lines"), label.color = NA + ) + + labs( + x = NULL, + y = NULL + ) + + scale_x_continuous(expand = expansion(add = c(0, 9)), position = "top") + + scale_fill_identity() + theme_minimal(base_family = "custom") + theme(panel.grid.major.y = element_blank()) ``` @@ -383,7 +445,7 @@ data |> -As you can see, the Netherlands has earned by far the most medals since 1960 than any other country. In fact, it's earned more medals than number two and three combined. Now, news agencies have reported on the total number of medals, and numbers may slightly differ between reports. This is the number reported by the source, and unless I made some errors in scraping, parsing, or wrangling the data I'll stand by it. However, differences of 3 or 4 medals won't change the message that the Netherlands is absolutely dominant in this area of the Winter Olympics. +As you can see, the Netherlands has earned by far the most medals since 1960 than any other country. In fact, it's earned more medals than number two and three combined. Now, news agencies have reported on the total number of medals, and numbers may slightly differ between reports. This is the number reported by the source, unless I made some errors in scraping, parsing, or wrangling the data. Even if, differences of 3 or 4 medals won't change the message that the Netherlands is absolutely dominant in this area of the Winter Olympics. Let's look at how this distribution is spread out across the different Olympic events. We'll start in 1960 since that's when the women's tournament was added and I consider that the proper start of the Winter Olympics. Since 1960 we've had 16 Winter Olympics (the 17th is currently underway). Since not all games had the same number of medals (events were added at different years), I'll calculate the percentage of medals won per year. @@ -391,37 +453,51 @@ Let's look at how this distribution is spread out across the different Olympic e Show code ``` r -data |> - filter(ranking %in% seq(3), - year >= 1960) |> - group_by(year) |> - mutate(total_medals = n()) |> - group_by(year, country) |> - summarise(medals_won = n(), - total_medals = first(total_medals)) |> - mutate(perc_won = medals_won / total_medals, - perc_label = str_glue("{round(perc_won * 100)}%"), - highlight_col = ifelse(country == "NED", "#FF9B00", "grey"), - country = tidytext::reorder_within(country, perc_won, year)) |> - slice_max(perc_won, n = 5) |> - ggplot(aes(x = perc_won, y = country)) + - geom_col(aes(fill = highlight_col)) + - geom_text(aes(label = perc_label), family = "custom", - size = 2, hjust = 0, nudge_x = 0.01) + - labs(title = "**Most medals won per country per Olympic Game**", - subtitle = "The Netherlands has won the largest proportion (shared in 1994 and 2002) of speed skating medals **every Game since 1994**", - x = "Percentage of all medals won", - y = NULL) + +data |> + filter( + ranking %in% seq(3), + year >= 1960 + ) |> + group_by(year) |> + mutate(total_medals = n()) |> + group_by(year, country) |> + summarise( + medals_won = n(), + total_medals = first(total_medals) + ) |> + mutate( + perc_won = medals_won / total_medals, + perc_label = str_glue("{round(perc_won * 100)}%"), + highlight_col = ifelse(country == "NED", "#FF9B00", "grey"), + country = tidytext::reorder_within(country, perc_won, year) + ) |> + slice_max(perc_won, n = 5) |> + ggplot(aes(x = perc_won, y = country)) + + geom_col(aes(fill = highlight_col)) + + geom_text(aes(label = perc_label), + family = "custom", + size = 2, hjust = 0, nudge_x = 0.01 + ) + + labs( + title = "**Most medals won per country per Olympic Game**", + subtitle = "The Netherlands has won the largest proportion (shared in 1994 and 2002) of speed skating medals **every Game since 1994**", + x = "Percentage of all medals won", + y = NULL + ) + tidytext::scale_y_reordered() + - scale_x_continuous(limits = c(0, 0.7), - labels = scales::label_percent()) + + scale_x_continuous( + limits = c(0, 0.7), + labels = scales::label_percent() + ) + scale_fill_identity() + - facet_wrap(~ year, scales = "free_y") + + facet_wrap(~year, scales = "free_y") + theme_minimal(base_family = "custom") + - theme(plot.title = element_markdown(size = 26), - plot.subtitle = element_markdown(size = 12), - strip.text = element_text(size = 16, face = "bold"), - panel.grid.major.y = element_blank()) + theme( + plot.title = element_markdown(size = 26), + plot.subtitle = element_markdown(size = 12), + strip.text = element_text(size = 16, face = "bold"), + panel.grid.major.y = element_blank() + ) ``` @@ -430,52 +506,74 @@ data |> As you can see from the plot, of the 16 Olympic Games since 1960, the Netherlands has earned the largest share of medals 9 times (56.25%). It has topped the list every game since 1994 (shared in 1994 and 2002). In 2014, the Netherlands took home 64% of all medals. Due to ISU and IOC qualification rules it's technically impossible for a single country to take home all medals (e.g. a country cannot send three teams for the team pursuit). So there might be 36 medals available (from 12 events), but a single country can only take home 32. Since I didn't want to go into the (sometimes changing) qualification rules since 1960 I made it simple and calculated based on the total number of medals, not the total number available to a country. -But of course, not all medals are created equal. In Olympic rankings or medal tables, the order is determined by the number of gold medals first, then silver, then bronze. Total number of medals does not matter here. So a country with 2 gold medals and no other metal will be ranked above a country with 1 gold medal, 10 silver, and 15 bronze medals. So the Netherlands can win a lot of medals, but for rankins the color matters too. So let's create a metal table. Again, we'll only look at results from 1960. We'll calculate the number of medals each country won, then we'll fill in the blank spaces with the amazing `complete()` function. Since not all medals are equal, we'll add a multiplier and then calculate a theoretical score (where gold counts 10 times stronger than a silver etc.). Then we'll look at the top 10 countries and use `geom_point()` to create a table. +But of course, not all medals are created equal. In Olympic rankings or medal tables, the order is determined by the number of gold medals first, then silver, then bronze. Total number of medals does not matter here. So a country with 2 gold medals and no other metal will be ranked above a country with 1 gold medal, 10 silver, and 15 bronze medals. So the Netherlands can win a lot of medals, but for rankings the color matters too. So let's create a metal table. Again, we'll only look at results from 1960. We'll calculate the number of medals each country won, then we'll fill in the blank spaces with the amazing `complete()` function. Since not all medals are equal, we'll add a multiplier and then calculate a theoretical score (where gold counts 10 times stronger than a silver etc.). Then we'll look at the top 10 countries and use `geom_point()` to create a table.
Show code ``` r -data |> - filter(year >= 1960, - ranking %in% seq(3)) |> - group_by(country, ranking) |> - summarise(n_medals = n()) |> - ungroup() |> - complete(country, ranking) |> - replace_na(list(n_medals = 0)) |> - mutate(country_long = countrycode::countrycode(country, origin = "ioc", destination = "country.name"), - country_long = case_when(str_detect(country, "URS") ~ "Soviet Union", - str_detect(country, "GDR") ~ "East Germany", - TRUE ~ country_long), - country_label = str_glue("{country_long} ({country})"), - ranking_color = case_when(ranking == 1 ~ "#F8CC46", - ranking == 2 ~ "#DFDFE7", - ranking == 3 ~ "#D8B581"), - rank_mult = case_when(ranking == 1 ~ 100, - ranking == 2 ~ 10, - ranking == 3 ~ 1), - rank_score = n_medals * rank_mult) |> - group_by(country) |> - mutate(country_rank = sum(rank_score)) |> +data |> + filter( + year >= 1960, + ranking %in% seq(3) + ) |> + group_by(country, ranking) |> + summarise(n_medals = n()) |> + ungroup() |> + complete(country, ranking) |> + replace_na(list(n_medals = 0)) |> + mutate( + country_long = countrycode::countrycode( + country, + origin = "ioc", destination = "country.name" + ), + country_long = case_when( + str_detect(country, "URS") ~ "Soviet Union", + str_detect(country, "GDR") ~ "East Germany", + TRUE ~ country_long + ), + country_label = str_glue("{country_long} ({country})"), + ranking_color = case_when( + ranking == 1 ~ "#F8CC46", + ranking == 2 ~ "#DFDFE7", + ranking == 3 ~ "#D8B581" + ), + rank_mult = case_when( + ranking == 1 ~ 100, + ranking == 2 ~ 10, + ranking == 3 ~ 1 + ), + rank_score = n_medals * rank_mult + ) |> + group_by(country) |> + mutate(country_rank = sum(rank_score)) |> ungroup() |> - slice_max(country_rank, n = 30) |> - ggplot(aes(x = ranking, y = reorder(country_label, country_rank), - fill = ranking_color, alpha = n_medals)) + - geom_point(shape = 21, size = 10, stroke = 0, show.legend = FALSE) + + slice_max(country_rank, n = 30) |> + ggplot(aes( + x = ranking, y = reorder(country_label, country_rank), + fill = ranking_color, alpha = n_medals + )) + + geom_point( + shape = 21, size = 10, + stroke = 0, show.legend = FALSE + ) + geom_text(aes(label = n_medals), alpha = 1, family = "custom") + - labs(title = "**Medal table since 1960**", - subtitle = "Ten countries with the highest total ranking", - x = NULL, - y = NULL) + + labs( + title = "**Medal table since 1960**", + subtitle = "Ten countries with the highest total ranking", + x = NULL, + y = NULL + ) + scale_x_discrete(position = "top") + - scale_fill_identity() + - coord_fixed(ratio = 1/2) + - theme_void(base_family = "custom") + - theme(plot.title.position = "plot", - plot.title = element_markdown(size = 26), - plot.subtitle = element_markdown(size = 13), - axis.text.y = element_text(hjust = 1)) + scale_fill_identity() + + coord_fixed(ratio = 1 / 2) + + theme_void(base_family = "custom") + + theme( + plot.title.position = "plot", + plot.title = element_markdown(size = 26), + plot.subtitle = element_markdown(size = 13), + axis.text.y = element_text(hjust = 1) + ) ```
@@ -488,83 +586,109 @@ To show that a country is dominant in a particular competition it helps to show Show code ``` r -data |> - filter(year >= 1960, - ranking %in% seq(3)) |> - group_by(year, distance, sex, country) |> - count(year, distance, sex, country, name = "medals_won") |> - filter(medals_won == 3) |> - mutate(sweeps = medals_won / 3) |> - ggplot(aes(x = year, y = sweeps, fill = country)) + - geom_col(key_glyph = "point") + +data |> + filter( + year >= 1960, + ranking %in% seq(3) + ) |> + group_by(year, distance, sex, country) |> + count(year, distance, sex, country, name = "medals_won") |> + filter(medals_won == 3) |> + mutate(sweeps = medals_won / 3) |> + ggplot(aes(x = year, y = sweeps, fill = country)) + + geom_col(key_glyph = "point") + geom_hline(yintercept = 0) + - labs(title = "**Podium sweeps since 1960**", - subtitle = "The Netherlands had 7 out 13 podium sweeps (winning gold, silver, **and** bronze in a single event),
including 4 at the 2014 Olympics in Sochi", - x = NULL, - y = "Number of podium sweeps", - fill = NULL) + + labs( + title = "**Podium sweeps since 1960**", + subtitle = "The Netherlands had 7 out 13 podium sweeps (winning gold, silver, **and** bronze in a single event),
including 4 at the 2014 Olympics in Sochi", + x = NULL, + y = "Number of podium sweeps", + fill = NULL + ) + scale_x_continuous(limits = c(1960, NA), breaks = game_years) + - scale_y_continuous(expand = expansion(mult = c(0,0.05))) + - scico::scale_fill_scico_d(palette = "batlow", guide = guide_legend( - override.aes = c(shape = 21, size = 4) - )) + - theme_minimal(base_family = "custom") + - theme(plot.title = element_markdown(size = 26), - legend.text = element_text(size = 10), - legend.key.height = unit(0.75, "lines"), - plot.subtitle = element_markdown(size = 13, lineheight = 0.5), - axis.text.x = element_text(size = 7), - axis.text.y = element_text(size = 12), - axis.title.y = element_text(size = 12), - panel.grid.major.x = element_blank(), - panel.grid.minor = element_blank()) + scale_y_continuous(expand = expansion(mult = c(0, 0.05))) + + scico::scale_fill_scico_d( + palette = "batlow", + guide = guide_legend(override.aes = c(shape = 21, size = 4)) + ) + + theme_minimal(base_family = "custom") + + theme( + plot.title = element_markdown(size = 26), + legend.text = element_text(size = 10), + legend.key.height = unit(0.75, "lines"), + plot.subtitle = element_markdown(size = 13, lineheight = 0.5), + axis.text.x = element_text(size = 7), + axis.text.y = element_text(size = 12), + axis.title.y = element_text(size = 12), + panel.grid.major.x = element_blank(), + panel.grid.minor = element_blank() + ) ``` -As you might gather, from this and the previous plot, the Winter Olympic Games from 2014 were a very good year for the Dutch speed skating team. That one year the Netherlands had *four* podium sweeps. For one of these four podium sweeps (the [women's 1500 m](https://en.wikipedia.org/wiki/Speed_skating_at_the_2014_Winter_Olympics_–_Women%27s_1500_metres)) the fourth place was also a Dutch women (Marrit Leenstra), a first in the Winter Olympics. +{{< sidenote >}} +For one of these four podium sweeps (the [2014 Women's 1500 m](https://en.wikipedia.org/wiki/Speed_skating_at_the_2014_Winter_Olympics_–_Women%27s_1500_metres)) the fourth place was also Dutch (Marrit Leenstra), a historic first +{{< /sidenote >}} + +As you might gather, from this and the previous plot, the Winter Olympic Games from 2014 were a very good year for the Dutch speed skating team. That one year the Netherlands had *four* podium sweeps.
Show code ``` r -data |> - mutate(distance = fct_relevel(distance, ~ event_lims)) |> - filter(str_detect(comment, "OR"), - distance != "Combined") |> - group_by(distance, sex) |> - arrange(year) |> - mutate(no = row_number()) |> - ggplot(aes(x = year, y = no, color = distance)) + - geom_vline(xintercept = c(1940, 1944), linetype = "dashed", color = "grey92") + - geom_step(linewidth = 1.5, alpha = 0.4, show.legend = FALSE) + +data |> + mutate(distance = fct_relevel(distance, ~event_lims)) |> + filter( + str_detect(comment, "OR"), + distance != "Combined" + ) |> + group_by(distance, sex) |> + arrange(year) |> + mutate(no = row_number()) |> + ggplot(aes(x = year, y = no, color = distance)) + + geom_vline( + xintercept = c(1940, 1944), + linetype = "dashed", color = "grey92" + ) + + geom_step(linewidth = 1.5, alpha = 0.4, show.legend = FALSE) + geom_point(size = 4, alpha = 0.75, stroke = 0) + - ggrepel::geom_text_repel(data = . |> filter(no == max(no)), - aes(label = country), show.legend = FALSE, seed = 2, - color = "#333333", size = 4, - family = "custom", fontface = "bold") + - labs(title = "**Olympic Records over the years**", - subtitle = "The Netherlands hold 4/6 olympic records with the men, and 3/6 records with the women.
+ ggrepel::geom_text_repel( + data = . %>% filter(no == max(no)), + aes(label = country), show.legend = FALSE, seed = 2, + color = "#333333", size = 4, + family = "custom", fontface = "bold" + ) + + labs( + title = "**Olympic Records over the years**", + subtitle = "The Netherlands hold 4/6 olympic records with the men, and 3/6 records with the women.
Current record holder indicated with the IOC abbreviation", - x = "Winter Games", - y = NULL, - color = NULL) + - scale_x_continuous(breaks = game_years, - labels = str_replace(game_years, "^19|^20", "'")) + + x = "Winter Games", + y = NULL, + color = NULL + ) + + scale_x_continuous( + breaks = game_years, + labels = str_replace(game_years, "^19|^20", "'") + ) + scale_y_continuous(breaks = NULL) + - scico::scale_color_scico_d(guide = guide_legend(override.aes = c(size = 4, alpha = 1))) + - facet_wrap(~ sex, nrow = 2, strip.position = "left") + - theme_minimal(base_family = "custom") + - theme(text = element_text(color = "#333333"), - legend.position = c(0.2, 0.25), - legend.key.height = unit(0.75, "lines"), - plot.title = element_markdown(size = 26), - plot.subtitle = element_markdown(size = 13), - strip.text = element_text(size = 24, face = "bold"), - panel.grid.major.y = element_blank(), - panel.grid.minor = element_blank()) + scico::scale_color_scico_d( + guide = guide_legend(override.aes = c(size = 4, alpha = 1)) + ) + + facet_wrap(~sex, nrow = 2, strip.position = "left") + + theme_minimal(base_family = "custom") + + theme( + text = element_text(color = "#333333"), + legend.position = c(0.2, 0.25), + legend.key.height = unit(0.75, "lines"), + plot.title = element_markdown(size = 26), + plot.subtitle = element_markdown(size = 13), + strip.text = element_text(size = 24, face = "bold"), + panel.grid.major.y = element_blank(), + panel.grid.minor = element_blank() + ) ```
@@ -575,15 +699,17 @@ Next, I want to highlight one athlete in particular. The Dutch team is a powerho Show code ``` r -data_wust <- data |> - filter(str_detect(athlete, "Ireen") | - str_detect(title, "Women's Team pursuit") & - country == "NED") |> - add_row(tibble(year = 2022, - distance = "1500 m", - sex = "Women", - ranking = 1, - comment = "OR")) |> +data_wust <- data |> + filter(str_detect(athlete, "Ireen") | + str_detect(title, "Women's Team pursuit") & + country == "NED") |> + add_row(tibble( + year = 2022, + distance = "1500 m", + sex = "Women", + ranking = 1, + comment = "OR" + )) |> glimpse() ``` @@ -608,37 +734,51 @@ So Ireen participated in 18 events across 5 Olympic Games. She participated in a Show code ``` r -data_wust |> - group_by(year, medals_won = ranking %in% seq(3)) |> - mutate(medals_won = ifelse(medals_won, "medals_won", "medals_not_won")) |> - count() |> - ungroup() |> - complete(year, medals_won, fill = list(n = 0)) |> - pivot_wider(names_from = medals_won, values_from = n) |> - mutate(total_events = medals_won + medals_not_won, - perc_won = medals_won / total_events, - perc_won_label = str_glue("{round(perc_won * 100,1)}%"), - perc_won_label = ifelse(year == 2022, str_glue("{perc_won_label}*"), perc_won_label), - year = as_factor(year), - year = fct_rev(year)) |> - ggplot(aes(x = perc_won, y = year)) + +data_wust |> + group_by(year, medals_won = ranking %in% seq(3)) |> + mutate(medals_won = ifelse(medals_won, "medals_won", "medals_not_won")) |> + count() |> + ungroup() |> + complete(year, medals_won, fill = list(n = 0)) |> + pivot_wider(names_from = medals_won, values_from = n) |> + mutate( + total_events = medals_won + medals_not_won, + perc_won = medals_won / total_events, + perc_won_label = str_glue("{round(perc_won * 100,1)}%"), + perc_won_label = ifelse( + year == 2022, str_glue("{perc_won_label}*"), perc_won_label + ), + year = as_factor(year), + year = fct_rev(year) + ) |> + ggplot(aes(x = perc_won, y = year)) + geom_segment(aes(x = 0, xend = perc_won, yend = year), - linewidth = 10, lineend = "round", color = "#FF9B00") + - geom_text(aes(label = perc_won_label), size = 4, family = "custom", hjust = 1) + - labs(title = "**Will Ireen win a medal if she shows up?**", - subtitle = "Of all the events Ireen Wüst participated in, how often did she win a medal (of any color)?
2022 Olympics is still ongoing, shown is win rate **so far***", - caption = "*As of time of writing (09/02/2022)", - x = NULL, - y = NULL) + - scale_x_continuous(breaks = NULL, - expand = expansion(add = c(0,0.05))) + - coord_fixed(1/12) + - theme_minimal(base_family = "custom") + - theme(plot.title.position = "plot", - plot.title = element_markdown(size = 26), - plot.subtitle = element_markdown(size = 13), - axis.text.y = element_markdown(size = 13), - panel.grid.major.y = element_blank()) + linewidth = 10, lineend = "round", color = "#FF9B00" + ) + + geom_text(aes(label = perc_won_label), + size = 4, + family = "custom", hjust = 1 + ) + + labs( + title = "**Will Ireen win a medal if she shows up?**", + subtitle = "Of all the events Ireen Wüst participated in, how often did she win a medal (of any color)?
2022 Olympics is still ongoing, shown is win rate **so far***", + caption = "*As of time of writing (09/02/2022)", + x = NULL, + y = NULL + ) + + scale_x_continuous( + breaks = NULL, + expand = expansion(add = c(0, 0.05)) + ) + + coord_fixed(1 / 12) + + theme_minimal(base_family = "custom") + + theme( + plot.title.position = "plot", + plot.title = element_markdown(size = 26), + plot.subtitle = element_markdown(size = 13), + axis.text.y = element_markdown(size = 13), + panel.grid.major.y = element_blank() + ) ``` @@ -647,45 +787,68 @@ data_wust |> With the caveat that Ireen has only participated in one event in 2022 (as of time of writing, 9/2/2022), there has been one instance where she took home a medal on every single event she participated in. The Sochi Olympics in 2014 were successful for the Dutch team and for Ireen Wüst individually too. -Finally, we can also visualize the individual medals she won. Again, I'll take some artistic liberty here by creating a sort-of bar plot, but instead with `geom_points()` in the shape and color of the medals. +Finally, we can also visualize the individual medals she won. Again, I'll take some artistic liberty here by creating a sort-of bar plot, but instead with `geom_point()`'s in the shape and color of the medals.
Show code ``` r -data_wust |> - filter(ranking %in% seq(3)) |> - mutate(ranking_color = case_when(ranking == 1 ~ "#F8CC46", - ranking == 2 ~ "#DFDFE7", - ranking == 3 ~ "#D8B581"), - label = str_glue("{sex}'s {distance}")) |> - group_by(year) |> - arrange(ranking) |> - mutate(y = row_number()) |> - ggplot(aes(x = year, y = -y)) + - geom_point(aes(color = ranking_color), size = 12) + - geom_text(aes(label = label), size = 4, family = "custom", hjust = 0.1) + - geom_richtext(data = tibble(), aes(x = 2004.5, y = -3.5, - label = "**Medals earned by Ireen Wüst**"), - family = "custom", size = 8, hjust = 0, label.color = NA) + - geom_richtext(data = tibble(), aes(x = 2004.5, y = -4.2, - label = "Ireen Wüst earned **12 medals*** (of which 6 gold) across
5 Olympic games, the first Winter Olympian in history
to reach this milestone"), - family = "custom", size = 4, hjust = 0, label.color = NA, - lineheight = 1) + - labs(x = NULL, - y = NULL, - caption = "*As of time of writing (09/02/2022)") + - scale_x_continuous(breaks = c(game_years, 2022), position = "top", - expand = expansion(add = c(1,2.5))) + - scale_y_continuous(breaks = FALSE, - expand = expansion(add = c(0.5, 0.5))) + - scale_color_identity() + +data_wust |> + filter(ranking %in% seq(3)) |> + mutate( + ranking_color = case_when( + ranking == 1 ~ "#F8CC46", + ranking == 2 ~ "#DFDFE7", + ranking == 3 ~ "#D8B581" + ), + label = str_glue("{sex}'s {distance}") + ) |> + group_by(year) |> + arrange(ranking) |> + mutate(y = row_number()) |> + ggplot(aes(x = year, y = -y)) + + geom_point(aes(color = ranking_color), size = 12) + + geom_text(aes(label = label), + size = 4, + family = "custom", hjust = 0.1 + ) + + geom_richtext( + data = tibble(), aes( + x = 2004.5, y = -3.5, + label = "**Medals earned by Ireen Wüst**" + ), + family = "custom", size = 8, hjust = 0, label.color = NA + ) + + geom_richtext( + data = tibble(), aes( + x = 2004.5, y = -4.2, + label = "Ireen Wüst earned **12 medals*** (of which 6 gold) across
5 Olympic games, the first Winter Olympian in history
to reach this milestone" + ), + family = "custom", size = 4, hjust = 0, label.color = NA, + lineheight = 1 + ) + + labs( + x = NULL, + y = NULL, + caption = "*As of time of writing (09/02/2022)" + ) + + scale_x_continuous( + breaks = c(game_years, 2022), position = "top", + expand = expansion(add = c(1, 2.5)) + ) + + scale_y_continuous( + breaks = FALSE, + expand = expansion(add = c(0.5, 0.5)) + ) + + scale_color_identity() + coord_fixed(ratio = 2) + - theme_minimal(base_family = "custom") + - theme(plot.title = element_markdown(size = 26), - plot.subtitle = element_markdown(size = 13), - axis.text.x = element_markdown(size = 13), - panel.grid = element_blank()) + theme_minimal(base_family = "custom") + + theme( + plot.title = element_markdown(size = 26), + plot.subtitle = element_markdown(size = 13), + axis.text.x = element_markdown(size = 13), + panel.grid = element_blank() + ) ```
diff --git a/content/blog/2022-dutch-performance-olympic-speed-skating/index.qmd b/content/blog/2022-dutch-performance-olympic-speed-skating/index.qmd index 2294ff4..16303b6 100644 --- a/content/blog/2022-dutch-performance-olympic-speed-skating/index.qmd +++ b/content/blog/2022-dutch-performance-olympic-speed-skating/index.qmd @@ -14,6 +14,8 @@ execute: fig.show: hold results: hold out.width: 80% +editor_options: + chunk_output_type: console --- The 2022 Winter Olympics started last week. I'm don't usually follow sports (of any kind) religiously during a regular year, but I make an exception for the Winter Olympics. In particular the speed skating events I'll watch live as much as time allows me. It's no secret the Netherlands is a speed skating nation ([although some international TV commentators don't quite grasp why](https://www.washingtonpost.com/news/early-lead/wp/2018/02/11/nbcs-katie-couric-is-in-hot-water-with-the-dutch-who-really-dont-skate-everywhere/)). Is it fun to watch a sport where you have a high chance of winning? Yes, of course! Is it still exiting? Yes, absolutely! Being the favorites brings a certain pressure that is thrilling. Dutch qualifying games to determine which athletes get to go to the Olympics are [always very competitive too](https://www.nytimes.com/2022/02/01/sports/olympics/netherlands-speedskating-beijing-2022.html), so it's exiting to see if they can deal with the pressure and which international surprises might threaten their "favorites" status. Watching speed skating events definitely gets my heart pumping faster. @@ -36,9 +38,13 @@ font_add_google(name = "Yanone Kaffeesatz", family = "custom") showtext_auto() ``` -Before we can do anything, we need to find a nice dataset. What was quite surprising to me, it was rather tough to find a good dataset on Olympic events and results. Wikipedia of course has all the data one could want, but it's not always structured in an organized way, which makes it hard to scrape programatically. I looked at the IOC, ISU (International Skating Union), and news agencies, but the best (i.e. most complete and most organized) resource I could was a website called [Olympian Database](https://www.olympiandatabase.com/). The website looks rather outdated and the html is fairly outdated too, but we can work with this. The website has a [main page for speed skating](https://www.olympiandatabase.com/index.php?id=6934&L=1), and then we can iteratively go through the games and events to scrape every individual webpage. +{{{< sidenote br="10em" >}}} +It looks like the website is (once again) maintained by a small group of enthusiasts, the unsung heroes of the internet +{{{< /sidenote >}}} -Before we've used the `{rvest}` package to scrape websites, but since then I've actually gotten really fond of using Python for web scraping with the `Selenium` library, and then parsing the html with the `BeautifulSoup` library. So what we'll do first is scrape and parse the [main table ](https://www.olympiandatabase.com/index.php?id=6934&L=1). This will give us the links to the speed skating events at each Winter Olympic Game. This will give us a list of all events that were part of that tournament, then we'll go one level deeper and scrape the table there. This table contains the final placements (and in case of team events, the results from the last rounds), the athlete, country, and a comment (Olympic records, disqualifications, etc.). We'll run through each game, and each event iteratively, save the data in an individual json file, and then at the end merge the individual json files into a single large json which we can then parse in R. While running this script I found [one instance](https://www.olympiandatabase.com/index.php?id=11769&L=1) where the header data and some variables were missing, which made machine reading this page very difficult, so when the script got to that instance I filled in the data manually. +Before we can do anything, we need to find a nice dataset. What was quite surprising to me, it was rather tough to find a good dataset on Olympic events and results. Wikipedia of course has all the data one could want, but it's not always structured in an organized way, which makes it hard to scrape programatically. I looked at the IOC, ISU (International Skating Union), and news agencies, but the best (i.e. most complete and most organized) resource I could was a website called [Olympian Database](https://www.olympiandatabase.com/). The website looks rather outdated and the HTML is fairly outdated too, but we can work with this. The website has a [main page for speed skating](https://www.olympiandatabase.com/index.php?id=6934&L=1), and then we can iteratively go through the games and events to scrape every individual webpage. + +Before we've used the `{rvest}` package to scrape websites, but since then I've actually gotten really fond of using Python for web scraping with the `Selenium` library, and then parsing the HTML with the `BeautifulSoup` library. So what we'll do first is scrape and parse the [main table ](https://www.olympiandatabase.com/index.php?id=6934&L=1). This will give us the links to the speed skating events at each Winter Olympic Game. This will give us a list of all events that were part of that tournament, then we'll go one level deeper and scrape the table there. This table contains the final placements (and in case of team events, the results from the last rounds), the athlete, country, and a comment (Olympic records, disqualifications, etc.). We'll run through each game, and each event iteratively, save the data in an individual JSON file, and then at the end merge the individual JSON files into a single large JSON which we can then parse in R. While running this script I found [one instance](https://www.olympiandatabase.com/index.php?id=11769&L=1) where the header data and some variables were missing, which made machine reading this page very difficult, so when the script got to that instance I filled in the data manually. ```{python, code=readLines("./scrape_data.py")} #| label: py-scrape-script @@ -47,7 +53,7 @@ Before we've used the `{rvest}` package to scrape websites, but since then I've #| code-summary: "Show code" ``` -I said before that the data is neatly organized, which is true except for a few instances. The individual events are simple tables with a ranking and time for each athlete. It's a bit more complicated for the team pursuits, since team pursuit events are a direct competition with qualifying rounds and knock-out rounds, the table is a bit more complicated. In this case we're just interested in the final ranking (so we dismiss the semi- and quarter-finals). The final ranking is split across two columns, so we stitch those together. For some reason the men's team pursuit from 2018 lists only the medal winners, and not in the same format as the other team pursuit events. One advantage here is that they list individual skaters too, but since this is the only time indivdual skaters are listed among the team pursuits it's still not very useful. It just meant we have to create another few lines in "if else" statement to parse the json. In the html, the podium places aren't denoted with a numeric list, but rather with a gold, silver, and bronze badge. Since the python script doesn't parse those, we add those back here (except for the 1928 Men's 10.000 m event, which was canceled due to bad weather). +I said before that the data is neatly organized, which is true except for a few instances. The individual events are simple tables with a ranking and time for each athlete. It's a bit more complicated for the team pursuits, since team pursuit events are a direct competition with qualifying rounds and knock-out rounds, the table is a bit more complicated. In this case we're just interested in the final ranking (so we dismiss the semi- and quarter-finals). The final ranking is split across two columns, so we stitch those together. For some reason the men's team pursuit from 2018 lists only the medal winners, and not in the same format as the other team pursuit events. One advantage here is that they list individual skaters too, but since this is the only time indivdual skaters are listed among the team pursuits it's still not very useful. It just meant we have to create another few lines in the `if else` statement to parse the JSON. In the HTML, the podium places aren't denoted with a numeric list, but rather with a gold, silver, and bronze badge. Since the Python script doesn't parse those, we add those back here (except for the 1928 Men's 10.000 m event, which was canceled due to bad weather). ```{r} #| label: define-parsing-function @@ -56,79 +62,84 @@ I said before that the data is neatly organized, which is true except for a few #| code-summary: "Show code" parse_json <- function(json) { - - t_df <- jsonlite::fromJSON(json) |> - as_tibble() |> - unnest() |> - janitor::clean_names() %>% + t_df <- jsonlite::fromJSON(json) |> + as_tibble() |> + unnest() |> + janitor::clean_names() %>% slice(seq(3, nrow(.) - 2)) - + if (str_detect(json, "Men's Team pursuit 2018")) { - - t_df_out <- t_df |> - filter(is.na(x0)) |> - rename(ranking = x0, - athlete = x1, - country = x3, - time = x4, - comment = x5) |> - mutate(ranking = rep(seq(3), each = 4), - ranking = str_glue("{ranking}.")) |> + t_df_out <- t_df |> + filter(is.na(x0)) |> + rename( + ranking = x0, + athlete = x1, + country = x3, + time = x4, + comment = x5 + ) |> + mutate( + ranking = rep(seq(3), each = 4), + ranking = str_glue("{ranking}.") + ) |> fill(country, time, comment) |> - group_by(ranking) |> + group_by(ranking) |> mutate(athlete = toString(athlete)) |> - ungroup() |> - distinct() |> + ungroup() |> + distinct() |> select(-x2) - } else if (str_detect(json, "Men's Team pursuit|Women's Team pursuit")) { - - t_df_tp <- t_df |> - rename(ranking = x0, - country = x1, - time = x3, - comment = x4, - ranking2 = x5, - country2 = x6, - time2 = x8, - comment2 = x9) |> - select(seq(10), - -c(x2,x7)) |> + t_df_tp <- t_df |> + rename( + ranking = x0, + country = x1, + time = x3, + comment = x4, + ranking2 = x5, + country2 = x6, + time2 = x8, + comment2 = x9 + ) |> + select( + seq(10), + -c(x2, x7) + ) |> slice(seq(0, min(which(nchar(ranking) > 3)) - 1)) - - t_df_out <- bind_rows(t_df_tp |> - select(seq(4)), - t_df_tp |> - select(seq(5,last_col())) |> - rename_with( ~ c("ranking","country","time","comment"))) - + + t_df_out <- bind_rows( + t_df_tp |> + select(seq(4)), + t_df_tp |> + select(seq(5, last_col())) |> + rename_with(~ c("ranking", "country", "time", "comment")) + ) } else { - - t_df <- t_df |> - rename(ranking = x0, - athlete = x1, - country = x3, - time = x4, - comment = x5) |> + t_df <- t_df |> + rename( + ranking = x0, + athlete = x1, + country = x3, + time = x4, + comment = x5 + ) |> select(-x2) - + if (str_detect(json, "Men's 10000 m 1928", negate = TRUE)) { t_df[seq(3), "ranking"] <- str_glue("{seq(3)}.") } - + t_df_out <- t_df - } - + return(t_df_out) } ``` -Okay, now that we have the function to parse the json file, let's look at some R code. We'll load the json file using the `{jsonlite}` package, and then parse each json string using the `map()` function from `{purrr}`. +Okay, now that we have the function to parse the JSON file, let's look at some R code. We'll load the JSON file using the `{jsonlite}` package, and then parse each JSON string using the `map()` function from `{purrr}`. Then when this data is parsed, we'll wrangle the nested data frames into one long data frame, and then we'll tidy up the data. Tied places are denoted using a single dash, we want to get rid of that. Then we'll fill the missing place numbers using the `fill()` function. However, there were also a number of people who either did not finish or start or were disqualified and so they don't have a ranking. These instances are denoted in the `time` column with a "dnf", "dns", or "dq". Since those are the only times it uses the lowercase d, we can use this feature to replace the ranking with a missing value. We'll then also add the comment from the `time` column to the `comment` column. Then there are also some artifacts which we can easily remove since the `country` column uses IOC 3-letter abbreviations, so any entry there that's longer than 3 characters we can remove. -Then we'll also create two vectors that contain the breaks we'll use later for the visualizations. Until 1992 both summer and winter olympic games were held in the same year. However, since 1994 they moved the Olympic Winter Games up 2 years to get the alternating schedule we have today. The Olympic Games were also not held during the second world war. I want to account for that so I create a vector with each unique entry in the `year` column. I also want a neatly organized ordering of the events, so I create a vector called `event_lims` that saves stores this preferred ordering. +Then we'll also create two vectors that contain the breaks we'll use later for the visualizations. Until 1992 both summer and winter olympic games were held in the same year. However, since 1994 they moved the Olympic Winter Games up 2 years to get the alternating schedule we have today. The Olympic Games were also not held during World War II, I want to account for that so I create a vector with each unique entry in the `year` column. I also want a neatly organized ordering of the events, so I create a vector called `event_lims` that saves stores this preferred ordering. ```{r} #| label: load-json-data @@ -137,29 +148,35 @@ Then we'll also create two vectors that contain the breaks we'll use later for t #| code-fold: true #| code-summary: "Show code" -data_load <- jsonlite::fromJSON("./all_events.json") |> +data_load <- jsonlite::fromJSON("./data/all_events.json") |> mutate(df = map(table, ~ parse_json(.x))) -data <- data_load |> - select(-table) |> - unnest(df) |> - group_by(year, distance, sex) |> - mutate(ranking = ifelse(str_detect(ranking, "-"), NA, ranking)) |> - fill(ranking) |> - ungroup() |> - mutate(ranking = parse_number(ranking), - ranking = ifelse(str_detect(time, "d"), NA, ranking), - comment = ifelse(str_detect(time, "d"), time, comment), - time = parse_number(time)) |> - filter(nchar(country) < 4) |> - arrange(year) |> +data <- data_load |> + select(-table) |> + unnest(df) |> + group_by(year, distance, sex) |> + mutate(ranking = ifelse(str_detect(ranking, "-"), NA, ranking)) |> + fill(ranking) |> + ungroup() |> + mutate( + ranking = parse_number(ranking), + ranking = ifelse(str_detect(time, "d"), NA, ranking), + comment = ifelse(str_detect(time, "d"), time, comment), + time = parse_number(time) + ) |> + filter(nchar(country) < 4) |> + arrange(year) |> glimpse() game_years <- unique(data$year) -event_lims <- c("500 m", "1000 m", "1500 m", "3000 m", "5000 m", "10000 m", "Combined", "Team pursuit", "Mass Start") +event_lims <- c("500 m", "1000 m", "1500 m", "3000 m", "5000 m", "10000 m", "Combined", "Team pursuit", "Mass Start") ``` +{{{< sidenote br="4em" >}}} +If you happen to get here by searching for "Gantt chart in ggplot", you can find an actual tutorial for that [here](https://jtr13.github.io/cc19/gantt-charts.html) +{{{< /sidenote >}}} + Then we can finally create some plots. Not all speed skating events were present from the start in 1924. Back then only men competed in Olympic speed skating, the women's program started in 1960. Here we'll create something that looks a bit like a Gantt chart. We'll use a `geom_segment()` to visualize the timeline and since there's a few events which have only been on the program once we'll use a `geom_point()` for those since `geom_segment()` requires a begin and end point that are different. Since this is just a casual visualization for illustrative purposes we can take some creative liberty and experiment a bit with the design. That's why I chose to remove the grid lines and axes, make the lines fairly big and added the individual distances as a label on top of the lines. I also made the text quite large and moved the labels slightly up. The first year an event was held is shown slightly below the line. ```{r} @@ -168,31 +185,48 @@ Then we can finally create some plots. Not all speed skating events were present #| code-fold: true #| code-summary: "Show code" -data |> - select(year, distance, sex) |> +data |> + select(year, distance, sex) |> distinct() |> - mutate(distance = fct_relevel(distance, ~ event_lims)) |> - group_by(distance, sex) |> - arrange(year) |> - summarise(first_year = min(year), - last_year = max(year)) |> - ggplot(aes(x = first_year, y = distance)) + - geom_segment(aes(xend = last_year, yend = distance, color = distance), - linewidth = 8, lineend = "round", alpha = 0.4) + - geom_point(data = . %>% filter(first_year == last_year), - aes(color = distance), - size = 8, alpha = 0.5) + - geom_text(aes(x = first_year, label = first_year), - color = "#333333", size = 3, family = "custom", nudge_y = -0.25) + - geom_text(aes(x = 2018, label = distance), - size = 10, color = "grey30", fontface = "bold", - family = "custom", hjust = 1, nudge_y = 0.2) + + mutate(distance = fct_relevel(distance, ~event_lims)) |> + group_by(distance, sex) |> + arrange(year) |> + summarise( + first_year = min(year), + last_year = max(year) + ) |> + ggplot(aes(x = first_year, y = distance)) + + geom_segment( + aes( + xend = last_year, yend = distance, + color = distance + ), + linewidth = 8, lineend = "round", alpha = 0.4 + ) + + geom_point( + data = . %>% filter(first_year == last_year), + aes(color = distance), + size = 8, alpha = 0.5 + ) + + geom_text(aes(x = first_year, label = first_year), + color = "#333333", size = 3, + family = "custom", nudge_y = -0.25 + ) + + geom_text(aes(x = 2018, label = distance), + size = 10, color = "grey30", fontface = "bold", + family = "custom", hjust = 1, nudge_y = 0.2 + ) + scale_y_discrete(limits = rev(event_lims)) + scico::scale_color_scico_d(guide = "none") + - facet_wrap(~ sex, scales = "free", strip.position = "top") + - theme_void(base_family = "custom") + - theme(text = element_text(color = "#333333"), - strip.text = element_text(face = "bold", size = 42)) + facet_wrap(~sex, + scales = "free_y", + strip.position = "top" + ) + + theme_void(base_family = "custom") + + theme( + text = element_text(color = "#333333"), + strip.text = element_text(face = "bold", size = 42) + ) ``` As we can see, the first Winter Olympic Games had only 5 events. This also included an event called "combined", which is the ranking for the all-round best score at the speed skating tournament. This event was only part of the Olympics in 1924 and an all-round medal hasn't been awarded since that tournament in 1924. The women's competition at the Olympics started in 1960 with 4 distances. Today the only difference is that the men have a 10.000 m event, and the women have a 3000 m event. Both competitions have a team pursuit event, but the men skate 8 laps around the 400 m track, while women do 6 laps. Why? I don't know. I think there's quite a lot of female athletes who'd love to show how fast they can skate a 10k, and there's a lot of male athletes who'd love the chance to earn a medal at the medium-distance 3000 m. The mass start is a new event that was added only in 2018, it is a spectacular event that mimics some of the scenarios from the eventful short-track tournament. @@ -205,38 +239,60 @@ Now, let's dive into the medals. First let's create a simple barplot with the to #| code-fold: true #| code-summary: "Show code" -data |> - filter(year >= 1960, - ranking %in% seq(3)) |> - mutate(country_long = countrycode::countrycode(country, origin = "ioc", destination = "country.name"), - country_long = case_when(str_detect(country, "URS") ~ "Soviet Union", - str_detect(country, "GDR") ~ "East Germany", - str_detect(country, "FRG") ~ "West Germany", - str_detect(country, "OAR") ~ "Olympic Athletes from Russia", - TRUE ~ country_long), - country_label = str_glue("{country_long} ({country})")) |> - count(country_label, sort = TRUE) |> - mutate(highlight_col = ifelse(str_detect(country_label, "NED"), "#FF9B00", "grey")) |> - ggplot(aes(x = n, y = reorder(country_label, n))) + - geom_col(aes(fill = highlight_col)) + - geom_vline(xintercept = 0, linewidth = 1) + - geom_richtext(data = tibble(), aes(x = 24, y = 15, - label = "Total number of medals won per country since 1960"), - family = "custom", size = 7, fontface = "bold", hjust = 0, - label.padding = unit(0.75,"lines"), label.color = NA) + - geom_richtext(data = tibble(), aes(x = 24, y = 13, - label = "The Netherlands has won more than twice as many medals as the runner-up"), - family = "custom", size = 4, hjust = 0, - label.padding = unit(0.75,"lines"), label.color = NA) + - labs(x = NULL, - y = NULL) + - scale_x_continuous(expand = expansion(add = c(0,9)), position = "top") + - scale_fill_identity() + +data |> + filter( + year >= 1960, + ranking %in% seq(3) + ) |> + mutate( + country_long = countrycode::countrycode( + country, + origin = "ioc", + destination = "country.name" + ), + country_long = case_when( + str_detect(country, "URS") ~ "Soviet Union", + str_detect(country, "GDR") ~ "East Germany", + str_detect(country, "FRG") ~ "West Germany", + str_detect(country, "OAR") ~ "Olympic Athletes from Russia", + TRUE ~ country_long + ), + country_label = str_glue("{country_long} ({country})") + ) |> + count(country_label, sort = TRUE) |> + mutate(highlight_col = ifelse( + str_detect(country_label, "NED"), "#FF9B00", "grey" + )) |> + ggplot(aes(x = n, y = reorder(country_label, n))) + + geom_col(aes(fill = highlight_col)) + + geom_vline(xintercept = 0, linewidth = 1) + + geom_richtext( + data = tibble(), aes( + x = 24, y = 15, + label = "Total number of medals won per country since 1960" + ), + family = "custom", size = 7, fontface = "bold", hjust = 0, + label.padding = unit(0.75, "lines"), label.color = NA + ) + + geom_richtext( + data = tibble(), aes( + x = 24, y = 13, + label = "The Netherlands has won more than twice as many medals as the runner-up" + ), + family = "custom", size = 4, hjust = 0, + label.padding = unit(0.75, "lines"), label.color = NA + ) + + labs( + x = NULL, + y = NULL + ) + + scale_x_continuous(expand = expansion(add = c(0, 9)), position = "top") + + scale_fill_identity() + theme_minimal(base_family = "custom") + theme(panel.grid.major.y = element_blank()) ``` -As you can see, the Netherlands has earned by far the most medals since 1960 than any other country. In fact, it's earned more medals than number two and three combined. Now, news agencies have reported on the total number of medals, and numbers may slightly differ between reports. This is the number reported by the source, and unless I made some errors in scraping, parsing, or wrangling the data I'll stand by it. However, differences of 3 or 4 medals won't change the message that the Netherlands is absolutely dominant in this area of the Winter Olympics. +As you can see, the Netherlands has earned by far the most medals since 1960 than any other country. In fact, it's earned more medals than number two and three combined. Now, news agencies have reported on the total number of medals, and numbers may slightly differ between reports. This is the number reported by the source, unless I made some errors in scraping, parsing, or wrangling the data. Even if, differences of 3 or 4 medals won't change the message that the Netherlands is absolutely dominant in this area of the Winter Olympics. Let's look at how this distribution is spread out across the different Olympic events. We'll start in 1960 since that's when the women's tournament was added and I consider that the proper start of the Winter Olympics. Since 1960 we've had 16 Winter Olympics (the 17th is currently underway). Since not all games had the same number of medals (events were added at different years), I'll calculate the percentage of medals won per year. @@ -248,42 +304,56 @@ Let's look at how this distribution is spread out across the different Olympic e #| code-fold: true #| code-summary: "Show code" -data |> - filter(ranking %in% seq(3), - year >= 1960) |> - group_by(year) |> - mutate(total_medals = n()) |> - group_by(year, country) |> - summarise(medals_won = n(), - total_medals = first(total_medals)) |> - mutate(perc_won = medals_won / total_medals, - perc_label = str_glue("{round(perc_won * 100)}%"), - highlight_col = ifelse(country == "NED", "#FF9B00", "grey"), - country = tidytext::reorder_within(country, perc_won, year)) |> - slice_max(perc_won, n = 5) |> - ggplot(aes(x = perc_won, y = country)) + - geom_col(aes(fill = highlight_col)) + - geom_text(aes(label = perc_label), family = "custom", - size = 2, hjust = 0, nudge_x = 0.01) + - labs(title = "**Most medals won per country per Olympic Game**", - subtitle = "The Netherlands has won the largest proportion (shared in 1994 and 2002) of speed skating medals **every Game since 1994**", - x = "Percentage of all medals won", - y = NULL) + +data |> + filter( + ranking %in% seq(3), + year >= 1960 + ) |> + group_by(year) |> + mutate(total_medals = n()) |> + group_by(year, country) |> + summarise( + medals_won = n(), + total_medals = first(total_medals) + ) |> + mutate( + perc_won = medals_won / total_medals, + perc_label = str_glue("{round(perc_won * 100)}%"), + highlight_col = ifelse(country == "NED", "#FF9B00", "grey"), + country = tidytext::reorder_within(country, perc_won, year) + ) |> + slice_max(perc_won, n = 5) |> + ggplot(aes(x = perc_won, y = country)) + + geom_col(aes(fill = highlight_col)) + + geom_text(aes(label = perc_label), + family = "custom", + size = 2, hjust = 0, nudge_x = 0.01 + ) + + labs( + title = "**Most medals won per country per Olympic Game**", + subtitle = "The Netherlands has won the largest proportion (shared in 1994 and 2002) of speed skating medals **every Game since 1994**", + x = "Percentage of all medals won", + y = NULL + ) + tidytext::scale_y_reordered() + - scale_x_continuous(limits = c(0, 0.7), - labels = scales::label_percent()) + + scale_x_continuous( + limits = c(0, 0.7), + labels = scales::label_percent() + ) + scale_fill_identity() + - facet_wrap(~ year, scales = "free_y") + + facet_wrap(~year, scales = "free_y") + theme_minimal(base_family = "custom") + - theme(plot.title = element_markdown(size = 26), - plot.subtitle = element_markdown(size = 12), - strip.text = element_text(size = 16, face = "bold"), - panel.grid.major.y = element_blank()) + theme( + plot.title = element_markdown(size = 26), + plot.subtitle = element_markdown(size = 12), + strip.text = element_text(size = 16, face = "bold"), + panel.grid.major.y = element_blank() + ) ``` As you can see from the plot, of the 16 Olympic Games since 1960, the Netherlands has earned the largest share of medals 9 times (`r 100*9/16`%). It has topped the list every game since 1994 (shared in 1994 and 2002). In 2014, the Netherlands took home 64% of all medals. Due to ISU and IOC qualification rules it's technically impossible for a single country to take home all medals (e.g. a country cannot send three teams for the team pursuit). So there might be 36 medals available (from 12 events), but a single country can only take home 32. Since I didn't want to go into the (sometimes changing) qualification rules since 1960 I made it simple and calculated based on the total number of medals, not the total number available to a country. -But of course, not all medals are created equal. In Olympic rankings or medal tables, the order is determined by the number of gold medals first, then silver, then bronze. Total number of medals does not matter here. So a country with 2 gold medals and no other metal will be ranked above a country with 1 gold medal, 10 silver, and 15 bronze medals. So the Netherlands can win a lot of medals, but for rankins the color matters too. So let's create a metal table. Again, we'll only look at results from 1960. We'll calculate the number of medals each country won, then we'll fill in the blank spaces with the amazing `complete()` function. Since not all medals are equal, we'll add a multiplier and then calculate a theoretical score (where gold counts 10 times stronger than a silver etc.). Then we'll look at the top 10 countries and use `geom_point()` to create a table. +But of course, not all medals are created equal. In Olympic rankings or medal tables, the order is determined by the number of gold medals first, then silver, then bronze. Total number of medals does not matter here. So a country with 2 gold medals and no other metal will be ranked above a country with 1 gold medal, 10 silver, and 15 bronze medals. So the Netherlands can win a lot of medals, but for rankings the color matters too. So let's create a metal table. Again, we'll only look at results from 1960. We'll calculate the number of medals each country won, then we'll fill in the blank spaces with the amazing `complete()` function. Since not all medals are equal, we'll add a multiplier and then calculate a theoretical score (where gold counts 10 times stronger than a silver etc.). Then we'll look at the top 10 countries and use `geom_point()` to create a table. ```{r} #| label: medal-table @@ -292,46 +362,68 @@ But of course, not all medals are created equal. In Olympic rankings or medal ta #| code-fold: true #| code-summary: "Show code" -data |> - filter(year >= 1960, - ranking %in% seq(3)) |> - group_by(country, ranking) |> - summarise(n_medals = n()) |> - ungroup() |> - complete(country, ranking) |> - replace_na(list(n_medals = 0)) |> - mutate(country_long = countrycode::countrycode(country, origin = "ioc", destination = "country.name"), - country_long = case_when(str_detect(country, "URS") ~ "Soviet Union", - str_detect(country, "GDR") ~ "East Germany", - TRUE ~ country_long), - country_label = str_glue("{country_long} ({country})"), - ranking_color = case_when(ranking == 1 ~ "#F8CC46", - ranking == 2 ~ "#DFDFE7", - ranking == 3 ~ "#D8B581"), - rank_mult = case_when(ranking == 1 ~ 100, - ranking == 2 ~ 10, - ranking == 3 ~ 1), - rank_score = n_medals * rank_mult) |> - group_by(country) |> - mutate(country_rank = sum(rank_score)) |> +data |> + filter( + year >= 1960, + ranking %in% seq(3) + ) |> + group_by(country, ranking) |> + summarise(n_medals = n()) |> ungroup() |> - slice_max(country_rank, n = 30) |> - ggplot(aes(x = ranking, y = reorder(country_label, country_rank), - fill = ranking_color, alpha = n_medals)) + - geom_point(shape = 21, size = 10, stroke = 0, show.legend = FALSE) + + complete(country, ranking) |> + replace_na(list(n_medals = 0)) |> + mutate( + country_long = countrycode::countrycode( + country, + origin = "ioc", destination = "country.name" + ), + country_long = case_when( + str_detect(country, "URS") ~ "Soviet Union", + str_detect(country, "GDR") ~ "East Germany", + TRUE ~ country_long + ), + country_label = str_glue("{country_long} ({country})"), + ranking_color = case_when( + ranking == 1 ~ "#F8CC46", + ranking == 2 ~ "#DFDFE7", + ranking == 3 ~ "#D8B581" + ), + rank_mult = case_when( + ranking == 1 ~ 100, + ranking == 2 ~ 10, + ranking == 3 ~ 1 + ), + rank_score = n_medals * rank_mult + ) |> + group_by(country) |> + mutate(country_rank = sum(rank_score)) |> + ungroup() |> + slice_max(country_rank, n = 30) |> + ggplot(aes( + x = ranking, y = reorder(country_label, country_rank), + fill = ranking_color, alpha = n_medals + )) + + geom_point( + shape = 21, size = 10, + stroke = 0, show.legend = FALSE + ) + geom_text(aes(label = n_medals), alpha = 1, family = "custom") + - labs(title = "**Medal table since 1960**", - subtitle = "Ten countries with the highest total ranking", - x = NULL, - y = NULL) + + labs( + title = "**Medal table since 1960**", + subtitle = "Ten countries with the highest total ranking", + x = NULL, + y = NULL + ) + scale_x_discrete(position = "top") + - scale_fill_identity() + - coord_fixed(ratio = 1/2) + - theme_void(base_family = "custom") + - theme(plot.title.position = "plot", - plot.title = element_markdown(size = 26), - plot.subtitle = element_markdown(size = 13), - axis.text.y = element_text(hjust = 1)) + scale_fill_identity() + + coord_fixed(ratio = 1 / 2) + + theme_void(base_family = "custom") + + theme( + plot.title.position = "plot", + plot.title = element_markdown(size = 26), + plot.subtitle = element_markdown(size = 13), + axis.text.y = element_text(hjust = 1) + ) ``` To show that a country is dominant in a particular competition it helps to show that a country can deliver not just one, but a few contenders for Olympic gold. The greatest display of strength for a country is to take home all medals in a single event, a so-called _podium sweep_. If a country can take home gold, silver, and bronze in a single event it may show they're competing mostly with each other. Now, to calculate this can simply take the rankins, group by event and country, and count how often a single country took home three medals in a single event. For this we'll create a simple stacked barplot. @@ -341,39 +433,50 @@ To show that a country is dominant in a particular competition it helps to show #| code-fold: true #| code-summary: "Show code" -data |> - filter(year >= 1960, - ranking %in% seq(3)) |> - group_by(year, distance, sex, country) |> - count(year, distance, sex, country, name = "medals_won") |> - filter(medals_won == 3) |> - mutate(sweeps = medals_won / 3) |> - ggplot(aes(x = year, y = sweeps, fill = country)) + - geom_col(key_glyph = "point") + +data |> + filter( + year >= 1960, + ranking %in% seq(3) + ) |> + group_by(year, distance, sex, country) |> + count(year, distance, sex, country, name = "medals_won") |> + filter(medals_won == 3) |> + mutate(sweeps = medals_won / 3) |> + ggplot(aes(x = year, y = sweeps, fill = country)) + + geom_col(key_glyph = "point") + geom_hline(yintercept = 0) + - labs(title = "**Podium sweeps since 1960**", - subtitle = "The Netherlands had 7 out 13 podium sweeps (winning gold, silver, **and** bronze in a single event),
including 4 at the 2014 Olympics in Sochi", - x = NULL, - y = "Number of podium sweeps", - fill = NULL) + + labs( + title = "**Podium sweeps since 1960**", + subtitle = "The Netherlands had 7 out 13 podium sweeps (winning gold, silver, **and** bronze in a single event),
including 4 at the 2014 Olympics in Sochi", + x = NULL, + y = "Number of podium sweeps", + fill = NULL + ) + scale_x_continuous(limits = c(1960, NA), breaks = game_years) + - scale_y_continuous(expand = expansion(mult = c(0,0.05))) + - scico::scale_fill_scico_d(palette = "batlow", guide = guide_legend( - override.aes = c(shape = 21, size = 4) - )) + - theme_minimal(base_family = "custom") + - theme(plot.title = element_markdown(size = 26), - legend.text = element_text(size = 10), - legend.key.height = unit(0.75, "lines"), - plot.subtitle = element_markdown(size = 13, lineheight = 0.5), - axis.text.x = element_text(size = 7), - axis.text.y = element_text(size = 12), - axis.title.y = element_text(size = 12), - panel.grid.major.x = element_blank(), - panel.grid.minor = element_blank()) + scale_y_continuous(expand = expansion(mult = c(0, 0.05))) + + scico::scale_fill_scico_d( + palette = "batlow", + guide = guide_legend(override.aes = c(shape = 21, size = 4)) + ) + + theme_minimal(base_family = "custom") + + theme( + plot.title = element_markdown(size = 26), + legend.text = element_text(size = 10), + legend.key.height = unit(0.75, "lines"), + plot.subtitle = element_markdown(size = 13, lineheight = 0.5), + axis.text.x = element_text(size = 7), + axis.text.y = element_text(size = 12), + axis.title.y = element_text(size = 12), + panel.grid.major.x = element_blank(), + panel.grid.minor = element_blank() + ) ``` -As you might gather, from this and the previous plot, the Winter Olympic Games from 2014 were a very good year for the Dutch speed skating team. That one year the Netherlands had *four* podium sweeps. For one of these four podium sweeps (the [women's 1500 m](https://en.wikipedia.org/wiki/Speed_skating_at_the_2014_Winter_Olympics_–_Women%27s_1500_metres)) the fourth place was also a Dutch women (Marrit Leenstra), a first in the Winter Olympics. +{{{< sidenote >}}} +For one of these four podium sweeps (the [2014 Women's 1500 m](https://en.wikipedia.org/wiki/Speed_skating_at_the_2014_Winter_Olympics_–_Women%27s_1500_metres)) the fourth place was also Dutch (Marrit Leenstra), a historic first +{{{< /sidenote >}}} + +As you might gather, from this and the previous plot, the Winter Olympic Games from 2014 were a very good year for the Dutch speed skating team. That one year the Netherlands had *four* podium sweeps. ```{r} #| label: olympic-records @@ -381,41 +484,56 @@ As you might gather, from this and the previous plot, the Winter Olympic Games f #| code-fold: true #| code-summary: "Show code" -data |> - mutate(distance = fct_relevel(distance, ~ event_lims)) |> - filter(str_detect(comment, "OR"), - distance != "Combined") |> - group_by(distance, sex) |> - arrange(year) |> - mutate(no = row_number()) |> - ggplot(aes(x = year, y = no, color = distance)) + - geom_vline(xintercept = c(1940, 1944), linetype = "dashed", color = "grey92") + - geom_step(linewidth = 1.5, alpha = 0.4, show.legend = FALSE) + +data |> + mutate(distance = fct_relevel(distance, ~event_lims)) |> + filter( + str_detect(comment, "OR"), + distance != "Combined" + ) |> + group_by(distance, sex) |> + arrange(year) |> + mutate(no = row_number()) |> + ggplot(aes(x = year, y = no, color = distance)) + + geom_vline( + xintercept = c(1940, 1944), + linetype = "dashed", color = "grey92" + ) + + geom_step(linewidth = 1.5, alpha = 0.4, show.legend = FALSE) + geom_point(size = 4, alpha = 0.75, stroke = 0) + - ggrepel::geom_text_repel(data = . |> filter(no == max(no)), - aes(label = country), show.legend = FALSE, seed = 2, - color = "#333333", size = 4, - family = "custom", fontface = "bold") + - labs(title = "**Olympic Records over the years**", - subtitle = "The Netherlands hold 4/6 olympic records with the men, and 3/6 records with the women.
+ ggrepel::geom_text_repel( + data = . %>% filter(no == max(no)), + aes(label = country), show.legend = FALSE, seed = 2, + color = "#333333", size = 4, + family = "custom", fontface = "bold" + ) + + labs( + title = "**Olympic Records over the years**", + subtitle = "The Netherlands hold 4/6 olympic records with the men, and 3/6 records with the women.
Current record holder indicated with the IOC abbreviation", - x = "Winter Games", - y = NULL, - color = NULL) + - scale_x_continuous(breaks = game_years, - labels = str_replace(game_years, "^19|^20", "'")) + + x = "Winter Games", + y = NULL, + color = NULL + ) + + scale_x_continuous( + breaks = game_years, + labels = str_replace(game_years, "^19|^20", "'") + ) + scale_y_continuous(breaks = NULL) + - scico::scale_color_scico_d(guide = guide_legend(override.aes = c(size = 4, alpha = 1))) + - facet_wrap(~ sex, nrow = 2, strip.position = "left") + - theme_minimal(base_family = "custom") + - theme(text = element_text(color = "#333333"), - legend.position = c(0.2, 0.25), - legend.key.height = unit(0.75, "lines"), - plot.title = element_markdown(size = 26), - plot.subtitle = element_markdown(size = 13), - strip.text = element_text(size = 24, face = "bold"), - panel.grid.major.y = element_blank(), - panel.grid.minor = element_blank()) + scico::scale_color_scico_d( + guide = guide_legend(override.aes = c(size = 4, alpha = 1)) + ) + + facet_wrap(~sex, nrow = 2, strip.position = "left") + + theme_minimal(base_family = "custom") + + theme( + text = element_text(color = "#333333"), + legend.position = c(0.2, 0.25), + legend.key.height = unit(0.75, "lines"), + plot.title = element_markdown(size = 26), + plot.subtitle = element_markdown(size = 13), + strip.text = element_text(size = 24, face = "bold"), + panel.grid.major.y = element_blank(), + panel.grid.minor = element_blank() + ) ``` Next, I want to highlight one athlete in particular. The Dutch team is a powerhouse of speed skating, but a team is still made up of individual athletes. And one of those athletes deserves some special attention: Ireen Wüst. She is one of the most successful Winter Olympic athletes ever and the most succesful speed skater of all time. As of time of writing (9/2/2022) she won 6 gold, 5 silver, and 1 bronze medals across 5 Winter Olympic Games. She's the only Olympian (Winter or Summer) to win individual gold in 5 different Olympic Games. So let's look at her performance. Let's extract all events where Ireen Wüst participated. One caveat here is that we can't only look for her name in the `athlete` column, and as we saw before, there's also team pursuit where individual names aren't registered in the website. Lucky for us, Ireen Wüst participated in all team pursuit events (only held since 2006), so we'll extract all instances where the Dutch team pursuit team participated. Since the 2022 Olympics are already underway and Ireen has already won a gold medal in her first event, I'll add a row manually to include this data too. @@ -425,15 +543,17 @@ Next, I want to highlight one athlete in particular. The Dutch team is a powerho #| code-fold: true #| code-summary: "Show code" -data_wust <- data |> - filter(str_detect(athlete, "Ireen") | - str_detect(title, "Women's Team pursuit") & - country == "NED") |> - add_row(tibble(year = 2022, - distance = "1500 m", - sex = "Women", - ranking = 1, - comment = "OR")) |> +data_wust <- data |> + filter(str_detect(athlete, "Ireen") | + str_detect(title, "Women's Team pursuit") & + country == "NED") |> + add_row(tibble( + year = 2022, + distance = "1500 m", + sex = "Women", + ranking = 1, + comment = "OR" + )) |> glimpse() ``` @@ -444,81 +564,118 @@ So Ireen participated in 18 events across 5 Olympic Games. She participated in a #| code-fold: true #| code-summary: "Show code" -data_wust |> - group_by(year, medals_won = ranking %in% seq(3)) |> - mutate(medals_won = ifelse(medals_won, "medals_won", "medals_not_won")) |> - count() |> - ungroup() |> - complete(year, medals_won, fill = list(n = 0)) |> - pivot_wider(names_from = medals_won, values_from = n) |> - mutate(total_events = medals_won + medals_not_won, - perc_won = medals_won / total_events, - perc_won_label = str_glue("{round(perc_won * 100,1)}%"), - perc_won_label = ifelse(year == 2022, str_glue("{perc_won_label}*"), perc_won_label), - year = as_factor(year), - year = fct_rev(year)) |> - ggplot(aes(x = perc_won, y = year)) + +data_wust |> + group_by(year, medals_won = ranking %in% seq(3)) |> + mutate(medals_won = ifelse(medals_won, "medals_won", "medals_not_won")) |> + count() |> + ungroup() |> + complete(year, medals_won, fill = list(n = 0)) |> + pivot_wider(names_from = medals_won, values_from = n) |> + mutate( + total_events = medals_won + medals_not_won, + perc_won = medals_won / total_events, + perc_won_label = str_glue("{round(perc_won * 100,1)}%"), + perc_won_label = ifelse( + year == 2022, str_glue("{perc_won_label}*"), perc_won_label + ), + year = as_factor(year), + year = fct_rev(year) + ) |> + ggplot(aes(x = perc_won, y = year)) + geom_segment(aes(x = 0, xend = perc_won, yend = year), - linewidth = 10, lineend = "round", color = "#FF9B00") + - geom_text(aes(label = perc_won_label), size = 4, family = "custom", hjust = 1) + - labs(title = "**Will Ireen win a medal if she shows up?**", - subtitle = "Of all the events Ireen Wüst participated in, how often did she win a medal (of any color)?
2022 Olympics is still ongoing, shown is win rate **so far***", - caption = "*As of time of writing (09/02/2022)", - x = NULL, - y = NULL) + - scale_x_continuous(breaks = NULL, - expand = expansion(add = c(0,0.05))) + - coord_fixed(1/12) + - theme_minimal(base_family = "custom") + - theme(plot.title.position = "plot", - plot.title = element_markdown(size = 26), - plot.subtitle = element_markdown(size = 13), - axis.text.y = element_markdown(size = 13), - panel.grid.major.y = element_blank()) + linewidth = 10, lineend = "round", color = "#FF9B00" + ) + + geom_text(aes(label = perc_won_label), + size = 4, + family = "custom", hjust = 1 + ) + + labs( + title = "**Will Ireen win a medal if she shows up?**", + subtitle = "Of all the events Ireen Wüst participated in, how often did she win a medal (of any color)?
2022 Olympics is still ongoing, shown is win rate **so far***", + caption = "*As of time of writing (09/02/2022)", + x = NULL, + y = NULL + ) + + scale_x_continuous( + breaks = NULL, + expand = expansion(add = c(0, 0.05)) + ) + + coord_fixed(1 / 12) + + theme_minimal(base_family = "custom") + + theme( + plot.title.position = "plot", + plot.title = element_markdown(size = 26), + plot.subtitle = element_markdown(size = 13), + axis.text.y = element_markdown(size = 13), + panel.grid.major.y = element_blank() + ) ``` With the caveat that Ireen has only participated in one event in 2022 (as of time of writing, 9/2/2022), there has been one instance where she took home a medal on every single event she participated in. The Sochi Olympics in 2014 were successful for the Dutch team and for Ireen Wüst individually too. -Finally, we can also visualize the individual medals she won. Again, I'll take some artistic liberty here by creating a sort-of bar plot, but instead with `geom_points()` in the shape and color of the medals. +Finally, we can also visualize the individual medals she won. Again, I'll take some artistic liberty here by creating a sort-of bar plot, but instead with `geom_point()`'s in the shape and color of the medals. ```{r} #| label: wust-medals #| code-fold: true #| code-summary: "Show code" -data_wust |> - filter(ranking %in% seq(3)) |> - mutate(ranking_color = case_when(ranking == 1 ~ "#F8CC46", - ranking == 2 ~ "#DFDFE7", - ranking == 3 ~ "#D8B581"), - label = str_glue("{sex}'s {distance}")) |> - group_by(year) |> - arrange(ranking) |> - mutate(y = row_number()) |> - ggplot(aes(x = year, y = -y)) + - geom_point(aes(color = ranking_color), size = 12) + - geom_text(aes(label = label), size = 4, family = "custom", hjust = 0.1) + - geom_richtext(data = tibble(), aes(x = 2004.5, y = -3.5, - label = "**Medals earned by Ireen Wüst**"), - family = "custom", size = 8, hjust = 0, label.color = NA) + - geom_richtext(data = tibble(), aes(x = 2004.5, y = -4.2, - label = "Ireen Wüst earned **12 medals*** (of which 6 gold) across
5 Olympic games, the first Winter Olympian in history
to reach this milestone"), - family = "custom", size = 4, hjust = 0, label.color = NA, - lineheight = 1) + - labs(x = NULL, - y = NULL, - caption = "*As of time of writing (09/02/2022)") + - scale_x_continuous(breaks = c(game_years, 2022), position = "top", - expand = expansion(add = c(1,2.5))) + - scale_y_continuous(breaks = FALSE, - expand = expansion(add = c(0.5, 0.5))) + - scale_color_identity() + +data_wust |> + filter(ranking %in% seq(3)) |> + mutate( + ranking_color = case_when( + ranking == 1 ~ "#F8CC46", + ranking == 2 ~ "#DFDFE7", + ranking == 3 ~ "#D8B581" + ), + label = str_glue("{sex}'s {distance}") + ) |> + group_by(year) |> + arrange(ranking) |> + mutate(y = row_number()) |> + ggplot(aes(x = year, y = -y)) + + geom_point(aes(color = ranking_color), size = 12) + + geom_text(aes(label = label), + size = 4, + family = "custom", hjust = 0.1 + ) + + geom_richtext( + data = tibble(), aes( + x = 2004.5, y = -3.5, + label = "**Medals earned by Ireen Wüst**" + ), + family = "custom", size = 8, hjust = 0, label.color = NA + ) + + geom_richtext( + data = tibble(), aes( + x = 2004.5, y = -4.2, + label = "Ireen Wüst earned **12 medals*** (of which 6 gold) across
5 Olympic games, the first Winter Olympian in history
to reach this milestone" + ), + family = "custom", size = 4, hjust = 0, label.color = NA, + lineheight = 1 + ) + + labs( + x = NULL, + y = NULL, + caption = "*As of time of writing (09/02/2022)" + ) + + scale_x_continuous( + breaks = c(game_years, 2022), position = "top", + expand = expansion(add = c(1, 2.5)) + ) + + scale_y_continuous( + breaks = FALSE, + expand = expansion(add = c(0.5, 0.5)) + ) + + scale_color_identity() + coord_fixed(ratio = 2) + - theme_minimal(base_family = "custom") + - theme(plot.title = element_markdown(size = 26), - plot.subtitle = element_markdown(size = 13), - axis.text.x = element_markdown(size = 13), - panel.grid = element_blank()) + theme_minimal(base_family = "custom") + + theme( + plot.title = element_markdown(size = 26), + plot.subtitle = element_markdown(size = 13), + axis.text.x = element_markdown(size = 13), + panel.grid = element_blank() + ) ``` Of course the Olympics are still ongoing, but I had a lot of fun collecting and visualizing this data. Again, not all numbers might correspond to official IOC records (as detailed [here](https://www.olympiandatabase.com/index.php?id=13738&L=1)), and I'll welcome any feedback on the code in this post. I'll use these posts as a creative outlet for data visualization ideas that my current professional work doesn't allow for. Since this is my own website and these posts aren't always very serious, I have some creative liberty. I enjoy writing these posts and they get the creative juices flowing. I hope for those not interested in speed skating they at least found the data wrangling process and visualization useful. I enjoy reading other people's blogposts since I usually learn a new function or approach, so I hope I can do the same for others. The Winter Olympics happen every four years so I won't get much opportunity to do this again any time soon, but it might update this post later with the latest data. diff --git a/content/blog/2022-dutch-performance-olympic-speed-skating/scrape_data.py b/content/blog/2022-dutch-performance-olympic-speed-skating/scrape_data.py index 571accb..4e71647 100644 --- a/content/blog/2022-dutch-performance-olympic-speed-skating/scrape_data.py +++ b/content/blog/2022-dutch-performance-olympic-speed-skating/scrape_data.py @@ -14,88 +14,94 @@ verbose = True -base_url = 'https://www.olympiandatabase.com' -parent_url = f'{base_url}/index.php?id=6934&L=1' +base_url = "https://www.olympiandatabase.com" +parent_url = f"{base_url}/index.php?id=6934&L=1" -out_path = './event_files' +out_path = "./data/event_files" # -- Get website ------------------------ options = webdriver.ChromeOptions() -options.add_argument('--headless') +options.add_argument("--headless") driver = webdriver.Chrome(options=options) driver.get(parent_url) html_source = driver.page_source -soup = BeautifulSoup(html_source, 'html.parser') +soup = BeautifulSoup(html_source, "html.parser") # -- Get list of games ------------------------ -parent_table = soup.find_all('table', attrs={'class': 'frame_space'})[-1] +parent_table = soup.find_all("table", attrs={"class": "frame_space"})[-1] game_links = [] -for link in parent_table.find_all('a'): - game_links.append(link.get('href')) +for link in parent_table.find_all("a"): + game_links.append(link.get("href")) -game_links = [i for i in game_links if not re.compile(r'http://.*$').match(i)] +game_links = [i for i in game_links if not re.compile(r"http://.*$").match(i)] game_links = game_links[:-1] # -- Get list of events per game ------------------------ for i in game_links: - - driver.get(f'{base_url}{i}') + driver.get(f"{base_url}{i}") html_source = driver.page_source - soup = BeautifulSoup(html_source, 'html.parser') - - event_table = soup.find_all('table', attrs={'class': 'frame_space'})[-1] - + soup = BeautifulSoup(html_source, "html.parser") + + event_table = soup.find_all("table", attrs={"class": "frame_space"})[-1] + event_links = [] - for link in event_table.find_all('a'): - if link.find(text=re.compile('0 m|Combined|Mass|Team')): - event_links.append(link.get('href')) - - event_links = [j for j in event_links if not re.compile(r'/index.php\?id=13738&L=1').match(j)] - + for link in event_table.find_all("a"): + if link.find(text=re.compile("0 m|Combined|Mass|Team")): + event_links.append(link.get("href")) + + event_links = [ + j for j in event_links if not re.compile(r"/index.php\?id=13738&L=1").match(j) + ] + for j in event_links: - - driver.get(f'{base_url}{j}') + driver.get(f"{base_url}{j}") html_source = driver.page_source - soup = BeautifulSoup(html_source, 'html.parser') - - id = re.search('id=(.*)&', j).group(1) - if id != '11769': - title = soup.find('h1').text - year = re.search('Speed Skating (.*) Winter Olympics', title).group(1).split()[-1] - distance = re.search('\'s (.*) -', title).group(1) - sex = re.search('^(.*)\'s', title).group(1).split()[0] - tab = pd.read_html(f'{base_url}{j}', match='Final')[0] + soup = BeautifulSoup(html_source, "html.parser") + + id = re.search("id=(.*)&", j).group(1) + if id != "11769": + title = soup.find("h1").text + year = ( + re.search("Speed Skating (.*) Winter Olympics", title) + .group(1) + .split()[-1] + ) + distance = re.search("'s (.*) -", title).group(1) + sex = re.search("^(.*)'s", title).group(1).split()[0] + tab = pd.read_html(f"{base_url}{j}", match="Final")[0] else: - year = '1994' - distance = '5000 m' - sex = 'Men' - title = f'{sex}\'s {distance} - Speed Skating Lillehammer {year} Winter Olympics' - tab = pd.read_html(f'{base_url}{j}')[2] - + year = "1994" + distance = "5000 m" + sex = "Men" + title = ( + f"{sex}'s {distance} - Speed Skating Lillehammer {year} Winter Olympics" + ) + tab = pd.read_html(f"{base_url}{j}")[2] + if verbose: - print(f'Extracting data for the {sex}\'s {distance} from {year}') - + print(f"Extracting data for the {sex}'s {distance} from {year}") + # Write to json out_data = { - 'title': title, - 'year': int(year), - 'distance': distance, - 'sex': sex, - 'table': tab.to_json(), - 'id': int(id) + "title": title, + "year": int(year), + "distance": distance, + "sex": sex, + "table": tab.to_json(), + "id": int(id), } - + file_name = f'{year}_{distance.lower().replace(" ", "")}_{sex.lower()}.json' - with open(f'{out_path}/{file_name}', 'w') as file_out: + with open(f"{out_path}/{file_name}", "w") as file_out: json.dump(out_data, file_out, indent=4) - + pass - + # -- Quit browser ------------------------ driver.quit() @@ -103,7 +109,7 @@ # -- Merge json files ------------------------- if verbose: - print('Merging json files') + print("Merging json files") json_file_list = [] for file in os.listdir(out_path): @@ -112,12 +118,12 @@ # -- Define function to merge json files ------------------------ -out_name = "./all_events.json" +out_name = "./data/all_events.json" result = [] -for f in glob.glob(f'{out_path}/*.json'): +for f in glob.glob(f"{out_path}/*.json"): with open(f, "rb") as infile: result.append(json.load(infile)) -with open(out_name, 'w') as outfile: - json.dump(result, outfile, indent=4) +with open(out_name, "w") as outfile: + json.dump(result, outfile, indent=4) diff --git a/content/blog/2022-everything-is-a-linear-model/index.markdown_strict_files/figure-markdown_strict/lm-plot-1.png b/content/blog/2022-everything-is-a-linear-model/index.markdown_strict_files/figure-markdown_strict/lm-plot-1.png index 98864a3..488ffcb 100644 Binary files a/content/blog/2022-everything-is-a-linear-model/index.markdown_strict_files/figure-markdown_strict/lm-plot-1.png and b/content/blog/2022-everything-is-a-linear-model/index.markdown_strict_files/figure-markdown_strict/lm-plot-1.png differ diff --git a/content/blog/2022-everything-is-a-linear-model/index.markdown_strict_files/figure-markdown_strict/lm-plot-error-1.png b/content/blog/2022-everything-is-a-linear-model/index.markdown_strict_files/figure-markdown_strict/lm-plot-error-1.png index 5dc51f4..2fcc4d7 100644 Binary files a/content/blog/2022-everything-is-a-linear-model/index.markdown_strict_files/figure-markdown_strict/lm-plot-error-1.png and b/content/blog/2022-everything-is-a-linear-model/index.markdown_strict_files/figure-markdown_strict/lm-plot-error-1.png differ diff --git a/content/blog/2022-everything-is-a-linear-model/index.markdown_strict_files/figure-markdown_strict/lm-plot-mean-1.png b/content/blog/2022-everything-is-a-linear-model/index.markdown_strict_files/figure-markdown_strict/lm-plot-mean-1.png index 4deddf7..8f3868a 100644 Binary files a/content/blog/2022-everything-is-a-linear-model/index.markdown_strict_files/figure-markdown_strict/lm-plot-mean-1.png and b/content/blog/2022-everything-is-a-linear-model/index.markdown_strict_files/figure-markdown_strict/lm-plot-mean-1.png differ diff --git a/content/blog/2022-everything-is-a-linear-model/index.markdown_strict_files/figure-markdown_strict/lm-plot-squares-1.png b/content/blog/2022-everything-is-a-linear-model/index.markdown_strict_files/figure-markdown_strict/lm-plot-squares-1.png index a8f558b..8237917 100644 Binary files a/content/blog/2022-everything-is-a-linear-model/index.markdown_strict_files/figure-markdown_strict/lm-plot-squares-1.png and b/content/blog/2022-everything-is-a-linear-model/index.markdown_strict_files/figure-markdown_strict/lm-plot-squares-1.png differ diff --git a/content/blog/2022-everything-is-a-linear-model/index.md b/content/blog/2022-everything-is-a-linear-model/index.md index 46bfcb2..56f57e2 100644 --- a/content/blog/2022-everything-is-a-linear-model/index.md +++ b/content/blog/2022-everything-is-a-linear-model/index.md @@ -15,6 +15,8 @@ execute: fig.show: hold results: hold out.width: 80% +editor_options: + chunk_output_type: console --- Let's imagine you're incredibly lazy and you want to learn R, but you only want to learn one function to do statistics. What function do you learn? I'd recommend to learn to use the `lm()` function. Why? Because most common statistical tests are in fact nothing more than some variation of a linear model, from the simplest One-Sample T-test to a repeated-measures ANOVA. I think most people that have Googled for this question have found Jonas Lindeløv's post on how [common statistical tests are linear models](https://lindeloev.github.io/tests-as-linear/) (as they should, it's an amazing post). Here I want to go a bit more in depth into the mathematics behind this statement to show how common statistical tests are in fact variations of a linear model. @@ -30,15 +32,19 @@ library(patchwork) set.seed(2022) ``` -# One-Sample T-test +## One-Sample T-test -Let's start simple with the One-Sample T-test. This test can be used to test how the mean value of your sample measure differs from a reference number. Throughout this page, I'll throw around a bunch of formulas, which, depending on your background, can either be informative or confusing. The formula for a One-Sample T-test is: +Let's start simple with the One-Sample T-test. This test can be used to test how the mean value of your sample measure differs from a reference number. Throughout this post, I'll throw around a bunch of formulas, which, depending on your background, can either be informative or confusing. The formula for a One-Sample T-test is: + +{{< sidenote >}} +The $\overline{x}$ is commonly called "x-bar" in conversation +{{< /sidenote >}} $$ t = \frac{\overline{x} - \mu}{\frac{\sigma}{\sqrt{n}}} = \frac{sample~mean - population~mean}{\frac{standard~deviation}{\sqrt{sample~size}}} $$ -What this says is that the effect size (*t*) is equal to the sample mean minus the population mean (or reference number) and you divide it by the standard deviation of the sample divided by the square root of the sample size. This formula will output the *t*-value that you would usually report when doing a T-test. The formula requires the standard deviation (*σ*) of the sample values, which is: +What this says is that the effect size ($t$) is equal to the sample mean minus the population mean (or reference number) and you divide it by the standard deviation of the sample divided by the square root of the sample size. This formula will output the $t$-value that you would usually report when doing a T-test. The formula requires the standard deviation (*σ*) of the sample values, which is: $$ \sigma = \sqrt{\frac{\sum\limits\_{i=1}^n{(x\_{i} - \overline{x})^2}}{n - 1}} @@ -46,6 +52,10 @@ $$ In this formula, you'd subtract the average across the sample values from each individual value, square it, and sum all these resulting values. This sum you would then divide by the size of the sample minus one (or the degrees of freedom), and take the square root of the whole thing. This will give the standard deviation (*σ*). Alright, let's now consider a study where we collected blood samples from a number of patients and measured for instance sodium levels in the blood. We don't have a control group for this study, but we know from medical textbooks that the reference value for sodium in healthy individuals for the age and sex distribution in our sample is for instance 2.5 mmol/L. Then we measure the sodium levels for 30 patients, we can simulate some fake measurements by generating a random sequence of values with a mean of 3 and a standard deviation of 1.2. +{{< sidenote >}} +I cannot condone generating data for your study using `rnorm()` but this is just for illustrative purposes +{{< /sidenote >}} + ``` r ref_concentration <- 2.5 @@ -62,12 +72,12 @@ $$ this formula would like like this when implemented in R: ``` r -sqrt( sum(abs(concentration - mean(concentration))^2) / (n - 1) ) +sqrt(sum(abs(concentration - mean(concentration))^2) / (n - 1)) ``` [1] 1.157324 -But of course in any normal setting, you'd use the `sd()` function, which will give the same result as the code above, but I just wanted to show it for illustrative purposes. Anywhere else I'll use the `sd()` function. Now let's calculate the *t*-value. In formula form this would look like this: +But of course in any normal setting, you'd use the `sd()` function, which will give the same result as the code above, but I just wanted to show it for illustrative purposes. Anywhere else I'll use the `sd()` function. Now let's calculate the $t$-value. In formula form this would look like this: $$ t = \frac{\overline{x} - \mu}{\frac{\sigma}{\sqrt{n}}} = \frac{2.855 - 2.5}{\frac{1.157}{\sqrt{30}}} = 1.681 @@ -81,7 +91,7 @@ So just to over this formula again, you take the mean of your sample, subtract t [1] 1.680503 -Now we can compare this to the `t.test()` function and then we'd find the same *t*-value (barring some rounding and digit cutoffs). In this function, since we're not comparing two samples, we set the population mean (`mu`) we want to compare to as the reference concentration (the default value for a One-Sample T-test is 0). What the `mu` option does is nothing else than subtract the reference value from all values. By doing this it centers all the values relative to 0, so if we'd run `t.test(concentration - ref_concentration)`, we'd get the same result, obviously with a different mean and the values of the confidence interval have changed, although the range stays the same. +Now we can compare this to the `t.test()` function and then we'd find the same $t$-value (barring some rounding and digit cutoffs). In this function, since we're not comparing two samples, we set the population mean (`mu`) we want to compare to as the reference concentration (the default value for a One-Sample T-test is 0). What the `mu` option does is nothing else than subtract the reference value from all values. By doing this it centers all the values relative to 0, so if we'd run `t.test(concentration - ref_concentration)`, we'd get the same result, obviously with a different mean and the values of the confidence interval have changed, although the range stays the same. ``` r t.test(concentration, mu = ref_concentration) @@ -115,13 +125,19 @@ t.test(concentration - ref_concentration, mu = 0) mean of x 0.3550862 -So now back to the premise of this exercise, how is a T-test the same as a linear model? Like we showed before, subtracting the reference value from the sample values and adding that to a T-test comparing the values to 0 is equivalent to comparing the sample values to the reference value. Now let's consider what a linear model does. You might recall from high-school mathematics that the formula for a straight line is always some form of *y* = \*a\*\*x* + *c\*, the linear model formula is somewhat similar: +So now back to the premise of this exercise, how is a T-test the same as a linear model? Like we showed before, subtracting the reference value from the sample values and adding that to a T-test comparing the values to 0 is equivalent to comparing the sample values to the reference value. Now let's consider what a linear model does. You might recall from high-school mathematics that the formula for a straight line is always some form of $y = ax + c$, the linear model formula is somewhat similar: -*Y**i* = *β*0 + *β*1*x* + *ϵ**i* +$$ +Y_i = \beta_{0} + \beta_{1}x + \epsilon_{i} +$$ -In this formula *Y**i* is the dependent variable, *x* is the independent variable. *β*0 is equivalent to the intercept at the y-axis, similar to *c* in the formula for a straight line. *β*1 is the slope (equivalent to *a* in the formula earlier). Finally, the *ϵ**i* is the random error term. +{{< sidenote br="2em" >}} +$\beta_1$ in this case is equivalent to $a$ in formula $y = ax + c$ +{{< /sidenote >}} -Now let's build the linear model. Remember that the formula for the linear model included this term: *β*1*x*. In this case, since we only have one sample, we don't have any value to multiply our value to, so we multiply it by 1. If we wanted to correlate two variables, for instance concentration with age, we would substitute the 1 with a continuous variable, i.e. age, but in this case we correlate all sample values with 1. Since we still want to compare our value to 0, we subtract the reference value from our sample values like we did before for the `t.test()`. Let's build the linear model. +In this formula $Y_i$ is the dependent variable, $x$ is the independent variable. $\beta_0$ is equivalent to the intercept at the y-axis, similar to $c$ in the formula for a straight line. $\beta_1$ is the slope. Finally, the $\epsilon_i$ is the random error term. + +Now let's build the linear model. Remember that the formula for the linear model included this term: $\beta_{1}x$. In this case, since we only have one sample, we don't have any value to multiply our value to, so we multiply it by 1. If we wanted to correlate two variables, for instance concentration with age, we would substitute the 1 with a continuous variable, i.e. age, but in this case we correlate all sample values with 1. Since we still want to compare our value to 0, we subtract the reference value from our sample values like we did before for the `t.test()`. Let's build the linear model. ``` r ost_model <- lm((concentration - ref_concentration) ~ 1) @@ -142,17 +158,20 @@ summary(ost_model) Residual standard error: 1.157 on 29 degrees of freedom -Again, we find the same *t*- and *p*-value as when we ran the `t.test()`! How exciting is that! We now have three ways to obtain the same values. Later I'll go into what the `Residuals`, `Estimate` and `Std. Error` mean when running comparing group means with a linear model. +Again, we find the same $t$- and $p$-value as when we ran the `t.test()`! How exciting is that! We now have three ways to obtain the same values. Later I'll go into what the `Residuals`, `Estimate` and `Std. Error` mean when running comparing group means with a linear model. -# Two-Sample T-test +## Two-Sample T-test Now we'll apply the same logic we used for the One-Sample T-test to show how an Two-Sample T-test is in essence a linear model. First we'll look at the formula again, then the implementation using the `t.test()` function, and then the linear model. Let's now consider another experiment using the blood measurements we had before, but now we actually do have a control sample. We have 30 participants in both samples. Let's generate some random data: ``` r n <- 30 data <- tibble( - concentration = c(rnorm(n, mean = 4, sd = 1.5), rnorm(n, mean = 6, sd = 2)), - group = rep(c("HC","PAT"), each = n) + concentration = c( + rnorm(n, mean = 4, sd = 1.5), + rnorm(n, mean = 6, sd = 2) + ), + group = rep(c("HC", "PAT"), each = n) ) ``` @@ -162,17 +181,17 @@ $$ t = \frac{(\overline{x_1} - \overline{x_2})}{\sqrt{\frac{\sigma_1^2}{n_1} + \frac{\sigma_2^2}{n_2}}} = \frac{(3.838073 - 5.455809)}{\sqrt{\frac{1.343565^2}{30} + \frac{1.69711^2}{30}}} = -4.093524 $$ -It's a bit too complex to describe in a sentence, but the definitions are perhaps familiar: $\overline{x}$ for group means, *σ* for group standard deviations, and *n* for group size. I find that the simplest way to implement this in R is by first separating the groups and then adding them in the formula. +It's a bit too complex to describe in a sentence, but the definitions are perhaps familiar: $\overline{x}$ for group means, $a$ for group standard deviations, and $n$ for group size. I find that the simplest way to implement this in R is by first separating the groups and then adding them in the formula. ``` r -g1 <- data |> - filter(group == "HC") |> +g1 <- data |> + filter(group == "HC") |> pull(concentration) -g2 <- data |> - filter(group == "PAT") |> +g2 <- data |> + filter(group == "PAT") |> pull(concentration) -(mean(g1) - mean(g2)) / sqrt( (sd(g1)^2 / length(g1)) + (sd(g2)^2 / length(g2)) ) +(mean(g1) - mean(g2)) / sqrt((sd(g1)^2 / length(g1)) + (sd(g2)^2 / length(g2))) ``` [1] -5.268195 @@ -195,7 +214,7 @@ t.test(g1, g2) mean of x mean of y 4.162838 6.746285 -Look at that! We find the same *t*-value! Before we move on to the linear model, I first want to do some plotting, it will help us visualize how the linear model applies here later. Let's make a boxplot: +Look at that! We find the same $t$-value! Before we move on to the linear model, I first want to do some plotting, it will help us visualize how the linear model applies here later. Let's make a boxplot: ``` r ggplot(data, aes(x = group, y = concentration, fill = group)) + @@ -203,7 +222,7 @@ ggplot(data, aes(x = group, y = concentration, fill = group)) + geom_boxplot(width = 0.2) + geom_jitter(width = 5e-2, size = 2, alpha = 0.75) + scico::scale_fill_scico_d(palette = "hawaii") + - theme_minimal() + + theme_minimal() + theme(legend.position = "none") ``` @@ -217,11 +236,18 @@ mean_concentration <- data |> summarise(mean_conc = mean(concentration)) ggplot(data, aes(x = group)) + - geom_jitter(aes(y = concentration), width = 5e-2, size = 2, alpha = 0.75) + - geom_point(data = mean_concentration, aes(y = mean_conc), - color = "violet", size = 5) + - geom_line(data = mean_concentration, aes(y = mean_conc), group = 1, - linewidth = 2, color = "violet") + + geom_jitter(aes(y = concentration), + width = 5e-2, + size = 2, alpha = 0.75 + ) + + geom_point( + data = mean_concentration, aes(y = mean_conc), + color = "violet", size = 5 + ) + + geom_line( + data = mean_concentration, aes(y = mean_conc), group = 1, + linewidth = 2, color = "violet" + ) + theme_minimal() ``` @@ -253,9 +279,9 @@ summary(tst_model) Multiple R-squared: 0.3236, Adjusted R-squared: 0.312 F-statistic: 27.75 on 1 and 58 DF, p-value: 2.11e-06 -First of all, let's look at the `groupPAT`, there we find the same *t*-value as we did when we ran the T-tests earlier, although with the sign flipped. I'll show later why that is. +First of all, let's look at the `groupPAT`, there we find the same $t$-value as we did when we ran the T-tests earlier, although with the sign flipped. I'll show later why that is. -Now, back to the plot. The x-axis has two conditions: `HC` and `PAT`, but let's imagine those values are `0` and `1`. Let's now also throw back to the time we recalled the formula for a straight line: *y* = \*a\*\*x* + *c*. In this context we only have two x-values, `HC` and `PAT` or `0` and `1`. Then we can obtain *y\* in the formula by solving the equation when *x* is equal to `0`, in that case *y* becomes just the mean concentration of the healthy controls, or the big magenta dot in the previous plot, and that is a value we can calculate. Remember that `0` in the formula below stands for `HC`. That looks something like this: +Now, back to the plot. The x-axis has two conditions: `HC` and `PAT`, but let's imagine those values are `0` and `1`. Let's now also throw back to the time we recalled the formula for a straight line: $y = ax + c$. In this context we only have two x-values, `HC` and `PAT` or `0` and `1`. Then we can obtain $y$ in the formula by solving the equation when $x$ is equal to `0`, in that case $y$ becomes just the mean concentration of the healthy controls, or the big magenta dot in the previous plot, and that is a value we can calculate. Remember that `0` in the formula below stands for `HC`. That looks something like this: $$ \begin{eqnarray} @@ -265,7 +291,7 @@ c &=& \overline{x}\_{0} \newline \end{eqnarray} $$ -So that's the constant our formula. If we look back at the output from the `lm()` function, we see that this value is represented as the `Estimate` of the `(Intercept)` row! Let's also solve *a*. Remember that *a* represents the slope of the line. How do we get the slope? The slope is basically nothing more than the difference between the mean values of `HC` and `PAT`, but let's solve it in a more elegant way, by using the same formula we used to find *c*. We'll use the same coding as before, `0` for `HC` and `1` for `PAT`. Remember that *c* is equal to the mean value of `HC` (aka $\overline{x}_{0}$). +So that's the constant our formula. If we look back at the output from the `lm()` function, we see that this value is represented as the `Estimate` of the `(Intercept)` row! Let's also solve $a$. Remember that $a$ represents the slope of the line. How do we get the slope? The slope is basically nothing more than the difference between the mean values of `HC` and `PAT`, but let's solve it in a more elegant way, by using the same formula we used to find *c*. We'll use the same coding as before, `0` for `HC` and `1` for `PAT`. Remember that *c* is equal to the mean value of `HC` (aka $\overline{x}_{0}$). $$ \begin{eqnarray} @@ -277,9 +303,13 @@ a &=& \overline{x}\_{1} - \overline{x}\_{0} \newline \end{eqnarray} $$ -And then we find that *a* is equal to the `Estimate` column for the `groupPAT` row. +And then we find that $a$ is equal to the `Estimate` column for the `groupPAT` row. + +{{< sidenote br="3em" >}} +inb4 the angry statisticians: I know it's more complicated than that but let's not get into this right now +{{< /sidenote >}} -We can reverse engineer the *t*-value too using just the output from the `lm()` function. One can imagine that if one would plot a situation where the null hypothesis (H0) is true, the slope of that line would be 0 since then there's no difference between the mean of the two groups (inb4 the angry statisticians: it's more complicated than that but let me use this for now). We'll take the difference between our observed slope, or the slope of the alternative hypothesis (H0), and the slope of the null hypothesis, which is 0, and divide that by the standard error of the sampling distribution, which is given by the `lm()` function as the `Std. Error` of the `groupPAT` row: +We can reverse engineer the $t$-value too using just the output from the `lm()` function. One can imagine that if one would plot a situation where the null hypothesis (H0) is true, the slope of that line would be 0 since then there's no difference between the mean of the two groups. We'll take the difference between our observed slope, or the slope of the alternative hypothesis (H0), and the slope of the null hypothesis, which is 0, and divide that by the standard error of the sampling distribution, which is given by the `lm()` function as the `Std. Error` of the `groupPAT` row: $$ \begin{eqnarray} @@ -290,21 +320,71 @@ $$ Which as you'll notice is one thousandths-decimal place off, which is due to rounding errors. `lm()` reports up to 4 decimal points while it uses more for the calculation. And now we've come full circle, because the slope of the regression line is nothing more than the difference between the mean of the second group minor the mean of the first group. Now we can go back to the figure we made earlier and see how all these values relate: +
+Show code + +``` r +ggplot(data, aes(x = group)) + + geom_jitter(aes(y = concentration), + width = 5e-2, + size = 2, alpha = 0.75 + ) + + geom_point( + data = mean_concentration, aes(y = mean_conc), + color = "violet", size = 5 + ) + + geom_line( + data = mean_concentration, aes(y = mean_conc), + group = 1, linewidth = 2, color = "violet" + ) + + geom_segment( + data = NULL, aes( + x = 0.4, xend = 0.925, + y = mean(g1), yend = mean(g1) + ), + color = "grey", linewidth = 0.2 + ) + + geom_segment( + data = NULL, + aes( + x = 0.4, xend = 1.925, + y = mean(g2), yend = mean(g2) + ), + color = "grey", linewidth = 0.2 + ) + + geom_text( + data = data.frame(), + aes(x = 1.45, y = 0.18 + (mean(g1) + mean(g2)) / 2), + label = "a = 1.6177", angle = 21 + ) + + scale_y_continuous( + breaks = round(c(seq(2.5, 10, 2.5), mean(g1), mean(g2)), 4), + minor_breaks = seq(1.25, 8.75, 2.5) + ) + + theme_minimal() +``` + +
+ And that's how a Two-Sample T-test is basically a linear model! -# ANOVA +## ANOVA Based on what we did in the previous section, you may already predict what we'll do in this section. Instead of one or two groups, we'll now show how this works for more than two groups. The mathematics becomes a bit more long-winded and the visualizations a bit less clear, so we'll just stick with the R code. Let's for instance say we have four groups of patients and each have a certain score on a questionnaire: ``` r n <- 30 -data <- tibble(score = round(c(rnorm(n, mean = 75, sd = 30), - rnorm(n, mean = 60, sd = 35), - rnorm(n, mean = 30, sd = 17), - rnorm(n, mean = 45, sd = 32))), - group = rep(c("SCZ", "BD", "MDD", "ASD"), each = n)) |> +data <- tibble( + score = round(c( + rnorm(n, mean = 75, sd = 30), + rnorm(n, mean = 60, sd = 35), + rnorm(n, mean = 30, sd = 17), + rnorm(n, mean = 45, sd = 32) + )), + group = rep(c("SCZ", "BD", "MDD", "ASD"), each = n) +) |> mutate(group = as_factor(group)) ``` @@ -349,27 +429,33 @@ summary(anova_lm_model) Multiple R-squared: 0.3151, Adjusted R-squared: 0.2974 F-statistic: 17.79 on 3 and 116 DF, p-value: 1.448e-09 -The first thing you might notice is that the *F*-statistic and the *p*-value are the same in both models. +The first thing you might notice is that the $F$-statistic and the $p$-value are the same in both models. ``` r -ref_mean <- data |> - filter(group == "SCZ") |> - pull(score) |> +ref_mean <- data |> + filter(group == "SCZ") |> + pull(score) |> mean() -anova_group_means <- data |> - group_by(group) |> - summarise(score = mean(score)) |> - mutate(ref_mean = ref_mean, - mean_adj = score - ref_mean) - -ggplot(data, aes(x = group, y = score - ref_mean)) + - stat_summary(fun = mean, geom = "point", - size = 10, color = "violet", shape = 18) + - geom_jitter(width = 0.2) + - ggtext::geom_richtext(data = anova_group_means, - aes(label = str_glue("x̄ = {round(mean_adj, 2)}")), - fill = "#ffffff80", nudge_x = 1/3) + +anova_group_means <- data |> + group_by(group) |> + summarise(score = mean(score)) |> + mutate( + ref_mean = ref_mean, + mean_adj = score - ref_mean + ) + +ggplot(data, aes(x = group, y = score - ref_mean)) + + stat_summary( + fun = mean, geom = "point", + size = 10, color = "violet", shape = 18 + ) + + geom_jitter(width = 0.2) + + ggtext::geom_richtext( + data = anova_group_means, + aes(label = str_glue("x̄ = {round(mean_adj, 2)}")), + fill = "#ffffff80", nudge_x = 1 / 3 + ) + theme_minimal() ``` @@ -384,22 +470,30 @@ residual~sum~of~squares &=& \sum\limits\_{j=1}^{J} \sum\limits\_{i=1}^{n\_{j}} ( \end{eqnarray} $$ -Just briefly, the first formula takes the mean value for the group in question, subtracts the overall mean (or grand mean) and squares the result. Then it multiplies this number by the sample size in this group. In this case we'll only do it for the first group since that's the one listed in the `summary(aov_model)` output. The second formula calculates the residual sum of squares (or sum of squared error), we'll come back to this later. In essence it substracts the group mean from each of the individual values, squares it, and sums it first within the group, and then sums it again across the groups. +{{< sidenote br="8em" >}} +We'll come back to residual sum of squares further down +{{< /sidenote >}} + +Just briefly, the first formula takes the mean value for the group in question, subtracts the overall mean (or grand mean) and squares the result. Then it multiplies this number by the sample size in this group. In this case we'll only do it for the first group since that's the one listed in the `summary(aov_model)` output. The second formula calculates the residual sum of squares (or sum of squared error). In essence it substracts the group mean from each of the individual values, squares it, and sums it first within the group, and then sums it again across the groups. We can do both calculations in one go with the following quick code: ``` r -data |> - mutate(overall_mean = mean(score)) |> - group_by(group) |> - summarise(group_mean = mean(score), - group_n = n(), - overall_mean = first(overall_mean), - sq_group = group_n * (group_mean - overall_mean)^2, - sq_error = sum((score - group_mean)^2)) |> - ungroup() |> - summarise(ss_group = sum(sq_group), - ss_error = sum(sq_error)) +data |> + mutate(overall_mean = mean(score)) |> + group_by(group) |> + summarise( + group_mean = mean(score), + group_n = n(), + overall_mean = first(overall_mean), + sq_group = group_n * (group_mean - overall_mean)^2, + sq_error = sum((score - group_mean)^2) + ) |> + ungroup() |> + summarise( + ss_group = sum(sq_group), + ss_error = sum(sq_error) + ) ``` # A tibble: 1 × 2 @@ -409,21 +503,27 @@ data |> Now look back at the output from `summary(aov_model)` and we'll find the same values! I'll leave it here for now, but we'll come back to sum of squares (of different varieties later). -# A linear model is a linear model +## A linear model is a linear model Well that's a statement of unequaled wisdom, isn't it? No wonder they give us doctorates to talk about this stuff. -I don't think I need a lot of effort to convince anyone that a linear model is a linear model. Actually, I'm so convinced that you are aware that a linear model is a linear model that I wanted to about something else instead. Instead I wanted to dive into residuals and R2. Before we start, let's first simulate some data, We'll create an age column, a sex column, and a measure column. We'll make it so that the measure column correlates with the age column. +I don't think I need a lot of effort to convince anyone that a linear model is a linear model. Actually, I'm so convinced that you are aware that a linear model is a linear model that I wanted to about something else instead. Instead I wanted to dive into residuals and R{{< sup "2" >}}. Before we start, let's first simulate some data, We'll create an age column, a sex column, and a measure column. We'll make it so that the measure column correlates with the age column. ``` r n <- 20 -data <- tibble(age = round(runif(n = n, min = 18, max = 60)), - sex = factor(sample(c("Male", "Female"), size = n, replace = TRUE), - levels = c("Male", "Female"))) |> +data <- tibble( + age = round(runif(n = n, min = 18, max = 60)), + sex = factor( + sample(c("Male", "Female"), + size = n, replace = TRUE + ), + levels = c("Male", "Female") + ) +) |> mutate(measure = 1e-2 * age + sqrt(1e-2) * rnorm(n)) ``` -We've used the formula for a straight line in previous sections (*y* = \*a\*\*x* + *c*), we can apply it here too, but instead of the difference in the mean between two groups, the slope of the line (denoted by *a\*) is now derived from the slope at which the line has the least distance to all points, referred to as the best fit. We will plot this later, but first we should maybe just run the linear model: +We've used the formula for a straight line in previous sections ($y = ax + c$), we can apply it here too, but instead of the difference in the mean between two groups, the slope of the line (denoted by $a$) is now derived from the slope at which the line has the least distance to all points, referred to as the best fit. We will plot this later, but first we should maybe just run the linear model: ``` r lm_model <- lm(measure ~ age, data = data) @@ -435,32 +535,30 @@ summary(lm_model) lm(formula = measure ~ age, data = data) Residuals: - Min 1Q Median 3Q Max - -0.11593 -0.08740 0.00416 0.02975 0.20023 + Min 1Q Median 3Q Max + -0.148243 -0.079997 -0.002087 0.053822 0.234803 Coefficients: - Estimate Std. Error t value Pr(>|t|) - (Intercept) 0.093800 0.089076 1.053 0.306255 - age 0.008074 0.002010 4.017 0.000808 *** + Estimate Std. Error t value Pr(>|t|) + (Intercept) -0.049454 0.089850 -0.550 0.589 + age 0.011142 0.002026 5.499 3.2e-05 *** --- Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1 - Residual standard error: 0.09828 on 18 degrees of freedom - Multiple R-squared: 0.4728, Adjusted R-squared: 0.4435 - F-statistic: 16.14 on 1 and 18 DF, p-value: 0.0008079 + Residual standard error: 0.1083 on 18 degrees of freedom + Multiple R-squared: 0.6268, Adjusted R-squared: 0.6061 + F-statistic: 30.24 on 1 and 18 DF, p-value: 3.196e-05 -We find that there is a significant association between age and our measure, and the R2 is about 47%. Recall that R2 denotes the amount of variance explained by the predictor, or age in our case. We can plot the linear model in `ggplot` with the `geom_smooth()` function, and then setting the `method` to `"lm"`: +We find that there is a significant association between age and our measure, and the R{{< sup "2" >}} is about 47%. Recall that R{{< sup "2" >}} denotes the amount of variance explained by the predictor, or age in our case. We can plot the linear model in `ggplot` with the `geom_smooth()` function, and then setting the `method` to `"lm"`: ``` r ggplot(data, aes(x = age, y = measure)) + geom_point(size = 4, alpha = 0.8) + geom_smooth(method = "lm", color = "grey30") + - scale_x_continuous(limits = c(18,60)) + + scale_x_continuous(limits = c(18, 60)) + theme_minimal() ``` - `geom_smooth()` using formula = 'y ~ x' - The line in the figure above shows the line that best fits the points with a ribbon indicating the standard error. @@ -468,7 +566,7 @@ The line in the figure above shows the line that best fits the points with a rib Back to our data. We know that a linear models fits a line that "predicts" outcome based on some other variable. This is heavily simplified, but it'll make clear what we'll do next. So what we did before with the best fit line was create one line that best fits all the data points, but now we want to relate that back to our data points. What would our values be if they would be exactly on this line? To get this, all we have to do is calculate the difference between the current data point and the value of the best fit line at the corresponding "predictor" value. We could do it by hand, but since this section is quite long already, I'll skip straight to the R function, which is appropriately called `predict.lm()`. It takes the linear model we created with the `lm()` function earlier as input. It outputs a vector with the predicted values based on the model. ``` r -data <- data |> +data <- data |> mutate(measure_pred = predict.lm(lm_model)) ``` @@ -484,7 +582,7 @@ or in R terms (the degrees of freedom is 18 here, too complicated to explain for sqrt(sum((data$measure - data$measure_pred)^2) / 18) ``` - [1] 0.09827566 + [1] 0.1082947 So that checks out. What we can then also do is calculate the difference between the observed and the predicted values values, this is called the residual: @@ -496,15 +594,17 @@ data <- data |> We can check that this is correct too by comparing the residuals we calculated with the output from the `predict.lm()` function to the output of the `residuals(lm_model)`: ``` r -tibble(residual_manual = data$residual, - residual_lm = residuals(lm_model)) |> +tibble( + residual_manual = data$residual, + residual_lm = residuals(lm_model) +) |> glimpse() ``` Rows: 20 Columns: 2 - $ residual_manual -0.022378283, 0.092888415, -0.106844019, -0.047841680,… - $ residual_lm -0.022378283, 0.092888415, -0.106844019, -0.047841680,… + $ residual_manual 0.020849595, 0.144178221, -0.075260734, -0.036675981, … + $ residual_lm 0.020849595, 0.144178221, -0.075260734, -0.036675981, … Predictably, when we sum all the individual differences (or residuals) we would get 0 (allowing for rounding errors) since the regression line perfectly fits in between the datapoints. @@ -512,24 +612,80 @@ Predictably, when we sum all the individual differences (or residuals) we would sum(data$residual) ``` - [1] 9.992007e-16 + [1] 1.970646e-15 We can visualize the residuals using the `geom_smooth()` function. First I just want to show the difference visually in the scatter plot we had before. I added points along the regression line to indicate where each point will move to, and an arrow to indicate the size and the direction of the difference between the observed and the predicted value: +
+Show code + +``` r +ggplot(data, aes(x = age)) + + geom_smooth(aes(y = measure), method = "lm", color = "grey30") + + geom_point(aes(y = measure), size = 4, alpha = 0.8) + + geom_point(aes(y = measure_pred), alpha = 0.5, size = 2.5) + + geom_segment(aes(xend = age, y = measure, yend = measure_pred), + arrow = arrow(length = unit(4, "points")), + color = "black", alpha = 0.8, show.legend = FALSE + ) + + scale_x_continuous(limits = c(18, 60)) + + scico::scale_color_scico_d() + + theme_minimal() +``` + +
+ You might have noticed now that the size of the arrow is defined as the difference between the observed and predicted value, i.e. the residual! Now, you might have come across the term "sum of squared error" in different textbooks. With the values that we've calculated so far we can illustrate where this comes from. Imagine that the arrow in the figure above is one side of a square. How do you get the area of a suqare? You multiply the length of one side of the square by itself, i.e. you square it! That's where the "squared error" part of that comes from. Perhaps the figure below helps illustrate it: +
+Show code + +``` r +ggplot(data, aes(x = age)) + + geom_smooth(aes(y = measure), method = "lm", color = "grey30") + + geom_point(aes(y = measure), size = 4, alpha = 0.8) + + geom_point(aes(y = measure_pred), alpha = 0.5, size = 2.5) + + geom_segment(aes(xend = age, y = measure, yend = measure_pred), + arrow = arrow(length = unit(4, "points")), + color = "black", alpha = 0.8, show.legend = FALSE + ) + + geom_rect( + data = . %>% filter(age < 35 & age > 30), + aes( + xmin = age, xmax = age - (1.6 * residual * age), + ymin = measure_pred, ymax = measure + ), + alpha = 0.5, color = "grey30" + ) + + geom_rect( + data = . %>% filter(age == 50), + aes( + xmin = age, xmax = age - (1.1 * residual * age), + ymin = measure_pred, ymax = measure + ), + alpha = 0.5, color = "grey30" + ) + + scale_x_continuous(limits = c(18, 60)) + + scico::scale_color_scico_d() + + theme_minimal() +``` + +
+ The "sum" part of "sum of squared error" refers to the sum of the areas of those squares. Simply, you sum the square of the sides. You can also look at it in mathematical form: +{{< sidenote >}} +We'll use this formula again a bit later to calculate the R{{< sup "2" >}}. +{{< /sidenote >}} + $$ \sum{(residual~or~difference~with~regression~line^2)} $$ -We'll use this formula again a bit later to calculate the R2. - In order to calculate the squared regression coefficient, we should also calculate the mean value of the measure across all points. This is necessary because the squared regression coefficient is defined as a perfect correlation (i.e. a correlation coefficient of 1) minus the explained variance divided by the total variance, or in formula form: $$ @@ -545,14 +701,44 @@ Important here is to notice that the error term has switched from the difference We've already plotted the sum of squared error, now we'll also illustrate sum of squared total. Remember the sum of squared total is the sum of squared differences between the observed values and the mean value. I'll also add the original regression line in the background to show the difference with the sum of squared error. +
+Show code + +``` r +ggplot(data, aes(x = age)) + + geom_smooth(aes(y = measure), + method = "lm", + color = "grey95", alpha = 0.1 + ) + + geom_hline( + yintercept = mean(data$measure), + color = "grey30", linewidth = 1 + ) + + geom_point(aes(y = measure), size = 4, alpha = 0.8) + + geom_point(aes(y = mean(measure)), alpha = 0.5, size = 2.5) + + geom_segment( + aes( + xend = age, y = measure, + yend = mean(measure) + ), + arrow = arrow(length = unit(4, "points")), + color = "black", alpha = 0.8, show.legend = FALSE + ) + + theme_minimal() +``` + +
+ We already calculated the difference with the regression line, then to calculate the difference with the mean is simple: ``` r -data <- data |> - mutate(measure_mean = mean(measure), - difference_mean = measure - measure_mean) +data <- data |> + mutate( + measure_mean = mean(measure), + difference_mean = measure - measure_mean + ) ``` Sidenote, if you wanted to calculate the total variance the formula for that would look like this: @@ -563,17 +749,17 @@ $$ Notice how the numerator is the same calculation as the sum of squared total, then divided by the sample size minus 1 (like the degrees of freedom). -To calculate the squared regression coefficient (R2) from the formula above is then simple. We take 1 (perfect correlation) and subtract the sum of the squared residuals (explained variance) divided by the sum of the squared difference with the mean (total variance). In R terms, that would look like this: +To calculate the squared regression coefficient (R{{< sup "2" >}}) from the formula above is then simple. We take 1 (perfect correlation) and subtract the sum of the squared residuals (explained variance) divided by the sum of the squared difference with the mean (total variance). In R terms, that would look like this: ``` r 1 - sum(data$residual^2) / sum(data$difference_mean^2) ``` - [1] 0.4727552 + [1] 0.6268363 -And there we have it, the regression coefficient R2! You can check that it's the same by scrolling up to where we ran `summary(lm_model)` and you'll find the same number. We could also calculate the F-statistic and the *t*- and *p*-values, but I think this tutorial has drained enough cognitive energy. For this last section, I hope it's become clear what we mean when we talk about "residuals", "sum of squares", and "variance" in the context of linear models. I also hope it's enlightened you a bit on what a linear model does and how it works. +And there we have it, the regression coefficient R{{< sup "2" >}}! You can check that it's the same by scrolling up to where we ran `summary(lm_model)` and you'll find the same number. We could also calculate the F-statistic and the $t$- and $p$-values, but I think this tutorial has drained enough cognitive energy. For this last section, I hope it's become clear what we mean when we talk about "residuals", "sum of squares", and "variance" in the context of linear models. I also hope it's enlightened you a bit on what a linear model does and how it works. -# Conclusion +## Conclusion There's many more things we could go over, multiple linear regression, non-parametric tests, etc., but I think we have done enough nerding for today. I hope I managed to show you the overlap in different statistical tests. Does that mean that you should only run linear models for now? No, of course not. But I do think it may be good to have an overview of where the values you get come from and what they might mean in different contexts. Hope this was enlightening. Happy learning! @@ -585,10 +771,7 @@ There's many more things we could go over, multiple linear regression, non-param - [ANOVA for Regression - Rahul Pathak](https://towardsdatascience.com/anova-for-regression-fdb49cf5d684) - [Explaining the `lm` summary in R - Learn by Marketing](https://www.learnbymarketing.com/tutorials/explaining-the-lm-summary-in-r/) -
- -Session info for reproducibility purposes - +{{< details "Session info for reproducibility purposes" >}} ``` r sessionInfo() @@ -612,23 +795,23 @@ sessionInfo() [1] stats graphics grDevices datasets utils methods base other attached packages: - [1] patchwork_1.1.2 lubridate_1.9.2 forcats_1.0.0 stringr_1.5.0 + [1] patchwork_1.1.3 lubridate_1.9.3 forcats_1.0.0 stringr_1.5.0 [5] dplyr_1.1.3 purrr_1.0.2 readr_2.1.4 tidyr_1.3.0 - [9] tibble_3.2.1 ggplot2_3.4.3 tidyverse_2.0.0 + [9] tibble_3.2.1 ggplot2_3.4.4 tidyverse_2.0.0 loaded via a namespace (and not attached): - [1] utf8_1.2.3 generics_0.1.3 renv_0.17.3 xml2_1.3.4 - [5] lattice_0.21-9 stringi_1.7.12 hms_1.1.3 digest_0.6.33 - [9] magrittr_2.0.3 evaluate_0.22 grid_4.3.0 timechange_0.2.0 - [13] fastmap_1.1.1 Matrix_1.6-1.1 jsonlite_1.8.7 ggtext_0.1.2 - [17] mgcv_1.9-0 fansi_1.0.4 scales_1.2.1 scico_1.4.0 - [21] cli_3.6.1 rlang_1.1.1 splines_4.3.0 commonmark_1.9.0 - [25] munsell_0.5.0 withr_2.5.1 yaml_2.3.7 tools_4.3.0 - [29] tzdb_0.4.0 colorspace_2.1-0 vctrs_0.6.3 R6_2.5.1 - [33] lifecycle_1.0.3 pkgconfig_2.0.3 pillar_1.9.0 gtable_0.3.4 - [37] glue_1.6.2 Rcpp_1.0.11 xfun_0.40 tidyselect_1.2.0 - [41] rstudioapi_0.14 knitr_1.44 farver_2.1.1 nlme_3.1-163 - [45] htmltools_0.5.6 rmarkdown_2.25 labeling_0.4.3 compiler_4.3.0 - [49] markdown_1.7 gridtext_0.1.5 - -
+ [1] utf8_1.2.4 generics_0.1.3 renv_1.0.3 xml2_1.3.5 + [5] lattice_0.22-5 stringi_1.7.12 hms_1.1.3 digest_0.6.33 + [9] magrittr_2.0.3 evaluate_0.22 grid_4.3.0 timechange_0.2.0 + [13] fastmap_1.1.1 Matrix_1.6-1.1 jsonlite_1.8.7 ggtext_0.1.2 + [17] mgcv_1.9-0 fansi_1.0.5 scales_1.2.1 scico_1.5.0 + [21] cli_3.6.1 rlang_1.1.1 splines_4.3.0 commonmark_1.9.0 + [25] munsell_0.5.0 withr_2.5.1 yaml_2.3.7 tools_4.3.0 + [29] tzdb_0.4.0 colorspace_2.1-0 vctrs_0.6.4 R6_2.5.1 + [33] lifecycle_1.0.3 pkgconfig_2.0.3 pillar_1.9.0 gtable_0.3.4 + [37] glue_1.6.2 Rcpp_1.0.11 xfun_0.40 tidyselect_1.2.0 + [41] rstudioapi_0.15.0 knitr_1.44 farver_2.1.1 nlme_3.1-163 + [45] htmltools_0.5.6.1 rmarkdown_2.25 labeling_0.4.3 compiler_4.3.0 + [49] markdown_1.11 gridtext_0.1.5 + +{{< /details >}} diff --git a/content/blog/2022-everything-is-a-linear-model/index.qmd b/content/blog/2022-everything-is-a-linear-model/index.qmd index 2591a27..bcedcfd 100644 --- a/content/blog/2022-everything-is-a-linear-model/index.qmd +++ b/content/blog/2022-everything-is-a-linear-model/index.qmd @@ -15,6 +15,8 @@ execute: fig.show: hold results: hold out.width: 80% +editor_options: + chunk_output_type: console --- Let's imagine you're incredibly lazy and you want to learn R, but you only want to learn one function to do statistics. What function do you learn? I'd recommend to learn to use the `lm()` function. Why? Because most common statistical tests are in fact nothing more than some variation of a linear model, from the simplest One-Sample T-test to a repeated-measures ANOVA. I think most people that have Googled for this question have found Jonas Lindeløv's post on how [common statistical tests are linear models](https://lindeloev.github.io/tests-as-linear/) (as they should, it's an amazing post). Here I want to go a bit more in depth into the mathematics behind this statement to show how common statistical tests are in fact variations of a linear model. @@ -33,15 +35,19 @@ library(patchwork) set.seed(2022) ``` -# One-Sample T-test +## One-Sample T-test -Let's start simple with the One-Sample T-test. This test can be used to test how the mean value of your sample measure differs from a reference number. Throughout this page, I'll throw around a bunch of formulas, which, depending on your background, can either be informative or confusing. The formula for a One-Sample T-test is: +Let's start simple with the One-Sample T-test. This test can be used to test how the mean value of your sample measure differs from a reference number. Throughout this post, I'll throw around a bunch of formulas, which, depending on your background, can either be informative or confusing. The formula for a One-Sample T-test is: + +{{{< sidenote >}}} +The $\overline{x}$ is commonly called "x-bar" in conversation +{{{< /sidenote >}}} $$ t = \frac{\overline{x} - \mu}{\frac{\sigma}{\sqrt{n}}} = \frac{sample~mean - population~mean}{\frac{standard~deviation}{\sqrt{sample~size}}} $$ -What this says is that the effect size (*t*) is equal to the sample mean minus the population mean (or reference number) and you divide it by the standard deviation of the sample divided by the square root of the sample size. This formula will output the *t*-value that you would usually report when doing a T-test. The formula requires the standard deviation (*σ*) of the sample values, which is: +What this says is that the effect size ($t$) is equal to the sample mean minus the population mean (or reference number) and you divide it by the standard deviation of the sample divided by the square root of the sample size. This formula will output the $t$-value that you would usually report when doing a T-test. The formula requires the standard deviation (*σ*) of the sample values, which is: $$ \sigma = \sqrt{\frac{\sum\limits\_{i=1}^n{(x\_{i} - \overline{x})^2}}{n - 1}} @@ -49,6 +55,10 @@ $$ In this formula, you'd subtract the average across the sample values from each individual value, square it, and sum all these resulting values. This sum you would then divide by the size of the sample minus one (or the degrees of freedom), and take the square root of the whole thing. This will give the standard deviation (*σ*). Alright, let's now consider a study where we collected blood samples from a number of patients and measured for instance sodium levels in the blood. We don't have a control group for this study, but we know from medical textbooks that the reference value for sodium in healthy individuals for the age and sex distribution in our sample is for instance 2.5 mmol/L. Then we measure the sodium levels for 30 patients, we can simulate some fake measurements by generating a random sequence of values with a mean of 3 and a standard deviation of 1.2. +{{{< sidenote >}}} +I cannot condone generating data for your study using `rnorm()` but this is just for illustrative purposes +{{{< /sidenote >}}} + ```{r} #| label: ost-setup @@ -69,10 +79,10 @@ this formula would like like this when implemented in R: ```{r} #| label: sd-calc -sqrt( sum(abs(concentration - mean(concentration))^2) / (n - 1) ) +sqrt(sum(abs(concentration - mean(concentration))^2) / (n - 1)) ``` -But of course in any normal setting, you'd use the `sd()` function, which will give the same result as the code above, but I just wanted to show it for illustrative purposes. Anywhere else I'll use the `sd()` function. Now let's calculate the *t*-value. In formula form this would look like this: +But of course in any normal setting, you'd use the `sd()` function, which will give the same result as the code above, but I just wanted to show it for illustrative purposes. Anywhere else I'll use the `sd()` function. Now let's calculate the $t$-value. In formula form this would look like this: $$ t = \frac{\overline{x} - \mu}{\frac{\sigma}{\sqrt{n}}} = \frac{2.855 - 2.5}{\frac{1.157}{\sqrt{30}}} = 1.681 @@ -86,7 +96,7 @@ So just to over this formula again, you take the mean of your sample, subtract t (mean(concentration) - ref_concentration) / (sd(concentration) / sqrt(n)) ``` -Now we can compare this to the `t.test()` function and then we'd find the same *t*-value (barring some rounding and digit cutoffs). In this function, since we're not comparing two samples, we set the population mean (`mu`) we want to compare to as the reference concentration (the default value for a One-Sample T-test is 0). What the `mu` option does is nothing else than subtract the reference value from all values. By doing this it centers all the values relative to 0, so if we'd run `t.test(concentration - ref_concentration)`, we'd get the same result, obviously with a different mean and the values of the confidence interval have changed, although the range stays the same. +Now we can compare this to the `t.test()` function and then we'd find the same $t$-value (barring some rounding and digit cutoffs). In this function, since we're not comparing two samples, we set the population mean (`mu`) we want to compare to as the reference concentration (the default value for a One-Sample T-test is 0). What the `mu` option does is nothing else than subtract the reference value from all values. By doing this it centers all the values relative to 0, so if we'd run `t.test(concentration - ref_concentration)`, we'd get the same result, obviously with a different mean and the values of the confidence interval have changed, although the range stays the same. ```{r} #| label: ost-test @@ -95,13 +105,19 @@ t.test(concentration, mu = ref_concentration) t.test(concentration - ref_concentration, mu = 0) ``` -So now back to the premise of this exercise, how is a T-test the same as a linear model? Like we showed before, subtracting the reference value from the sample values and adding that to a T-test comparing the values to 0 is equivalent to comparing the sample values to the reference value. Now let's consider what a linear model does. You might recall from high-school mathematics that the formula for a straight line is always some form of *y* = *a**x* + *c*, the linear model formula is somewhat similar: +So now back to the premise of this exercise, how is a T-test the same as a linear model? Like we showed before, subtracting the reference value from the sample values and adding that to a T-test comparing the values to 0 is equivalent to comparing the sample values to the reference value. Now let's consider what a linear model does. You might recall from high-school mathematics that the formula for a straight line is always some form of $y = ax + c$, the linear model formula is somewhat similar: + +$$ +Y_i = \beta_{0} + \beta_{1}x + \epsilon_{i} +$$ -*Y**i* = *β*0 + *β*1*x* + *ϵ**i* +{{{< sidenote br="2em" >}}} +$\beta_1$ in this case is equivalent to $a$ in formula $y = ax + c$ +{{{< /sidenote >}}} -In this formula *Y**i* is the dependent variable, *x* is the independent variable. *β*0 is equivalent to the intercept at the y-axis, similar to *c* in the formula for a straight line. *β*1 is the slope (equivalent to *a* in the formula earlier). Finally, the *ϵ**i* is the random error term. +In this formula $Y_i$ is the dependent variable, $x$ is the independent variable. $\beta_0$ is equivalent to the intercept at the y-axis, similar to $c$ in the formula for a straight line. $\beta_1$ is the slope. Finally, the $\epsilon_i$ is the random error term. -Now let's build the linear model. Remember that the formula for the linear model included this term: *β*1*x*. In this case, since we only have one sample, we don't have any value to multiply our value to, so we multiply it by 1. If we wanted to correlate two variables, for instance concentration with age, we would substitute the 1 with a continuous variable, i.e. age, but in this case we correlate all sample values with 1. Since we still want to compare our value to 0, we subtract the reference value from our sample values like we did before for the `t.test()`. Let's build the linear model. +Now let's build the linear model. Remember that the formula for the linear model included this term: $\beta_{1}x$. In this case, since we only have one sample, we don't have any value to multiply our value to, so we multiply it by 1. If we wanted to correlate two variables, for instance concentration with age, we would substitute the 1 with a continuous variable, i.e. age, but in this case we correlate all sample values with 1. Since we still want to compare our value to 0, we subtract the reference value from our sample values like we did before for the `t.test()`. Let's build the linear model. ```{r} #| label: ost-lm @@ -110,9 +126,9 @@ ost_model <- lm((concentration - ref_concentration) ~ 1) summary(ost_model) ``` -Again, we find the same *t*- and *p*-value as when we ran the `t.test()`! How exciting is that! We now have three ways to obtain the same values. Later I'll go into what the `Residuals`, `Estimate` and `Std. Error` mean when running comparing group means with a linear model. +Again, we find the same $t$- and $p$-value as when we ran the `t.test()`! How exciting is that! We now have three ways to obtain the same values. Later I'll go into what the `Residuals`, `Estimate` and `Std. Error` mean when running comparing group means with a linear model. -# Two-Sample T-test +## Two-Sample T-test Now we'll apply the same logic we used for the One-Sample T-test to show how an Two-Sample T-test is in essence a linear model. First we'll look at the formula again, then the implementation using the `t.test()` function, and then the linear model. Let's now consider another experiment using the blood measurements we had before, but now we actually do have a control sample. We have 30 participants in both samples. Let's generate some random data: @@ -121,8 +137,11 @@ Now we'll apply the same logic we used for the One-Sample T-test to show how an n <- 30 data <- tibble( - concentration = c(rnorm(n, mean = 4, sd = 1.5), rnorm(n, mean = 6, sd = 2)), - group = rep(c("HC","PAT"), each = n) + concentration = c( + rnorm(n, mean = 4, sd = 1.5), + rnorm(n, mean = 6, sd = 2) + ), + group = rep(c("HC", "PAT"), each = n) ) ``` @@ -132,19 +151,19 @@ $$ t = \frac{(\overline{x_1} - \overline{x_2})}{\sqrt{\frac{\sigma_1^2}{n_1} + \frac{\sigma_2^2}{n_2}}} = \frac{(3.838073 - 5.455809)}{\sqrt{\frac{1.343565^2}{30} + \frac{1.69711^2}{30}}} = -4.093524 $$ -It's a bit too complex to describe in a sentence, but the definitions are perhaps familiar: $\overline{x}$ for group means, *σ* for group standard deviations, and *n* for group size. I find that the simplest way to implement this in R is by first separating the groups and then adding them in the formula. +It's a bit too complex to describe in a sentence, but the definitions are perhaps familiar: $\overline{x}$ for group means, $a$ for group standard deviations, and $n$ for group size. I find that the simplest way to implement this in R is by first separating the groups and then adding them in the formula. ```{r} #| label: tst-t-calc -g1 <- data |> - filter(group == "HC") |> +g1 <- data |> + filter(group == "HC") |> pull(concentration) -g2 <- data |> - filter(group == "PAT") |> +g2 <- data |> + filter(group == "PAT") |> pull(concentration) -(mean(g1) - mean(g2)) / sqrt( (sd(g1)^2 / length(g1)) + (sd(g2)^2 / length(g2)) ) +(mean(g1) - mean(g2)) / sqrt((sd(g1)^2 / length(g1)) + (sd(g2)^2 / length(g2))) ``` Then running the regular T-test is easy. @@ -155,7 +174,7 @@ Then running the regular T-test is easy. t.test(g1, g2) ``` -Look at that! We find the same *t*-value! Before we move on to the linear model, I first want to do some plotting, it will help us visualize how the linear model applies here later. Let's make a boxplot: +Look at that! We find the same $t$-value! Before we move on to the linear model, I first want to do some plotting, it will help us visualize how the linear model applies here later. Let's make a boxplot: ```{r} #| label: tst-boxplot @@ -165,7 +184,7 @@ ggplot(data, aes(x = group, y = concentration, fill = group)) + geom_boxplot(width = 0.2) + geom_jitter(width = 5e-2, size = 2, alpha = 0.75) + scico::scale_fill_scico_d(palette = "hawaii") + - theme_minimal() + + theme_minimal() + theme(legend.position = "none") ``` @@ -179,11 +198,18 @@ mean_concentration <- data |> summarise(mean_conc = mean(concentration)) ggplot(data, aes(x = group)) + - geom_jitter(aes(y = concentration), width = 5e-2, size = 2, alpha = 0.75) + - geom_point(data = mean_concentration, aes(y = mean_conc), - color = "violet", size = 5) + - geom_line(data = mean_concentration, aes(y = mean_conc), group = 1, - linewidth = 2, color = "violet") + + geom_jitter(aes(y = concentration), + width = 5e-2, + size = 2, alpha = 0.75 + ) + + geom_point( + data = mean_concentration, aes(y = mean_conc), + color = "violet", size = 5 + ) + + geom_line( + data = mean_concentration, aes(y = mean_conc), group = 1, + linewidth = 2, color = "violet" + ) + theme_minimal() ``` @@ -196,9 +222,9 @@ tst_model <- lm(concentration ~ group, data = data) summary(tst_model) ``` -First of all, let's look at the `groupPAT`, there we find the same *t*-value as we did when we ran the T-tests earlier, although with the sign flipped. I'll show later why that is. +First of all, let's look at the `groupPAT`, there we find the same $t$-value as we did when we ran the T-tests earlier, although with the sign flipped. I'll show later why that is. -Now, back to the plot. The x-axis has two conditions: `HC` and `PAT`, but let's imagine those values are `0` and `1`. Let's now also throw back to the time we recalled the formula for a straight line: *y* = *a**x* + *c*. In this context we only have two x-values, `HC` and `PAT` or `0` and `1`. Then we can obtain *y* in the formula by solving the equation when *x* is equal to `0`, in that case *y* becomes just the mean concentration of the healthy controls, or the big magenta dot in the previous plot, and that is a value we can calculate. Remember that `0` in the formula below stands for `HC`. That looks something like this: +Now, back to the plot. The x-axis has two conditions: `HC` and `PAT`, but let's imagine those values are `0` and `1`. Let's now also throw back to the time we recalled the formula for a straight line: $y = ax + c$. In this context we only have two x-values, `HC` and `PAT` or `0` and `1`. Then we can obtain $y$ in the formula by solving the equation when $x$ is equal to `0`, in that case $y$ becomes just the mean concentration of the healthy controls, or the big magenta dot in the previous plot, and that is a value we can calculate. Remember that `0` in the formula below stands for `HC`. That looks something like this: $$ \begin{eqnarray} @@ -208,7 +234,7 @@ c &=& \overline{x}\_{0} \newline \end{eqnarray} $$ -So that's the constant our formula. If we look back at the output from the `lm()` function, we see that this value is represented as the `Estimate` of the `(Intercept)` row! Let's also solve *a*. Remember that *a* represents the slope of the line. How do we get the slope? The slope is basically nothing more than the difference between the mean values of `HC` and `PAT`, but let's solve it in a more elegant way, by using the same formula we used to find *c*. We'll use the same coding as before, `0` for `HC` and `1` for `PAT`. Remember that *c* is equal to the mean value of `HC` (aka $\overline{x}_{0}$). +So that's the constant our formula. If we look back at the output from the `lm()` function, we see that this value is represented as the `Estimate` of the `(Intercept)` row! Let's also solve $a$. Remember that $a$ represents the slope of the line. How do we get the slope? The slope is basically nothing more than the difference between the mean values of `HC` and `PAT`, but let's solve it in a more elegant way, by using the same formula we used to find *c*. We'll use the same coding as before, `0` for `HC` and `1` for `PAT`. Remember that *c* is equal to the mean value of `HC` (aka $\overline{x}_{0}$). $$ \begin{eqnarray} @@ -220,9 +246,13 @@ a &=& \overline{x}\_{1} - \overline{x}\_{0} \newline \end{eqnarray} $$ -And then we find that *a* is equal to the `Estimate` column for the `groupPAT` row. +And then we find that $a$ is equal to the `Estimate` column for the `groupPAT` row. + +{{{< sidenote br="3em" >}}} +inb4 the angry statisticians: I know it's more complicated than that but let's not get into this right now +{{{< /sidenote >}}} -We can reverse engineer the *t*-value too using just the output from the `lm()` function. One can imagine that if one would plot a situation where the null hypothesis (H0) is true, the slope of that line would be 0 since then there's no difference between the mean of the two groups (inb4 the angry statisticians: it's more complicated than that but let me use this for now). We'll take the difference between our observed slope, or the slope of the alternative hypothesis (H0), and the slope of the null hypothesis, which is 0, and divide that by the standard error of the sampling distribution, which is given by the `lm()` function as the `Std. Error` of the `groupPAT` row: +We can reverse engineer the $t$-value too using just the output from the `lm()` function. One can imagine that if one would plot a situation where the null hypothesis (H0) is true, the slope of that line would be 0 since then there's no difference between the mean of the two groups. We'll take the difference between our observed slope, or the slope of the alternative hypothesis (H0), and the slope of the null hypothesis, which is 0, and divide that by the standard error of the sampling distribution, which is given by the `lm()` function as the `Std. Error` of the `groupPAT` row: $$ \begin{eqnarray} @@ -235,28 +265,52 @@ Which as you'll notice is one thousandths-decimal place off, which is due to rou ```{r} #| label: tst-plot-w-annot -#| echo: false +#| code-fold: true +#| code-summary: "Show code" ggplot(data, aes(x = group)) + - geom_jitter(aes(y = concentration), width = 5e-2, size = 2, alpha = 0.75) + - geom_point(data = mean_concentration, aes(y = mean_conc), - color = "violet", size = 5) + - geom_line(data = mean_concentration, aes(y = mean_conc), group = 1, - linewidth = 2, color = "violet") + - geom_segment(data = NULL, aes(x = 0.4, xend = 0.925, y = mean(g1), yend = mean(g1)), - color = "grey", linewidth = 0.2) + - geom_segment(data = NULL, aes(x = 0.4, xend = 1.925, y = mean(g2), yend = mean(g2)), - color = "grey", linewidth = 0.2) + - geom_text(data = data.frame(), aes(x = 1.45, y = 0.18 + (mean(g1) + mean(g2))/2), - label = "a = 1.6177", angle = 21) + - scale_y_continuous(breaks = round(c(seq(2.5,10,2.5), mean(g1), mean(g2)), 4), - minor_breaks = seq(1.25,8.75,2.5)) + + geom_jitter(aes(y = concentration), + width = 5e-2, + size = 2, alpha = 0.75 + ) + + geom_point( + data = mean_concentration, aes(y = mean_conc), + color = "violet", size = 5 + ) + + geom_line( + data = mean_concentration, aes(y = mean_conc), + group = 1, linewidth = 2, color = "violet" + ) + + geom_segment( + data = NULL, aes( + x = 0.4, xend = 0.925, + y = mean(g1), yend = mean(g1) + ), + color = "grey", linewidth = 0.2 + ) + + geom_segment( + data = NULL, + aes( + x = 0.4, xend = 1.925, + y = mean(g2), yend = mean(g2) + ), + color = "grey", linewidth = 0.2 + ) + + geom_text( + data = data.frame(), + aes(x = 1.45, y = 0.18 + (mean(g1) + mean(g2)) / 2), + label = "a = 1.6177", angle = 21 + ) + + scale_y_continuous( + breaks = round(c(seq(2.5, 10, 2.5), mean(g1), mean(g2)), 4), + minor_breaks = seq(1.25, 8.75, 2.5) + ) + theme_minimal() ``` And that's how a Two-Sample T-test is basically a linear model! -# ANOVA +## ANOVA Based on what we did in the previous section, you may already predict what we'll do in this section. Instead of one or two groups, we'll now show how this works for more than two groups. The mathematics becomes a bit more long-winded and the visualizations a bit less clear, so we'll just stick with the R code. Let's for instance say we have four groups of patients and each have a certain score on a questionnaire: @@ -264,11 +318,15 @@ Based on what we did in the previous section, you may already predict what we'll #| label: aov-setup n <- 30 -data <- tibble(score = round(c(rnorm(n, mean = 75, sd = 30), - rnorm(n, mean = 60, sd = 35), - rnorm(n, mean = 30, sd = 17), - rnorm(n, mean = 45, sd = 32))), - group = rep(c("SCZ", "BD", "MDD", "ASD"), each = n)) |> +data <- tibble( + score = round(c( + rnorm(n, mean = 75, sd = 30), + rnorm(n, mean = 60, sd = 35), + rnorm(n, mean = 30, sd = 17), + rnorm(n, mean = 45, sd = 32) + )), + group = rep(c("SCZ", "BD", "MDD", "ASD"), each = n) +) |> mutate(group = as_factor(group)) ``` @@ -290,29 +348,35 @@ anova_lm_model <- lm(score ~ group, data = data) summary(anova_lm_model) ``` -The first thing you might notice is that the *F*-statistic and the *p*-value are the same in both models. +The first thing you might notice is that the $F$-statistic and the $p$-value are the same in both models. ```{r} #| label: aov-plot -ref_mean <- data |> - filter(group == "SCZ") |> - pull(score) |> +ref_mean <- data |> + filter(group == "SCZ") |> + pull(score) |> mean() -anova_group_means <- data |> - group_by(group) |> - summarise(score = mean(score)) |> - mutate(ref_mean = ref_mean, - mean_adj = score - ref_mean) - -ggplot(data, aes(x = group, y = score - ref_mean)) + - stat_summary(fun = mean, geom = "point", - size = 10, color = "violet", shape = 18) + - geom_jitter(width = 0.2) + - ggtext::geom_richtext(data = anova_group_means, - aes(label = str_glue("x̄ = {round(mean_adj, 2)}")), - fill = "#ffffff80", nudge_x = 1/3) + +anova_group_means <- data |> + group_by(group) |> + summarise(score = mean(score)) |> + mutate( + ref_mean = ref_mean, + mean_adj = score - ref_mean + ) + +ggplot(data, aes(x = group, y = score - ref_mean)) + + stat_summary( + fun = mean, geom = "point", + size = 10, color = "violet", shape = 18 + ) + + geom_jitter(width = 0.2) + + ggtext::geom_richtext( + data = anova_group_means, + aes(label = str_glue("x̄ = {round(mean_adj, 2)}")), + fill = "#ffffff80", nudge_x = 1 / 3 + ) + theme_minimal() ``` @@ -325,45 +389,59 @@ residual~sum~of~squares &=& \sum\limits\_{j=1}^{J} \sum\limits\_{i=1}^{n\_{j}} ( \end{eqnarray} $$ -Just briefly, the first formula takes the mean value for the group in question, subtracts the overall mean (or grand mean) and squares the result. Then it multiplies this number by the sample size in this group. In this case we'll only do it for the first group since that's the one listed in the `summary(aov_model)` output. The second formula calculates the residual sum of squares (or sum of squared error), we'll come back to this later. In essence it substracts the group mean from each of the individual values, squares it, and sums it first within the group, and then sums it again across the groups. +{{{< sidenote br="8em" >}}} +We'll come back to residual sum of squares further down +{{{< /sidenote >}}} + +Just briefly, the first formula takes the mean value for the group in question, subtracts the overall mean (or grand mean) and squares the result. Then it multiplies this number by the sample size in this group. In this case we'll only do it for the first group since that's the one listed in the `summary(aov_model)` output. The second formula calculates the residual sum of squares (or sum of squared error). In essence it substracts the group mean from each of the individual values, squares it, and sums it first within the group, and then sums it again across the groups. We can do both calculations in one go with the following quick code: ```{r} #| label: aov-calc-ss -data |> - mutate(overall_mean = mean(score)) |> - group_by(group) |> - summarise(group_mean = mean(score), - group_n = n(), - overall_mean = first(overall_mean), - sq_group = group_n * (group_mean - overall_mean)^2, - sq_error = sum((score - group_mean)^2)) |> - ungroup() |> - summarise(ss_group = sum(sq_group), - ss_error = sum(sq_error)) +data |> + mutate(overall_mean = mean(score)) |> + group_by(group) |> + summarise( + group_mean = mean(score), + group_n = n(), + overall_mean = first(overall_mean), + sq_group = group_n * (group_mean - overall_mean)^2, + sq_error = sum((score - group_mean)^2) + ) |> + ungroup() |> + summarise( + ss_group = sum(sq_group), + ss_error = sum(sq_error) + ) ``` Now look back at the output from `summary(aov_model)` and we'll find the same values! I'll leave it here for now, but we'll come back to sum of squares (of different varieties later). -# A linear model is a linear model +## A linear model is a linear model Well that's a statement of unequaled wisdom, isn't it? No wonder they give us doctorates to talk about this stuff. -I don't think I need a lot of effort to convince anyone that a linear model is a linear model. Actually, I'm so convinced that you are aware that a linear model is a linear model that I wanted to about something else instead. Instead I wanted to dive into residuals and R2. Before we start, let's first simulate some data, We'll create an age column, a sex column, and a measure column. We'll make it so that the measure column correlates with the age column. +I don't think I need a lot of effort to convince anyone that a linear model is a linear model. Actually, I'm so convinced that you are aware that a linear model is a linear model that I wanted to about something else instead. Instead I wanted to dive into residuals and R{{{< sup "2" >}}}. Before we start, let's first simulate some data, We'll create an age column, a sex column, and a measure column. We'll make it so that the measure column correlates with the age column. ```{r} #| label: lm-setup n <- 20 -data <- tibble(age = round(runif(n = n, min = 18, max = 60)), - sex = factor(sample(c("Male", "Female"), size = n, replace = TRUE), - levels = c("Male", "Female"))) |> +data <- tibble( + age = round(runif(n = n, min = 18, max = 60)), + sex = factor( + sample(c("Male", "Female"), + size = n, replace = TRUE + ), + levels = c("Male", "Female") + ) +) |> mutate(measure = 1e-2 * age + sqrt(1e-2) * rnorm(n)) ``` -We've used the formula for a straight line in previous sections (*y* = *a**x* + *c*), we can apply it here too, but instead of the difference in the mean between two groups, the slope of the line (denoted by *a*) is now derived from the slope at which the line has the least distance to all points, referred to as the best fit. We will plot this later, but first we should maybe just run the linear model: +We've used the formula for a straight line in previous sections ($y = ax + c$), we can apply it here too, but instead of the difference in the mean between two groups, the slope of the line (denoted by $a$) is now derived from the slope at which the line has the least distance to all points, referred to as the best fit. We will plot this later, but first we should maybe just run the linear model: ```{r} #| label: lm @@ -372,15 +450,16 @@ lm_model <- lm(measure ~ age, data = data) summary(lm_model) ``` -We find that there is a significant association between age and our measure, and the R2 is about 47%. Recall that R2 denotes the amount of variance explained by the predictor, or age in our case. We can plot the linear model in `ggplot` with the `geom_smooth()` function, and then setting the `method` to `"lm"`: +We find that there is a significant association between age and our measure, and the R{{{< sup "2" >}}} is about 47%. Recall that R{{{< sup "2" >}}} denotes the amount of variance explained by the predictor, or age in our case. We can plot the linear model in `ggplot` with the `geom_smooth()` function, and then setting the `method` to `"lm"`: ```{r} #| label: lm-plot +#| message: false ggplot(data, aes(x = age, y = measure)) + geom_point(size = 4, alpha = 0.8) + geom_smooth(method = "lm", color = "grey30") + - scale_x_continuous(limits = c(18,60)) + + scale_x_continuous(limits = c(18, 60)) + theme_minimal() ``` @@ -391,7 +470,7 @@ Back to our data. We know that a linear models fits a line that "predicts" outco ```{r} #| label: lm-predict -data <- data |> +data <- data |> mutate(measure_pred = predict.lm(lm_model)) ``` @@ -423,8 +502,10 @@ We can check that this is correct too by comparing the residuals we calculated w ```{r} #| label: lm-compare-residuals -tibble(residual_manual = data$residual, - residual_lm = residuals(lm_model)) |> +tibble( + residual_manual = data$residual, + residual_lm = residuals(lm_model) +) |> glimpse() ``` @@ -440,18 +521,20 @@ We can visualize the residuals using the `geom_smooth()` function. First I just ```{r} #| label: lm-plot-error -#| echo: false #| message: false +#| code-fold: true +#| code-summary: "Show code" ggplot(data, aes(x = age)) + - geom_smooth(aes(y = measure), method = 'lm', color = "grey30") + + geom_smooth(aes(y = measure), method = "lm", color = "grey30") + geom_point(aes(y = measure), size = 4, alpha = 0.8) + geom_point(aes(y = measure_pred), alpha = 0.5, size = 2.5) + - geom_segment(aes(xend = age, y = measure, yend = measure_pred), - arrow = arrow(length = unit(4, "points")), - color = "black", alpha = 0.8, show.legend = FALSE) + - scale_x_continuous(limits = c(18,60)) + - scico::scale_color_scico_d() + + geom_segment(aes(xend = age, y = measure, yend = measure_pred), + arrow = arrow(length = unit(4, "points")), + color = "black", alpha = 0.8, show.legend = FALSE + ) + + scale_x_continuous(limits = c(18, 60)) + + scico::scale_color_scico_d() + theme_minimal() ``` @@ -459,37 +542,49 @@ You might have noticed now that the size of the arrow is defined as the differen ```{r} #| label: lm-plot-squares -#| echo: false #| message: false +#| code-fold: true +#| code-summary: "Show code" ggplot(data, aes(x = age)) + - geom_smooth(aes(y = measure), method = 'lm', color = "grey30") + + geom_smooth(aes(y = measure), method = "lm", color = "grey30") + geom_point(aes(y = measure), size = 4, alpha = 0.8) + geom_point(aes(y = measure_pred), alpha = 0.5, size = 2.5) + - geom_segment(aes(xend = age, y = measure, yend = measure_pred), - arrow = arrow(length = unit(4, "points")), - color = "black", alpha = 0.8, show.legend = FALSE) + - geom_rect(data = . %>% filter(age < 35 & age > 30), - aes(xmin = age, xmax = age - (1.6 * residual * age), - ymin = measure_pred, ymax = measure), - alpha = 0.5, color = "grey30") + - geom_rect(data = . %>% filter(age == 50), - aes(xmin = age, xmax = age - (1.1 * residual * age), - ymin = measure_pred, ymax = measure), - alpha = 0.5, color = "grey30") + - scale_x_continuous(limits = c(18,60)) + - scico::scale_color_scico_d() + + geom_segment(aes(xend = age, y = measure, yend = measure_pred), + arrow = arrow(length = unit(4, "points")), + color = "black", alpha = 0.8, show.legend = FALSE + ) + + geom_rect( + data = . %>% filter(age < 35 & age > 30), + aes( + xmin = age, xmax = age - (1.6 * residual * age), + ymin = measure_pred, ymax = measure + ), + alpha = 0.5, color = "grey30" + ) + + geom_rect( + data = . %>% filter(age == 50), + aes( + xmin = age, xmax = age - (1.1 * residual * age), + ymin = measure_pred, ymax = measure + ), + alpha = 0.5, color = "grey30" + ) + + scale_x_continuous(limits = c(18, 60)) + + scico::scale_color_scico_d() + theme_minimal() ``` The "sum" part of "sum of squared error" refers to the sum of the areas of those squares. Simply, you sum the square of the sides. You can also look at it in mathematical form: +{{{< sidenote >}}} +We'll use this formula again a bit later to calculate the R{{{< sup "2" >}}}. +{{{< /sidenote >}}} + $$ \sum{(residual~or~difference~with~regression~line^2)} $$ -We'll use this formula again a bit later to calculate the R2. - In order to calculate the squared regression coefficient, we should also calculate the mean value of the measure across all points. This is necessary because the squared regression coefficient is defined as a perfect correlation (i.e. a correlation coefficient of 1) minus the explained variance divided by the total variance, or in formula form: $$ @@ -507,17 +602,29 @@ We've already plotted the sum of squared error, now we'll also illustrate sum of ```{r} #| label: lm-plot-mean -#| echo: false #| message: false +#| code-fold: true +#| code-summary: "Show code" ggplot(data, aes(x = age)) + - geom_smooth(aes(y = measure), method = 'lm', color = "grey95", alpha = 0.1) + - geom_hline(yintercept = mean(data$measure), color = "grey30", linewidth = 1) + + geom_smooth(aes(y = measure), + method = "lm", + color = "grey95", alpha = 0.1 + ) + + geom_hline( + yintercept = mean(data$measure), + color = "grey30", linewidth = 1 + ) + geom_point(aes(y = measure), size = 4, alpha = 0.8) + geom_point(aes(y = mean(measure)), alpha = 0.5, size = 2.5) + - geom_segment(aes(xend = age, y = measure, yend = mean(measure)), - arrow = arrow(length = unit(4, "points")), - color = "black", alpha = 0.8, show.legend = FALSE) + + geom_segment( + aes( + xend = age, y = measure, + yend = mean(measure) + ), + arrow = arrow(length = unit(4, "points")), + color = "black", alpha = 0.8, show.legend = FALSE + ) + theme_minimal() ``` @@ -526,9 +633,11 @@ We already calculated the difference with the regression line, then to calculate ```{r} #| label: lm-calc-differences -data <- data |> - mutate(measure_mean = mean(measure), - difference_mean = measure - measure_mean) +data <- data |> + mutate( + measure_mean = mean(measure), + difference_mean = measure - measure_mean + ) ``` Sidenote, if you wanted to calculate the total variance the formula for that would look like this: @@ -539,7 +648,7 @@ $$ Notice how the numerator is the same calculation as the sum of squared total, then divided by the sample size minus 1 (like the degrees of freedom). -To calculate the squared regression coefficient (R2) from the formula above is then simple. We take 1 (perfect correlation) and subtract the sum of the squared residuals (explained variance) divided by the sum of the squared difference with the mean (total variance). In R terms, that would look like this: +To calculate the squared regression coefficient (R{{{< sup "2" >}}}) from the formula above is then simple. We take 1 (perfect correlation) and subtract the sum of the squared residuals (explained variance) divided by the sum of the squared difference with the mean (total variance). In R terms, that would look like this: ```{r} #| label: lm-calc-r2 @@ -547,9 +656,9 @@ To calculate the squared regression coefficient (R2) from the formula 1 - sum(data$residual^2) / sum(data$difference_mean^2) ``` -And there we have it, the regression coefficient R2! You can check that it's the same by scrolling up to where we ran `summary(lm_model)` and you'll find the same number. We could also calculate the F-statistic and the *t*- and *p*-values, but I think this tutorial has drained enough cognitive energy. For this last section, I hope it's become clear what we mean when we talk about "residuals", "sum of squares", and "variance" in the context of linear models. I also hope it's enlightened you a bit on what a linear model does and how it works. +And there we have it, the regression coefficient R{{{< sup "2" >}}}! You can check that it's the same by scrolling up to where we ran `summary(lm_model)` and you'll find the same number. We could also calculate the F-statistic and the $t$- and $p$-values, but I think this tutorial has drained enough cognitive energy. For this last section, I hope it's become clear what we mean when we talk about "residuals", "sum of squares", and "variance" in the context of linear models. I also hope it's enlightened you a bit on what a linear model does and how it works. -# Conclusion +## Conclusion There's many more things we could go over, multiple linear regression, non-parametric tests, etc., but I think we have done enough nerding for today. I hope I managed to show you the overlap in different statistical tests. Does that mean that you should only run linear models for now? No, of course not. But I do think it may be good to have an overview of where the values you get come from and what they might mean in different contexts. Hope this was enlightening. Happy learning! @@ -561,10 +670,10 @@ There's many more things we could go over, multiple linear regression, non-param - [ANOVA for Regression - Rahul Pathak](https://towardsdatascience.com/anova-for-regression-fdb49cf5d684) - [Explaining the `lm` summary in R - Learn by Marketing](https://www.learnbymarketing.com/tutorials/explaining-the-lm-summary-in-r/) -
Session info for reproducibility purposes +{{{< details "Session info for reproducibility purposes" >}}} ```{r} #| label: sessioninfo sessionInfo() ``` -
+{{{< /details >}}} diff --git a/content/blog/2023-nyt-books-api/index.markdown_strict_files/figure-markdown_strict/plot-top-authors-1.png b/content/blog/2023-nyt-books-api/index.markdown_strict_files/figure-markdown_strict/plot-top-authors-1.png index 3824614..e840c87 100644 Binary files a/content/blog/2023-nyt-books-api/index.markdown_strict_files/figure-markdown_strict/plot-top-authors-1.png and b/content/blog/2023-nyt-books-api/index.markdown_strict_files/figure-markdown_strict/plot-top-authors-1.png differ diff --git a/content/blog/2023-nyt-books-api/index.md b/content/blog/2023-nyt-books-api/index.md index 1a1df34..44aa0dc 100644 --- a/content/blog/2023-nyt-books-api/index.md +++ b/content/blog/2023-nyt-books-api/index.md @@ -18,6 +18,10 @@ editor_options: chunk_output_type: console --- +{{< sidenote br="10em" >}} +REST: represen-
tational state transfer +{{< /sidenote >}} + Using an API (an *application programming interface*) is a very popular tool for data scientists to pull data from a public source. I've done a bit of web scraping in previous blogposts where I collected data from for example [Wikipedia](https://danielroelfs.com/blog/the-easier-way-to-create-a-map-of-norway-using-csmaps/) and the [Olympian Database](https://danielroelfs.com/blog/dutch-performance-at-olympic-speed-skating/). However, I'd much prefer using APIs whenever possible, this way the creator has more control of what data they want to share and how it is packaged. The most common APIs use a [RESTful](https://aws.amazon.com/what-is/restful-api/#:~:text=RESTful%20API%20is%20an%20interface,applications%20to%20perform%20various%20tasks.) approach where the interface and communication between you and the server the data is stored on follows a predictable approach and maintains some common standards. Most RESTful APIs return data in JSON format. In this blogpost I want to show you a simple way to interact with a RESTful API through Python and of course we'll do some analyses on the data we collect. The API we will be working with here is the [New York Times Books API](https://developer.nytimes.com/docs/books-product/1/overview). This API provides us with access to the current and previous (up to 2008) New York Times Best Sellers lists. It allows ut to download each individual list (both fiction, non-fiction, children's, etc), including ones that are no longer updated (such as the monthly lists on Espionage and Love & Relationships). Let's keep it simple for now and only focus on the main list on fiction, called the [Combined Print & E-Book Fiction](https://www.nytimes.com/books/best-sellers/) list which appears as the list on the home page of the Best Sellers webpage. @@ -50,7 +54,7 @@ def get_list_of_dates( return date_list_str ``` -Now that we have a function that will provide a list of dates, the next ingredient we need is the API key. The NYT Books API requires a personal API key to authenticate your request. Since this key is personal I won't upload it the internet, but instead I stored it in a `.secrets.yml` file that I put in my `.gitignore` file to avoid uploading it to GitHub. We'll use the functionality from the `yaml` module to parse the file. The function will return the API key as a string. +Now that we have a function that will provide a list of dates, the next ingredient we need is the API key. The NYT Books API requires a personal API key to authenticate your request. Since this key is personal I won't upload it the internet, but instead I stored it in a `.secrets.yml` file that I put in my `.gitignore` file to avoid storing it in Git log. We'll use the functionality from the `yaml` module to parse the file. The function will return the API key as a string. ``` python def _get_api_key(path="./.secrets.yml"): @@ -126,6 +130,10 @@ def api_request(list="fiction", date="current"): return df_out ``` +{{< sidenote br="12em" >}} +Given these limits, you can deduce how long it takes to download data for a few years. It's quite a while... +{{< /sidenote >}} + Now, each of the functions above dealt with performing a single API request. Since we are interested in obtaining an historical dataset, we should run the API request for each of the weeks we returned in the `get_list_of_dates()` function. In the `download_bestseller_lists()` function, we'll first get the list of dates. Then we'll loop through the list of dates going back roughly 5 years. In each iteration we'll run the `api_request()` function for the specified date and concatenate the result with the existing data frame. Then, in order to avoid hitting the API rate limit we'll wait for 12 seconds, the NYT Books API is rate limited at 500 requests per day and 5 requests per minute so we need to wait 12 seconds between each request to avoid hitting the rate limit. Finally, we'll run this `download_bestseller_lists()` function for the Fiction list, and save the output in a CSV file. ``` python @@ -163,7 +171,7 @@ download_bestseller_lists(list_type="fiction") Now that we've done the hard work in Python, let's do some data visualization in R using `ggplot` and a few other `{tidyverse}` and plotting related packages (`{ggtext}` and `{patchwork}`). Since the `download_bestseller_lists()` function in Python also saved the file, we can easily load it into R. Since I'm using the `{reticulate}` package to create this, I could have also used the `py$` functionality, but since the API call takes a lot of time to run, this seemed easier. ``` r -data <- read_csv("./data/nyt_list_fiction.csv") |> +data <- read_csv("./data/nyt_list_fiction.csv") |> glimpse() ``` @@ -189,9 +197,11 @@ data <- read_csv("./data/nyt_list_fiction.csv") |> As mentioned, the dataset we downloaded does not include all history of the Best Sellers list due to the API rate limit. Instead we specified that we want to look about 5 years back, let's see what the earliest to latest dates in the dataset are. ``` r -data |> - summarise(from = min(list_publication_date), - to = max(list_publication_date)) +data |> + summarise( + from = min(list_publication_date), + to = max(list_publication_date) + ) ``` # A tibble: 1 × 2 @@ -207,21 +217,25 @@ The purpose of this post is mostly to show the implementation of an API call in ``` r data |> count(author, title, name = "n_weeks_on_list") |> - mutate(book_label = str_glue("{author} - {title}")) |> - slice_max(order_by = n_weeks_on_list, n = 20) |> + mutate(book_label = str_glue("{author} - {title}")) |> + slice_max(order_by = n_weeks_on_list, n = 20) |> ggplot(aes(x = n_weeks_on_list, y = reorder(book_label, n_weeks_on_list))) + - geom_col(fill = "#5D7B92") + - geom_text(aes(label = n_weeks_on_list), nudge_x = -7, - color = "white", fontface = "bold") + + geom_col(fill = "#5D7B92") + + geom_text(aes(label = n_weeks_on_list), + nudge_x = -7, + color = "white", fontface = "bold" + ) + geom_vline(xintercept = 0, linewidth = 1) + labs( title = "Which books have spent the longest
on the NYT Best Sellers list?", x = NULL, y = NULL ) + - scale_x_continuous(position = "top", - labels = ~ str_glue("{.x} wk"), - expand = expansion(add = c(0, 20))) + + scale_x_continuous( + position = "top", + labels = ~ str_glue("{.x} wk"), + expand = expansion(add = c(0, 20)) + ) + theme_minimal() + theme( plot.title.position = "plot", @@ -242,23 +256,27 @@ Let's also look at which authors have had the largest number of books in the lis Code for the plot below ``` r -data |> +data |> distinct(author, title) |> count(author, name = "n_books_in_list") |> - slice_max(order_by = n_books_in_list, n = 10) |> - ggplot(aes(x = n_books_in_list, y = reorder(author, n_books_in_list))) + - geom_col(fill = "#5D7B92") + - geom_text(aes(label = n_books_in_list), nudge_x = -1, - color = "white", fontface = "bold") + + slice_max(order_by = n_books_in_list, n = 10) |> + ggplot(aes(x = n_books_in_list, y = reorder(author, n_books_in_list))) + + geom_col(fill = "#5D7B92") + + geom_text(aes(label = n_books_in_list), + nudge_x = -1, + color = "white", fontface = "bold" + ) + geom_vline(xintercept = 0, linewidth = 1) + labs( - title = "Which books have spent the longest
on the NYT Best Sellers list?", + title = "Which authors have the most books
on the NYT Best Sellers list?", x = NULL, y = NULL ) + - scale_x_continuous(position = "top", - labels = ~ str_glue("{.x} books"), - expand = expansion(add = c(0, 5))) + + scale_x_continuous( + position = "top", + labels = ~ str_glue("{.x} books"), + expand = expansion(add = c(0, 5)) + ) + theme_minimal() + theme( plot.title.position = "plot", @@ -271,6 +289,10 @@ data |> +{{< sidenote br="8em" >}} +The LA Review of Books even used the term ["supermarket schlock"](https://lareviewofbooks.org/article/the-sublime-danielle-steel/) +{{< /sidenote >}} + This genuinely surprised me, I had expected some authors to have maybe 6 or 7 books on the list, but it turns out there are some really prolific authors out there, with Danielle Steel definitely taking the top spot. According to her separate (!) [bibliography page on Wikipedia](https://en.wikipedia.org/wiki/Danielle_Steel_bibliography) she publishes several books a year to make a total of 190 published books, including 141 novels. Wikipedia also mentions a ["resounding lack of critical acclaim"](https://www.publishersweekly.com/978-0-312-11257-8), which just emphasizes the point that the New York Times Best Sellers list is not necessarily a mark of quality, but rather of popularity, although one would hope good books are also the books that sell well. Note that some authors have received critical acclaim *and* have published multiple popular books such as Colleen Hoover and Stephen King in the plot above. Since we have essentially time series data, we could also plot the trajectory of a book. So let's take the book that spent the longest on the Best Sellers list in our timeframe: *Where The Crawdads Sing*. We can plot rank on the list over time. We'll use the `geom_bump()` function from the `{ggbump}` package to get a nicer-looking curve. Since `geom_path()` and `geom_bump()` will show a continuous line for missing values and dropping those empty values looks ugly, we'll hide the weeks where the book wasn't on the Best Sellers list in an area below the plot. @@ -279,35 +301,44 @@ Since we have essentially time series data, we could also plot the trajectory of Code for the plot below ``` r -title_of_interest = "Where The Crawdads Sing" - -data |> - filter(title == title_of_interest) |> - right_join(data |> select(list_publication_date) |> distinct(), - by = "list_publication_date") |> - replace_na(list(rank = 16)) |> - ggplot(aes(x = list_publication_date, y = rank)) + +title_of_interest <- "Where The Crawdads Sing" + +data |> + filter(title == title_of_interest) |> + right_join(data |> select(list_publication_date) |> distinct(), + by = "list_publication_date" + ) |> + replace_na(list(rank = 16)) |> + ggplot(aes(x = list_publication_date, y = rank)) + geom_hline(yintercept = 1, color = "grey", linetype = "dotted") + ggbump::geom_bump(linewidth = 1, color = "#5D7B92") + - geom_rect(aes(xmin = min(data$list_publication_date), - xmax = max(data$list_publication_date), - ymin = 15.2, ymax = Inf), fill = "grey40") + - geom_text(data = tibble(), aes(x = mean(data$list_publication_date), y = 15.75, - label = "Not on list"), color = "white") + + geom_rect(aes( + xmin = min(data$list_publication_date), + xmax = max(data$list_publication_date), + ymin = 15.2, ymax = Inf + ), fill = "grey40") + + geom_text(data = tibble(), aes( + x = mean(data$list_publication_date), y = 15.75, + label = "Not on list" + ), color = "white") + labs( title = str_glue("How _{title_of_interest}_ ranked over time"), x = NULL, y = "Position on Best Sellers list" ) + - scale_y_continuous(trans = "reverse", - breaks = seq(15), - expand = expansion(add = c(0.5, 0))) + + scale_y_continuous( + trans = "reverse", + breaks = seq(15), + expand = expansion(add = c(0.5, 0)) + ) + coord_cartesian(clip = "off") + theme_minimal() + theme( plot.title.position = "plot", - plot.title = element_markdown(size = 18, face = "bold", - padding = margin(b = 10)), + plot.title = element_markdown( + size = 18, face = "bold", + padding = margin(b = 10) + ), panel.grid.minor.y = element_blank() ) ``` @@ -318,7 +349,11 @@ data |> That is quite an impressive trajectory. There are a few moments where the book leaves the Best Sellers list, but it had a spot on the list every year from 2018 when the book was published to 2023. -The jump in the plot above that happens in 2022 seemed very remarkable, jumping from spot 13 to 1 within a week. I was wondering if that had anything to do with the release of the movie adaptation. So let's have a look. The movie was released on July 15th, 2022. Let's also look at a few other books I know were on the Best Sellers list that have gotten an adaptation. This is just anecdotal and hand-picked evidence of course so needs to be taken with a grain of salt. Perhaps another time I'll find a more systematic way of testing this. In addition to [*Where The Crawdads Sing*](https://www.imdb.com/title/tt9411972/), I've picked two other movies and three TV series. The other two movies are [*Bird Box*](https://www.imdb.com/title/tt2737304/) released in late 2018 and [*It Chapter Two*](https://www.imdb.com/title/tt7349950/), the second installment of the adaptation of *It* by Stephen King. For the TV series I'll look at the wonderful [*Normal People*](https://www.imdb.com/title/tt9059760/) based on the novel by Sally Rooney (who also has writing credits on the series!), initially released in 2020, [*The Outsider*](https://www.imdb.com/title/tt8550800/) also based on a book by Stephen King, and [*The Handmaid's Tale*](https://www.imdb.com/title/tt5834204/) based on the book by Margaret Atwood. In the plots below the vertical dashed line indicates the release date of the adaptation. +{{< sidenote br="18em" >}} +Sally Rooney actually has [writing credits](https://www.imdb.com/name/nm10745607/?ref_=nv_sr_srsg_0_tt_0_nm_8_q_sally%2520roon) on the series too! +{{< /sidenote >}} + +The jump in the plot above that happens in 2022 seemed very remarkable, jumping from spot 13 to 1 within a week. I was wondering if that had anything to do with the release of the movie adaptation. So let's have a look. The movie was released on July 15th, 2022. Let's also look at a few other books I know were on the Best Sellers list that have gotten an adaptation. This is just anecdotal and hand-picked evidence of course so needs to be taken with a grain of salt. Perhaps another time I'll find a more systematic way of testing this. In addition to [*Where The Crawdads Sing*](https://www.imdb.com/title/tt9411972/), I've picked two other movies and three TV series. The other two movies are [*Bird Box*](https://www.imdb.com/title/tt2737304/) released in late 2018 and [*It Chapter Two*](https://www.imdb.com/title/tt7349950/), the second installment of the adaptation of *It* by Stephen King. For the TV series I'll look at the wonderful [*Normal People*](https://www.imdb.com/title/tt9059760/) based on the novel by Sally Rooney, initially released in 2020, [*The Outsider*](https://www.imdb.com/title/tt8550800/) also based on a book by Stephen King, and [*The Handmaid's Tale*](https://www.imdb.com/title/tt5834204/) based on the book by Margaret Atwood. In the plots below the vertical dashed line indicates the release date of the adaptation.
Code for the plot below @@ -334,54 +369,63 @@ titles_w_adaptation <- tribble( "The Handmaid'S Tale", "2019-6-5" ) -rplot = list() -for (i in seq(nrow(titles_w_adaptation))) { - - book_title <- titles_w_adaptation |> - slice(i) |> - mutate(title = str_replace(title, "'S", "'s")) |> +rplot <- list() +for (i in seq_len(nrow(titles_w_adaptation))) { + book_title <- titles_w_adaptation |> + slice(i) |> + mutate(title = str_replace(title, "'S", "'s")) |> pull(title) - - rplot[[i]] <- data |> - right_join(titles_w_adaptation |> slice(i), by = "title") |> - right_join(data |> select(list_publication_date) |> distinct(), - by = "list_publication_date") |> - replace_na(list(rank = 16)) |> - ggplot(aes(x = list_publication_date, y = rank)) + + + rplot[[i]] <- data |> + right_join(titles_w_adaptation |> + slice(i), by = "title") |> + right_join(data |> select(list_publication_date) |> distinct(), + by = "list_publication_date" + ) |> + replace_na(list(rank = 16)) |> + ggplot(aes(x = list_publication_date, y = rank)) + geom_hline(yintercept = 1, color = "grey", linetype = "dotted") + ggbump::geom_bump(linewidth = 1, alpha = 0.25) + - geom_rect(aes(xmin = min(data$list_publication_date), - xmax = max(data$list_publication_date), - ymin = 15.2, ymax = Inf), fill = "grey40", show.legend = FALSE) + - geom_text(data = tibble(), aes(x = mean(data$list_publication_date), y = 15.75, - label = "Not on list"), color = "white") + + geom_rect(aes( + xmin = min(data$list_publication_date), + xmax = max(data$list_publication_date), + ymin = 15.2, ymax = Inf + ), fill = "grey40", show.legend = FALSE) + + geom_text(data = tibble(), aes( + x = mean(data$list_publication_date), y = 15.75, + label = "Not on list" + ), color = "white") + geom_vline(aes(xintercept = as.Date(screen_release_date)), - linewidth = 1, linetype = "dashed") + + linewidth = 1, linetype = "dashed" + ) + labs( title = str_glue("Ranking of _{book_title}_"), x = NULL, y = "Rank on list", color = NULL ) + - scale_y_continuous(trans = "reverse", - breaks = seq(15), - expand = expansion(add = c(0.5, 0))) + + scale_y_continuous( + trans = "reverse", + breaks = seq(15), + expand = expansion(add = c(0.5, 0)) + ) + coord_cartesian(clip = "off") + theme_minimal() + theme( plot.title.position = "plot", - plot.title = element_markdown(size = 14, face = "bold", - padding = margin(b = 10)), + plot.title = element_markdown( + size = 14, face = "bold", + padding = margin(b = 10) + ), panel.grid.minor.y = element_blank(), legend.position = "bottom", legend.direction = "vertical" ) - } -(rplot[[1]] + rplot[[2]]) / - (rplot[[3]] + rplot[[4]]) / - (rplot[[5]] + rplot[[6]]) + +(rplot[[1]] + rplot[[2]]) / + (rplot[[3]] + rplot[[4]]) / + (rplot[[5]] + rplot[[6]]) + plot_annotation( caption = "Vertical dashed line indicates the adaptation's release date" ) & diff --git a/content/blog/2023-nyt-books-api/index.qmd b/content/blog/2023-nyt-books-api/index.qmd index 946fbf7..d7962e2 100644 --- a/content/blog/2023-nyt-books-api/index.qmd +++ b/content/blog/2023-nyt-books-api/index.qmd @@ -18,6 +18,10 @@ editor_options: chunk_output_type: console --- +{{{< sidenote br="10em" >}}} +REST: represen-
tational state transfer +{{{< /sidenote >}}} + Using an API (an _application programming interface_) is a very popular tool for data scientists to pull data from a public source. I've done a bit of web scraping in previous blogposts where I collected data from for example [Wikipedia](https://danielroelfs.com/blog/the-easier-way-to-create-a-map-of-norway-using-csmaps/) and the [Olympian Database](https://danielroelfs.com/blog/dutch-performance-at-olympic-speed-skating/). However, I'd much prefer using APIs whenever possible, this way the creator has more control of what data they want to share and how it is packaged. The most common APIs use a [RESTful](https://aws.amazon.com/what-is/restful-api/#:~:text=RESTful%20API%20is%20an%20interface,applications%20to%20perform%20various%20tasks.) approach where the interface and communication between you and the server the data is stored on follows a predictable approach and maintains some common standards. Most RESTful APIs return data in JSON format. In this blogpost I want to show you a simple way to interact with a RESTful API through Python and of course we'll do some analyses on the data we collect. The API we will be working with here is the [New York Times Books API](https://developer.nytimes.com/docs/books-product/1/overview). This API provides us with access to the current and previous (up to 2008) New York Times Best Sellers lists. It allows ut to download each individual list (both fiction, non-fiction, children's, etc), including ones that are no longer updated (such as the monthly lists on Espionage and Love & Relationships). Let's keep it simple for now and only focus on the main list on fiction, called the [Combined Print & E-Book Fiction](https://www.nytimes.com/books/best-sellers/) list which appears as the list on the home page of the Best Sellers webpage. @@ -66,7 +70,7 @@ def get_list_of_dates( return date_list_str ``` -Now that we have a function that will provide a list of dates, the next ingredient we need is the API key. The NYT Books API requires a personal API key to authenticate your request. Since this key is personal I won't upload it the internet, but instead I stored it in a `.secrets.yml` file that I put in my `.gitignore` file to avoid uploading it to GitHub. We'll use the functionality from the `yaml` module to parse the file. The function will return the API key as a string. +Now that we have a function that will provide a list of dates, the next ingredient we need is the API key. The NYT Books API requires a personal API key to authenticate your request. Since this key is personal I won't upload it the internet, but instead I stored it in a `.secrets.yml` file that I put in my `.gitignore` file to avoid storing it in Git log. We'll use the functionality from the `yaml` module to parse the file. The function will return the API key as a string. ```{python} #| label: py-get-api-key @@ -148,6 +152,10 @@ def api_request(list="fiction", date="current"): return df_out ``` +{{{< sidenote br="12em" >}}} +Given these limits, you can deduce how long it takes to download data for a few years. It's quite a while... +{{{< /sidenote >}}} + Now, each of the functions above dealt with performing a single API request. Since we are interested in obtaining an historical dataset, we should run the API request for each of the weeks we returned in the `get_list_of_dates()` function. In the `download_bestseller_lists()` function, we'll first get the list of dates. Then we'll loop through the list of dates going back roughly 5 years. In each iteration we'll run the `api_request()` function for the specified date and concatenate the result with the existing data frame. Then, in order to avoid hitting the API rate limit we'll wait for 12 seconds, the NYT Books API is rate limited at 500 requests per day and 5 requests per minute so we need to wait 12 seconds between each request to avoid hitting the rate limit. Finally, we'll run this `download_bestseller_lists()` function for the Fiction list, and save the output in a CSV file. ```{python} @@ -190,7 +198,7 @@ Now that we've done the hard work in Python, let's do some data visualization in ```{r} #| label: load-data -data <- read_csv("./data/nyt_list_fiction.csv") |> +data <- read_csv("./data/nyt_list_fiction.csv") |> glimpse() ``` @@ -199,9 +207,11 @@ As mentioned, the dataset we downloaded does not include all history of the Best ```{r} #| label: date-range -data |> - summarise(from = min(list_publication_date), - to = max(list_publication_date)) +data |> + summarise( + from = min(list_publication_date), + to = max(list_publication_date) + ) ``` The purpose of this post is mostly to show the implementation of an API call in Python, but I want to make some plots anyway because it's fun. Let's first look at what book has spend the longest on the Best Sellers list. Just to make it easier to determine the book, we'll create a composite variable of both the book author and title before plotting and take the 20 books that have the most occurences on the list using the `count()` function and then use the `slice_max()` to limit the dataset to 20. @@ -213,21 +223,25 @@ The purpose of this post is mostly to show the implementation of an API call in data |> count(author, title, name = "n_weeks_on_list") |> - mutate(book_label = str_glue("{author} - {title}")) |> - slice_max(order_by = n_weeks_on_list, n = 20) |> + mutate(book_label = str_glue("{author} - {title}")) |> + slice_max(order_by = n_weeks_on_list, n = 20) |> ggplot(aes(x = n_weeks_on_list, y = reorder(book_label, n_weeks_on_list))) + - geom_col(fill = "#5D7B92") + - geom_text(aes(label = n_weeks_on_list), nudge_x = -7, - color = "white", fontface = "bold") + + geom_col(fill = "#5D7B92") + + geom_text(aes(label = n_weeks_on_list), + nudge_x = -7, + color = "white", fontface = "bold" + ) + geom_vline(xintercept = 0, linewidth = 1) + labs( title = "Which books have spent the longest
on the NYT Best Sellers list?", x = NULL, y = NULL ) + - scale_x_continuous(position = "top", - labels = ~ str_glue("{.x} wk"), - expand = expansion(add = c(0, 20))) + + scale_x_continuous( + position = "top", + labels = ~ str_glue("{.x} wk"), + expand = expansion(add = c(0, 20)) + ) + theme_minimal() + theme( plot.title.position = "plot", @@ -245,23 +259,27 @@ Let's also look at which authors have had the largest number of books in the lis #| code-fold: true #| code-summary: "Code for the plot below" -data |> +data |> distinct(author, title) |> count(author, name = "n_books_in_list") |> - slice_max(order_by = n_books_in_list, n = 10) |> - ggplot(aes(x = n_books_in_list, y = reorder(author, n_books_in_list))) + - geom_col(fill = "#5D7B92") + - geom_text(aes(label = n_books_in_list), nudge_x = -1, - color = "white", fontface = "bold") + + slice_max(order_by = n_books_in_list, n = 10) |> + ggplot(aes(x = n_books_in_list, y = reorder(author, n_books_in_list))) + + geom_col(fill = "#5D7B92") + + geom_text(aes(label = n_books_in_list), + nudge_x = -1, + color = "white", fontface = "bold" + ) + geom_vline(xintercept = 0, linewidth = 1) + labs( - title = "Which books have spent the longest
on the NYT Best Sellers list?", + title = "Which authors have the most books
on the NYT Best Sellers list?", x = NULL, y = NULL ) + - scale_x_continuous(position = "top", - labels = ~ str_glue("{.x} books"), - expand = expansion(add = c(0, 5))) + + scale_x_continuous( + position = "top", + labels = ~ str_glue("{.x} books"), + expand = expansion(add = c(0, 5)) + ) + theme_minimal() + theme( plot.title.position = "plot", @@ -270,6 +288,10 @@ data |> ) ``` +{{{< sidenote br="8em" >}}} +The LA Review of Books even used the term ["supermarket schlock"](https://lareviewofbooks.org/article/the-sublime-danielle-steel/) +{{{< /sidenote >}}} + This genuinely surprised me, I had expected some authors to have maybe 6 or 7 books on the list, but it turns out there are some really prolific authors out there, with Danielle Steel definitely taking the top spot. According to her separate (!) [bibliography page on Wikipedia](https://en.wikipedia.org/wiki/Danielle_Steel_bibliography) she publishes several books a year to make a total of 190 published books, including 141 novels. Wikipedia also mentions a ["resounding lack of critical acclaim"](https://www.publishersweekly.com/978-0-312-11257-8), which just emphasizes the point that the New York Times Best Sellers list is not necessarily a mark of quality, but rather of popularity, although one would hope good books are also the books that sell well. Note that some authors have received critical acclaim _and_ have published multiple popular books such as Colleen Hoover and Stephen King in the plot above. Since we have essentially time series data, we could also plot the trajectory of a book. So let's take the book that spent the longest on the Best Sellers list in our timeframe: _Where The Crawdads Sing_. We can plot rank on the list over time. We'll use the `geom_bump()` function from the `{ggbump}` package to get a nicer-looking curve. Since `geom_path()` and `geom_bump()` will show a continuous line for missing values and dropping those empty values looks ugly, we'll hide the weeks where the book wasn't on the Best Sellers list in an area below the plot. @@ -279,42 +301,55 @@ Since we have essentially time series data, we could also plot the trajectory of #| code-fold: true #| code-summary: "Code for the plot below" -title_of_interest = "Where The Crawdads Sing" +title_of_interest <- "Where The Crawdads Sing" -data |> - filter(title == title_of_interest) |> - right_join(data |> select(list_publication_date) |> distinct(), - by = "list_publication_date") |> - replace_na(list(rank = 16)) |> - ggplot(aes(x = list_publication_date, y = rank)) + +data |> + filter(title == title_of_interest) |> + right_join(data |> select(list_publication_date) |> distinct(), + by = "list_publication_date" + ) |> + replace_na(list(rank = 16)) |> + ggplot(aes(x = list_publication_date, y = rank)) + geom_hline(yintercept = 1, color = "grey", linetype = "dotted") + ggbump::geom_bump(linewidth = 1, color = "#5D7B92") + - geom_rect(aes(xmin = min(data$list_publication_date), - xmax = max(data$list_publication_date), - ymin = 15.2, ymax = Inf), fill = "grey40") + - geom_text(data = tibble(), aes(x = mean(data$list_publication_date), y = 15.75, - label = "Not on list"), color = "white") + + geom_rect(aes( + xmin = min(data$list_publication_date), + xmax = max(data$list_publication_date), + ymin = 15.2, ymax = Inf + ), fill = "grey40") + + geom_text(data = tibble(), aes( + x = mean(data$list_publication_date), y = 15.75, + label = "Not on list" + ), color = "white") + labs( title = str_glue("How _{title_of_interest}_ ranked over time"), x = NULL, y = "Position on Best Sellers list" ) + - scale_y_continuous(trans = "reverse", - breaks = seq(15), - expand = expansion(add = c(0.5, 0))) + + scale_y_continuous( + trans = "reverse", + breaks = seq(15), + expand = expansion(add = c(0.5, 0)) + ) + coord_cartesian(clip = "off") + theme_minimal() + theme( plot.title.position = "plot", - plot.title = element_markdown(size = 18, face = "bold", - padding = margin(b = 10)), + plot.title = element_markdown( + size = 18, face = "bold", + padding = margin(b = 10) + ), panel.grid.minor.y = element_blank() ) ``` That is quite an impressive trajectory. There are a few moments where the book leaves the Best Sellers list, but it had a spot on the list every year from 2018 when the book was published to 2023. -The jump in the plot above that happens in 2022 seemed very remarkable, jumping from spot 13 to 1 within a week. I was wondering if that had anything to do with the release of the movie adaptation. So let's have a look. The movie was released on July 15th, 2022. Let's also look at a few other books I know were on the Best Sellers list that have gotten an adaptation. This is just anecdotal and hand-picked evidence of course so needs to be taken with a grain of salt. Perhaps another time I'll find a more systematic way of testing this. In addition to [_Where The Crawdads Sing_](https://www.imdb.com/title/tt9411972/), I've picked two other movies and three TV series. The other two movies are [_Bird Box_](https://www.imdb.com/title/tt2737304/) released in late 2018 and [_It Chapter Two_](https://www.imdb.com/title/tt7349950/), the second installment of the adaptation of _It_ by Stephen King. For the TV series I'll look at the wonderful [_Normal People_](https://www.imdb.com/title/tt9059760/) based on the novel by Sally Rooney (who also has writing credits on the series!), initially released in 2020, [_The Outsider_](https://www.imdb.com/title/tt8550800/) also based on a book by Stephen King, and [_The Handmaid's Tale_](https://www.imdb.com/title/tt5834204/) based on the book by Margaret Atwood. In the plots below the vertical dashed line indicates the release date of the adaptation. +{{{< sidenote br="18em" >}}} +Sally Rooney actually has [writing credits](https://www.imdb.com/name/nm10745607/?ref_=nv_sr_srsg_0_tt_0_nm_8_q_sally%2520roon) on the series too! +{{{< /sidenote >}}} + +The jump in the plot above that happens in 2022 seemed very remarkable, jumping from spot 13 to 1 within a week. I was wondering if that had anything to do with the release of the movie adaptation. So let's have a look. The movie was released on July 15th, 2022. Let's also look at a few other books I know were on the Best Sellers list that have gotten an adaptation. This is just anecdotal and hand-picked evidence of course so needs to be taken with a grain of salt. Perhaps another time I'll find a more systematic way of testing this. In addition to [_Where The Crawdads Sing_](https://www.imdb.com/title/tt9411972/), I've picked two other movies and three TV series. The other two movies are [_Bird Box_](https://www.imdb.com/title/tt2737304/) released in late 2018 and [_It Chapter Two_](https://www.imdb.com/title/tt7349950/), the second installment of the adaptation of _It_ by Stephen King. For the TV series I'll look at the wonderful [_Normal People_](https://www.imdb.com/title/tt9059760/) based on the novel by Sally Rooney, initially released in 2020, [_The Outsider_](https://www.imdb.com/title/tt8550800/) also based on a book by Stephen King, and [_The Handmaid's Tale_](https://www.imdb.com/title/tt5834204/) based on the book by Margaret Atwood. In the plots below the vertical dashed line indicates the release date of the adaptation. ```{r} #| label: plot-trajectory-adaptations @@ -333,54 +368,63 @@ titles_w_adaptation <- tribble( "The Handmaid'S Tale", "2019-6-5" ) -rplot = list() -for (i in seq(nrow(titles_w_adaptation))) { - - book_title <- titles_w_adaptation |> - slice(i) |> - mutate(title = str_replace(title, "'S", "'s")) |> +rplot <- list() +for (i in seq_len(nrow(titles_w_adaptation))) { + book_title <- titles_w_adaptation |> + slice(i) |> + mutate(title = str_replace(title, "'S", "'s")) |> pull(title) - - rplot[[i]] <- data |> - right_join(titles_w_adaptation |> slice(i), by = "title") |> - right_join(data |> select(list_publication_date) |> distinct(), - by = "list_publication_date") |> - replace_na(list(rank = 16)) |> - ggplot(aes(x = list_publication_date, y = rank)) + + + rplot[[i]] <- data |> + right_join(titles_w_adaptation |> + slice(i), by = "title") |> + right_join(data |> select(list_publication_date) |> distinct(), + by = "list_publication_date" + ) |> + replace_na(list(rank = 16)) |> + ggplot(aes(x = list_publication_date, y = rank)) + geom_hline(yintercept = 1, color = "grey", linetype = "dotted") + ggbump::geom_bump(linewidth = 1, alpha = 0.25) + - geom_rect(aes(xmin = min(data$list_publication_date), - xmax = max(data$list_publication_date), - ymin = 15.2, ymax = Inf), fill = "grey40", show.legend = FALSE) + - geom_text(data = tibble(), aes(x = mean(data$list_publication_date), y = 15.75, - label = "Not on list"), color = "white") + + geom_rect(aes( + xmin = min(data$list_publication_date), + xmax = max(data$list_publication_date), + ymin = 15.2, ymax = Inf + ), fill = "grey40", show.legend = FALSE) + + geom_text(data = tibble(), aes( + x = mean(data$list_publication_date), y = 15.75, + label = "Not on list" + ), color = "white") + geom_vline(aes(xintercept = as.Date(screen_release_date)), - linewidth = 1, linetype = "dashed") + + linewidth = 1, linetype = "dashed" + ) + labs( title = str_glue("Ranking of _{book_title}_"), x = NULL, y = "Rank on list", color = NULL ) + - scale_y_continuous(trans = "reverse", - breaks = seq(15), - expand = expansion(add = c(0.5, 0))) + + scale_y_continuous( + trans = "reverse", + breaks = seq(15), + expand = expansion(add = c(0.5, 0)) + ) + coord_cartesian(clip = "off") + theme_minimal() + theme( plot.title.position = "plot", - plot.title = element_markdown(size = 14, face = "bold", - padding = margin(b = 10)), + plot.title = element_markdown( + size = 14, face = "bold", + padding = margin(b = 10) + ), panel.grid.minor.y = element_blank(), legend.position = "bottom", legend.direction = "vertical" ) - } -(rplot[[1]] + rplot[[2]]) / - (rplot[[3]] + rplot[[4]]) / - (rplot[[5]] + rplot[[6]]) + +(rplot[[1]] + rplot[[2]]) / + (rplot[[3]] + rplot[[4]]) / + (rplot[[5]] + rplot[[6]]) + plot_annotation( caption = "Vertical dashed line indicates the adaptation's release date" ) & @@ -421,7 +465,3 @@ get_nyt_lists() ``` If you wish to explore other APIs, there's [a ton of public APIs](https://github.com/public-apis/public-apis) that you can explore using the approach discussed here. Thanks for reading, and thanks sticking all the way through! Happy coding! - - - - diff --git a/content/blog/2023-nyt-books-api/requirements.txt b/content/blog/2023-nyt-books-api/requirements.txt new file mode 100644 index 0000000..6d53b6a --- /dev/null +++ b/content/blog/2023-nyt-books-api/requirements.txt @@ -0,0 +1,12 @@ +certifi==2023.7.22 +charset-normalizer==3.3.0 +idna==3.4 +numpy==1.26.0 +pandas==2.1.1 +python-dateutil==2.8.2 +pytz==2023.3.post1 +PyYAML==6.0.1 +requests==2.31.0 +six==1.16.0 +tzdata==2023.3 +urllib3==2.0.6 diff --git a/content/blog/2023-sunrise-sunset-differences/index.md b/content/blog/2023-sunrise-sunset-differences/index.md index 21ca6c0..998d35b 100644 --- a/content/blog/2023-sunrise-sunset-differences/index.md +++ b/content/blog/2023-sunrise-sunset-differences/index.md @@ -15,6 +15,8 @@ execute: fig.show: hold results: hold out.width: 80% +editor_options: + chunk_output_type: console --- Today I want to share a project that deals with data scientist's favorite kind of data: date- and timedata involving time zones and daylight savings time. Who doesn't love it? @@ -28,15 +30,17 @@ library(tidyverse) library(lubridate) library(ggtext) -reticulate::use_virtualenv('./.venv', required = TRUE) +reticulate::use_virtualenv("./.venv", required = TRUE) -theme_custom <- function(...){ +theme_custom <- function(...) { ggthemes::theme_fivethirtyeight(...) + - theme(plot.background = element_rect(fill = "transparent"), - panel.background = element_rect(fill = "transparent"), - legend.background = element_rect(fill = "transparent"), - legend.key = element_rect(fill = "transparent")) - } + theme( + plot.background = element_rect(fill = "transparent"), + panel.background = element_rect(fill = "transparent"), + legend.background = element_rect(fill = "transparent"), + legend.key = element_rect(fill = "transparent") + ) +} ``` For the Python part, we'll use `pandas` as usual, the `datetime` and `time` module to deal with date- and timedata. The `dateutil` module to deal with time zones. And to get the actual raw data we'll use the `astral` module to get the sunrise and sunset times for different locations. Unfortunately, not all cities are included, it includes all capital cities plus some additional cities in the UK and USA. To get the coordinates for the cities we want to look at, we'll use the `geopy` module which allows you to look up the longitude, latitude of a given location without needing to create an account somewhere or dealing with an API (directly). To deal with some wrangling of strings we'll use the `re` module. @@ -52,80 +56,84 @@ from geopy.geocoders import Nominatim import re ``` +{{< sidenote br="20em" >}} +If you thought working with dates and times was a headache, imagine adding DST to the mix as well +{{< /sidenote >}} + For this project we'll collect a number of variables related to sunrise and sunset for different locations throughout the year. The `astral` module makes this very easy, and in order to keep things clean and efficient we'll create a function to efficient collect this data. We'll write it in such a way that it can collect a number of locations within one function call using a list input (which isn't the cleanest, but works quite well here). We'll provide the list of cities we want to analyze in the form `'/'` to avoid any possible misconceptions (e.g. Cambridge, Cambridgeshire in the UK or Cambridge, MA in the US). We'll then extract the coordinates for that location using the `geopy` module. We'll also supply the reference time zone (for this project Central Europe). The `astral` module has a [known issue](https://github.com/sffjunkie/astral/issues/67) when dusk happens past midnight. So we'll extract everything in [UTC](https://en.wikipedia.org/wiki/Coordinated_Universal_Time) and convert it to the time zone of interest later. We'll collect the data throughout the year for the various locations in a loop. And we'll cycle through the year using a `while` loop. We'll also collect the difference in time from the [Daylights Savings Time](https://en.wikipedia.org/wiki/Daylight_saving_time). We'll convert it to the time zone of interest using the `_convert_timezone()` function. In the final data frame we'll only get the times (not the dates) so in case the dusk happens past midnight, we'll consider that "no dusk takes place this day" instead of having it happen "early in the morning the same day", implemented in the `_fix_dusks_past_midnight()` function. This will help with the plots later. Finally we'll convert some of the variables to a format that'll make it easy to deal with in R later. ``` python def _convert_timezone(x, to_tz=tz.tzlocal()): - ''' - Convert the default time zone to local - ''' - - x_out = x.apply(lambda k: k.astimezone(to_tz)) - - return x_out + """ + Convert the default time zone to local + """ + + x_out = x.apply(lambda k: k.astimezone(to_tz)) + + return x_out + def _fix_dusks_past_midnight(sunset, dusk): - ''' - Replace the dusk time with NaN if it's past midnight - ''' - - sunset_dt = datetime.strptime(sunset, '%H:%M:%S') - dusk_dt = datetime.strptime(dusk, '%H:%M:%S') - - if dusk_dt < sunset_dt: - dusk_out = '23:59:59' - else: - dusk_out = dusk_dt.strftime('%H:%M:%S') - - return dusk_out - + """ + Replace the dusk time with NaN if it's past midnight + """ + + sunset_dt = datetime.strptime(sunset, "%H:%M:%S") + dusk_dt = datetime.strptime(dusk, "%H:%M:%S") + + if dusk_dt < sunset_dt: + dusk_out = "23:59:59" + else: + dusk_out = dusk_dt.strftime("%H:%M:%S") + + return dusk_out + + def get_sun_data( - cities=['Norway/Oslo', 'Netherlands/Amsterdam'], - ref_tz='Europe/Berlin'): - ''' + cities=["Norway/Oslo", "Netherlands/Amsterdam"], + ref_tz="Europe/Berlin" +): + """ Get sunset data from location - ''' - + """ + geolocator = Nominatim(user_agent="sunset-sunrise-app") - + df = pd.DataFrame() for i, city in enumerate(cities): df_tmp = pd.DataFrame() - + loc = geolocator.geocode(city) city_loc = LocationInfo( - timezone=city, - latitude=loc.latitude, - longitude=loc.longitude - ) - - start_date = datetime.strptime( - date.today().strftime('%Y-01-01'), '%Y-%m-%d') - end_date = datetime.strptime( - date.today().strftime('%Y-12-31'), '%Y-%m-%d') + timezone=city, latitude=loc.latitude, longitude=loc.longitude + ) + + start_date = datetime.strptime(date.today().strftime("%Y-01-01"), "%Y-%m-%d") + end_date = datetime.strptime(date.today().strftime("%Y-12-31"), "%Y-%m-%d") delta = timedelta(days=1) while start_date <= end_date: s = sun(city_loc.observer, date=start_date) - s['dst'] = time.localtime(start_date.timestamp()).tm_isdst + s["dst"] = time.localtime(start_date.timestamp()).tm_isdst df_tmp = pd.concat([df_tmp, pd.DataFrame(s, index=[0])]) start_date += delta - df_tmp['city_no'] = i + 1 - df_tmp['location'] = city_loc.timezone - df_tmp['lat'] = loc.latitude - df_tmp['long'] = loc.longitude + df_tmp["city_no"] = i + 1 + df_tmp["location"] = city_loc.timezone + df_tmp["lat"] = loc.latitude + df_tmp["long"] = loc.longitude df = pd.concat([df, df_tmp]) df.reset_index(drop=True, inplace=True) - df['date'] = df['noon'].dt.strftime('%Y-%m-%d') - df['dst'] = df['dst'].shift(-1, fill_value=0) - df['city'] = df['location'].apply(lambda x: re.findall('\\/(.*)', x)[0]) + df["date"] = df["noon"].dt.strftime("%Y-%m-%d") + df["dst"] = df["dst"].shift(-1, fill_value=0) + df["city"] = df["location"].apply(lambda x: re.findall("\\/(.*)", x)[0]) - cols = ['dawn', 'sunrise', 'noon', 'sunset', 'dusk'] + cols = ["dawn", "sunrise", "noon", "sunset", "dusk"] df[cols] = df[cols].apply(lambda k: _convert_timezone(k, to_tz=ref_tz)) - df[cols] = df[cols].apply(lambda k: k.dt.strftime('%H:%M:%S')) - df['dusk'] = df.apply( - lambda k: _fix_dusks_past_midnight(sunset=k['sunset'], dusk=k['dusk']), axis=1 - ) + df[cols] = df[cols].apply(lambda k: k.dt.strftime("%H:%M:%S")) + df["dusk"] = df.apply( + lambda k: _fix_dusks_past_midnight(sunset=k["sunset"], + dusk=k["dusk"]), axis=1 + ) return df ``` @@ -134,9 +142,10 @@ Let's run this function once for Oslo, Amsterdam, Warsaw, and Madrid and once fo ``` python df = get_sun_data( - ['Norway/Oslo', 'Netherlands/Amsterdam', 'Poland/Warsaw', 'Spain/Madrid'] + ["Norway/Oslo", "Netherlands/Amsterdam", + "Poland/Warsaw", "Spain/Madrid"] ) -df_deux = get_sun_data(['Norway/Oslo', 'Netherlands/Amsterdam']) +df_deux = get_sun_data(["Norway/Oslo", "Netherlands/Amsterdam"]) print(df.info(), df.head()) ``` @@ -174,14 +183,16 @@ Here's where we'll define a function to get everything into R and create a data ``` r parse_sun_data <- function(df) { #' Get the data frame from Python into R - - data <- tibble(df) |> - relocate(c("city","date"), .before = "dawn") |> - mutate(date = parse_date(date, "%Y-%m-%d"), - across(dawn:dusk, ~ parse_time(.x, format = "%H:%M:%S")), - day_length = hms::as_hms(sunset - sunrise), - city = fct_reorder(city, city_no)) - + + data <- tibble(df) |> + relocate(c("city", "date"), .before = "dawn") |> + mutate( + date = parse_date(date, "%Y-%m-%d"), + across(dawn:dusk, ~ parse_time(.x, format = "%H:%M:%S")), + day_length = hms::as_hms(sunset - sunrise), + city = fct_reorder(city, city_no) + ) + return(data) } ``` @@ -189,16 +200,23 @@ parse_sun_data <- function(df) { Let's have a look at when sunset happens throughout the year for these cities. I deliberately picked a few cities on extreme ends of the Central European time zone to show the variety within a single time zone. We'll include the [DST](https://en.wikipedia.org/wiki/Daylight_saving_time) also. ``` r -data <- parse_sun_data(reticulate::py$df) - -data |> - ggplot(aes(x = date, y = sunset, color = city, group = city)) + - geom_line(linewidth = 2, lineend = "round", key_glyph = "point") + - labs(x = NULL, - y = "Sunset time", - color = NULL) + +data <- parse_sun_data(reticulate::py$df) + +data |> + ggplot(aes(x = date, y = sunset, color = city, group = city)) + + geom_line( + linewidth = 2, lineend = "round", + key_glyph = "point" + ) + + labs( + x = NULL, + y = "Sunset time", + color = NULL + ) + scale_x_date(labels = scales::label_date(format = "%B")) + - ggthemes::scale_color_tableau(guide = guide_legend(override.aes = list(size = 4))) + + ggthemes::scale_color_tableau( + guide = guide_legend(override.aes = list(size = 4)) + ) + theme_custom() ``` @@ -207,35 +225,29 @@ data |> As expected, the latitude (how far north or south a location is) has the most to say about sunset times in summer in winter. Oslo has both the earliest and latest sunsets in winter and summer. And also predictable, the longitude (how far east or west a location is) merely shifts the curve up or down. Madrid has later sunsets in summer than Warsaw (which is further north) but also later sunsets in winter and a flatter curve overall. Oslo has a much steeper curve than e.g. Amsterdam while simultaneously also having the curve slightly shifted due to Oslo being further east than Amsterdam. The plot below shows the locations of of the cities in this analysis. ``` r -rnaturalearth::ne_countries(scale = "medium", - returnclass = "sf", - continent = "Europe") |> - ggplot() + +rnaturalearth::ne_countries( + scale = "medium", + returnclass = "sf", + continent = "Europe" +) |> + ggplot() + geom_sf(color = "grey60", fill = "#DDD5C7", linewidth = 0.1) + - geom_point(data = data |> distinct(city, lat, long), - aes(x = long, y = lat, color = city), - shape = 18, size = 4) + - labs(color = NULL) + + geom_point( + data = data |> distinct(city, lat, long), + aes(x = long, y = lat, color = city), + shape = 18, size = 4 + ) + + labs(color = NULL) + ggthemes::scale_color_tableau() + coord_sf(xlim = c(-15, 25), ylim = c(35, 65)) + - theme_custom() + - theme(legend.position = "right", - legend.direction = "vertical", - panel.grid.major = element_line(size = 0.1)) + theme_custom() + + theme( + legend.position = "right", + legend.direction = "vertical", + panel.grid.major = element_line(size = 0.1) + ) ``` - The legacy packages maptools, rgdal, and rgeos, underpinning the sp package, - which was just loaded, will retire in October 2023. - Please refer to R-spatial evolution reports for details, especially - https://r-spatial.org/r/2023/05/15/evolution4.html. - It may be desirable to make the sf package available; - package maintainers should consider adding sf to Suggests:. - The sp package is now running under evolution status 2 - (status 2 uses the sf package in place of rgdal) - - Warning: The `size` argument of `element_line()` is deprecated as of ggplot2 3.4.0. - ℹ Please use the `linewidth` argument instead. - Perhaps we can have a look at day length to get a better reflection of the isolated effect of latitude. To make the plot more legible, we'll use only 2 cities this time. @@ -243,54 +255,71 @@ Perhaps we can have a look at day length to get a better reflection of the isola We'll create the plot using the `geom_ribbon()` function to shade the area between sunrise and sunset. ``` r -parse_sun_data(reticulate::py$df_deux) |> - ggplot(aes(xmin = date, x = date, xmax = date, - ymin = sunrise, y = noon, ymax = sunset, - fill = city, color = city, group = city)) + - geom_hline(yintercept = parse_time("12:00", "%H:%M"), - size = 1) + +parse_sun_data(reticulate::py$df_deux) |> + ggplot(aes( + xmin = date, x = date, xmax = date, + ymin = sunrise, y = noon, ymax = sunset, + fill = city, color = city, group = city + )) + + geom_hline( + yintercept = parse_time("12:00", "%H:%M"), + linewidth = 1 + ) + geom_ribbon(alpha = 0.5, color = NA, key_glyph = "point") + - labs(x = NULL, - y = NULL, - color = NULL, - fill = NULL) + - scale_x_date(expand = expansion(add = 0)) + - scale_y_time(limits = hms::hms(hours = c(0, 24)), - breaks = hms::hms(hours = seq(0, 24, 2))) + + labs( + x = NULL, + y = NULL, + color = NULL, + fill = NULL + ) + + scale_x_date(expand = expansion(add = 0)) + + scale_y_time( + limits = hms::hms(hours = c(0, 24)), + breaks = hms::hms(hours = seq(0, 24, 2)) + ) + ggthemes::scale_fill_tableau(guide = guide_legend( - override.aes = list(shape = 21, alpha = 0.5, size = 8))) + + override.aes = list(shape = 21, alpha = 0.5, size = 8) + )) + theme_custom() ``` - Warning: Using `size` aesthetic for lines was deprecated in ggplot2 3.4.0. - ℹ Please use `linewidth` instead. - -As you see, in neither cities 12:00 in the afternoon is perfectly in the middle between sunrise and sunset, although this trend is slightly enhanced for Oslo (again due to longitude differences). We can also see that the length of days in Oslo is longer in summer particularly because the mornings start earlier. We'll get back to the distance between sunset and dusk, which is also a lot larger in Oslo which means darkness sets in much later in Oslo despite sunset happening around the same time. This means it's lighter outside longer than in Amsterdam. +{{< sidenote br="1em" >}} +The difference between dusk and sunset is also a lot larger in Oslo, but we'll get back to that +{{< /sidenote >}} + +As you see, in neither cities 12:00 in the afternoon is perfectly in the middle between sunrise and sunset, although this trend is slightly enhanced for Oslo (again due to longitude differences). We can also see that the length of days in Oslo is longer in summer particularly because the mornings start earlier. So let's also squish everything on the bottom axis and just look at day length in total. ``` r -parse_sun_data(reticulate::py$df_deux) |> - mutate(day_length = hms::as_hms(sunset - sunrise)) |> - ggplot(aes(x = date, y = day_length, fill = city, group = city)) + +parse_sun_data(reticulate::py$df_deux) |> + mutate(day_length = hms::as_hms(sunset - sunrise)) |> + ggplot(aes(x = date, y = day_length, fill = city, group = city)) + geom_area(position = "identity", alpha = 0.5, key_glyph = "point") + - labs(title = "Comparison in day length across the year", - x = NULL, - y = "Day length (_hours_)", - fill = NULL) + + labs( + title = "Comparison in day length across the year", + x = NULL, + y = "Day length (_hours_)", + fill = NULL + ) + scale_x_date(labels = scales::label_date(format = "%B")) + - scale_y_time(labels = scales::label_time(format = "%H hours"), - limits = parse_time(c("00:00", "23:59"), "%H:%M"), - breaks = hms::hms(hours = seq(0, 24, 4)), - expand = expansion(add = 0)) + + scale_y_time( + labels = scales::label_time(format = "%H hours"), + limits = parse_time(c("00:00", "23:59"), "%H:%M"), + breaks = hms::hms(hours = seq(0, 24, 4)), + expand = expansion(add = 0) + ) + ggthemes::scale_fill_tableau(guide = guide_legend( - override.aes = list(shape = 21, size = 8), direction = "vertical")) + + override.aes = list(shape = 21, size = 8), direction = "vertical" + )) + theme_custom() + - theme(plot.title.position = "plot", - axis.title.y = element_markdown(), - legend.position = c(0.85, 0.85)) + theme( + plot.title.position = "plot", + axis.title.y = element_markdown(), + legend.position = c(0.85, 0.85) + ) ``` @@ -298,12 +327,14 @@ parse_sun_data(reticulate::py$df_deux) |> Now I'm curious when the largest difference occurs and when the days are roughly equal. Let's select the relevant columns and get the day length in Oslo and Amsterdam next to each other using the `pivot_wider()` function so we can easily compare. Let's then select the largest and smallest differences. ``` r -day_length_diff <- parse_sun_data(reticulate::py$df_deux) |> - select(city, date, day_length) |> - group_by(city) |> - pivot_wider(id_cols = date, names_from = city, values_from = day_length, - names_glue = "day_length_{city}") |> - janitor::clean_names() |> +day_length_diff <- parse_sun_data(reticulate::py$df_deux) |> + select(city, date, day_length) |> + group_by(city) |> + pivot_wider( + id_cols = date, names_from = city, values_from = day_length, + names_glue = "day_length_{city}" + ) |> + janitor::clean_names() |> mutate(difference = hms::as_hms(day_length_oslo - day_length_amsterdam)) print("Biggest positive difference:") @@ -338,20 +369,29 @@ print(day_length_diff |> slice_min(abs(difference), n = 5)) Seems like the largest difference occur around the 20th of June when Oslo gets about 2 full hours of daylight more than Oslo. -Finally, let's look at the effect of longitude (how far east or west a place is) on the sunset/sunrise times. The easiest way to look at this is to compare the time solar noon happens. Due to (most of) mainland Europe being in the same time zone ([CET](https://en.wikipedia.org/wiki/Central_European_Time)), it's 12:00 at the same time across central Europe. However, the sun did not get the note and still travels (from Earth perspective) in a very consistent pace from east to west. This means that the sun arrives first in Poland and leaves last in Spain. This means that solar noon (the time the sun is at its highest point in the sky) is unequal across the continent. Let's look at when solar noon happens across the four cities in this analysis. +{{< sidenote br="1em" >}} +solar noon: the time the sun is at its highest point in the sky +{{< /sidenote >}} + +Finally, let's look at the effect of longitude (how far east or west a place is) on the sunset/sunrise times. The easiest way to look at this is to compare the time solar noon happens. Due to (most of) mainland Europe being in the same time zone ([CET](https://en.wikipedia.org/wiki/Central_European_Time)), it's 12:00 at the same time across central Europe. However, the sun did not get the note and still travels (from Earth perspective) in a very consistent pace from east to west. This means that the sun arrives first in Poland and leaves last in Spain. This means that solar noon is unequal across the continent. Let's look at when solar noon happens across the four cities in this analysis. ``` r -data |> +data |> ggplot(aes(x = date, y = noon, group = city, color = city)) + geom_hline(yintercept = hms::hms(hours = 12), linewidth = 1) + geom_line(linewidth = 2, key_glyph = "point") + - labs(title = "What time is solar noon?", - color = NULL) + - scale_x_date(expand = expansion(add = 0)) + - scale_y_time(limits = hms::hms(hours = c(11, 15)), - breaks = hms::hms(hours = seq(11, 15, 0.5))) + + labs( + title = "What time is solar noon?", + color = NULL + ) + + scale_x_date(expand = expansion(add = 0)) + + scale_y_time( + limits = hms::hms(hours = c(11, 15)), + breaks = hms::hms(hours = seq(11, 15, 0.5)) + ) + ggthemes::scale_color_tableau(guide = guide_legend( - override.aes = list(size = 4))) + + override.aes = list(size = 4) + )) + ggthemes::scale_fill_tableau() + theme_custom() ``` diff --git a/content/blog/2023-sunrise-sunset-differences/index.qmd b/content/blog/2023-sunrise-sunset-differences/index.qmd index 272ca35..1dcc1f4 100644 --- a/content/blog/2023-sunrise-sunset-differences/index.qmd +++ b/content/blog/2023-sunrise-sunset-differences/index.qmd @@ -15,6 +15,8 @@ execute: fig.show: hold results: hold out.width: 80% +editor_options: + chunk_output_type: console --- Today I want to share a project that deals with data scientist's favorite kind of data: date- and timedata involving time zones and daylight savings time. Who doesn't love it? @@ -31,15 +33,17 @@ library(tidyverse) library(lubridate) library(ggtext) -reticulate::use_virtualenv('./.venv', required = TRUE) +reticulate::use_virtualenv("./.venv", required = TRUE) -theme_custom <- function(...){ +theme_custom <- function(...) { ggthemes::theme_fivethirtyeight(...) + - theme(plot.background = element_rect(fill = "transparent"), - panel.background = element_rect(fill = "transparent"), - legend.background = element_rect(fill = "transparent"), - legend.key = element_rect(fill = "transparent")) - } + theme( + plot.background = element_rect(fill = "transparent"), + panel.background = element_rect(fill = "transparent"), + legend.background = element_rect(fill = "transparent"), + legend.key = element_rect(fill = "transparent") + ) +} ``` For the Python part, we'll use `pandas` as usual, the `datetime` and `time` module to deal with date- and timedata. The `dateutil` module to deal with time zones. And to get the actual raw data we'll use the `astral` module to get the sunrise and sunset times for different locations. Unfortunately, not all cities are included, it includes all capital cities plus some additional cities in the UK and USA. To get the coordinates for the cities we want to look at, we'll use the `geopy` module which allows you to look up the longitude, latitude of a given location without needing to create an account somewhere or dealing with an API (directly). To deal with some wrangling of strings we'll use the `re` module. @@ -57,82 +61,86 @@ from geopy.geocoders import Nominatim import re ``` +{{{< sidenote br="20em" >}}} +If you thought working with dates and times was a headache, imagine adding DST to the mix as well +{{{< /sidenote >}}} + For this project we'll collect a number of variables related to sunrise and sunset for different locations throughout the year. The `astral` module makes this very easy, and in order to keep things clean and efficient we'll create a function to efficient collect this data. We'll write it in such a way that it can collect a number of locations within one function call using a list input (which isn't the cleanest, but works quite well here). We'll provide the list of cities we want to analyze in the form `'/'` to avoid any possible misconceptions (e.g. Cambridge, Cambridgeshire in the UK or Cambridge, MA in the US). We'll then extract the coordinates for that location using the `geopy` module. We'll also supply the reference time zone (for this project Central Europe). The `astral` module has a [known issue](https://github.com/sffjunkie/astral/issues/67) when dusk happens past midnight. So we'll extract everything in [UTC](https://en.wikipedia.org/wiki/Coordinated_Universal_Time) and convert it to the time zone of interest later. We'll collect the data throughout the year for the various locations in a loop. And we'll cycle through the year using a `while` loop. We'll also collect the difference in time from the [Daylights Savings Time](https://en.wikipedia.org/wiki/Daylight_saving_time). We'll convert it to the time zone of interest using the `_convert_timezone()` function. In the final data frame we'll only get the times (not the dates) so in case the dusk happens past midnight, we'll consider that "no dusk takes place this day" instead of having it happen "early in the morning the same day", implemented in the `_fix_dusks_past_midnight()` function. This will help with the plots later. Finally we'll convert some of the variables to a format that'll make it easy to deal with in R later. ```{python} #| label: py-define-extraction-function def _convert_timezone(x, to_tz=tz.tzlocal()): - ''' - Convert the default time zone to local - ''' - - x_out = x.apply(lambda k: k.astimezone(to_tz)) - - return x_out + """ + Convert the default time zone to local + """ + + x_out = x.apply(lambda k: k.astimezone(to_tz)) + + return x_out + def _fix_dusks_past_midnight(sunset, dusk): - ''' - Replace the dusk time with NaN if it's past midnight - ''' - - sunset_dt = datetime.strptime(sunset, '%H:%M:%S') - dusk_dt = datetime.strptime(dusk, '%H:%M:%S') - - if dusk_dt < sunset_dt: - dusk_out = '23:59:59' - else: - dusk_out = dusk_dt.strftime('%H:%M:%S') - - return dusk_out - + """ + Replace the dusk time with NaN if it's past midnight + """ + + sunset_dt = datetime.strptime(sunset, "%H:%M:%S") + dusk_dt = datetime.strptime(dusk, "%H:%M:%S") + + if dusk_dt < sunset_dt: + dusk_out = "23:59:59" + else: + dusk_out = dusk_dt.strftime("%H:%M:%S") + + return dusk_out + + def get_sun_data( - cities=['Norway/Oslo', 'Netherlands/Amsterdam'], - ref_tz='Europe/Berlin'): - ''' + cities=["Norway/Oslo", "Netherlands/Amsterdam"], + ref_tz="Europe/Berlin" +): + """ Get sunset data from location - ''' - + """ + geolocator = Nominatim(user_agent="sunset-sunrise-app") - + df = pd.DataFrame() for i, city in enumerate(cities): df_tmp = pd.DataFrame() - + loc = geolocator.geocode(city) city_loc = LocationInfo( - timezone=city, - latitude=loc.latitude, - longitude=loc.longitude - ) - - start_date = datetime.strptime( - date.today().strftime('%Y-01-01'), '%Y-%m-%d') - end_date = datetime.strptime( - date.today().strftime('%Y-12-31'), '%Y-%m-%d') + timezone=city, latitude=loc.latitude, longitude=loc.longitude + ) + + start_date = datetime.strptime(date.today().strftime("%Y-01-01"), "%Y-%m-%d") + end_date = datetime.strptime(date.today().strftime("%Y-12-31"), "%Y-%m-%d") delta = timedelta(days=1) while start_date <= end_date: s = sun(city_loc.observer, date=start_date) - s['dst'] = time.localtime(start_date.timestamp()).tm_isdst + s["dst"] = time.localtime(start_date.timestamp()).tm_isdst df_tmp = pd.concat([df_tmp, pd.DataFrame(s, index=[0])]) start_date += delta - df_tmp['city_no'] = i + 1 - df_tmp['location'] = city_loc.timezone - df_tmp['lat'] = loc.latitude - df_tmp['long'] = loc.longitude + df_tmp["city_no"] = i + 1 + df_tmp["location"] = city_loc.timezone + df_tmp["lat"] = loc.latitude + df_tmp["long"] = loc.longitude df = pd.concat([df, df_tmp]) df.reset_index(drop=True, inplace=True) - df['date'] = df['noon'].dt.strftime('%Y-%m-%d') - df['dst'] = df['dst'].shift(-1, fill_value=0) - df['city'] = df['location'].apply(lambda x: re.findall('\\/(.*)', x)[0]) + df["date"] = df["noon"].dt.strftime("%Y-%m-%d") + df["dst"] = df["dst"].shift(-1, fill_value=0) + df["city"] = df["location"].apply(lambda x: re.findall("\\/(.*)", x)[0]) - cols = ['dawn', 'sunrise', 'noon', 'sunset', 'dusk'] + cols = ["dawn", "sunrise", "noon", "sunset", "dusk"] df[cols] = df[cols].apply(lambda k: _convert_timezone(k, to_tz=ref_tz)) - df[cols] = df[cols].apply(lambda k: k.dt.strftime('%H:%M:%S')) - df['dusk'] = df.apply( - lambda k: _fix_dusks_past_midnight(sunset=k['sunset'], dusk=k['dusk']), axis=1 - ) + df[cols] = df[cols].apply(lambda k: k.dt.strftime("%H:%M:%S")) + df["dusk"] = df.apply( + lambda k: _fix_dusks_past_midnight(sunset=k["sunset"], + dusk=k["dusk"]), axis=1 + ) return df @@ -144,9 +152,10 @@ Let's run this function once for Oslo, Amsterdam, Warsaw, and Madrid and once fo #| label: run-python-script df = get_sun_data( - ['Norway/Oslo', 'Netherlands/Amsterdam', 'Poland/Warsaw', 'Spain/Madrid'] + ["Norway/Oslo", "Netherlands/Amsterdam", + "Poland/Warsaw", "Spain/Madrid"] ) -df_deux = get_sun_data(['Norway/Oslo', 'Netherlands/Amsterdam']) +df_deux = get_sun_data(["Norway/Oslo", "Netherlands/Amsterdam"]) print(df.info(), df.head()) ``` @@ -158,14 +167,16 @@ Here's where we'll define a function to get everything into R and create a data parse_sun_data <- function(df) { #' Get the data frame from Python into R - - data <- tibble(df) |> - relocate(c("city","date"), .before = "dawn") |> - mutate(date = parse_date(date, "%Y-%m-%d"), - across(dawn:dusk, ~ parse_time(.x, format = "%H:%M:%S")), - day_length = hms::as_hms(sunset - sunrise), - city = fct_reorder(city, city_no)) - + + data <- tibble(df) |> + relocate(c("city", "date"), .before = "dawn") |> + mutate( + date = parse_date(date, "%Y-%m-%d"), + across(dawn:dusk, ~ parse_time(.x, format = "%H:%M:%S")), + day_length = hms::as_hms(sunset - sunrise), + city = fct_reorder(city, city_no) + ) + return(data) } ``` @@ -175,16 +186,23 @@ Let's have a look at when sunset happens throughout the year for these cities. I ```{r} #| label: plot-sunset -data <- parse_sun_data(reticulate::py$df) - -data |> - ggplot(aes(x = date, y = sunset, color = city, group = city)) + - geom_line(linewidth = 2, lineend = "round", key_glyph = "point") + - labs(x = NULL, - y = "Sunset time", - color = NULL) + +data <- parse_sun_data(reticulate::py$df) + +data |> + ggplot(aes(x = date, y = sunset, color = city, group = city)) + + geom_line( + linewidth = 2, lineend = "round", + key_glyph = "point" + ) + + labs( + x = NULL, + y = "Sunset time", + color = NULL + ) + scale_x_date(labels = scales::label_date(format = "%B")) + - ggthemes::scale_color_tableau(guide = guide_legend(override.aes = list(size = 4))) + + ggthemes::scale_color_tableau( + guide = guide_legend(override.aes = list(size = 4)) + ) + theme_custom() ``` @@ -192,22 +210,29 @@ As expected, the latitude (how far north or south a location is) has the most to ```{r} #| label: plot-map - -rnaturalearth::ne_countries(scale = "medium", - returnclass = "sf", - continent = "Europe") |> - ggplot() + +#| warning: false + +rnaturalearth::ne_countries( + scale = "medium", + returnclass = "sf", + continent = "Europe" +) |> + ggplot() + geom_sf(color = "grey60", fill = "#DDD5C7", linewidth = 0.1) + - geom_point(data = data |> distinct(city, lat, long), - aes(x = long, y = lat, color = city), - shape = 18, size = 4) + - labs(color = NULL) + + geom_point( + data = data |> distinct(city, lat, long), + aes(x = long, y = lat, color = city), + shape = 18, size = 4 + ) + + labs(color = NULL) + ggthemes::scale_color_tableau() + coord_sf(xlim = c(-15, 25), ylim = c(35, 65)) + - theme_custom() + - theme(legend.position = "right", - legend.direction = "vertical", - panel.grid.major = element_line(size = 0.1)) + theme_custom() + + theme( + legend.position = "right", + legend.direction = "vertical", + panel.grid.major = element_line(size = 0.1) + ) ``` Perhaps we can have a look at day length to get a better reflection of the isolated effect of latitude. To make the plot more legible, we'll use only 2 cities this time. @@ -217,51 +242,71 @@ We'll create the plot using the `geom_ribbon()` function to shade the area betwe ```{r} #| label: plot-day-length-areas -parse_sun_data(reticulate::py$df_deux) |> - ggplot(aes(xmin = date, x = date, xmax = date, - ymin = sunrise, y = noon, ymax = sunset, - fill = city, color = city, group = city)) + - geom_hline(yintercept = parse_time("12:00", "%H:%M"), - size = 1) + +parse_sun_data(reticulate::py$df_deux) |> + ggplot(aes( + xmin = date, x = date, xmax = date, + ymin = sunrise, y = noon, ymax = sunset, + fill = city, color = city, group = city + )) + + geom_hline( + yintercept = parse_time("12:00", "%H:%M"), + linewidth = 1 + ) + geom_ribbon(alpha = 0.5, color = NA, key_glyph = "point") + - labs(x = NULL, - y = NULL, - color = NULL, - fill = NULL) + - scale_x_date(expand = expansion(add = 0)) + - scale_y_time(limits = hms::hms(hours = c(0, 24)), - breaks = hms::hms(hours = seq(0, 24, 2))) + + labs( + x = NULL, + y = NULL, + color = NULL, + fill = NULL + ) + + scale_x_date(expand = expansion(add = 0)) + + scale_y_time( + limits = hms::hms(hours = c(0, 24)), + breaks = hms::hms(hours = seq(0, 24, 2)) + ) + ggthemes::scale_fill_tableau(guide = guide_legend( - override.aes = list(shape = 21, alpha = 0.5, size = 8))) + + override.aes = list(shape = 21, alpha = 0.5, size = 8) + )) + theme_custom() ``` -As you see, in neither cities 12:00 in the afternoon is perfectly in the middle between sunrise and sunset, although this trend is slightly enhanced for Oslo (again due to longitude differences). We can also see that the length of days in Oslo is longer in summer particularly because the mornings start earlier. We'll get back to the distance between sunset and dusk, which is also a lot larger in Oslo which means darkness sets in much later in Oslo despite sunset happening around the same time. This means it's lighter outside longer than in Amsterdam. +{{{< sidenote br="1em" >}}} +The difference between dusk and sunset is also a lot larger in Oslo, but we'll get back to that +{{{< /sidenote >}}} + +As you see, in neither cities 12:00 in the afternoon is perfectly in the middle between sunrise and sunset, although this trend is slightly enhanced for Oslo (again due to longitude differences). We can also see that the length of days in Oslo is longer in summer particularly because the mornings start earlier. So let's also squish everything on the bottom axis and just look at day length in total. ```{r} #| label: plot-day-length-curve -parse_sun_data(reticulate::py$df_deux) |> - mutate(day_length = hms::as_hms(sunset - sunrise)) |> - ggplot(aes(x = date, y = day_length, fill = city, group = city)) + +parse_sun_data(reticulate::py$df_deux) |> + mutate(day_length = hms::as_hms(sunset - sunrise)) |> + ggplot(aes(x = date, y = day_length, fill = city, group = city)) + geom_area(position = "identity", alpha = 0.5, key_glyph = "point") + - labs(title = "Comparison in day length across the year", - x = NULL, - y = "Day length (_hours_)", - fill = NULL) + + labs( + title = "Comparison in day length across the year", + x = NULL, + y = "Day length (_hours_)", + fill = NULL + ) + scale_x_date(labels = scales::label_date(format = "%B")) + - scale_y_time(labels = scales::label_time(format = "%H hours"), - limits = parse_time(c("00:00", "23:59"), "%H:%M"), - breaks = hms::hms(hours = seq(0, 24, 4)), - expand = expansion(add = 0)) + + scale_y_time( + labels = scales::label_time(format = "%H hours"), + limits = parse_time(c("00:00", "23:59"), "%H:%M"), + breaks = hms::hms(hours = seq(0, 24, 4)), + expand = expansion(add = 0) + ) + ggthemes::scale_fill_tableau(guide = guide_legend( - override.aes = list(shape = 21, size = 8), direction = "vertical")) + + override.aes = list(shape = 21, size = 8), direction = "vertical" + )) + theme_custom() + - theme(plot.title.position = "plot", - axis.title.y = element_markdown(), - legend.position = c(0.85, 0.85)) + theme( + plot.title.position = "plot", + axis.title.y = element_markdown(), + legend.position = c(0.85, 0.85) + ) ``` Now I'm curious when the largest difference occurs and when the days are roughly equal. Let's select the relevant columns and get the day length in Oslo and Amsterdam next to each other using the `pivot_wider()` function so we can easily compare. Let's then select the largest and smallest differences. @@ -270,12 +315,14 @@ Now I'm curious when the largest difference occurs and when the days are roughly #| label: get-biggest-difference #| results: hold -day_length_diff <- parse_sun_data(reticulate::py$df_deux) |> - select(city, date, day_length) |> - group_by(city) |> - pivot_wider(id_cols = date, names_from = city, values_from = day_length, - names_glue = "day_length_{city}") |> - janitor::clean_names() |> +day_length_diff <- parse_sun_data(reticulate::py$df_deux) |> + select(city, date, day_length) |> + group_by(city) |> + pivot_wider( + id_cols = date, names_from = city, values_from = day_length, + names_glue = "day_length_{city}" + ) |> + janitor::clean_names() |> mutate(difference = hms::as_hms(day_length_oslo - day_length_amsterdam)) print("Biggest positive difference:") @@ -290,26 +337,35 @@ print(day_length_diff |> slice_min(abs(difference), n = 5)) Seems like the largest difference occur around the 20th of June when Oslo gets about 2 full hours of daylight more than Oslo. -Finally, let's look at the effect of longitude (how far east or west a place is) on the sunset/sunrise times. The easiest way to look at this is to compare the time solar noon happens. Due to (most of) mainland Europe being in the same time zone ([CET](https://en.wikipedia.org/wiki/Central_European_Time)), it's 12:00 at the same time across central Europe. However, the sun did not get the note and still travels (from Earth perspective) in a very consistent pace from east to west. This means that the sun arrives first in Poland and leaves last in Spain. This means that solar noon (the time the sun is at its highest point in the sky) is unequal across the continent. Let's look at when solar noon happens across the four cities in this analysis. +{{{< sidenote br="1em" >}}} +solar noon: the time the sun is at its highest point in the sky +{{{< /sidenote >}}} + +Finally, let's look at the effect of longitude (how far east or west a place is) on the sunset/sunrise times. The easiest way to look at this is to compare the time solar noon happens. Due to (most of) mainland Europe being in the same time zone ([CET](https://en.wikipedia.org/wiki/Central_European_Time)), it's 12:00 at the same time across central Europe. However, the sun did not get the note and still travels (from Earth perspective) in a very consistent pace from east to west. This means that the sun arrives first in Poland and leaves last in Spain. This means that solar noon is unequal across the continent. Let's look at when solar noon happens across the four cities in this analysis. ```{r} #| label: plot-noon -data |> +data |> ggplot(aes(x = date, y = noon, group = city, color = city)) + geom_hline(yintercept = hms::hms(hours = 12), linewidth = 1) + geom_line(linewidth = 2, key_glyph = "point") + - labs(title = "What time is solar noon?", - color = NULL) + - scale_x_date(expand = expansion(add = 0)) + - scale_y_time(limits = hms::hms(hours = c(11, 15)), - breaks = hms::hms(hours = seq(11, 15, 0.5))) + + labs( + title = "What time is solar noon?", + color = NULL + ) + + scale_x_date(expand = expansion(add = 0)) + + scale_y_time( + limits = hms::hms(hours = c(11, 15)), + breaks = hms::hms(hours = seq(11, 15, 0.5)) + ) + ggthemes::scale_color_tableau(guide = guide_legend( - override.aes = list(size = 4))) + + override.aes = list(size = 4) + )) + ggthemes::scale_fill_tableau() + theme_custom() ``` This one actually quite surprised me! Due to it's location on the far west of the time zone Madrid has solar noon only at around 14:00. Oslo and Warsaw are actually the only cities that are at time roughly near the 12:00 mark corrected for DST. However, with DST Oslo has solar noon close to midday (12:00) in late October right after switching off from DST and Warsaw most of the winter months early in the year. Also, solar noon in Madrid is nearly 2 hours later than solar noon in Warsaw, which is quite a big difference for a single time zone, which is one of the few reasons Portugal is in the [western European time zone](https://en.wikipedia.org/wiki/Western_European_Time)) -If I had time (and motivation) I would explore some other extremes like [China which one uses a single time zone](https://en.wikipedia.org/wiki/Time_in_China) despite probably needing 4 to 5 due to its size, [Australia with it's half hour time zones](https://en.wikipedia.org/wiki/Time_in_Australia), and locations that are on roughly the same longitude but aren't in the same time zone due to economic and/or political reasons (e.g. China, and [Iceland which doesn't observe DST](https://en.wikipedia.org/wiki/Time_in_Iceland)). This was a fun little project, I certainly gained some insight into how latitude and longitude affect sunrise and sunset times and how much of a difference exists within central Europe. For an added bonus we got to create some fun plots, what's not to love! Thanks for reading! \ No newline at end of file +If I had time (and motivation) I would explore some other extremes like [China which one uses a single time zone](https://en.wikipedia.org/wiki/Time_in_China) despite probably needing 4 to 5 due to its size, [Australia with it's half hour time zones](https://en.wikipedia.org/wiki/Time_in_Australia), and locations that are on roughly the same longitude but aren't in the same time zone due to economic and/or political reasons (e.g. China, and [Iceland which doesn't observe DST](https://en.wikipedia.org/wiki/Time_in_Iceland)). This was a fun little project, I certainly gained some insight into how latitude and longitude affect sunrise and sunset times and how much of a difference exists within central Europe. For an added bonus we got to create some fun plots, what's not to love! Thanks for reading! diff --git a/content/cv/index.qmd b/content/cv/index.qmd index 6e2d650..238ffb5 100644 --- a/content/cv/index.qmd +++ b/content/cv/index.qmd @@ -26,7 +26,9 @@ iframe { #| label: iframe #| echo: FALSE -htmltools::tags$iframe(src = "https://danielroelfs.github.io/cv", - onload = "this.width='100%';this.height=screen.height*0.6;", - frameBorder="0") +htmltools::tags$iframe( + src = "https://danielroelfs.github.io/cv", + onload = "this.width='100%';this.height=screen.height*0.6;", + frameBorder = "0" +) ``` diff --git a/content/photography/index.qmd b/content/photography/index.qmd index 0f758ea..a85b90f 100644 --- a/content/photography/index.qmd +++ b/content/photography/index.qmd @@ -30,7 +30,9 @@ See the website for my photography here: