diff --git a/.github/workflows/test-build.yml b/.github/workflows/test-build.yml new file mode 100644 index 0000000..0f7d07a --- /dev/null +++ b/.github/workflows/test-build.yml @@ -0,0 +1,39 @@ +name: Test build + +on: + push: + branches: + - update-theme + +jobs: + build: + + runs-on: ubuntu-latest + + env: + HUGO_VERSION: 0.116.1 + + steps: + - name: 🛎 Check out repo + uses: actions/checkout@v3 + with: + ref: update-theme + fetch-depth: 1 + submodules: recursive + - name: Use Node.js + uses: actions/setup-node@master + with: + node-version: 10.x + - run: npm i -g postcss postcss-cli autoprefixer + - name: 🤵 Install Hugo + run: | + wget -O ${{ runner.temp }}/hugo.deb https://github.com/gohugoio/hugo/releases/download/v${HUGO_VERSION}/hugo_extended_${HUGO_VERSION}_linux-amd64.deb \ + && sudo dpkg -i ${{ runner.temp }}/hugo.deb + - name: 🧹 Clean site + run: | + if [ -d "public" ]; then + rm -rf public/* + fi + - name: 🍳 Build site + run: | + hugo --gc --minify diff --git a/.gitmodules b/.gitmodules new file mode 100644 index 0000000..f6ed6f4 --- /dev/null +++ b/.gitmodules @@ -0,0 +1,4 @@ +[submodule "themes/typography"] + path = themes/typography + url = https://github.com/danielroelfs/hugo-theme-typography.git + branch = dev diff --git a/archetypes/blog.md b/archetypes/blog.md deleted file mode 100644 index cf7b9dd..0000000 --- a/archetypes/blog.md +++ /dev/null @@ -1,14 +0,0 @@ ---- -title: "{{ replace .TranslationBaseName "-" " " | title }}" -author: Daniel Roelfs -date: {{ .Date }} - -draft: true -output: - html_document: - keep_md: yes -always_allow_html: true - -tags: [R, ] -image: index_files/ ---- \ No newline at end of file diff --git a/assets/custom_style.css b/assets/custom_style.css new file mode 100644 index 0000000..9b1f6ae --- /dev/null +++ b/assets/custom_style.css @@ -0,0 +1,6 @@ +@media all and (min-width: 1200px) { + img:not(.icon) { + margin-left: -6rem; + min-width: 120%; + } +} diff --git a/config.toml b/config.toml index c7f9724..4554e82 100644 --- a/config.toml +++ b/config.toml @@ -1,18 +1,8 @@ -baseurl = "/" # Hostname (and path) to the root. -title = "Daniel Roelfs" # Site title. -theme = "hugo-coder-portfolio-edit" # Set the theme. -languagecode = "en" # The site’s language code used to generate RSS. -defaultcontentlanguage = "en" # The default content language. -defaultContentLanguageInSubdir = true +baseurl = "/" +title = "Daniel Roelfs" +theme = "typography" -paginate = 20 # Default number of pages per page in pagination. - -PygmentsStyle = "monokai" # Color-theme or style for syntax highlighting. -PygmentsCodeFences = true # Enable code fence background highlighting. -PygmentsCodeFencesGuessSyntax = false # Enable syntax guessing for code fences without specified language. -PygmentsUseClasses = true # new add - -ignoreFiles = ["\\.qmd$", "\\.ipynb$", "\\.Rmd$", "\\.Rmarkdown$", "_cache$", "index\\.html", "about\\.html", "static/blog/\\*_files/", "README"] +ignoreFiles = ["\\.qmd$", "\\.ipynb$", "\\.Rmd$", "\\.Rmarkdown$", "_cache$", "index\\.html", "about\\.html", "static/blog/\\*_files/", "README.md"] [markup] defaultMarkdownHandler = "goldmark" @@ -20,93 +10,79 @@ ignoreFiles = ["\\.qmd$", "\\.ipynb$", "\\.Rmd$", "\\.Rmarkdown$", "_cache$", "i [markup.goldmark.renderer] unsafe = true -[params] # theme parameters - author = "Daniel Roelfs" # Author's name. - info = "PhD Candidate in Brain Imaging & Genetics" # Author's job title or info. - description = "Daniel Roelfs' personal website" # Site description. - keywords = "blog,personal,coding" # Site keywords. - avatarurl = "images/avatar.png" # Contain the path of the optional avatar in the static folder. - - # Whether you want to hide copyright and credits in the footer. - hideCredits = true - hideCopyright = true +[params] + introtag = "introduction" + intro = "PhD Candidate in Brain Imaging & Genetics, Data Scientist within Anti-Money Laundering at DNB" - # Custom CSS - custom_css = [] + favicon = "avatar.png" - # Alignment of Mobile Menu items - itemscentered = true + iconsource = "simple-icons" - # RTL support - rtl = false + description = "Daniel Roelfs' personal website" + keywords = "blog, personal, coding" - # Bottom sns share - snsShare = true # new add - # Popular sns share - # if you want add sns. please message! - enableTwitterShare = false # new add - enableFacebookShare = false # new add - enableHatenaShare = false # new add - enableLineShare = false # new add - enableLinkedInShare = false # new add + customcss = ["custom_style.css"] - thumbnail = "images/avatar.png" # default sns thumbnail + umami = true + umamilink = "https://analytics-danielroelfs.netlify.app/script.js" + umamiid = "da48a88a-2e87-4024-8c99-639222aab54d" -# Social links -[[params.social]] - name = "Twitter" - icon = "fab fa-twitter" - weight = 1 - url = "https://twitter.com/dthroelfs/" -[[params.social]] - name = "GitHub" - icon = "fab fa-github" - weight = 2 - url = "https://github.com/danielroelfs/" -[[params.social]] - name = "ORCID" - icon = "fab fa-orcid" - weight = 3 - url = "https://orcid.org/0000-0002-1083-002X" -[[params.social]] - name = "Google Scholar" - icon = "fas fa-graduation-cap" - weight = 4 - url = "https://scholar.google.com/citations?user=QmVQcsAAAAAJ&hl=en" -[[params.social]] - name = "LinkedIn" - icon = "fab fa-linkedin" - weight = 5 - url = "https://www.linkedin.com/in/danielroelfs/" -[[params.social]] - name = "Mastodon" - icon = "fab fa-mastodon" - weight = 6 - url = "https://mastodon.online/@danielroelfs#" -[[params.social]] - name = "Mail" - icon = "fas fa-envelope" - weight = 7 - url = "mailto:daniel.roelfs@medisin.uio.no" + # Social links + [[params.social]] + name = "GitHub" + icon = "github" + weight = 1 + url = "https://github.com/danielroelfs/" + [[params.social]] + name = "Twitter" + icon = "x" + weight = 2 + url = "https://twitter.com/dthroelfs/" + [[params.social]] + name = "LinkedIn" + icon = "linkedin" + weight = 3 + url = "https://www.linkedin.com/in/danielroelfs/" + [[params.social]] + name = "ORCID" + icon = "orcid" + weight = 4 + url = "https://orcid.org/0000-0002-1083-002X" + [[params.social]] + name = "Google Scholar" + icon = "googlescholar" + weight = 5 + url = "https://scholar.google.com/citations?user=QmVQcsAAAAAJ&hl=en" + [[params.social]] + name = "Mail" + icon = "minutemailer" + weight = 6 + url = "mailto:daniel.roelfs@medisin.uio.no" # Menu links -[[menu.main]] - name = "Blog" - weight = 1 - url = "blog" -[[menu.main]] - name = "Photography" - weight = 2 - url = "photography" -[[menu.main]] - name = "Publications" - weight = 3 - url = "publications" -[[menu.main]] - name = "Curriculum Vitæ" - weight = 4 - url = "cv" -[[menu.main]] - name = "About" - weight = 5 - url = "about" \ No newline at end of file +[menu] + [[menu.main]] + name = "Home" + url = "/" + weight = 1 + [[menu.main]] + name = "Blog" + url = "/blog" + weight = 2 + [[menu.main]] + name = "Photography" + url = "/photography" + weight = 3 + identifier = "hide-from-nav" + [[menu.main]] + name = "Publications" + url = "/publications" + weight = 4 + [[menu.main]] + name = "CV" + url = "/cv" + weight = 5 + [[menu.main]] + name = "About" + url = "/about" + weight = 6 diff --git a/content/about/index.md b/content/about/index.md index 73e135a..802451e 100644 --- a/content/about/index.md +++ b/content/about/index.md @@ -1,21 +1,15 @@ --- -description: "about" -slug: "about" -thumbnail: "images/avatar.png" -title: "About" -author: "Daniel Roelfs" -format: hugo +title: About +description: About +hidetopnav: true --- - +{{< sidenote >}} +![avatar](../../avatar.png) +{{< /sidenote >}} -I'm Daniel Roelfs, currently working as a Data Scientist at the Norwegian bank DNB. Prior to this I was a PhD candidate working at the Norwegian Centre for Mental Disorders Research (NORMENT) in Oslo. My doctoral research focused on the intersection between neuroimaging and genetics to discover more about the human brain and the biology underlying psychiatric disorders. +Hi, I'm Daniel Roelfs! I'm currently working as a Data Scientist at the Norwegian bank DNB. Prior to this I was a PhD candidate working at the Norwegian Centre for Mental Disorders Research (NORMENT) in Oslo. My doctoral research focused on the intersection between neuroimaging and genetics to discover more about the human brain and the biology underlying psychiatric disorders. Born and raised in the Netherlands, I have also studied in France and Sweden before moving to Norway. In my free time I run, ski, hike, and take photos. This blog serves as a way to share some of the code I write in R and Python for personal and professional interests. -If you have any questions or comments about the tutorials and posts here, or just want to get in touch, please send me an e-mail at daniel.roelfs\[a\]medisin.uio.no. +If you have any questions or comments about the tutorials and posts here, or just want to get in touch, please send me an e-mail at [daniel.roelfs\[a\]medisin.uio.no](mailto:daniel.roelfs@medisin.uio.no). diff --git a/content/about/index.qmd b/content/about/index.qmd index b41e588..b0db240 100644 --- a/content/about/index.qmd +++ b/content/about/index.qmd @@ -1,21 +1,15 @@ --- -description: "about" -slug: "about" -thumbnail: "images/avatar.png" -title: "About" -author: "Daniel Roelfs" -format: hugo +title: About +description: About +hidetopnav: true --- - +{{{< sidenote >}}} +![avatar](/avatar.png) +{{{< /sidenote >}}} -I'm Daniel Roelfs, currently working as a Data Scientist at the Norwegian bank DNB. Prior to this I was a PhD candidate working at the Norwegian Centre for Mental Disorders Research (NORMENT) in Oslo. My doctoral research focused on the intersection between neuroimaging and genetics to discover more about the human brain and the biology underlying psychiatric disorders. +Hi, I'm Daniel Roelfs! I'm currently working as a Data Scientist at the Norwegian bank DNB. Prior to this I was a PhD candidate working at the Norwegian Centre for Mental Disorders Research (NORMENT) in Oslo. My doctoral research focused on the intersection between neuroimaging and genetics to discover more about the human brain and the biology underlying psychiatric disorders. Born and raised in the Netherlands, I have also studied in France and Sweden before moving to Norway. In my free time I run, ski, hike, and take photos. This blog serves as a way to share some of the code I write in R and Python for personal and professional interests. -If you have any questions or comments about the tutorials and posts here, or just want to get in touch, please send me an e-mail at daniel.roelfs[a]medisin.uio.no. \ No newline at end of file +If you have any questions or comments about the tutorials and posts here, or just want to get in touch, please send me an e-mail at [daniel.roelfs[a]medisin.uio.no](mailto:daniel.roelfs@medisin.uio.no). \ No newline at end of file diff --git a/content/blog/2019-analyzing-bach/index.markdown_strict_files/figure-markdown_strict/BWVperyear-plot-1.png b/content/blog/2019-analyzing-bach/index.markdown_strict_files/figure-markdown_strict/BWVperyear-plot-1.png index 7891cdc..1f85faa 100644 Binary files a/content/blog/2019-analyzing-bach/index.markdown_strict_files/figure-markdown_strict/BWVperyear-plot-1.png and b/content/blog/2019-analyzing-bach/index.markdown_strict_files/figure-markdown_strict/BWVperyear-plot-1.png differ diff --git a/content/blog/2019-analyzing-bach/index.markdown_strict_files/figure-markdown_strict/mapplot-1.png b/content/blog/2019-analyzing-bach/index.markdown_strict_files/figure-markdown_strict/mapplot-1.png index acd263e..5a0bb32 100644 Binary files a/content/blog/2019-analyzing-bach/index.markdown_strict_files/figure-markdown_strict/mapplot-1.png and b/content/blog/2019-analyzing-bach/index.markdown_strict_files/figure-markdown_strict/mapplot-1.png differ diff --git a/content/blog/2019-analyzing-bach/index.markdown_strict_files/figure-markdown_strict/plot-with-city-1.png b/content/blog/2019-analyzing-bach/index.markdown_strict_files/figure-markdown_strict/plot-with-city-1.png index 6c20a34..f811362 100644 Binary files a/content/blog/2019-analyzing-bach/index.markdown_strict_files/figure-markdown_strict/plot-with-city-1.png and b/content/blog/2019-analyzing-bach/index.markdown_strict_files/figure-markdown_strict/plot-with-city-1.png differ diff --git a/content/blog/2019-analyzing-bach/index.md b/content/blog/2019-analyzing-bach/index.md index 1d918ac..d7eb155 100644 --- a/content/blog/2019-analyzing-bach/index.md +++ b/content/blog/2019-analyzing-bach/index.md @@ -1,17 +1,13 @@ --- title: Analyzing Bach -author: Daniel Roelfs -date: '2019-12-01' +date: 2019-12-01 +description: Analyzing Bach with R slug: analyzing-bach categories: - coding tags: - ggplot - R - - Bach -description: 'Analyzing Bach' -thumbnail: images/avatar.png -format: hugo execute: fig.retina: 2 fig.align: center @@ -64,14 +60,14 @@ Let's collect the BWVs: ``` r index_url <- "http://www.bachcentral.com/BWV/index.html" -BWVs <- paste(readLines(index_url), collapse = "\n") %>% - str_match_all("% - unlist() %>% - unique() %>% - str_extract_all("[0-9]+.html") %>% - unlist() %>% - unique() %>% - str_extract_all("[0-9]+") %>% +BWVs <- paste(readLines(index_url), collapse = "\n") |> + str_match_all(" + unlist() |> + unique() |> + str_extract_all("[0-9]+.html") |> + unlist() |> + unique() |> + str_extract_all("[0-9]+") |> as.character() str(BWVs) ``` @@ -103,11 +99,11 @@ for (i in 1:length(BWVs)) { url <- sprintf("http://www.bachcentral.com/BWV/%s.html", BWVs[i]) webpage <- read_html(url) - text <- webpage %>% - html_nodes("ul") %>% - html_text() %>% - gsub('[\t]', '', .) %>% - strsplit(., "\\n") %>% + text <- webpage |> + html_nodes("ul") |> + html_text() |> + gsub('[\t]', '', .) |> + strsplit(., "\\n") |> unlist() values <- text[seq(2,length(text),2)] @@ -144,14 +140,14 @@ All columns are currently character arrays, and this is appropriate for most of The character arrays are appropriate, but for further analyes I'd prefer to make at least the categories explicitely factorial. Then I'll also rename awkwardly named columns, and reorder the columns to start with the BWV instead of the number title. ``` r -data <- scraped_data %>% +data <- scraped_data |> mutate(BWV = as.numeric(BWV), category1 = factor(category1), category2 = factor(category2), category3 = factor(category3), cantate_cat1 = substring(cantate_cat1,4), - CLC_BWV_w_epifix = str_replace(CLC_BWV_w_epifix, " ", "")) %>% - rename(BWV_w_epifix = CLC_BWV_w_epifix) %>% + CLC_BWV_w_epifix = str_replace(CLC_BWV_w_epifix, " ", "")) |> + rename(BWV_w_epifix = CLC_BWV_w_epifix) |> select(BWV, BWV_epifix, BWV_w_epifix, everything()) str(data) ``` @@ -177,9 +173,9 @@ Now we have this data, we can do some descriptive visualizations of the data. Ov The first descritive I wanted to see was what kind of work Bach wrote and in what numbers. First the main category, which differentiates between choral pieces and instrumental pieces. ``` r -data %>% - group_by(category1) %>% - summarise(n = n()) %>% +data |> + group_by(category1) |> + summarise(n = n()) |> ggplot(aes(x = category1, y = n, color = category1)) + geom_segment(aes(xend = category1, yend = 0), size = 12, color = "grey40", alpha = 0.9) + @@ -204,9 +200,9 @@ data %>% It seems Bach didn't have a strong preference for either instrumental or choral music. I suppose he didn't have infinite freedom with what to compose, since his employer might also request him to compose certain types of music. I wanted to see the same for the secondary category, which differentias between the type of composition (e.g. cantate, passions, organ pieces, symphonies, and so on). ``` r -data %>% - group_by(category2) %>% - summarise(n = n()) %>% +data |> + group_by(category2) |> + summarise(n = n()) |> ggplot(aes(x = reorder(category2,-n), y = n, color = category2)) + geom_segment(aes(xend = category2, yend = 0), size = 6, color = "grey40", alpha = 0.9) + @@ -228,10 +224,10 @@ data %>% From this it seems that most of the intrumental pieces are made up by just solo pieces for organ and the harpsichord and that the choral pieces are made up mostly by cantates and chorales for four voices. While I appreciate the information dissemination qualities of a barplot (or lollipop plot) like this, in that it's good in communicating absolute quantities (while I admit that this could also be displayed in a table). One thing it is less good at, is communicating the subjective volume of the works. There's more than a thousand pieces in the BWV, and I feel that the plots above don't do a good enough job at communicating just how many pieces this is. Therefore, I created a tile plot (using `{waffle}`), where every tile represents one piece. I colored again based on the secondary category. In order to still maintain the quantities of each category, I added the number of compositions to the legend. ``` r -data %>% - group_by(category2) %>% - summarise(n = n()) %>% - mutate(category2 = sprintf("%s (%s)", category2, n)) %>% +data |> + group_by(category2) |> + summarise(n = n()) |> + mutate(category2 = sprintf("%s (%s)", category2, n)) |> ggplot(aes(fill = category2, values = n)) + geom_waffle(n_rows = 17, size = 0.2, colour = "#FFFFFC", flip = FALSE, na.rm = TRUE) + scale_fill_daniel(palette = "staalmeesters", name = NULL) + @@ -253,26 +249,26 @@ The dataset I just created was comprehensive and clean, but it didn't contain an url <- "https://en.wikipedia.org/wiki/List_of_compositions_by_Johann_Sebastian_Bach" webpage <- read_html(url) -wikitext <- webpage %>% - html_nodes(xpath='//*[@id="TOP"]') %>% - html_table(fill = TRUE) %>% +wikitext <- webpage |> + html_nodes(xpath='//*[@id="TOP"]') |> + html_table(fill = TRUE) |> as.data.frame() ``` Then I cleaned the data somewhat and extracted the number from the BWV columns. ``` r -wikidata <- wikitext %>% - rename(BWV_full = BWV) %>% +wikidata <- wikitext |> + rename(BWV_full = BWV) |> mutate(BWV = sub('.*(\\d{3}).*', '\\1', BWV_full), - BWV = parse_integer(BWV)) %>% + BWV = parse_integer(BWV)) |> filter(!rev(duplicated(rev(BWV)))) ``` Then I merged the data. I lost a number of compositions in the process, but I was okay with it, mostly because it took me too much time and effort to try and fix it. Hurray for laziness. I extracted the year from the slightly messy `Date` column. Some strings in this column contain two dates, one for the first compilation and one for the date of completion. I extracted the last number, the year of completion. ``` r -merged_data <- merge(data, wikidata, by = "BWV") %>% +merged_data <- merge(data, wikidata, by = "BWV") |> mutate(year = sub(".*(\\d{4}).*", "\\1", Date), year = as.numeric(year), age = year - 1685) @@ -281,9 +277,9 @@ merged_data <- merge(data, wikidata, by = "BWV") %>% I noticed that some entries in the `year` column exceeded the year of Bach's death. Bach died in 1750. I assumed that these years indicated the year of first performance or publication. In this scenario, it would be possible that certain pieces were lost and rediscovered at a later date and only then published. I thought to make a barplot of the number of compositions Bach over the course of his life, trying to see if there was any period where he was particularly productive. I also added some annotations to give the plot some context. ``` r -BWVperyear <- merged_data %>% - filter(!is.na(year)) %>% - group_by(year,age) %>% +BWVperyear <- merged_data |> + filter(!is.na(year)) |> + group_by(year,age) |> summarise(n = n()) palette <- daniel_pal("staalmeesters")(6) @@ -338,15 +334,15 @@ It seems there were two particularly productive years. But since the year column The Wikipedia page also contained information on the key of most of the compositions (21 were missing). I was wondering if Bach had a particular preference for a specific key. So I calculated the number of compositions written in each key, and separated based on the first category. ``` r -summ_key_cat1 <- merged_data %>% - group_by(category1, Key) %>% - summarise(n = n()) %>% - filter(nchar(Key) > 1) %>% +summ_key_cat1 <- merged_data |> + group_by(category1, Key) |> + summarise(n = n()) |> + filter(nchar(Key) > 1) |> mutate(Key = sub("\u266D", "b", Key), Key = sub("\U266F", "#", Key), Key = ifelse(nchar(Key) > 10, substring(Key,1,nchar(Key)/2), Key) - ) %>% - group_by(category1, Key) %>% + ) |> + group_by(category1, Key) |> summarise(n = sum(n)) ``` @@ -374,19 +370,19 @@ ggplot(summ_key_cat1, aes(x = reorder(Key,-n), y = n, fill = category1)) + I noticed that there were no double keys, as in B flat is the same as A sharp. The person who compiled the table must have taken it into account since it's very unlikely that this was by chance. I also wanted to make the same comparison for the second category. Since I didn't like the idea of a large number of very skinny barplots, or the idea of a stacked barplot, I thought the best way to go about this is to make it into a tile plot. Since the tile plot is quite an efficient way of disseminating information, I could also include the counts from the primary category. ``` r -summ_key_cat2 <- merged_data %>% - group_by(category2, Key) %>% - summarise(n = n()) %>% - filter(nchar(Key) > 1) %>% +summ_key_cat2 <- merged_data |> + group_by(category2, Key) |> + summarise(n = n()) |> + filter(nchar(Key) > 1) |> mutate(Key = sub("\u266D", "b", Key), Key = sub("\U266F", "#", Key), Key = ifelse(nchar(Key) > 10, substring(Key,1,nchar(Key)/2), Key) - ) %>% - group_by(category2, Key) %>% + ) |> + group_by(category2, Key) |> summarise(n = sum(n)) -plotdat <- rbind(summ_key_cat1 %>% rename(category2 = category1), - summ_key_cat2) %>% +plotdat <- rbind(summ_key_cat1 |> rename(category2 = category1), + summ_key_cat2) |> cbind(gr = c(rep(1,nrow(summ_key_cat1)), rep(2,nrow(summ_key_cat2))) ) @@ -422,8 +418,8 @@ places <- data_frame( city = c("Eisenach","Ohrdruf","Lüneburg","Weimar","Arnstadt","Mühlhausen","Weimar","Köthen","Leipzig"), year_from = c(1685,1695,1700,1702,1703,1707,1708,1717,1723), year_to = c(1695,1700,1702,1703,1707,1708,1717,1723,1750) -) %>% - mutate_geocode(city) %>% +) |> + mutate_geocode(city) |> mutate(duration = year_to - year_from) ``` @@ -440,8 +436,8 @@ It's obvious that Bach lived in Leipzich the longest. He had a few productive pe Lastly, just for fun, I created a map. ``` r -places_unique <- places %>% - distinct(., city, .keep_all = TRUE) +places_unique <- places |> + distinct(city, .keep_all = TRUE) ggplot(places_unique, aes(x = lon, y = lat)) + borders("world", colour = palette[3], fill = palette[4], alpha = 1) + diff --git a/content/blog/2019-analyzing-bach/index.qmd b/content/blog/2019-analyzing-bach/index.qmd index 5286d4f..57fb017 100644 --- a/content/blog/2019-analyzing-bach/index.qmd +++ b/content/blog/2019-analyzing-bach/index.qmd @@ -1,17 +1,13 @@ --- title: Analyzing Bach -author: Daniel Roelfs -date: '2019-12-01' +date: 2019-12-01 +description: Analyzing Bach with R slug: analyzing-bach categories: - coding tags: - ggplot - R - - Bach -description: 'Analyzing Bach' -thumbnail: images/avatar.png -format: hugo execute: fig.retina: 2 fig.align: center @@ -20,7 +16,10 @@ execute: out.width: 80% --- -```{css, echo=FALSE} +```{css} +#| label: style +#| echo: false + @import url('https://fonts.googleapis.com/css2?family=Alegreya:ital,wght@0,400;0,700;1,400;1,700&family=IM+Fell+English+SC&family=IM+Fell+English:ital@0;1&display=swap'); h1 { @@ -78,14 +77,14 @@ Let's collect the BWVs: #| label: scrape-index index_url <- "http://www.bachcentral.com/BWV/index.html" -BWVs <- paste(readLines(index_url), collapse = "\n") %>% - str_match_all("% - unlist() %>% - unique() %>% - str_extract_all("[0-9]+.html") %>% - unlist() %>% - unique() %>% - str_extract_all("[0-9]+") %>% +BWVs <- paste(readLines(index_url), collapse = "\n") |> + str_match_all(" + unlist() |> + unique() |> + str_extract_all("[0-9]+.html") |> + unlist() |> + unique() |> + str_extract_all("[0-9]+") |> as.character() str(BWVs) ``` @@ -119,11 +118,11 @@ for (i in 1:length(BWVs)) { url <- sprintf("http://www.bachcentral.com/BWV/%s.html", BWVs[i]) webpage <- read_html(url) - text <- webpage %>% - html_nodes("ul") %>% - html_text() %>% - gsub('[\t]', '', .) %>% - strsplit(., "\\n") %>% + text <- webpage |> + html_nodes("ul") |> + html_text() |> + gsub('[\t]', '', .) |> + strsplit(., "\\n") |> unlist() values <- text[seq(2,length(text),2)] @@ -157,14 +156,14 @@ The character arrays are appropriate, but for further analyes I'd prefer to make ```{r} #| label: clean-scraped -data <- scraped_data %>% +data <- scraped_data |> mutate(BWV = as.numeric(BWV), category1 = factor(category1), category2 = factor(category2), category3 = factor(category3), cantate_cat1 = substring(cantate_cat1,4), - CLC_BWV_w_epifix = str_replace(CLC_BWV_w_epifix, " ", "")) %>% - rename(BWV_w_epifix = CLC_BWV_w_epifix) %>% + CLC_BWV_w_epifix = str_replace(CLC_BWV_w_epifix, " ", "")) |> + rename(BWV_w_epifix = CLC_BWV_w_epifix) |> select(BWV, BWV_epifix, BWV_w_epifix, everything()) str(data) ``` @@ -178,9 +177,9 @@ The first descritive I wanted to see was what kind of work Bach wrote and in wha ```{r} #| label: lollipop-cat1 -data %>% - group_by(category1) %>% - summarise(n = n()) %>% +data |> + group_by(category1) |> + summarise(n = n()) |> ggplot(aes(x = category1, y = n, color = category1)) + geom_segment(aes(xend = category1, yend = 0), size = 12, color = "grey40", alpha = 0.9) + @@ -202,9 +201,9 @@ It seems Bach didn't have a strong preference for either instrumental or choral ```{r} #| label: lollipop-cat2 -data %>% - group_by(category2) %>% - summarise(n = n()) %>% +data |> + group_by(category2) |> + summarise(n = n()) |> ggplot(aes(x = reorder(category2,-n), y = n, color = category2)) + geom_segment(aes(xend = category2, yend = 0), size = 6, color = "grey40", alpha = 0.9) + @@ -227,10 +226,10 @@ From this it seems that most of the intrumental pieces are made up by just solo #| label: waffle #| fig-width: 12 -data %>% - group_by(category2) %>% - summarise(n = n()) %>% - mutate(category2 = sprintf("%s (%s)", category2, n)) %>% +data |> + group_by(category2) |> + summarise(n = n()) |> + mutate(category2 = sprintf("%s (%s)", category2, n)) |> ggplot(aes(fill = category2, values = n)) + geom_waffle(n_rows = 17, size = 0.2, colour = "#FFFFFC", flip = FALSE, na.rm = TRUE) + scale_fill_daniel(palette = "staalmeesters", name = NULL) + @@ -248,12 +247,12 @@ data %>% #| eval: false library(tidytext) -title_words <- data %>% - select(Title) %>% - unnest_tokens(output = word, input = Title) %>% - anti_join(get_stopwords(language = "de")) %>% - count(word, sort = TRUE) %>% - mutate(nchar = nchar(word)) %>% +title_words <- data |> + select(Title) |> + unnest_tokens(output = word, input = Title) |> + anti_join(get_stopwords(language = "de")) |> + count(word, sort = TRUE) |> + mutate(nchar = nchar(word)) |> filter(nchar >= 3) ``` @@ -267,9 +266,9 @@ The dataset I just created was comprehensive and clean, but it didn't contain an url <- "https://en.wikipedia.org/wiki/List_of_compositions_by_Johann_Sebastian_Bach" webpage <- read_html(url) -wikitext <- webpage %>% - html_nodes(xpath='//*[@id="TOP"]') %>% - html_table(fill = TRUE) %>% +wikitext <- webpage |> + html_nodes(xpath='//*[@id="TOP"]') |> + html_table(fill = TRUE) |> as.data.frame() ``` @@ -279,10 +278,10 @@ Then I cleaned the data somewhat and extracted the number from the BWV columns. #| label: clean-wiki #| warning: false -wikidata <- wikitext %>% - rename(BWV_full = BWV) %>% +wikidata <- wikitext |> + rename(BWV_full = BWV) |> mutate(BWV = sub('.*(\\d{3}).*', '\\1', BWV_full), - BWV = parse_integer(BWV)) %>% + BWV = parse_integer(BWV)) |> filter(!rev(duplicated(rev(BWV)))) ``` @@ -292,7 +291,7 @@ Then I merged the data. I lost a number of compositions in the process, but I wa #| label: merge-scraped #| warning: false -merged_data <- merge(data, wikidata, by = "BWV") %>% +merged_data <- merge(data, wikidata, by = "BWV") |> mutate(year = sub(".*(\\d{4}).*", "\\1", Date), year = as.numeric(year), age = year - 1685) @@ -306,9 +305,9 @@ I noticed that some entries in the `year` column exceeded the year of Bach's dea #| message: false #| fig-width: 12 -BWVperyear <- merged_data %>% - filter(!is.na(year)) %>% - group_by(year,age) %>% +BWVperyear <- merged_data |> + filter(!is.na(year)) |> + group_by(year,age) |> summarise(n = n()) palette <- daniel_pal("staalmeesters")(6) @@ -363,15 +362,15 @@ The Wikipedia page also contained information on the key of most of the composit ```{r} #| label: colplot-key-cat1 -summ_key_cat1 <- merged_data %>% - group_by(category1, Key) %>% - summarise(n = n()) %>% - filter(nchar(Key) > 1) %>% +summ_key_cat1 <- merged_data |> + group_by(category1, Key) |> + summarise(n = n()) |> + filter(nchar(Key) > 1) |> mutate(Key = sub("\u266D", "b", Key), Key = sub("\U266F", "#", Key), Key = ifelse(nchar(Key) > 10, substring(Key,1,nchar(Key)/2), Key) - ) %>% - group_by(category1, Key) %>% + ) |> + group_by(category1, Key) |> summarise(n = sum(n)) ggplot(summ_key_cat1, aes(x = reorder(Key,-n), y = n, fill = category1)) + @@ -394,19 +393,19 @@ I noticed that there were no double keys, as in B flat is the same as A sharp. T #| warning: false #| fig-width: 12 -summ_key_cat2 <- merged_data %>% - group_by(category2, Key) %>% - summarise(n = n()) %>% - filter(nchar(Key) > 1) %>% +summ_key_cat2 <- merged_data |> + group_by(category2, Key) |> + summarise(n = n()) |> + filter(nchar(Key) > 1) |> mutate(Key = sub("\u266D", "b", Key), Key = sub("\U266F", "#", Key), Key = ifelse(nchar(Key) > 10, substring(Key,1,nchar(Key)/2), Key) - ) %>% - group_by(category2, Key) %>% + ) |> + group_by(category2, Key) |> summarise(n = sum(n)) -plotdat <- rbind(summ_key_cat1 %>% rename(category2 = category1), - summ_key_cat2) %>% +plotdat <- rbind(summ_key_cat1 |> rename(category2 = category1), + summ_key_cat2) |> cbind(gr = c(rep(1,nrow(summ_key_cat1)), rep(2,nrow(summ_key_cat2))) ) @@ -443,8 +442,8 @@ places <- data_frame( city = c("Eisenach","Ohrdruf","Lüneburg","Weimar","Arnstadt","Mühlhausen","Weimar","Köthen","Leipzig"), year_from = c(1685,1695,1700,1702,1703,1707,1708,1717,1723), year_to = c(1695,1700,1702,1703,1707,1708,1717,1723,1750) -) %>% - mutate_geocode(city) %>% +) |> + mutate_geocode(city) |> mutate(duration = year_to - year_from) ``` @@ -470,13 +469,13 @@ Since this time I wanted to visualize whether Bach was more productive in certai #| echo: false #| fig-width: 12 -places_plot <- places %>% +places_plot <- places |> mutate(meanyear = year_from + ((year_to - year_from)/2)) ggplot(places_plot) + geom_rect(aes(xmin = year_from + 0.5, xmax = year_to + 0.5, ymin = 0, ymax = 60, fill = reorder(city,year_from)), alpha = 0.75) + - stat_smooth(data = BWVperyear %>% filter(year < 1751), mapping = aes(x = year + 0.5, y = n, group = 1), fill = "black", geom = 'area', method = 'loess', span = 0.225, alpha = 0.75) + + stat_smooth(data = BWVperyear |> filter(year < 1751), mapping = aes(x = year + 0.5, y = n, group = 1), fill = "black", geom = 'area', method = 'loess', span = 0.225, alpha = 0.75) + geom_text(aes(x = meanyear, label = city, y = 62.5), family = "Alegreya", angle = 60, hjust = 0) + labs(x = NULL, @@ -498,8 +497,8 @@ Lastly, just for fun, I created a map. ```{r} #| label: mapplot -places_unique <- places %>% - distinct(., city, .keep_all = TRUE) +places_unique <- places |> + distinct(city, .keep_all = TRUE) ggplot(places_unique, aes(x = lon, y = lat)) + borders("world", colour = palette[3], fill = palette[4], alpha = 1) + diff --git a/content/blog/2019-analyzing-bach/index_files/figure-gfm/BWVperyear-plot-1.png b/content/blog/2019-analyzing-bach/index_files/figure-gfm/BWVperyear-plot-1.png deleted file mode 100644 index 8c4fee8..0000000 Binary files a/content/blog/2019-analyzing-bach/index_files/figure-gfm/BWVperyear-plot-1.png and /dev/null differ diff --git a/content/blog/2019-analyzing-bach/index_files/figure-gfm/colplot-key-cat1-1.png b/content/blog/2019-analyzing-bach/index_files/figure-gfm/colplot-key-cat1-1.png deleted file mode 100644 index 8dd0990..0000000 Binary files a/content/blog/2019-analyzing-bach/index_files/figure-gfm/colplot-key-cat1-1.png and /dev/null differ diff --git a/content/blog/2019-analyzing-bach/index_files/figure-gfm/lollipop-cat1-1.png b/content/blog/2019-analyzing-bach/index_files/figure-gfm/lollipop-cat1-1.png deleted file mode 100644 index bf05f89..0000000 Binary files a/content/blog/2019-analyzing-bach/index_files/figure-gfm/lollipop-cat1-1.png and /dev/null differ diff --git a/content/blog/2019-analyzing-bach/index_files/figure-gfm/lollipop-cat2-1.png b/content/blog/2019-analyzing-bach/index_files/figure-gfm/lollipop-cat2-1.png deleted file mode 100644 index af035ae..0000000 Binary files a/content/blog/2019-analyzing-bach/index_files/figure-gfm/lollipop-cat2-1.png and /dev/null differ diff --git a/content/blog/2019-analyzing-bach/index_files/figure-gfm/mapplot-1.png b/content/blog/2019-analyzing-bach/index_files/figure-gfm/mapplot-1.png deleted file mode 100644 index f53696e..0000000 Binary files a/content/blog/2019-analyzing-bach/index_files/figure-gfm/mapplot-1.png and /dev/null differ diff --git a/content/blog/2019-analyzing-bach/index_files/figure-gfm/plot-with-city-1.png b/content/blog/2019-analyzing-bach/index_files/figure-gfm/plot-with-city-1.png deleted file mode 100644 index e473d39..0000000 Binary files a/content/blog/2019-analyzing-bach/index_files/figure-gfm/plot-with-city-1.png and /dev/null differ diff --git a/content/blog/2019-analyzing-bach/index_files/figure-gfm/tileplot-key-cat2-1.png b/content/blog/2019-analyzing-bach/index_files/figure-gfm/tileplot-key-cat2-1.png deleted file mode 100644 index c0a0d5c..0000000 Binary files a/content/blog/2019-analyzing-bach/index_files/figure-gfm/tileplot-key-cat2-1.png and /dev/null differ diff --git a/content/blog/2019-analyzing-bach/index_files/figure-gfm/waffle-1.png b/content/blog/2019-analyzing-bach/index_files/figure-gfm/waffle-1.png deleted file mode 100644 index 9a70ffb..0000000 Binary files a/content/blog/2019-analyzing-bach/index_files/figure-gfm/waffle-1.png and /dev/null differ diff --git a/content/blog/2019-how-i-create-manhattan-plots-using-ggplot/index.md b/content/blog/2019-how-i-create-manhattan-plots-using-ggplot/index.md index 1745d85..bd35e87 100644 --- a/content/blog/2019-how-i-create-manhattan-plots-using-ggplot/index.md +++ b/content/blog/2019-how-i-create-manhattan-plots-using-ggplot/index.md @@ -1,16 +1,13 @@ --- title: How I Create Manhattan Plots Using ggplot -author: Daniel Roelfs -date: '2019-04-24' +date: 2019-04-24 +description: How I Create Manhattan Plots Using ggplot slug: how-i-create-manhattan-plots-using-ggplot categories: - coding tags: - ggplot - R -description: 'How I Create Manhattan Plots Using ggplot' -thumbnail: images/avatar.png -format: hugo execute: fig.retina: 2 fig.align: center @@ -36,7 +33,7 @@ library(normentR) ``` r set.seed(2404) -gwas_data_load <- simulateGWAS(nSNPs = 1e6, nSigCols = 3) %>% +gwas_data_load <- simulateGWAS(nSNPs = 1e6, nSigCols = 3) |> janitor::clean_names() ``` @@ -58,11 +55,11 @@ This will create a dataframe with as many rows as there are SNPs in the summary A vast majority of the datapoints will overlap in the non-significant region of the Manhattan plot, these data points are not particularly informative, and it's possible to select a random number of SNPs in this region to make it less computationally heavy. The data I used here does not have such a large volume, so I only needed to filter a small number of SNPs (10% in this case). ``` r -sig_data <- gwas_data_load %>% +sig_data <- gwas_data_load |> subset(p < 0.05) -notsig_data <- gwas_data_load %>% - subset(p >= 0.05) %>% - group_by(chr) %>% +notsig_data <- gwas_data_load |> + subset(p >= 0.05) |> + group_by(chr) |> sample_frac(0.1) gwas_data <- bind_rows(sig_data, notsig_data) ``` @@ -72,27 +69,27 @@ gwas_data <- bind_rows(sig_data, notsig_data) Since the only columns we have indicating position are the chromosome number and the base pair position of the SNP on that chromosome, we want to combine those so that we have one column with position that we can use for the x-axis. So, what we want to do is to create a column with cumulative base pair position in a way that puts the SNPs on the first chromosome first, and the SNPs on chromosome 22 last. I create a data frame frame called `data_cum` (for cumulative), which selects the largest position for each chromosome, and then calculates the cumulative sum of those. Since we don't need to add anything to the first chromosome, we move everything one row down (using the `lag()` function). We then merge this data frame with the original dataframe and calculate a the cumulative basepair position for each SNP by adding the relative position and the adding factor together. This will create a column (here called `bp_cum`) in which the relative base pair position is the position as if it was stitched together. This code is shown below: ``` r -data_cum <- gwas_data %>% - group_by(chr) %>% - summarise(max_bp = max(bp)) %>% - mutate(bp_add = lag(cumsum(max_bp), default = 0)) %>% +data_cum <- gwas_data |> + group_by(chr) |> + summarise(max_bp = max(bp)) |> + mutate(bp_add = lag(cumsum(max_bp), default = 0)) |> select(chr, bp_add) -gwas_data <- gwas_data %>% - inner_join(data_cum, by = "chr") %>% +gwas_data <- gwas_data |> + inner_join(data_cum, by = "chr") |> mutate(bp_cum = bp + bp_add) ``` When this is done, the next thing I want to do is to get a couple of parameters that I'll use for the plot later. First, I want the centre position of each chromosome. This position I'll use later to place the labels on the x-axis of the Manhattan plot neatly in the middle of each chromosome. In order to get this position, I'll pipe the `gwas_data` dataframe into this powerful `{dplyr}` function which I then ask to calculate the difference between the maximum and minimum cumulative base pair position for each chromosome and divide it by two to get the middle of each chromosome. I also want to set the limit of the y-axis, as not to cut off any highly significant SNPs. If you want to compare multiple GWAS statistics, then I highly recommend to hard code the limit of the y-axis, and then explore the data beforehand to make sure your chosen limit does not cut off any SNPs. Since the y-axis will be log transformed, we need an integer that is lower than the largest negative exponent. But since the y-axis will be linear and positive, I transform the largest exponent to positive and add 2, to give some extra space on the top edge of the plot. When plotting, I actually convert it back to a log scale, but it's a bit easier to add a constant to it by transforming it to a regular integer first. Then, we also want to indicate the significance threshold, I prefer to save this in a variable. Here, I choose to get a Bonferroni-corrected threshold, which is 0.05 divided by the number of SNPs in the summary statistics. I believe many scientists will use the "standard" threshold of 0.05 divided by 1e-6, which is 5e-8. However, in the data I had I believed it to be best to use the Bonferroni-corrected threshold since the sample encompassed different populations, and because it contained less than a million SNPs were used in the association testing, which would make a standard correction overly conservative. These three parameters were calculated as follows: ``` r -axis_set <- gwas_data %>% - group_by(chr) %>% +axis_set <- gwas_data |> + group_by(chr) |> summarize(center = mean(bp_cum)) -ylim <- gwas_data %>% - filter(p == min(p)) %>% - mutate(ylim = abs(floor(log10(p))) + 2) %>% +ylim <- gwas_data |> + filter(p == min(p)) |> + mutate(ylim = abs(floor(log10(p))) + 2) |> pull(ylim) sig <- 5e-8 diff --git a/content/blog/2019-how-i-create-manhattan-plots-using-ggplot/index.qmd b/content/blog/2019-how-i-create-manhattan-plots-using-ggplot/index.qmd index ecc985b..3ad0694 100644 --- a/content/blog/2019-how-i-create-manhattan-plots-using-ggplot/index.qmd +++ b/content/blog/2019-how-i-create-manhattan-plots-using-ggplot/index.qmd @@ -1,16 +1,13 @@ --- title: How I Create Manhattan Plots Using ggplot -author: Daniel Roelfs -date: '2019-04-24' +date: 2019-04-24 +description: How I Create Manhattan Plots Using ggplot slug: how-i-create-manhattan-plots-using-ggplot categories: - coding tags: - ggplot - R -description: 'How I Create Manhattan Plots Using ggplot' -thumbnail: images/avatar.png -format: hugo execute: fig.retina: 2 fig.align: center @@ -39,7 +36,7 @@ library(normentR) set.seed(2404) -gwas_data_load <- simulateGWAS(nSNPs = 1e6, nSigCols = 3) %>% +gwas_data_load <- simulateGWAS(nSNPs = 1e6, nSigCols = 3) |> janitor::clean_names() ``` @@ -50,11 +47,11 @@ A vast majority of the datapoints will overlap in the non-significant region of ```{r} #| label: reduce-size -sig_data <- gwas_data_load %>% +sig_data <- gwas_data_load |> subset(p < 0.05) -notsig_data <- gwas_data_load %>% - subset(p >= 0.05) %>% - group_by(chr) %>% +notsig_data <- gwas_data_load |> + subset(p >= 0.05) |> + group_by(chr) |> sample_frac(0.1) gwas_data <- bind_rows(sig_data, notsig_data) ``` @@ -65,14 +62,14 @@ Since the only columns we have indicating position are the chromosome number and ```{r} #| label: cumulative-bp -data_cum <- gwas_data %>% - group_by(chr) %>% - summarise(max_bp = max(bp)) %>% - mutate(bp_add = lag(cumsum(max_bp), default = 0)) %>% +data_cum <- gwas_data |> + group_by(chr) |> + summarise(max_bp = max(bp)) |> + mutate(bp_add = lag(cumsum(max_bp), default = 0)) |> select(chr, bp_add) -gwas_data <- gwas_data %>% - inner_join(data_cum, by = "chr") %>% +gwas_data <- gwas_data |> + inner_join(data_cum, by = "chr") |> mutate(bp_cum = bp + bp_add) ``` @@ -81,13 +78,13 @@ When this is done, the next thing I want to do is to get a couple of parameters ```{r} #| label: set-axes -axis_set <- gwas_data %>% - group_by(chr) %>% +axis_set <- gwas_data |> + group_by(chr) |> summarize(center = mean(bp_cum)) -ylim <- gwas_data %>% - filter(p == min(p)) %>% - mutate(ylim = abs(floor(log10(p))) + 2) %>% +ylim <- gwas_data |> + filter(p == min(p)) |> + mutate(ylim = abs(floor(log10(p))) + 2) |> pull(ylim) sig <- 5e-8 diff --git a/content/blog/2019-how-i-create-manhattan-plots-using-ggplot/index_files/figure-gfm/print-plot-1.png b/content/blog/2019-how-i-create-manhattan-plots-using-ggplot/index_files/figure-gfm/print-plot-1.png deleted file mode 100644 index 0f1411d..0000000 Binary files a/content/blog/2019-how-i-create-manhattan-plots-using-ggplot/index_files/figure-gfm/print-plot-1.png and /dev/null differ diff --git a/content/blog/2019-how-i-plot-erps-in-r/index.md b/content/blog/2019-how-i-plot-erps-in-r/index.md index e65eff1..f29d7af 100644 --- a/content/blog/2019-how-i-plot-erps-in-r/index.md +++ b/content/blog/2019-how-i-plot-erps-in-r/index.md @@ -1,7 +1,7 @@ --- title: How I Plot ERPs in R -author: Daniel Roelfs -date: '2019-03-30' +date: 2019-03-30 +description: How I Plot ERPs in R slug: how-i-plot-erps-in-r categories: - coding @@ -9,9 +9,6 @@ tags: - R - ggplot - ERP -description: 'How I Plot ERPs in R' -thumbnail: images/avatar.png -format: hugo execute: fig.retina: 2 fig.align: center @@ -46,16 +43,16 @@ I also have a file with the timepoints for each value. i.e. with epoch of 500 m Since I didn't include any headers in my file, I rename them here. I give the the identifying columns their appropriate names, and for the amplitudes, I attach the values from the `times` variable as names to these columns, so -100 will be one column, -99 will be another, and so on. ``` r -ERPdata <- data %>% +ERPdata <- data |> rename(Channel = V1, ID = V2, - Condition = V3) %>% + Condition = V3) |> mutate(ID = factor(ID), Condition = factor(Condition)) oldnames <- sprintf("V%s", 1:ncol(times) + 3) -ERPdata <- ERPdata %>% +ERPdata <- ERPdata |> rename_at(vars(all_of(oldnames)), ~ as.character(times)) ``` @@ -70,15 +67,15 @@ coi <- c("P1", "P2", "Pz", "P3", "P4", "PO3", "PO4", "POz"); Then I calculate the means across channels and conditions. This goes in two steps. First I'll select only the channels of interest, then I'll group by ID, condition, and channel. And then calculate the average of every other column, in this case column 4 to the end of the file. Then I'll do the same, except now I'll group by ID and condition. So then we have one average ERP for every condition in all participants. ``` r -ERPdata_mChan <- ERPdata %>% - filter(Channel %in% coi) %>% +ERPdata_mChan <- ERPdata |> + filter(Channel %in% coi) |> group_by(ID,Condition,Channel) %>% - summarise_at(vars(names(.)[4:ncol(.)]), list(~ mean(., na.rm = TRUE))) %>% + summarise_at(vars(names(.)[4:ncol(.)]), list(~ mean(., na.rm = TRUE))) |> ungroup() -ERPdata_mCond <- ERPdata_mChan %>% +ERPdata_mCond <- ERPdata_mChan |> group_by(ID,Condition) %>% - summarise_at(vars(names(.)[4:ncol(.)]), list(~ mean(., na.rm = TRUE))) %>% + summarise_at(vars(names(.)[4:ncol(.)]), list(~ mean(., na.rm = TRUE))) |> ungroup() MeanERPs <- ERPdata_mCond @@ -89,10 +86,10 @@ MeanERPs <- ERPdata_mCond The next piece of code calculates the grand average. I will also calculate the confidence interval and then transform it from the interval relative to the mean to the absolute values representing the upper and lower boundaries of the confidence interval. Here I use a confidence interval of 95%. We first transform from wide to long format using the `pivot_longer()` function from the `{tidyr}` package. Then we convert the (now character) `Time` variable to numeric. Then we will calculate the average amplitude per time point. Then using the `CI()` function from the `{Rmisc}` package, we calculate the upper and lower bounds of the confidence interval. ``` r -ERP_plotdata <- MeanERPs %>% - pivot_longer(-c(ID,Condition), names_to = "Time", values_to = "Amplitude") %>% - mutate(Time = as.numeric(Time)) %>% - group_by(Condition,Time) %>% +ERP_plotdata <- MeanERPs |> + pivot_longer(-c(ID,Condition), names_to = "Time", values_to = "Amplitude") |> + mutate(Time = as.numeric(Time)) |> + group_by(Condition,Time) |> summarise(Mean_Amplitude = mean(Amplitude), CIlower = Rmisc::CI(Amplitude, ci = 0.95)["lower"], CIupper = Rmisc::CI(Amplitude, ci = 0.95)["upper"]) diff --git a/content/blog/2019-how-i-plot-erps-in-r/index.qmd b/content/blog/2019-how-i-plot-erps-in-r/index.qmd index 92cd792..aebeb37 100644 --- a/content/blog/2019-how-i-plot-erps-in-r/index.qmd +++ b/content/blog/2019-how-i-plot-erps-in-r/index.qmd @@ -1,7 +1,7 @@ --- title: How I Plot ERPs in R -author: Daniel Roelfs -date: '2019-03-30' +date: 2019-03-30 +description: How I Plot ERPs in R slug: how-i-plot-erps-in-r categories: - coding @@ -9,9 +9,6 @@ tags: - R - ggplot - ERP -description: 'How I Plot ERPs in R' -thumbnail: images/avatar.png -format: hugo execute: fig.retina: 2 fig.align: center @@ -70,16 +67,16 @@ Since I didn't include any headers in my file, I rename them here. I give the th ```{r} #| label: prep-data -ERPdata <- data %>% +ERPdata <- data |> rename(Channel = V1, ID = V2, - Condition = V3) %>% + Condition = V3) |> mutate(ID = factor(ID), Condition = factor(Condition)) oldnames <- sprintf("V%s", 1:ncol(times) + 3) -ERPdata <- ERPdata %>% +ERPdata <- ERPdata |> rename_at(vars(all_of(oldnames)), ~ as.character(times)) ``` @@ -87,7 +84,7 @@ ERPdata <- ERPdata %>% #| label: remove-empty-col #| echo: false -ERPdata <- ERPdata %>% +ERPdata <- ERPdata |> select(-V824) ``` @@ -106,15 +103,15 @@ Then I calculate the means across channels and conditions. This goes in two step ```{r} #| label: calculate-grand-averages -ERPdata_mChan <- ERPdata %>% - filter(Channel %in% coi) %>% +ERPdata_mChan <- ERPdata |> + filter(Channel %in% coi) |> group_by(ID,Condition,Channel) %>% - summarise_at(vars(names(.)[4:ncol(.)]), list(~ mean(., na.rm = TRUE))) %>% + summarise_at(vars(names(.)[4:ncol(.)]), list(~ mean(., na.rm = TRUE))) |> ungroup() -ERPdata_mCond <- ERPdata_mChan %>% +ERPdata_mCond <- ERPdata_mChan |> group_by(ID,Condition) %>% - summarise_at(vars(names(.)[4:ncol(.)]), list(~ mean(., na.rm = TRUE))) %>% + summarise_at(vars(names(.)[4:ncol(.)]), list(~ mean(., na.rm = TRUE))) |> ungroup() MeanERPs <- ERPdata_mCond @@ -127,10 +124,10 @@ The next piece of code calculates the grand average. I will also calculate the c ```{r} #| label: long-format -ERP_plotdata <- MeanERPs %>% - pivot_longer(-c(ID,Condition), names_to = "Time", values_to = "Amplitude") %>% - mutate(Time = as.numeric(Time)) %>% - group_by(Condition,Time) %>% +ERP_plotdata <- MeanERPs |> + pivot_longer(-c(ID,Condition), names_to = "Time", values_to = "Amplitude") |> + mutate(Time = as.numeric(Time)) |> + group_by(Condition,Time) |> summarise(Mean_Amplitude = mean(Amplitude), CIlower = Rmisc::CI(Amplitude, ci = 0.95)["lower"], CIupper = Rmisc::CI(Amplitude, ci = 0.95)["upper"]) diff --git a/content/blog/2019-how-i-plot-erps-in-r/index_files/figure-gfm/erp-plot-1.png b/content/blog/2019-how-i-plot-erps-in-r/index_files/figure-gfm/erp-plot-1.png deleted file mode 100644 index 521466c..0000000 Binary files a/content/blog/2019-how-i-plot-erps-in-r/index_files/figure-gfm/erp-plot-1.png and /dev/null differ diff --git a/content/blog/2020-how-i-make-qq-plots-using-ggplot/index.md b/content/blog/2020-how-i-make-qq-plots-using-ggplot/index.md index 6a2747d..5cf53be 100644 --- a/content/blog/2020-how-i-make-qq-plots-using-ggplot/index.md +++ b/content/blog/2020-how-i-make-qq-plots-using-ggplot/index.md @@ -1,16 +1,13 @@ --- title: How I Make QQ Plots Using ggplot -author: Daniel Roelfs -date: '2020-04-10' +date: 2020-04-10 +description: How I Make QQ Plots Using ggplot slug: how-i-make-qq-plots-using-ggplot categories: - coding tags: - ggplot - R -description: 'How I Make QQ Plots Using ggplot' -thumbnail: images/avatar.png -format: hugo execute: fig.retina: 2 fig.align: center @@ -77,11 +74,11 @@ Now we have a data frame that contains all the information we need for plotting. Since the majority of SNPs have a p-value that's between 1 and 0.01, plotting all these individual SNPs is unnecessary. Similar to what one would do in a Manhattan plot, we're going to filter out a large number of SNPs that have a p-value higher than 0.01. Recall that the -log10 of 0.01 is equal to 2, so we're going to take a subset of all expected values equal to or lower than 2. If this step causes some issue due to inflation etc., it will be easily visible in the plot later. ``` r -plotdata_sub <- plotdata %>% - filter(expected <= 2) %>% +plotdata_sub <- plotdata |> + filter(expected <= 2) |> sample_frac(0.01) -plotdata_sup <- plotdata %>% +plotdata_sup <- plotdata |> filter(expected > 2) plotdata_small <- rbind(plotdata_sub, plotdata_sup) diff --git a/content/blog/2020-how-i-make-qq-plots-using-ggplot/index.qmd b/content/blog/2020-how-i-make-qq-plots-using-ggplot/index.qmd index db8223a..6e33364 100644 --- a/content/blog/2020-how-i-make-qq-plots-using-ggplot/index.qmd +++ b/content/blog/2020-how-i-make-qq-plots-using-ggplot/index.qmd @@ -1,16 +1,13 @@ --- title: How I Make QQ Plots Using ggplot -author: Daniel Roelfs -date: '2020-04-10' +date: 2020-04-10 +description: How I Make QQ Plots Using ggplot slug: how-i-make-qq-plots-using-ggplot categories: - coding tags: - ggplot - R -description: 'How I Make QQ Plots Using ggplot' -thumbnail: images/avatar.png -format: hugo execute: fig.retina: 2 fig.align: center @@ -75,11 +72,11 @@ Since the majority of SNPs have a p-value that's between 1 and 0.01, plotting al ```{r} #| label: reduce-size -plotdata_sub <- plotdata %>% - filter(expected <= 2) %>% +plotdata_sub <- plotdata |> + filter(expected <= 2) |> sample_frac(0.01) -plotdata_sup <- plotdata %>% +plotdata_sup <- plotdata |> filter(expected > 2) plotdata_small <- rbind(plotdata_sub, plotdata_sup) diff --git a/content/blog/2020-how-i-make-qq-plots-using-ggplot/index_files/figure-gfm/print-plot-1.png b/content/blog/2020-how-i-make-qq-plots-using-ggplot/index_files/figure-gfm/print-plot-1.png deleted file mode 100644 index b7b60df..0000000 Binary files a/content/blog/2020-how-i-make-qq-plots-using-ggplot/index_files/figure-gfm/print-plot-1.png and /dev/null differ diff --git a/content/blog/2020-plotting-star-destroyers-in-r/index.md b/content/blog/2020-plotting-star-destroyers-in-r/index.md index b9baf17..1588555 100644 --- a/content/blog/2020-plotting-star-destroyers-in-r/index.md +++ b/content/blog/2020-plotting-star-destroyers-in-r/index.md @@ -1,16 +1,13 @@ --- title: Plotting Star Destroyers in R -author: Daniel Roelfs -date: '2020-04-17' +date: 2020-04-17 +description: Plotting Star Destroyers in R slug: plotting-star-destroyers-in-r categories: - coding tags: - ggplot - R -description: 'Plotting Star Destroyers in R' -thumbnail: images/avatar.png -format: hugo execute: fig.retina: 2 fig.align: center @@ -140,7 +137,14 @@ And that's balanced ternary enumeration. So obviously we are lazy, and don't want to do this process manually for thousands of values. that's why we're going to code it. For this step I translated some Python code to R syntax. The function I wrote to do one step of balanced ternary enumartion is shown below. The first value is always 0 (since it's a 0 in the first position, hence `$1*0 = 0$`). After this, we can incorporate the two steps (of converting into ternary and the enumeration) into one. The formula for this looks like this: -`$$\begin{aligned} a(0) &= 0\\ a(3n) &= 3 * a(n)\\ a(3n + 1) &= 3 * a(n) + 1\\ a(3n + 2) &= 3 * a(n) - 1\\ \end{aligned}$$` +$$ +\begin{aligned} +a(0) &= 0\newline +a(3n) &= 3 * a(n)\newline +a(3n + 1) &= 3 * a(n) + 1\newline +a(3n + 2) &= 3 * a(n) - 1 +\end{aligned} +$$ The Python code for this function came from a website that collects mathematical functions and sequences and can be found [here](https://oeis.org/A117966). I've adapted it to work in R. Since 0 will always result in 0, this is hard coded in. Afterwards it is a nested function (I know, we all love it) where it iteratively calls itself until the input to the function is 0 and it stops. At that point we have out balanced ternary. This function only performs the calculation for one value. So getting a sequence means putting it in a `map()` function. @@ -169,7 +173,7 @@ BTE(3) [1] 3 -Let's also look at a few other examples using the `map()` function again. Since the `BTE()` function outputs only integers, I use `map_dbl()`. Let's input a few exampels: +Let's also look at a few other examples using the `map()` function again. Since the `BTE()` function outputs only integers, I use `map_dbl()`. Let's input a few examples: ``` r example_vector <- c(0,seq(10),500,1500,9999) diff --git a/content/blog/2020-plotting-star-destroyers-in-r/index.qmd b/content/blog/2020-plotting-star-destroyers-in-r/index.qmd index 80ef2df..4b749ac 100644 --- a/content/blog/2020-plotting-star-destroyers-in-r/index.qmd +++ b/content/blog/2020-plotting-star-destroyers-in-r/index.qmd @@ -1,16 +1,13 @@ --- title: Plotting Star Destroyers in R -author: Daniel Roelfs -date: '2020-04-17' +date: 2020-04-17 +description: Plotting Star Destroyers in R slug: plotting-star-destroyers-in-r categories: - coding tags: - ggplot - R -description: 'Plotting Star Destroyers in R' -thumbnail: images/avatar.png -format: hugo execute: fig.retina: 2 fig.align: center @@ -34,6 +31,8 @@ In a ternary system, this same principle applies as in the binary system, except #| message: false library(tidyverse) + +reticulate::use_virtualenv("./.venv") ``` ```{r} @@ -77,7 +76,7 @@ illust_table <- tibble( knitr::kable(illust_table, align = "ccc", - latex_header_includes = c("\\renewcommand{\\arraystretch}{1}")) %>% + latex_header_includes = c("\\renewcommand{\\arraystretch}{1}")) |> kableExtra::kable_styling(position = "center") ``` @@ -160,12 +159,14 @@ And that's balanced ternary enumeration. So obviously we are lazy, and don't want to do this process manually for thousands of values. that's why we're going to code it. For this step I translated some Python code to R syntax. The function I wrote to do one step of balanced ternary enumartion is shown below. The first value is always 0 (since it's a 0 in the first position, hence `$1*0 = 0$`). After this, we can incorporate the two steps (of converting into ternary and the enumeration) into one. The formula for this looks like this: -`$$\begin{aligned} -a(0) &= 0\\ -a(3n) &= 3 * a(n)\\ -a(3n + 1) &= 3 * a(n) + 1\\ -a(3n + 2) &= 3 * a(n) - 1\\ -\end{aligned}$$` +$$ +\begin{aligned} +a(0) &= 0\newline +a(3n) &= 3 * a(n)\newline +a(3n + 1) &= 3 * a(n) + 1\newline +a(3n + 2) &= 3 * a(n) - 1 +\end{aligned} +$$ The Python code for this function came from a website that collects mathematical functions and sequences and can be found [here](https://oeis.org/A117966). I've adapted it to work in R. Since 0 will always result in 0, this is hard coded in. Afterwards it is a nested function (I know, we all love it) where it iteratively calls itself until the input to the function is 0 and it stops. At that point we have out balanced ternary. This function only performs the calculation for one value. So getting a sequence means putting it in a `map()` function. @@ -211,7 +212,7 @@ If we plug the number 3 into the formula, we will get the same result: BTE(3) ``` -Let's also look at a few other examples using the `map()` function again. Since the `BTE()` function outputs only integers, I use `map_dbl()`. Let's input a few exampels: +Let's also look at a few other examples using the `map()` function again. Since the `BTE()` function outputs only integers, I use `map_dbl()`. Let's input a few examples: ```{r} #| label: bte-vector diff --git a/content/blog/2020-plotting-star-destroyers-in-r/index_files/figure-gfm/plot-castle-1.png b/content/blog/2020-plotting-star-destroyers-in-r/index_files/figure-gfm/plot-castle-1.png deleted file mode 100644 index 4fd854b..0000000 Binary files a/content/blog/2020-plotting-star-destroyers-in-r/index_files/figure-gfm/plot-castle-1.png and /dev/null differ diff --git a/content/blog/2020-plotting-star-destroyers-in-r/index_files/figure-gfm/plot-pretty-1.png b/content/blog/2020-plotting-star-destroyers-in-r/index_files/figure-gfm/plot-pretty-1.png deleted file mode 100644 index b11aff3..0000000 Binary files a/content/blog/2020-plotting-star-destroyers-in-r/index_files/figure-gfm/plot-pretty-1.png and /dev/null differ diff --git a/content/blog/2020-plotting-star-destroyers-in-r/index_files/figure-gfm/sw-minimal-1.png b/content/blog/2020-plotting-star-destroyers-in-r/index_files/figure-gfm/sw-minimal-1.png deleted file mode 100644 index 6474fa4..0000000 Binary files a/content/blog/2020-plotting-star-destroyers-in-r/index_files/figure-gfm/sw-minimal-1.png and /dev/null differ diff --git a/content/blog/2020-running-an-ica-on-questionnaires/index.markdown_strict_files/figure-markdown_strict/plot-wmatrix-nolabs-1.png b/content/blog/2020-running-an-ica-on-questionnaires/index.markdown_strict_files/figure-markdown_strict/plot-wmatrix-nolabs-1.png index 8759733..c3fc090 100644 Binary files a/content/blog/2020-running-an-ica-on-questionnaires/index.markdown_strict_files/figure-markdown_strict/plot-wmatrix-nolabs-1.png and b/content/blog/2020-running-an-ica-on-questionnaires/index.markdown_strict_files/figure-markdown_strict/plot-wmatrix-nolabs-1.png differ diff --git a/content/blog/2020-running-an-ica-on-questionnaires/index.md b/content/blog/2020-running-an-ica-on-questionnaires/index.md index ee16b46..a36a7df 100644 --- a/content/blog/2020-running-an-ica-on-questionnaires/index.md +++ b/content/blog/2020-running-an-ica-on-questionnaires/index.md @@ -1,7 +1,7 @@ --- title: Running an ICA on Questionnaires -author: Daniel Roelfs -date: '2020-10-24' +date: 2020-10-24 +description: Running an ICA on Questionnaires slug: running-an-ica-on-questionnaires categories: - coding @@ -9,9 +9,6 @@ tags: - statistics - R - ICA -description: 'Running an ICA on Questionnaires' -thumbnail: images/avatar.png -format: hugo execute: fig.retina: 2 fig.align: center @@ -44,10 +41,10 @@ library(fastICA) There are two files we need, one with the actual data (we'll call this `loaddata`), and one with the list of questions (we'll call this `loadcodes`). ``` r -loaddata <- read_delim("data.csv", delim = "\t") %>% +loaddata <- read_delim("data.csv", delim = "\t") |> mutate(id = row_number()) -loadcodes <- read_delim("codebook_clean.txt", delim = "\t", col_names = FALSE) %>% +loadcodes <- read_delim("codebook_clean.txt", delim = "\t", col_names = FALSE) |> rename(qnum = X1, question = X2) ``` @@ -59,17 +56,17 @@ After cleaning, we still have more than 250.000 individual records left. This is Next, we want to prune the data. We want to exclude questions that have a low degree of variance and we might want to remove or impute questions with a large number of `NA`s. Our first step is to get the data in a format that's easier to work with, in this case, the long format. We'll select all questions, and put the question number in a column called `question` and the values in a column called `score`. ``` r -questdata <- loaddata %>% - select(starts_with("Q")) %>% +questdata <- loaddata |> + select(starts_with("Q")) |> pivot_longer(cols = everything(), names_to = "question", values_to = "score") ``` We'll first find if there's any unanswered questions. These are rows where `score` is `NA`. ``` r -questdata %>% - filter(is.na(score)) %>% - nrow(.) +questdata |> + filter(is.na(score)) |> + nrow() ``` [1] 0 @@ -79,13 +76,13 @@ We find that there's no missing values (crazy, I know!). If there were any missi The next step is to find if there are questions with insufficient variation in answers. For this we'll calculate the percentage that each answer on the Likert scale represents in the total variance of the dataset. We'll take our long-format data, group by the question and the answers on the question (`score`). Count the number of times that answer has been given to that question (using `count()`). Then we'll calculate the percentage that this answer represented within each question (using `perc = n / sum(n)`). Then we'll sort the answers within the questions on the percentage (with the answer that's been answered the most on top, and the answer that's answered the least on the bottom (`arrange(question, -perc)`)). Then we'll take the second-most common answer (`slice(2)`) and select the questions where the second-most answer represented less than 15% of the answers (`perc < 0.15`). The threshold of 15% is fairly arbitrary. Usually I'd go for 10%, but due to the source of the data, I'm being a bit more stringent here. The output from these steps will give us the questions that we might want to exclude due to low variance. ``` r -less15var <- questdata %>% - group_by(question, score) %>% - count() %>% - group_by(question) %>% - mutate(perc = n / sum(n)) %>% - arrange(question, -perc) %>% - slice(2) %>% +less15var <- questdata |> + group_by(question, score) |> + count() |> + group_by(question) |> + mutate(perc = n / sum(n)) |> + arrange(question, -perc) |> + slice(2) |> filter(perc < 0.15) print(less15var) @@ -98,21 +95,21 @@ print(less15var) So no question has too little variance. If there was, I'd remove that question like so: ``` r -data <- loaddata %>% +data <- loaddata |> select(-less15var$question) ``` Next we want to normalize the data. Usually you'd do this to ensure that all answers on a questionnaire are in the same domain. Let's imagine a scenario where some questions are scored from 0 to 10, others are scored from 0 to 5, and a third is scored from 80 to 120. The ICA is then biased towards questions that have larger values, like the third question in the example above. That's why we want to normalize the data. The statistical term for this is z-score normalization. The defining property of z-scored transformed data is that the mean is 0 and the standard deviation is one. The z-score transformation is obtained by subtracting the mean from each individual value and then dividing by the standard deviation. This is implemented in R with the `scale()` function. We'll also check if it worked afterwards. ``` r -data <- data %>% - pivot_longer(starts_with("Q"), names_to = "qnum", values_to = "score") %>% - group_by(qnum) %>% +data <- data |> + pivot_longer(starts_with("Q"), names_to = "qnum", values_to = "score") |> + group_by(qnum) |> mutate(score_z = scale(score)) -data %>% - ungroup() %>% - select(score_z) %>% +data |> + ungroup() |> + select(score_z) |> summarise(mean = mean(score_z), sd = sd(score_z), min = min(score_z), @@ -133,9 +130,9 @@ There's two clustering approaches we'll use. One will actually help us do the ot The PCA is implemented in the `prcomp()` function. This function doesn't accept a data frame, but instead it requires a matrix. So we'll have to make that first. We'll transform the long-format data that we created earlier back into wide format (using `pivot_wider()`), then select only the columns that we want to include in the analysis (i.e. the questions (`select(starts_with("Q"))`)), and then we'll turn it into a matrix (using `as.matrix()`). Then we'll put the resulting matrix into the `prcomp()` function. ``` r -mat <- data %>% - pivot_wider(names_from = "qnum", values_from = "score_z", id_cols = "id") %>% - select(starts_with("Q")) %>% +mat <- data |> + pivot_wider(names_from = "qnum", values_from = "score_z", id_cols = "id") |> + select(starts_with("Q")) |> as.matrix() pca_data <- prcomp(mat) @@ -172,8 +169,8 @@ ggplot(pca_stats, aes(x = seq(var), y = var)) + It can be a bit hard to see from the plot how many components have an eigenvalue larger than 1. But we can calulate it. The number of components with an eigenvalue larger than 1 will be the number of independent components we'll request from the ICA. ``` r -nICs <- pca_stats %>% - filter(var > 1) %>% +nICs <- pca_stats |> + filter(var > 1) |> nrow() print(nICs) ``` @@ -201,9 +198,9 @@ glimpse(ica_model) List of 5 $ X: num [1:14955, 1:26] -0.8724 0.0298 0.932 0.932 0.0298 ... $ K: num [1:26, 1:6] -0.0843 -0.0791 -0.0755 -0.0875 -0.0968 ... - $ W: num [1:6, 1:6] 0.7508 -0.0758 0.1641 0.5859 0.245 ... - $ A: num [1:6, 1:26] -0.283 0.13 -0.284 -0.142 0.58 ... - $ S: num [1:14955, 1:6] -1.037 -0.541 -1.086 -0.663 -0.431 ... + $ W: num [1:6, 1:6] 0.5122 0.2571 0.2286 -0.744 0.0655 ... + $ A: num [1:6, 1:26] -0.283 0.131 -0.387 0.127 -0.583 ... + $ S: num [1:14955, 1:6] -0.3649 1.0887 -1.2281 -0.1592 -0.0618 ... These names aren't very informative. In this function, `X` represens the pre-processed data matrix, `K` is the pre-whitening matrix, `W` is the estimated un-mixing matrix, `A` is the estimating mixing matrix (the loadings of the items on the independent components), and `S` is the source matrix (the individual IC loadings for all participants). @@ -214,18 +211,18 @@ cor(ica_model$S) ``` [,1] [,2] [,3] [,4] [,5] - [1,] 1.000000e+00 -1.416697e-14 4.240505e-15 2.487685e-14 4.087139e-16 - [2,] -1.416697e-14 1.000000e+00 1.135678e-14 1.159897e-14 1.131390e-15 - [3,] 4.240505e-15 1.135678e-14 1.000000e+00 -3.638422e-14 8.887895e-15 - [4,] 2.487685e-14 1.159897e-14 -3.638422e-14 1.000000e+00 -1.976354e-15 - [5,] 4.087139e-16 1.131390e-15 8.887895e-15 -1.976354e-15 1.000000e+00 - [6,] 3.711451e-15 -7.321958e-15 1.306657e-14 2.869582e-14 2.511484e-14 + [1,] 1.000000e+00 1.008510e-14 -1.701048e-14 3.537856e-14 -8.652989e-15 + [2,] 1.008510e-14 1.000000e+00 9.286019e-15 -9.827378e-15 -1.197023e-15 + [3,] -1.701048e-14 9.286019e-15 1.000000e+00 2.328882e-14 2.402569e-14 + [4,] 3.537856e-14 -9.827378e-15 2.328882e-14 1.000000e+00 -3.206080e-15 + [5,] -8.652989e-15 -1.197023e-15 2.402569e-14 -3.206080e-15 1.000000e+00 + [6,] 1.354061e-15 -6.944827e-15 -1.626536e-15 -2.241246e-14 -3.859294e-15 [,6] - [1,] 3.711451e-15 - [2,] -7.321958e-15 - [3,] 1.306657e-14 - [4,] 2.869582e-14 - [5,] 2.511484e-14 + [1,] 1.354061e-15 + [2,] -6.944827e-15 + [3,] -1.626536e-15 + [4,] -2.241246e-14 + [5,] -3.859294e-15 [6,] 1.000000e+00 The data we're interested in for now is the estimated mixing matrix (stored in `ica_model$A`). This is the loadings of the individual questions (i.e. the columns of the matrix) that we put in the ICA algorithm on the independent components. For any further analyses with individual data, you'd take the estimated source matrix, which is the loading of each independent component on each individual (i.e. the rows of the matrix). @@ -248,16 +245,16 @@ plot(clust) What we can see from the dendrogram is that for instance question 16 (*"I gravitate towards introspection"*) and question 17 (*"I am more comfortable interacting online than in person"*) on the left are very similar. Question 3 (*"I like to play RPGs. (Ex. D&D)"*) is somewhat similar, but not as much as question 16 and 17. The same goes for question 1 and 4, question 11 and 19, and so on. ``` r -codes <- loadcodes %>% +codes <- loadcodes |> filter(str_detect(qnum, "Q")) ``` Next we'll plot the weight matrix. For that we first create a column with the question number, we'll then merge the matrix with the list of questions so we can plot the actual question asked instead of the question number. Next, we'll put the data frame in a long format. Finally, we'll also reorder the questions to match the order determined by the hierarchical clustering. ``` r -weight_matrix_long <- weight_matrix %>% - mutate(qnum = sprintf("Q%s",row_number())) %>% - inner_join(codes, by = "qnum") %>% +weight_matrix_long <- weight_matrix |> + mutate(qnum = str_glue("Q{row_number()}")) |> + inner_join(codes, by = "qnum") |> pivot_longer(cols = starts_with("IC"), names_to = "IC", values_to = "loading") %>% mutate(question = as_factor(question), question = fct_relevel(question, unique(.$question)[clust$order])) diff --git a/content/blog/2020-running-an-ica-on-questionnaires/index.qmd b/content/blog/2020-running-an-ica-on-questionnaires/index.qmd index 9ccec32..66dbae1 100644 --- a/content/blog/2020-running-an-ica-on-questionnaires/index.qmd +++ b/content/blog/2020-running-an-ica-on-questionnaires/index.qmd @@ -1,7 +1,7 @@ --- title: Running an ICA on Questionnaires -author: Daniel Roelfs -date: '2020-10-24' +date: 2020-10-24 +description: Running an ICA on Questionnaires slug: running-an-ica-on-questionnaires categories: - coding @@ -9,9 +9,6 @@ tags: - statistics - R - ICA -description: 'Running an ICA on Questionnaires' -thumbnail: images/avatar.png -format: hugo execute: fig.retina: 2 fig.align: center @@ -51,10 +48,10 @@ There are two files we need, one with the actual data (we'll call this `loaddata #| message: false #| warning: false -loaddata <- read_delim("data.csv", delim = "\t") %>% +loaddata <- read_delim("data.csv", delim = "\t") |> mutate(id = row_number()) -loadcodes <- read_delim("codebook_clean.txt", delim = "\t", col_names = FALSE) %>% +loadcodes <- read_delim("codebook_clean.txt", delim = "\t", col_names = FALSE) |> rename(qnum = X1, question = X2) ``` @@ -68,8 +65,8 @@ Next, we want to prune the data. We want to exclude questions that have a low de ```{r} #| label: make-long -questdata <- loaddata %>% - select(starts_with("Q")) %>% +questdata <- loaddata |> + select(starts_with("Q")) |> pivot_longer(cols = everything(), names_to = "question", values_to = "score") ``` @@ -78,9 +75,9 @@ We'll first find if there's any unanswered questions. These are rows where `scor ```{r} #| label: find-nas -questdata %>% - filter(is.na(score)) %>% - nrow(.) +questdata |> + filter(is.na(score)) |> + nrow() ``` We find that there's no missing values (crazy, I know!). If there were any missing values we could possibly impute the scores in the original (wide) data (e.g. using k-nearest neighbor imputation) or we could remove these items (e.g. using `drop_na()`). @@ -90,13 +87,13 @@ The next step is to find if there are questions with insufficient variation in a ```{r} #| label: excl-vars -less15var <- questdata %>% - group_by(question, score) %>% - count() %>% - group_by(question) %>% - mutate(perc = n / sum(n)) %>% - arrange(question, -perc) %>% - slice(2) %>% +less15var <- questdata |> + group_by(question, score) |> + count() |> + group_by(question) |> + mutate(perc = n / sum(n)) |> + arrange(question, -perc) |> + slice(2) |> filter(perc < 0.15) print(less15var) @@ -107,7 +104,7 @@ So no question has too little variance. If there was, I'd remove that question l ```{r} #| label: remove-low-var -data <- loaddata %>% +data <- loaddata |> select(-less15var$question) ``` @@ -116,14 +113,14 @@ Next we want to normalize the data. Usually you'd do this to ensure that all ans ```{r} #| label: normalize-data -data <- data %>% - pivot_longer(starts_with("Q"), names_to = "qnum", values_to = "score") %>% - group_by(qnum) %>% +data <- data |> + pivot_longer(starts_with("Q"), names_to = "qnum", values_to = "score") |> + group_by(qnum) |> mutate(score_z = scale(score)) -data %>% - ungroup() %>% - select(score_z) %>% +data |> + ungroup() |> + select(score_z) |> summarise(mean = mean(score_z), sd = sd(score_z), min = min(score_z), @@ -141,9 +138,9 @@ The PCA is implemented in the `prcomp()` function. This function doesn't accept ```{r} #| label: run-pca -mat <- data %>% - pivot_wider(names_from = "qnum", values_from = "score_z", id_cols = "id") %>% - select(starts_with("Q")) %>% +mat <- data |> + pivot_wider(names_from = "qnum", values_from = "score_z", id_cols = "id") |> + select(starts_with("Q")) |> as.matrix() pca_data <- prcomp(mat) @@ -181,8 +178,8 @@ It can be a bit hard to see from the plot how many components have an eigenvalue ```{r} #| label: get-n-ics -nICs <- pca_stats %>% - filter(var > 1) %>% +nICs <- pca_stats |> + filter(var > 1) |> nrow() print(nICs) ``` @@ -241,7 +238,7 @@ What we can see from the dendrogram is that for instance question 16 (_"I gravit ```{r} #| label: select-qs -codes <- loadcodes %>% +codes <- loadcodes |> filter(str_detect(qnum, "Q")) ``` @@ -250,9 +247,9 @@ Next we'll plot the weight matrix. For that we first create a column with the qu ```{r} #| label: create-plotdata -weight_matrix_long <- weight_matrix %>% - mutate(qnum = sprintf("Q%s",row_number())) %>% - inner_join(codes, by = "qnum") %>% +weight_matrix_long <- weight_matrix |> + mutate(qnum = str_glue("Q{row_number()}")) |> + inner_join(codes, by = "qnum") |> pivot_longer(cols = starts_with("IC"), names_to = "IC", values_to = "loading") %>% mutate(question = as_factor(question), question = fct_relevel(question, unique(.$question)[clust$order])) diff --git a/content/blog/2020-running-an-ica-on-questionnaires/index_files/figure-gfm/plot-var-1.png b/content/blog/2020-running-an-ica-on-questionnaires/index_files/figure-gfm/plot-var-1.png deleted file mode 100644 index ebaa82b..0000000 Binary files a/content/blog/2020-running-an-ica-on-questionnaires/index_files/figure-gfm/plot-var-1.png and /dev/null differ diff --git a/content/blog/2020-running-an-ica-on-questionnaires/index_files/figure-gfm/plot-wmatrix-nolabs-1.png b/content/blog/2020-running-an-ica-on-questionnaires/index_files/figure-gfm/plot-wmatrix-nolabs-1.png deleted file mode 100644 index fe06d4e..0000000 Binary files a/content/blog/2020-running-an-ica-on-questionnaires/index_files/figure-gfm/plot-wmatrix-nolabs-1.png and /dev/null differ diff --git a/content/blog/2020-running-an-ica-on-questionnaires/index_files/figure-gfm/show-hclust-1.png b/content/blog/2020-running-an-ica-on-questionnaires/index_files/figure-gfm/show-hclust-1.png deleted file mode 100644 index 518d4bc..0000000 Binary files a/content/blog/2020-running-an-ica-on-questionnaires/index_files/figure-gfm/show-hclust-1.png and /dev/null differ diff --git a/content/blog/2021-amsterdam-housing-market/index.md b/content/blog/2021-amsterdam-housing-market/index.md index 0c92886..5e4c4fd 100644 --- a/content/blog/2021-amsterdam-housing-market/index.md +++ b/content/blog/2021-amsterdam-housing-market/index.md @@ -1,22 +1,18 @@ --- title: Visualizing the State of the Amsterdam Housing Market -author: Daniel Roelfs -date: "2021-06-20" +date: 2021-06-20 +description: Visualizing the State of the Amsterdam Housing Market slug: visualizing-the-state-of-the-amsterdam-housing-market categories: - society tags: - ggplot - society -description: "Visualizing the State of the Amsterdam Housing Market" -thumbnail: images/avatar.png -format: hugo execute: fig.retina: 2 - fig.align: center + fig.align: left fig.show: hold results: hold - out.width: 80% dev.args: list(bg = "#EBEBEB") --- @@ -49,6 +45,9 @@ Of all the worrying events in 2020, Millenials and Gen Z kids ranked financial i This will be a fairly basic data analysis and visualization post, I can't claim that this is an exhaustive list and that I didn't miss some nuances, but I'll give it . I collected some data from the Amsterdam Real Estate Association ([Makelaars Vereniging Amsterdam; MVA](https://www.mva.nl)) and statistics from the Central Bureau for Statistics ([Centraal Bureau for de Statistiek; CBS](https://www.cbs.nl)). As usual, I'll be using `{tidyverse}` *a lot*. I've also recently started using the `{ggtext}` package to manage the text elements in my plots, inspired by Cédric Scherer ([@CedScherer](https://twitter.com/CedScherer)). I'll use the `{gt}` package to spice up some of the tables, and `{patchwork}` for arranging plots. I'll use the `{cbsodataR}` to download data from the Central Bureau for Statistics ([CBS](https://www.cbs.nl)). +
+Show code + ``` r library(tidyverse) library(ggtext) @@ -77,63 +76,68 @@ theme_set(ggthemes::theme_economist(base_family = "nunito-sans") + legend.text = element_text(size = 10))) ``` +
+ ### Getting the data The first piece of data I'll use comes from the Amsterdam Real Estate Association. They publish quarterly data on a number of variables about the Amsterdam housing market ([link](https://www.mva.nl/over-de-mva/mva/kwartaalcijfers)), inclusing asking price, final price paid, number of properties put on the market, number of properties sold and a few more back to the first quarter of 2012. *Obviously*, these numbers all come in pdf-format, because the people writing quarterly reports apparently have a *massive hatred* towards people that want to analyze this data. I downloaded the reports, used the online tool [PDFTables](https://pdftables.com) to convert them to Excel, and then stitched the tables together manually. Of course (remember the authors have a *massive hatred* towards us), the formatting of the numbers in the tables weren't consistent between different quarterly reports, so I had to do some cleaning in R. I put each table in a separate sheet and then used functionality from the `{readxl}` package to load each table into a different variable and then do the cleaning per table. This was a bit cumbersome. Since this will probably be a very long post, I'll hide the code, but you can see it in the [Rmarkdown file](https://github.com/danielroelfs/danielroelfs.com/blob/main/content/blog/2020-running-an-ica-on-questionnaires/index.Rmd) (at least until I figure out how to get code folding to work on my laptop). You can look at the code I used to load and merge the files. It's a bit of a mess: +
+Show code + ``` r -asking_price <- readxl::read_xlsx("MVA_kwartaalcijfers.xlsx", sheet = 2) %>% - janitor::clean_names() %>% +asking_price <- readxl::read_xlsx("MVA_kwartaalcijfers.xlsx", sheet = 2) |> + janitor::clean_names() |> mutate(type_woning = as_factor(type_woning), across(where(is.double), ~ .x * 1e3), - across(where(is.character), ~ parse_number(str_remove_all(.x, fixed(" "))))) %>% + across(where(is.character), ~ parse_number(str_remove_all(.x, fixed(" "))))) |> pivot_longer(starts_with("x"), names_to = "date", values_to = "asking_price") -transaction_price <- readxl::read_xlsx("MVA_kwartaalcijfers.xlsx", sheet = 3) %>% - janitor::clean_names() %>% +transaction_price <- readxl::read_xlsx("MVA_kwartaalcijfers.xlsx", sheet = 3) |> + janitor::clean_names() |> mutate(type_woning = as_factor(type_woning), - across(where(is.character), ~ parse_number(str_remove_all(.x, fixed(" "))))) %>% + across(where(is.character), ~ parse_number(str_remove_all(.x, fixed(" "))))) |> pivot_longer(starts_with("x"), names_to = "date", values_to = "transaction_price") -price_per_m2 <- readxl::read_xlsx("MVA_kwartaalcijfers.xlsx", sheet = 4) %>% - janitor::clean_names() %>% +price_per_m2 <- readxl::read_xlsx("MVA_kwartaalcijfers.xlsx", sheet = 4) |> + janitor::clean_names() |> mutate(type_woning = as_factor(type_woning), across(where(is.double), ~ .x * 1e3), - across(where(is.character), ~ parse_number(str_remove_all(.x, fixed(" "))))) %>% + across(where(is.character), ~ parse_number(str_remove_all(.x, fixed(" "))))) |> pivot_longer(starts_with("x"), names_to = "date", values_to = "price_per_m2") -n_offered <- readxl::read_xlsx("MVA_kwartaalcijfers.xlsx", sheet = 5) %>% - janitor::clean_names() %>% +n_offered <- readxl::read_xlsx("MVA_kwartaalcijfers.xlsx", sheet = 5) |> + janitor::clean_names() |> mutate(type_woning = as_factor(type_woning), - across(where(is.character), ~ parse_number(str_remove_all(.x, fixed(" "))))) %>% + across(where(is.character), ~ parse_number(str_remove_all(.x, fixed(" "))))) |> pivot_longer(starts_with("x"), names_to = "date", values_to = "n_offered") -n_sold <- readxl::read_xlsx("MVA_kwartaalcijfers.xlsx", sheet = 6) %>% - janitor::clean_names() %>% +n_sold <- readxl::read_xlsx("MVA_kwartaalcijfers.xlsx", sheet = 6) |> + janitor::clean_names() |> mutate(type_woning = as_factor(type_woning), - across(where(is.character), ~ parse_number(str_remove_all(.x, fixed(" "))))) %>% + across(where(is.character), ~ parse_number(str_remove_all(.x, fixed(" "))))) |> pivot_longer(starts_with("x"), names_to = "date", values_to = "n_sold") -mortgage_months <- readxl::read_xlsx("MVA_kwartaalcijfers.xlsx", sheet = 7) %>% - janitor::clean_names() %>% +mortgage_months <- readxl::read_xlsx("MVA_kwartaalcijfers.xlsx", sheet = 7) |> + janitor::clean_names() |> mutate(type_woning = as_factor(type_woning), - across(where(is.character), ~ parse_number(str_remove_all(.x, fixed(" "))))) %>% + across(where(is.character), ~ parse_number(str_remove_all(.x, fixed(" "))))) |> pivot_longer(starts_with("x"), names_to = "date", values_to = "mortgage_months") -tightness_index <- readxl::read_xlsx("MVA_kwartaalcijfers.xlsx", sheet = 8) %>% - janitor::clean_names() %>% +tightness_index <- readxl::read_xlsx("MVA_kwartaalcijfers.xlsx", sheet = 8) |> + janitor::clean_names() |> mutate(type_woning = as_factor(type_woning), - across(where(is.character), ~ parse_number(str_replace_all(.x, ",", ".")))) %>% + across(where(is.character), ~ parse_number(str_replace_all(.x, ",", ".")))) |> pivot_longer(starts_with("x"), names_to = "date", values_to = "tightness_index") -data_merged <- inner_join(asking_price, transaction_price) %>% - inner_join(price_per_m2) %>% - inner_join(n_offered) %>% - inner_join(n_sold) %>% - inner_join(mortgage_months) %>% - inner_join(tightness_index) %>% +data_merged <- inner_join(asking_price, transaction_price) |> + inner_join(price_per_m2) |> + inner_join(n_offered) |> + inner_join(n_sold) |> + inner_join(mortgage_months) |> + inner_join(tightness_index) |> mutate(asking_price = ifelse(asking_price < 1e5, asking_price * 1e3, asking_price), transaction_price = ifelse(transaction_price < 1e5, transaction_price * 1e3, transaction_price), price_per_m2 = ifelse(price_per_m2 > 1e4, price_per_m2 / 1e3, price_per_m2)) @@ -141,14 +145,21 @@ data_merged <- inner_join(asking_price, transaction_price) %>% write_rds(data_merged, "data_merged.rds") ``` +
+ Let's have a look at the dataset. +
+Show code + ``` r data_merged <- read_rds("data_merged.rds") glimpse(data_merged) ``` +
+ Rows: 259 Columns: 9 $ type_woning Tussenwoning, Tussenwoning, Tussenwoning, Tussenwoni… @@ -163,9 +174,12 @@ glimpse(data_merged) From this dataset, I want to create a few new variables. I want to create a date format from the quarterly date. Currently it's in the format `"x1e_kw_2012"`. We'll extract the year and the quarter. Since there's 4 quarters in the 12 months, we'll multiply the quarter by 3 and then create a date format. We'll also calculate the percentage difference between the asking price and the price paid, and the percentage difference between the houses offered and the houses sold. I'll also translate the character string from Dutch to English using the `case_when()` function. +
+Show code + ``` r -data <- data_merged %>% - rename(type = type_woning) %>% +data <- data_merged |> + rename(type = type_woning) |> mutate(quarter = str_extract(date, pattern = "x(.*?)e"), quarter = parse_number(quarter), year = str_extract(date, pattern = "kw_(.*)"), @@ -185,10 +199,12 @@ data <- data_merged %>% str_detect(type,"2-onder-1-kap") ~ "Semi-detached house"), type = factor(type, levels = c("Apartments (pre-1970)","Apartments (post-1970)", "Terraced house","Corner house","Detached house", - "Semi-detached house","Total"))) %>% + "Semi-detached house","Total"))) |> glimpse() ``` +
+ Rows: 259 Columns: 16 $ type Terraced house, Terraced house, Terraced house,… @@ -210,17 +226,20 @@ data <- data_merged %>% The first thing that seems interesting to do is to plot the percentage difference between the asking price and the price paid. This will give us an indication of the trend in overpaying on different types of properties. I'll use a color palette and legend design I again shamelessly stole from Cédric Scherer ([@CedScherer](https://twitter.com/CedScherer)). I'll use a simple line graph to visualize the percentage overpay. +
+Show code + ``` r colors <- c("#019868","#9dd292","#ec0b88","#651eac","#e18a1e","#2b7de5") -data %>% - filter(type != "Total") %>% - group_by(type) %>% +data |> + filter(type != "Total") |> + group_by(type) |> mutate(n_total = sum(n_sold), type_label = str_glue("{type} (n={format(n_total, big.mark = \".\", decimal.mark = \",\")})"), - type_label = str_replace(type_label,"\\) \\(", ", ")) %>% - arrange(type) %>% - mutate(type_label = factor(type_label)) %>% + type_label = str_replace(type_label,"\\) \\(", ", ")) |> + arrange(type) |> + mutate(type_label = factor(type_label)) |> ggplot(aes(x = date, y = diff_ask_paid_perc, color = type_label)) + geom_hline(yintercept = 0, color = "grey30", size = 1) + geom_line(size = 1.2, alpha = 0.8, lineend = "round", key_glyph = "point") + @@ -240,47 +259,57 @@ data %>% legend.key = element_rect(fill = "transparent", color = "transparent")) ``` +
+ Prior to 2014, most properties in Amsterdam were sold at about 6% below asking price, in the last quarter of 2020, that trend had changed to more than 3.5% above asking. The variance is obviously larger for detached and semi-detached houses, since those are both expensive and scarce in Amsterdam, and are thus also bought and sold less often. The table below shows the stats for the first quarter of 2021. Apart from semi-detached houses, most other properties were sold at 4% over asking or more. The types of properties most accessible to first-time buyers are obviously the apartments. Either the older type in the inner city, or the newer apartments in the suburbs. People buying pre-1970 apartments spent more than €28 000 over the asking price and people buying apartments built after 1970 spent on average more than €18 000 over the asking price. +
+Show code + ``` r -data %>% +data |> filter(type != "Total", - date == max(date)) %>% - select(type, diff_ask_paid_perc, diff_ask_paid, transaction_price, n_sold) %>% - arrange(-diff_ask_paid_perc) %>% - gt() %>% - cols_align(columns = "type", align = "left") %>% - fmt_percent(columns = "diff_ask_paid_perc") %>% - fmt_currency(columns = "diff_ask_paid", currency = "EUR", decimals = 0, sep_mark = " ") %>% - fmt_currency(columns = "transaction_price", currency = "EUR", decimals = 0, sep_mark = " ") %>% - fmt_number(columns = "n_sold", sep_mark = " ", drop_trailing_zeros = TRUE) %>% + date == max(date)) |> + select(type, diff_ask_paid_perc, diff_ask_paid, transaction_price, n_sold) |> + arrange(-diff_ask_paid_perc) |> + gt() |> + cols_align(columns = "type", align = "left") |> + fmt_percent(columns = "diff_ask_paid_perc") |> + fmt_currency(columns = "diff_ask_paid", currency = "EUR", decimals = 0, sep_mark = " ") |> + fmt_currency(columns = "transaction_price", currency = "EUR", decimals = 0, sep_mark = " ") |> + fmt_number(columns = "n_sold", sep_mark = " ", drop_trailing_zeros = TRUE) |> cols_label( type = html("
Property type
"), diff_ask_paid_perc = html("
Percentage overpay
"), diff_ask_paid = html("
Mean overpay
"), transaction_price = html("
Mean transaction price
"), n_sold = html("
Number of properties
sold in period
") - ) %>% + ) |> tab_header(title = html("

Difference between
asking price and price paid

"), - subtitle = html("

Data from the first quarter of 2021

")) %>% - tab_source_note(source_note = md("_**Data**: MVA_")) %>% - tab_options(table.background.color = "#EBEBEB") %>% + subtitle = html("

Data from the first quarter of 2021

")) |> + tab_source_note(source_note = md("_**Data**: MVA_")) |> + tab_options(table.background.color = "#EBEBEB") |> gtsave("overpay-table.png", expand = 0) ``` +
+ What contributed to this price increase? A simple supply-and-demand plays a part. The figure below shows the "tightness index" (Dutch: "krapte indicator") over time. This number represents the number of choices a potential buyer has. This number is calculated per quarter by dividing the number of properties on the market halfway through the quarter divided by the number of transactions over the entire quarter. This number is then multiplied by 3 to correct for the fact that the number is calculated per quarter instead of per month. When the "tightness index" is below 5, it's considered a "sellers market" (source: [NVM](https://www.nvm.nl/wonen/marktinformatie/)). A larger number is good for buyers, a smaller number is good for sellers. In the first quarter of 2021, the number was exactly 2. It varies a bit per property type, but for apartments specifically it hasn't been higher than 3 since 2016. This means that first-time buyers often don't have a choice between more than 2 or 3 apartments per month. I tried to find some data on how many people currently are interested in buying a home in Amsterdam, but I couldn't find anything solid. There's only anecdotal evidence from viewings where within a year, the number of people interested in viewing a property has increased to 80 in 2020, compared to about 55 a year earlier (source: [Parool](https://www.parool.nl/amsterdam/huizenmarkt-is-gekkenhuis-kopers-profiteren-voor-1-april-nog-van-voordelige-belastingregel~bcadc579/)). +
+Show code + ``` r -data %>% - filter(type != "Total") %>% - group_by(type) %>% +data |> + filter(type != "Total") |> + group_by(type) |> mutate(n_total = sum(n_sold), type_label = str_glue("{type} (n={format(n_total, big.mark = \".\", decimal.mark = \",\")})"), - type_label = str_replace(type_label,"\\) \\(", ", ")) %>% - arrange(type) %>% - mutate(type_label = factor(type_label)) %>% + type_label = str_replace(type_label,"\\) \\(", ", ")) |> + arrange(type) |> + mutate(type_label = factor(type_label)) |> ggplot(aes(x = date, y = tightness_index, color = type_label)) + geom_rect(data = tibble(), aes(xmin = as.Date(-Inf), xmax = as.Date(Inf), ymin = c(0,5,10), ymax = c(5,10,Inf), @@ -311,6 +340,8 @@ data %>% legend.key = element_rect(fill = "transparent")) ``` +
+ So, there's a lot of competition among buyers, and people looking to sell their houses can expect to be paid more than they anticipated. Dozens of buyers compete for the same properties, driving up the price. The figure below shows the percentage of properties sold compared to the number of properties offered. It's clear that after the 2008 housing bubble crisis, the housing market was still recovering in 2012 and 2013. However, since 2016, more apartments were sold than were put on the market. This means that the number of properties available for the growing number of people wanting to move to Amsterdam is decreasing. This decreases supply in a time with increasing demand, thus pushing the prices higher twice over. @@ -318,10 +349,12 @@ So, there's a lot of competition among buyers, and people looking to sell their

A new phenomenon that entered the scene a little while ago may indicate how skewed the market is. People wanting to buy a house in a certain neighborhood will put notes and letters in the mailboxes of people living in that area saying "Nice house! Are you interested in selling it to me?". This is now not an uncommon strategy to find a house. Some people living in popular neighborshoods are inundated with notes from agencies facilitating these practices. See also a news item by RTL.

+
+Show code ``` r -data %>% - filter(type != "Total") %>% +data |> + filter(type != "Total") |> ggplot(aes(x = date, y = perc_sold, fill = type)) + geom_col(key_glyph = "point") + geom_hline(yintercept = 1, linetype = "dashed", color = "grey20") + @@ -345,26 +378,31 @@ data %>% facet_wrap(~ type, strip.position = "top", scales = "free_x") ``` +
+ This adds fuel to the fire. I guess I'm trying to show that there are a number of factors stacked against young buyers. Any money you have saved up you need to spend to outbid the massive competition. The massive competition means buyers only have a handful of feasible options. Again, because of the massive competition, the chance they actually get to buy one of those options is low. The number of options is not steady either, the number properties sold has overtaken the number of properties being put on the market. There is a dwindling reserve of properties. Combine this with more and more young people wanting to move to the bigger cities and you have a perfect cocktail for a congested housing market. Building new properties can counteract this, but over the last years the Netherlands has actually slowed building new properties. +
+Show code + ``` r decades <- tibble( x1 = seq(1920,2020,10), x2 = x1 + 10 -) %>% - slice(seq(1,1e2,2)) %>% +) |> + slice(seq(1,1e2,2)) |> filter(x2 < 2030) -hist_data_homes <- cbsodataR::cbs_get_data("82235NED") %>% - janitor::clean_names() %>% - rename(stock_end = eindstand_voorraad_8) %>% +hist_data_homes <- cbsodataR::cbs_get_data("82235NED") |> + janitor::clean_names() |> + rename(stock_end = eindstand_voorraad_8) |> mutate(year = parse_number(perioden), stock_end = stock_end * 1e3, diff = stock_end - lag(stock_end)) -total_hist_plot <- hist_data_homes %>% +total_hist_plot <- hist_data_homes |> ggplot(aes(x = year, y = stock_end)) + geom_rect(data = tibble(), aes(xmin = 1940.01, xmax = 1945, ymin = -Inf, ymax = Inf), fill = "red", alpha = 0.2, inherit.aes = FALSE) + @@ -379,7 +417,7 @@ total_hist_plot <- hist_data_homes %>% scale_y_continuous(labels = scales::label_number()) + coord_cartesian(clip = "off") -diff_hist_plot <- hist_data_homes %>% +diff_hist_plot <- hist_data_homes |> ggplot(aes(x = year, y = diff)) + geom_hline(yintercept = 0.5, color = "grey30", size = 1) + geom_rect(data = tibble(), aes(xmin = 1940.01, xmax = 1945, ymin = -Inf, ymax = Inf), @@ -403,6 +441,8 @@ total_hist_plot / diff_hist_plot + theme(plot.title = element_textbox(size = 20)) ``` +
+ The figure displays data from from the CBS through the `{cbsodataR}` package. It shows an increase in the number of homes after the second World War in the 50s and the 60s. Since around 1975 there's been a decline in the net number of new houses added year-over-year. This while demand hasn't decreased in the same period. @@ -411,6 +451,9 @@ It's also important to mention that between the 60s and 70s, the Netherlands sta But that's not all, there's a few other features that contribute to the gridlock. See, not only young people are on the market to buy apartments in Amsterdam. There's a thriving market for investors looking to take advantage of the rising prices in the Amsterdam housing market (source: [Algemeen Dagblad](https://www.ad.nl/wonen/beleggers-verwerven-massaal-koophuizen-voor-verhuur-in-amsterdam~afedc50c6/)). According to the Dutch central bank, about 1 in 5 properties are sold to investors, who are mostly looking to convert it to a rental property or flip it for a profit. I couldn't find the data the Dutch central bank relied on, but I found something else. The Central Bureau for Statistics collects data on the number of "mutations" among properties. The "mutation" in this case refers to the change of purpose of a property, e.g. if a house meant for sale is bought and then transformed to a rental property by either private individuals or corporations, or vice versa. I collected this data from the governmental yearly report on the housing market of 2020 ("*Staat van de Woningmarkt - Jaarrapportage 2020*", [link](https://www.rijksoverheid.nl/documenten/rapporten/2020/06/15/staat-van-de-woningmarkt-jaarrapportage-2020)). Instead of per quarter, this data is reported per year. Unfortunately, the data on housing mutations in the report (from 2020 mind you) only goes until 2017. It's important to note that these numbers are country-wide, not specific for Amsterdam. That means there could be some other factors in play. Many of these trends are present across the country, but they're massively amplified in the larger cities. The data was contained in a pdf that wasn't easily machine-readable, so I had to manually copy the numbers into a tibble, which was great... +
+Show code + ``` r house_mutations_in <- tribble( ~year, ~buy_to_rent_corp, ~buy_to_rent_other, ~rent_corp_to_buy, ~rent_corp_to_rent_other, ~rent_other_to_buy, ~rent_other_to_rent_corp, @@ -422,28 +465,33 @@ house_mutations_in <- tribble( 2017, 1600, 98900, 6400, 11000, 7300, 9000 ) -house_mutations <- house_mutations_in %>% - pivot_longer(cols = -year, names_to = "mutation", values_to = "n_mutations") %>% +house_mutations <- house_mutations_in |> + pivot_longer(cols = -year, names_to = "mutation", values_to = "n_mutations") |> mutate(from = str_extract(mutation, "(.*?)_to"), from = str_remove_all(from, "_to"), to = str_extract(mutation, "to_(.*?)$"), - to = str_remove_all(to, "to_")) %>% - group_by(year) %>% + to = str_remove_all(to, "to_")) |> + group_by(year) |> mutate(total_mutations = sum(n_mutations), perc_mutations = n_mutations / total_mutations, across(c(from,to), ~ case_when(str_detect(.x,"buy") ~ "buy", str_detect(.x,"rent_corp") ~ "rent (corporation)", str_detect(.x,"rent_other") ~ "rent (other)")), - mutation_label = str_glue("From {from} to {to}")) %>% + mutation_label = str_glue("From {from} to {to}")) |> ungroup() ``` +
+ So not every year there's the same number of "mutations" (transformations of purpose). That's I thought I'd display this data in two different plots, one with the raw values per year, and one with the percentage-wise deconstruction per year. Now obviously, first-time buyers don't care about the percentage houses being taken of the market and being transformed into rental properties, they care about the total number. However, I do think showing the percentage-wise plot makes the trend a bit more clearly. +
+Show code + ``` r -mutations_n_plot <- house_mutations %>% - arrange(from,to) %>% - mutate(mutation_label = fct_inorder(mutation_label)) %>% +mutations_n_plot <- house_mutations |> + arrange(from,to) |> + mutate(mutation_label = fct_inorder(mutation_label)) |> ggplot(aes(x = year, y = n_mutations, alluvium = mutation_label, fill = mutation_label)) + geom_point(shape = 21, color = "transparent", size = NA) + ggalluvial::geom_flow(width = 0, show.legend = FALSE) + @@ -458,9 +506,9 @@ mutations_n_plot <- house_mutations %>% override.aes = list(color = "transparent", size = 6, alpha = 1, stroke = 0))) + theme(legend.key = element_rect(fill = "transparent")) -mutations_perc_plot <- house_mutations %>% - arrange(from,to) %>% - mutate(mutation_label = fct_inorder(mutation_label)) %>% +mutations_perc_plot <- house_mutations |> + arrange(from,to) |> + mutate(mutation_label = fct_inorder(mutation_label)) |> ggplot(aes(x = year, y = perc_mutations, alluvium = mutation_label, fill = mutation_label)) + geom_point(shape = 21, color = "transparent", size = NA) + ggalluvial::geom_flow(width = 0, show.legend = FALSE) + @@ -484,28 +532,33 @@ mutations_n_plot / mutations_perc_plot + theme(plot.title = element_textbox(size = 20)) ``` +
+ The total number of mutations has hovered around 150 000 since 2012. There was a bump in 2014 when about 40 000 properties were changed from "rent (other)" to "rent (corporation)". The label "rent (other)" includes mostly private rentals or government-run rental properties, I think. I suspect that in 2014 one of those government-run rental agencies was privatized, causing the bump. The number of mutations from type to type has been fairly consistent until 2016. After 2016 there was a massive drop in the number of private rental properties being put up for sale, and a massive increase in the number of properties meant for sale being added to the private rental market. My guess is that this increase represents investors getting more and more involved in buying up properties and renting them out. Whereas in e.g. 2012, there was about an equal number of properties exchanged between the private rental market and the sale market, this balance is now heavily skewed. Relating this to the previous figures, where we showed that the number of houses sold exceeds the number of houses being put on the market, this also means that among those houses sold, a large number isn't kept in the sale market but rather added to the rental market. We can look at the net number of houses added to the rental market by adding up the different mutations. Again, prior to 2016, there were slightly more houses added to the sale market than the rental market, but after 2016, this number skyrocketed in favor of the rental market, when tens of thousands of properties were withdrawn from the sale market. Unfortunately I couldn't find any data since then to see if 2017 happened to be an outlier and if this number corrected to a more reasonable number since then. +
+Show code + ``` r -net_house_mutations <- house_mutations %>% +net_house_mutations <- house_mutations |> mutate(across(c(from,to), ~ case_when(str_detect(.x, "rent") ~ "rent", - str_detect(.x, "buy") ~ "buy"))) %>% - group_by(year) %>% - filter(from != to) %>% - mutate(total_mutations = sum(n_mutations)) %>% - group_by(year, from, to) %>% + str_detect(.x, "buy") ~ "buy"))) |> + group_by(year) |> + filter(from != to) |> + mutate(total_mutations = sum(n_mutations)) |> + group_by(year, from, to) |> summarise(n_mutations = sum(n_mutations), - total_mutations = first(total_mutations)) %>% + total_mutations = first(total_mutations)) |> pivot_wider(id_cols = c(year,total_mutations), names_from = c(from,to), values_from = n_mutations, - names_sep = "_to_") %>% + names_sep = "_to_") |> mutate(net_buy_to_rent = buy_to_rent - rent_to_buy, perc_buy_to_rent = buy_to_rent / total_mutations) -net_mutation_plot <- net_house_mutations %>% +net_mutation_plot <- net_house_mutations |> ggplot(aes(x = year, y = net_buy_to_rent)) + geom_hline(yintercept = 0, color = "grey30", size = 1) + #geom_ribbon(aes(ymin = 0, ymax = net_buy_to_rent), fill = "grey30", alpha = 0.2) + @@ -522,7 +575,7 @@ net_mutation_plot <- net_house_mutations %>% scale_y_continuous(labels = scales::label_number_auto(), limits = c(-3e4, NA), n.breaks = 5) + theme(axis.title.y = element_textbox(width = grid::unit(3.5,"in"))) -perc_mutation_plot <- net_house_mutations %>% +perc_mutation_plot <- net_house_mutations |> ggplot(aes(x = year, y = perc_buy_to_rent)) + geom_hline(yintercept = 0.5, color = "grey30", size = 1) + geom_line(color = "darkred", size = 1.5, lineend = "round") + @@ -539,59 +592,74 @@ net_mutation_plot + perc_mutation_plot + theme(plot.title = element_textbox(size = 20)) ``` +
+ -So in 2017 nearly 90 000 houses were mutated from sale to rental properties. In that same year, about 62 000 new homes were built (source: [CBS](https://www.cbs.nl/nl-nl/nieuws/2018/04/hoogste-aantal-nieuwbouwwoningen-in-acht-jaar)). Not all of those 62 000 homes are meant for sale, a proportion are added to the rental market, but even if all those new homes were meant for sale, the sale market still shrunk due to the (net) \~90 000 properties that were transformed and in that way removed from the sale market. +So in 2017 nearly 90 000 houses were mutated from sale to rental properties. In that same year, about 62 000 new homes were built (source: [CBS](https://www.cbs.nl/nl-nl/nieuws/2018/04/hoogste-aantal-nieuwbouwwoningen-in-acht-jaar)). Not all of those 62 000 homes are meant for sale, a proportion are added to the rental market, but even if all those new homes were meant for sale, the sale market still shrunk due to the (net) ~90 000 properties that were transformed and in that way removed from the sale market. + +
+Show code ``` r -house_mutations %>% - filter(year == max(year)) %>% - select(from, to, n_mutations, perc_mutations) %>% - mutate(across(c(from,to), ~ str_to_sentence(.x))) %>% - arrange(-perc_mutations) %>% - gt() %>% - fmt_number(columns = "n_mutations", sep_mark = " ", drop_trailing_zeros = TRUE) %>% - fmt_percent(columns = "perc_mutations") %>% +house_mutations |> + filter(year == max(year)) |> + select(from, to, n_mutations, perc_mutations) |> + mutate(across(c(from,to), ~ str_to_sentence(.x))) |> + arrange(-perc_mutations) |> + gt() |> + fmt_number(columns = "n_mutations", sep_mark = " ", drop_trailing_zeros = TRUE) |> + fmt_percent(columns = "perc_mutations") |> grand_summary_rows( columns = "n_mutations", fns = list(total = "sum"), formatter = fmt_number, sep_mark = " ", - drop_trailing_zeros = TRUE) %>% + drop_trailing_zeros = TRUE) |> grand_summary_rows( columns = "perc_mutations", fns = list(total = "sum"), formatter = fmt_percent, - missing_text = "") %>% + missing_text = "") |> cols_label( from = html("
From
"), to = html("
To
"), n_mutations = html("
Number of mutations
"), perc_mutations = html("
Percentage of total mutations
") - ) %>% + ) |> tab_header(title = html("

Mutations in the Dutch housing market

"), - subtitle = html("

Data from the year 2017

")) %>% - tab_source_note(source_note = md("_**Data**: CBS_")) %>% + subtitle = html("

Data from the year 2017

")) |> + tab_source_note(source_note = md("_**Data**: CBS_")) |> tab_options(table.background.color = "#EBEBEB", - grand_summary_row.text_transform = "capitalize") %>% + grand_summary_row.text_transform = "capitalize") |> gtsave("mutations-table.png", expand = 0) ``` +
+ + Warning: Since gt v0.9.0, the `formatter` argument (and associated `...`) has been + deprecated. + • Please use the `fmt` argument to provide formatting directives. + This warning is displayed once every 8 hours. + So what's the result of all these phenomena? The figure below shows the housing price index for Amsterdam. The first quarter of 2012 was taken as the reference point (100%). In the past 8(!) years, properties had on average nearly doubled in price. +
+Show code + ``` r -asking_start <- data %>% +asking_start <- data |> filter(type != "Total", - date == min(date)) %>% - rename(asking_start = asking_price) %>% - select(type,asking_start) + date == min(date)) |> + rename(asking_start = asking_price) |> + select(type, asking_start) -data_asking_index <- data %>% - filter(type != "Total") %>% - left_join(asking_start) %>% +data_asking_index <- data |> + filter(type != "Total") |> + left_join(asking_start) |> mutate(asking_index = asking_price / asking_start) -data_asking_index %>% +data_asking_index |> ggplot(aes(x = date, y = asking_index, color = type, group = type)) + geom_line(size = 2, alpha = 0.15, lineend = "round", show.legend = FALSE) + geom_smooth(se = FALSE, show.legend = FALSE) + @@ -612,6 +680,8 @@ data_asking_index %>% legend.key = element_rect(fill = "transparent", color = "transparent")) ``` +
+ Adjusted for the number of properties offered, the average asking price in Amsterdam has increased by 107%. That means that prices since 2012 have (on average) doubled. For apartments, this increase was 110%. Inflation on the other hand was only about 8.5% since 2012. So young people wanting to buy a house in Amsterdam need to bring nearly twice as much money to the table as they did just a decade ago. The money they saved and the salaries they earned have by no means kept up with the increasing price though. @@ -626,26 +696,4 @@ I cannot know this for sure, because the data I presented here doesn't contain t Perhaps if we implement a few of these strategies we can make the housing market a little fairer and perhaps ensure that future generations will have the same opportunities current home-owners have. -
**EDIT (2021-12-03)**: Reintroduce the code to wrangle the data, generate the figures and tables etc. to the main document and put the code inside unfoldable tags to make it easier to see the code associated with a particular step while maintaining legibility. - - diff --git a/content/blog/2021-amsterdam-housing-market/index.qmd b/content/blog/2021-amsterdam-housing-market/index.qmd index 3964e22..bb897de 100644 --- a/content/blog/2021-amsterdam-housing-market/index.qmd +++ b/content/blog/2021-amsterdam-housing-market/index.qmd @@ -1,26 +1,25 @@ --- title: Visualizing the State of the Amsterdam Housing Market -author: Daniel Roelfs -date: "2021-06-20" +date: 2021-06-20 +description: Visualizing the State of the Amsterdam Housing Market slug: visualizing-the-state-of-the-amsterdam-housing-market categories: - society tags: - ggplot - society -description: "Visualizing the State of the Amsterdam Housing Market" -thumbnail: images/avatar.png -format: hugo execute: fig.retina: 2 - fig.align: center + fig.align: left fig.show: hold results: hold - out.width: 80% dev.args: list(bg = "#EBEBEB") --- -```{css echo=FALSE} +```{css} +#| label: style +#| echo: FALSE + body { background-color: #EBEBEB; /* was #EFEFEF */ } @@ -51,6 +50,8 @@ This will be a fairly basic data analysis and visualization post, I can't claim ```{r} #| label: pkgs #| message: false +#| code-fold: true +#| code-summary: "Show code" library(tidyverse) library(ggtext) @@ -88,57 +89,59 @@ You can look at the code I used to load and merge the files. It's a bit of a mes ```{r} #| label: load-data #| eval: false +#| code-fold: true +#| code-summary: "Show code" -asking_price <- readxl::read_xlsx("MVA_kwartaalcijfers.xlsx", sheet = 2) %>% - janitor::clean_names() %>% +asking_price <- readxl::read_xlsx("MVA_kwartaalcijfers.xlsx", sheet = 2) |> + janitor::clean_names() |> mutate(type_woning = as_factor(type_woning), across(where(is.double), ~ .x * 1e3), - across(where(is.character), ~ parse_number(str_remove_all(.x, fixed(" "))))) %>% + across(where(is.character), ~ parse_number(str_remove_all(.x, fixed(" "))))) |> pivot_longer(starts_with("x"), names_to = "date", values_to = "asking_price") -transaction_price <- readxl::read_xlsx("MVA_kwartaalcijfers.xlsx", sheet = 3) %>% - janitor::clean_names() %>% +transaction_price <- readxl::read_xlsx("MVA_kwartaalcijfers.xlsx", sheet = 3) |> + janitor::clean_names() |> mutate(type_woning = as_factor(type_woning), - across(where(is.character), ~ parse_number(str_remove_all(.x, fixed(" "))))) %>% + across(where(is.character), ~ parse_number(str_remove_all(.x, fixed(" "))))) |> pivot_longer(starts_with("x"), names_to = "date", values_to = "transaction_price") -price_per_m2 <- readxl::read_xlsx("MVA_kwartaalcijfers.xlsx", sheet = 4) %>% - janitor::clean_names() %>% +price_per_m2 <- readxl::read_xlsx("MVA_kwartaalcijfers.xlsx", sheet = 4) |> + janitor::clean_names() |> mutate(type_woning = as_factor(type_woning), across(where(is.double), ~ .x * 1e3), - across(where(is.character), ~ parse_number(str_remove_all(.x, fixed(" "))))) %>% + across(where(is.character), ~ parse_number(str_remove_all(.x, fixed(" "))))) |> pivot_longer(starts_with("x"), names_to = "date", values_to = "price_per_m2") -n_offered <- readxl::read_xlsx("MVA_kwartaalcijfers.xlsx", sheet = 5) %>% - janitor::clean_names() %>% +n_offered <- readxl::read_xlsx("MVA_kwartaalcijfers.xlsx", sheet = 5) |> + janitor::clean_names() |> mutate(type_woning = as_factor(type_woning), - across(where(is.character), ~ parse_number(str_remove_all(.x, fixed(" "))))) %>% + across(where(is.character), ~ parse_number(str_remove_all(.x, fixed(" "))))) |> pivot_longer(starts_with("x"), names_to = "date", values_to = "n_offered") -n_sold <- readxl::read_xlsx("MVA_kwartaalcijfers.xlsx", sheet = 6) %>% - janitor::clean_names() %>% +n_sold <- readxl::read_xlsx("MVA_kwartaalcijfers.xlsx", sheet = 6) |> + janitor::clean_names() |> mutate(type_woning = as_factor(type_woning), - across(where(is.character), ~ parse_number(str_remove_all(.x, fixed(" "))))) %>% + across(where(is.character), ~ parse_number(str_remove_all(.x, fixed(" "))))) |> pivot_longer(starts_with("x"), names_to = "date", values_to = "n_sold") -mortgage_months <- readxl::read_xlsx("MVA_kwartaalcijfers.xlsx", sheet = 7) %>% - janitor::clean_names() %>% +mortgage_months <- readxl::read_xlsx("MVA_kwartaalcijfers.xlsx", sheet = 7) |> + janitor::clean_names() |> mutate(type_woning = as_factor(type_woning), - across(where(is.character), ~ parse_number(str_remove_all(.x, fixed(" "))))) %>% + across(where(is.character), ~ parse_number(str_remove_all(.x, fixed(" "))))) |> pivot_longer(starts_with("x"), names_to = "date", values_to = "mortgage_months") -tightness_index <- readxl::read_xlsx("MVA_kwartaalcijfers.xlsx", sheet = 8) %>% - janitor::clean_names() %>% +tightness_index <- readxl::read_xlsx("MVA_kwartaalcijfers.xlsx", sheet = 8) |> + janitor::clean_names() |> mutate(type_woning = as_factor(type_woning), - across(where(is.character), ~ parse_number(str_replace_all(.x, ",", ".")))) %>% + across(where(is.character), ~ parse_number(str_replace_all(.x, ",", ".")))) |> pivot_longer(starts_with("x"), names_to = "date", values_to = "tightness_index") -data_merged <- inner_join(asking_price, transaction_price) %>% - inner_join(price_per_m2) %>% - inner_join(n_offered) %>% - inner_join(n_sold) %>% - inner_join(mortgage_months) %>% - inner_join(tightness_index) %>% +data_merged <- inner_join(asking_price, transaction_price) |> + inner_join(price_per_m2) |> + inner_join(n_offered) |> + inner_join(n_sold) |> + inner_join(mortgage_months) |> + inner_join(tightness_index) |> mutate(asking_price = ifelse(asking_price < 1e5, asking_price * 1e3, asking_price), transaction_price = ifelse(transaction_price < 1e5, transaction_price * 1e3, transaction_price), price_per_m2 = ifelse(price_per_m2 > 1e4, price_per_m2 / 1e3, price_per_m2)) @@ -150,6 +153,8 @@ Let's have a look at the dataset. ```{r} #| label: load-glimpse-data +#| code-fold: true +#| code-summary: "Show code" data_merged <- read_rds("data_merged.rds") @@ -160,9 +165,11 @@ From this dataset, I want to create a few new variables. I want to create a date ```{r} #| label: clean-data +#| code-fold: true +#| code-summary: "Show code" -data <- data_merged %>% - rename(type = type_woning) %>% +data <- data_merged |> + rename(type = type_woning) |> mutate(quarter = str_extract(date, pattern = "x(.*?)e"), quarter = parse_number(quarter), year = str_extract(date, pattern = "kw_(.*)"), @@ -182,7 +189,7 @@ data <- data_merged %>% str_detect(type,"2-onder-1-kap") ~ "Semi-detached house"), type = factor(type, levels = c("Apartments (pre-1970)","Apartments (post-1970)", "Terraced house","Corner house","Detached house", - "Semi-detached house","Total"))) %>% + "Semi-detached house","Total"))) |> glimpse() ``` @@ -191,17 +198,19 @@ The first thing that seems interesting to do is to plot the percentage differenc ```{r} #| label: plot-overpay #| warning: false +#| code-fold: true +#| code-summary: "Show code" colors <- c("#019868","#9dd292","#ec0b88","#651eac","#e18a1e","#2b7de5") -data %>% - filter(type != "Total") %>% - group_by(type) %>% +data |> + filter(type != "Total") |> + group_by(type) |> mutate(n_total = sum(n_sold), type_label = str_glue("{type} (n={format(n_total, big.mark = \".\", decimal.mark = \",\")})"), - type_label = str_replace(type_label,"\\) \\(", ", ")) %>% - arrange(type) %>% - mutate(type_label = factor(type_label)) %>% + type_label = str_replace(type_label,"\\) \\(", ", ")) |> + arrange(type) |> + mutate(type_label = factor(type_label)) |> ggplot(aes(x = date, y = diff_ask_paid_perc, color = type_label)) + geom_hline(yintercept = 0, color = "grey30", size = 1) + geom_line(size = 1.2, alpha = 0.8, lineend = "round", key_glyph = "point") + @@ -226,48 +235,52 @@ Prior to 2014, most properties in Amsterdam were sold at about 6% below asking p ```{r} #| label: overpay-table #| out-width: 60% +#| code-fold: true +#| code-summary: "Show code" -data %>% +data |> filter(type != "Total", - date == max(date)) %>% - select(type, diff_ask_paid_perc, diff_ask_paid, transaction_price, n_sold) %>% - arrange(-diff_ask_paid_perc) %>% - gt() %>% - cols_align(columns = "type", align = "left") %>% - fmt_percent(columns = "diff_ask_paid_perc") %>% - fmt_currency(columns = "diff_ask_paid", currency = "EUR", decimals = 0, sep_mark = " ") %>% - fmt_currency(columns = "transaction_price", currency = "EUR", decimals = 0, sep_mark = " ") %>% - fmt_number(columns = "n_sold", sep_mark = " ", drop_trailing_zeros = TRUE) %>% + date == max(date)) |> + select(type, diff_ask_paid_perc, diff_ask_paid, transaction_price, n_sold) |> + arrange(-diff_ask_paid_perc) |> + gt() |> + cols_align(columns = "type", align = "left") |> + fmt_percent(columns = "diff_ask_paid_perc") |> + fmt_currency(columns = "diff_ask_paid", currency = "EUR", decimals = 0, sep_mark = " ") |> + fmt_currency(columns = "transaction_price", currency = "EUR", decimals = 0, sep_mark = " ") |> + fmt_number(columns = "n_sold", sep_mark = " ", drop_trailing_zeros = TRUE) |> cols_label( type = html("
Property type
"), diff_ask_paid_perc = html("
Percentage overpay
"), diff_ask_paid = html("
Mean overpay
"), transaction_price = html("
Mean transaction price
"), n_sold = html("
Number of properties
sold in period
") - ) %>% + ) |> tab_header(title = html("

Difference between
asking price and price paid

"), - subtitle = html("

Data from the first quarter of 2021

")) %>% - tab_source_note(source_note = md("_**Data**: MVA_")) %>% - tab_options(table.background.color = "#EBEBEB") %>% + subtitle = html("

Data from the first quarter of 2021

")) |> + tab_source_note(source_note = md("_**Data**: MVA_")) |> + tab_options(table.background.color = "#EBEBEB") |> gtsave("overpay-table.png", expand = 0) ``` -What contributed to this price increase? A simple supply-and-demand plays a part. The figure below shows the "tightness index" (Dutch: "krapte indicator") over time. This number represents the number of choices a potential buyer has. This number is calculated per quarter by dividing the number of properties on the market halfway through the quarter divided by the number of transactions over the entire quarter. This number is then multiplied by 3 to correct for the fact that the number is calculated per quarter instead of per month. When the "tightness index" is below 5, it's considered a "sellers market" (source: [NVM](https://www.nvm.nl/wonen/marktinformatie/)). A larger number is good for buyers, a smaller number is good for sellers. In the first quarter of 2021, the number was exactly `r data %>% filter(type == "Total", date == max(date)) %>% pull(tightness_index)`. It varies a bit per property type, but for apartments specifically it hasn't been higher than 3 since 2016. This means that first-time buyers often don't have a choice between more than 2 or 3 apartments per month. I tried to find some data on how many people currently are interested in buying a home in Amsterdam, but I couldn't find anything solid. There's only anecdotal evidence from viewings where within a year, the number of people interested in viewing a property has increased to 80 in 2020, compared to about 55 a year earlier (source: [Parool](https://www.parool.nl/amsterdam/huizenmarkt-is-gekkenhuis-kopers-profiteren-voor-1-april-nog-van-voordelige-belastingregel~bcadc579/)). +What contributed to this price increase? A simple supply-and-demand plays a part. The figure below shows the "tightness index" (Dutch: "krapte indicator") over time. This number represents the number of choices a potential buyer has. This number is calculated per quarter by dividing the number of properties on the market halfway through the quarter divided by the number of transactions over the entire quarter. This number is then multiplied by 3 to correct for the fact that the number is calculated per quarter instead of per month. When the "tightness index" is below 5, it's considered a "sellers market" (source: [NVM](https://www.nvm.nl/wonen/marktinformatie/)). A larger number is good for buyers, a smaller number is good for sellers. In the first quarter of 2021, the number was exactly `r data |> filter(type == "Total", date == max(date)) |> pull(tightness_index)`. It varies a bit per property type, but for apartments specifically it hasn't been higher than 3 since 2016. This means that first-time buyers often don't have a choice between more than 2 or 3 apartments per month. I tried to find some data on how many people currently are interested in buying a home in Amsterdam, but I couldn't find anything solid. There's only anecdotal evidence from viewings where within a year, the number of people interested in viewing a property has increased to 80 in 2020, compared to about 55 a year earlier (source: [Parool](https://www.parool.nl/amsterdam/huizenmarkt-is-gekkenhuis-kopers-profiteren-voor-1-april-nog-van-voordelige-belastingregel~bcadc579/)). ```{r} #| label: plot-tightness #| warning: false #| fig-width: 8 #| fig-height: 6 +#| code-fold: true +#| code-summary: "Show code" -data %>% - filter(type != "Total") %>% - group_by(type) %>% +data |> + filter(type != "Total") |> + group_by(type) |> mutate(n_total = sum(n_sold), type_label = str_glue("{type} (n={format(n_total, big.mark = \".\", decimal.mark = \",\")})"), - type_label = str_replace(type_label,"\\) \\(", ", ")) %>% - arrange(type) %>% - mutate(type_label = factor(type_label)) %>% + type_label = str_replace(type_label,"\\) \\(", ", ")) |> + arrange(type) |> + mutate(type_label = factor(type_label)) |> ggplot(aes(x = date, y = tightness_index, color = type_label)) + geom_rect(data = tibble(), aes(xmin = as.Date(-Inf), xmax = as.Date(Inf), ymin = c(0,5,10), ymax = c(5,10,Inf), @@ -307,9 +320,11 @@ So, there's a lot of competition among buyers, and people looking to sell their #| warning: false #| fig-width: 10 #| fig-height: 7 +#| code-fold: true +#| code-summary: "Show code" -data %>% - filter(type != "Total") %>% +data |> + filter(type != "Total") |> ggplot(aes(x = date, y = perc_sold, fill = type)) + geom_col(key_glyph = "point") + geom_hline(yintercept = 1, linetype = "dashed", color = "grey20") + @@ -341,22 +356,24 @@ This adds fuel to the fire. I guess I'm trying to show that there are a number o #| warning: false #| fig-width: 10 #| fig-height: 8 +#| code-fold: true +#| code-summary: "Show code" decades <- tibble( x1 = seq(1920,2020,10), x2 = x1 + 10 -) %>% - slice(seq(1,1e2,2)) %>% +) |> + slice(seq(1,1e2,2)) |> filter(x2 < 2030) -hist_data_homes <- cbsodataR::cbs_get_data("82235NED") %>% - janitor::clean_names() %>% - rename(stock_end = eindstand_voorraad_8) %>% +hist_data_homes <- cbsodataR::cbs_get_data("82235NED") |> + janitor::clean_names() |> + rename(stock_end = eindstand_voorraad_8) |> mutate(year = parse_number(perioden), stock_end = stock_end * 1e3, diff = stock_end - lag(stock_end)) -total_hist_plot <- hist_data_homes %>% +total_hist_plot <- hist_data_homes |> ggplot(aes(x = year, y = stock_end)) + geom_rect(data = tibble(), aes(xmin = 1940.01, xmax = 1945, ymin = -Inf, ymax = Inf), fill = "red", alpha = 0.2, inherit.aes = FALSE) + @@ -371,7 +388,7 @@ total_hist_plot <- hist_data_homes %>% scale_y_continuous(labels = scales::label_number()) + coord_cartesian(clip = "off") -diff_hist_plot <- hist_data_homes %>% +diff_hist_plot <- hist_data_homes |> ggplot(aes(x = year, y = diff)) + geom_hline(yintercept = 0.5, color = "grey30", size = 1) + geom_rect(data = tibble(), aes(xmin = 1940.01, xmax = 1945, ymin = -Inf, ymax = Inf), @@ -404,6 +421,8 @@ But that's not all, there's a few other features that contribute to the gridlock ```{r} #| label: load-housing-mutations +#| code-fold: true +#| code-summary: "Show code" house_mutations_in <- tribble( ~year, ~buy_to_rent_corp, ~buy_to_rent_other, ~rent_corp_to_buy, ~rent_corp_to_rent_other, ~rent_other_to_buy, ~rent_other_to_rent_corp, @@ -415,19 +434,19 @@ house_mutations_in <- tribble( 2017, 1600, 98900, 6400, 11000, 7300, 9000 ) -house_mutations <- house_mutations_in %>% - pivot_longer(cols = -year, names_to = "mutation", values_to = "n_mutations") %>% +house_mutations <- house_mutations_in |> + pivot_longer(cols = -year, names_to = "mutation", values_to = "n_mutations") |> mutate(from = str_extract(mutation, "(.*?)_to"), from = str_remove_all(from, "_to"), to = str_extract(mutation, "to_(.*?)$"), - to = str_remove_all(to, "to_")) %>% - group_by(year) %>% + to = str_remove_all(to, "to_")) |> + group_by(year) |> mutate(total_mutations = sum(n_mutations), perc_mutations = n_mutations / total_mutations, across(c(from,to), ~ case_when(str_detect(.x,"buy") ~ "buy", str_detect(.x,"rent_corp") ~ "rent (corporation)", str_detect(.x,"rent_other") ~ "rent (other)")), - mutation_label = str_glue("From {from} to {to}")) %>% + mutation_label = str_glue("From {from} to {to}")) |> ungroup() ``` @@ -438,10 +457,12 @@ So not every year there's the same number of "mutations" (transformations of pur #| warning: false #| fig-width: 10 #| fig-height: 9 +#| code-fold: true +#| code-summary: "Show code" -mutations_n_plot <- house_mutations %>% - arrange(from,to) %>% - mutate(mutation_label = fct_inorder(mutation_label)) %>% +mutations_n_plot <- house_mutations |> + arrange(from,to) |> + mutate(mutation_label = fct_inorder(mutation_label)) |> ggplot(aes(x = year, y = n_mutations, alluvium = mutation_label, fill = mutation_label)) + geom_point(shape = 21, color = "transparent", size = NA) + ggalluvial::geom_flow(width = 0, show.legend = FALSE) + @@ -456,9 +477,9 @@ mutations_n_plot <- house_mutations %>% override.aes = list(color = "transparent", size = 6, alpha = 1, stroke = 0))) + theme(legend.key = element_rect(fill = "transparent")) -mutations_perc_plot <- house_mutations %>% - arrange(from,to) %>% - mutate(mutation_label = fct_inorder(mutation_label)) %>% +mutations_perc_plot <- house_mutations |> + arrange(from,to) |> + mutate(mutation_label = fct_inorder(mutation_label)) |> ggplot(aes(x = year, y = perc_mutations, alluvium = mutation_label, fill = mutation_label)) + geom_point(shape = 21, color = "transparent", size = NA) + ggalluvial::geom_flow(width = 0, show.legend = FALSE) + @@ -491,22 +512,24 @@ We can look at the net number of houses added to the rental market by adding up #| message: false #| fig-width: 10 #| fig-height: 6 +#| code-fold: true +#| code-summary: "Show code" -net_house_mutations <- house_mutations %>% +net_house_mutations <- house_mutations |> mutate(across(c(from,to), ~ case_when(str_detect(.x, "rent") ~ "rent", - str_detect(.x, "buy") ~ "buy"))) %>% - group_by(year) %>% - filter(from != to) %>% - mutate(total_mutations = sum(n_mutations)) %>% - group_by(year, from, to) %>% + str_detect(.x, "buy") ~ "buy"))) |> + group_by(year) |> + filter(from != to) |> + mutate(total_mutations = sum(n_mutations)) |> + group_by(year, from, to) |> summarise(n_mutations = sum(n_mutations), - total_mutations = first(total_mutations)) %>% + total_mutations = first(total_mutations)) |> pivot_wider(id_cols = c(year,total_mutations), names_from = c(from,to), values_from = n_mutations, - names_sep = "_to_") %>% + names_sep = "_to_") |> mutate(net_buy_to_rent = buy_to_rent - rent_to_buy, perc_buy_to_rent = buy_to_rent / total_mutations) -net_mutation_plot <- net_house_mutations %>% +net_mutation_plot <- net_house_mutations |> ggplot(aes(x = year, y = net_buy_to_rent)) + geom_hline(yintercept = 0, color = "grey30", size = 1) + #geom_ribbon(aes(ymin = 0, ymax = net_buy_to_rent), fill = "grey30", alpha = 0.2) + @@ -523,7 +546,7 @@ net_mutation_plot <- net_house_mutations %>% scale_y_continuous(labels = scales::label_number_auto(), limits = c(-3e4, NA), n.breaks = 5) + theme(axis.title.y = element_textbox(width = grid::unit(3.5,"in"))) -perc_mutation_plot <- net_house_mutations %>% +perc_mutation_plot <- net_house_mutations |> ggplot(aes(x = year, y = perc_buy_to_rent)) + geom_hline(yintercept = 0.5, color = "grey30", size = 1) + geom_line(color = "darkred", size = 1.5, lineend = "round") + @@ -546,37 +569,39 @@ So in 2017 nearly 90 000 houses were mutated from sale to rental properties. In ```{r} #| label: mutations-table #| out-width: 60% - -house_mutations %>% - filter(year == max(year)) %>% - select(from, to, n_mutations, perc_mutations) %>% - mutate(across(c(from,to), ~ str_to_sentence(.x))) %>% - arrange(-perc_mutations) %>% - gt() %>% - fmt_number(columns = "n_mutations", sep_mark = " ", drop_trailing_zeros = TRUE) %>% - fmt_percent(columns = "perc_mutations") %>% +#| code-fold: true +#| code-summary: "Show code" + +house_mutations |> + filter(year == max(year)) |> + select(from, to, n_mutations, perc_mutations) |> + mutate(across(c(from,to), ~ str_to_sentence(.x))) |> + arrange(-perc_mutations) |> + gt() |> + fmt_number(columns = "n_mutations", sep_mark = " ", drop_trailing_zeros = TRUE) |> + fmt_percent(columns = "perc_mutations") |> grand_summary_rows( columns = "n_mutations", fns = list(total = "sum"), formatter = fmt_number, sep_mark = " ", - drop_trailing_zeros = TRUE) %>% + drop_trailing_zeros = TRUE) |> grand_summary_rows( columns = "perc_mutations", fns = list(total = "sum"), formatter = fmt_percent, - missing_text = "") %>% + missing_text = "") |> cols_label( from = html("
From
"), to = html("
To
"), n_mutations = html("
Number of mutations
"), perc_mutations = html("
Percentage of total mutations
") - ) %>% + ) |> tab_header(title = html("

Mutations in the Dutch housing market

"), - subtitle = html("

Data from the year 2017

")) %>% - tab_source_note(source_note = md("_**Data**: CBS_")) %>% + subtitle = html("

Data from the year 2017

")) |> + tab_source_note(source_note = md("_**Data**: CBS_")) |> tab_options(table.background.color = "#EBEBEB", - grand_summary_row.text_transform = "capitalize") %>% + grand_summary_row.text_transform = "capitalize") |> gtsave("mutations-table.png", expand = 0) ``` @@ -588,19 +613,21 @@ So what's the result of all these phenomena? The figure below shows the housing #| warning: false #| fig-width: 10 #| fig-height: 6 +#| code-fold: true +#| code-summary: "Show code" -asking_start <- data %>% +asking_start <- data |> filter(type != "Total", - date == min(date)) %>% - rename(asking_start = asking_price) %>% - select(type,asking_start) + date == min(date)) |> + rename(asking_start = asking_price) |> + select(type, asking_start) -data_asking_index <- data %>% - filter(type != "Total") %>% - left_join(asking_start) %>% +data_asking_index <- data |> + filter(type != "Total") |> + left_join(asking_start) |> mutate(asking_index = asking_price / asking_start) -data_asking_index %>% +data_asking_index |> ggplot(aes(x = date, y = asking_index, color = type, group = type)) + geom_line(size = 2, alpha = 0.15, lineend = "round", show.legend = FALSE) + geom_smooth(se = FALSE, show.legend = FALSE) + @@ -622,7 +649,7 @@ data_asking_index %>% ``` -Adjusted for the number of properties offered, the average asking price in Amsterdam has increased by `r data_asking_index %>% filter(date == max(date)) %$% weighted.mean(asking_index, n_offered) %>% subtract(.,1) %>% scales::percent()`. That means that prices since 2012 have (on average) doubled. For apartments, this increase was `r data_asking_index %>% filter(str_detect(type,"Apartments"), date == max(date)) %$% weighted.mean(asking_index, n_offered) %>% subtract(.,1) %>% scales::percent()`. Inflation on the other hand was only about 8.5% since 2012. So young people wanting to buy a house in Amsterdam need to bring nearly twice as much money to the table as they did just a decade ago. The money they saved and the salaries they earned have by no means kept up with the increasing price though. +Adjusted for the number of properties offered, the average asking price in Amsterdam has increased by `r data_asking_index |> filter(date == max(date)) %$% weighted.mean(asking_index, n_offered) %>% subtract(.,1) |> scales::percent()`. That means that prices since 2012 have (on average) doubled. For apartments, this increase was `r data_asking_index |> filter(str_detect(type,"Apartments"), date == max(date)) %$% weighted.mean(asking_index, n_offered) %>% subtract(.,1) |> scales::percent()`. Inflation on the other hand was only about 8.5% since 2012. So young people wanting to buy a house in Amsterdam need to bring nearly twice as much money to the table as they did just a decade ago. The money they saved and the salaries they earned have by no means kept up with the increasing price though. I cannot know this for sure, because the data I presented here doesn't contain the answer, but perhaps these factors combined play a role in the financial uncertainty that millenials experience. How do we fix it? I'm not sure. This is a really complicated issue. But a few things might (in my opinion) help: @@ -634,26 +661,4 @@ I cannot know this for sure, because the data I presented here doesn't contain t Perhaps if we implement a few of these strategies we can make the housing market a little fairer and perhaps ensure that future generations will have the same opportunities current home-owners have. -
**EDIT (2021-12-03)**: Reintroduce the code to wrangle the data, generate the figures and tables etc. to the main document and put the code inside unfoldable tags to make it easier to see the code associated with a particular step while maintaining legibility. - - \ No newline at end of file diff --git a/content/blog/2021-amsterdam-housing-market/index_files/figure-gfm/mutations-table-1.png b/content/blog/2021-amsterdam-housing-market/index_files/figure-gfm/mutations-table-1.png deleted file mode 100644 index 719c480..0000000 Binary files a/content/blog/2021-amsterdam-housing-market/index_files/figure-gfm/mutations-table-1.png and /dev/null differ diff --git a/content/blog/2021-amsterdam-housing-market/index_files/figure-gfm/net-shift-to-renters-1.png b/content/blog/2021-amsterdam-housing-market/index_files/figure-gfm/net-shift-to-renters-1.png deleted file mode 100644 index 204370d..0000000 Binary files a/content/blog/2021-amsterdam-housing-market/index_files/figure-gfm/net-shift-to-renters-1.png and /dev/null differ diff --git a/content/blog/2021-amsterdam-housing-market/index_files/figure-gfm/overpay-table-1.png b/content/blog/2021-amsterdam-housing-market/index_files/figure-gfm/overpay-table-1.png deleted file mode 100644 index 7e882ea..0000000 Binary files a/content/blog/2021-amsterdam-housing-market/index_files/figure-gfm/overpay-table-1.png and /dev/null differ diff --git a/content/blog/2021-amsterdam-housing-market/index_files/figure-gfm/plot-asking-index-increase-1.png b/content/blog/2021-amsterdam-housing-market/index_files/figure-gfm/plot-asking-index-increase-1.png deleted file mode 100644 index 96cc446..0000000 Binary files a/content/blog/2021-amsterdam-housing-market/index_files/figure-gfm/plot-asking-index-increase-1.png and /dev/null differ diff --git a/content/blog/2021-amsterdam-housing-market/index_files/figure-gfm/plot-housing-n_mutations-1.png b/content/blog/2021-amsterdam-housing-market/index_files/figure-gfm/plot-housing-n_mutations-1.png deleted file mode 100644 index 03893a8..0000000 Binary files a/content/blog/2021-amsterdam-housing-market/index_files/figure-gfm/plot-housing-n_mutations-1.png and /dev/null differ diff --git a/content/blog/2021-amsterdam-housing-market/index_files/figure-gfm/plot-n-homes-1.png b/content/blog/2021-amsterdam-housing-market/index_files/figure-gfm/plot-n-homes-1.png deleted file mode 100644 index 81ad138..0000000 Binary files a/content/blog/2021-amsterdam-housing-market/index_files/figure-gfm/plot-n-homes-1.png and /dev/null differ diff --git a/content/blog/2021-amsterdam-housing-market/index_files/figure-gfm/plot-n-sales-1.png b/content/blog/2021-amsterdam-housing-market/index_files/figure-gfm/plot-n-sales-1.png deleted file mode 100644 index 2003ea8..0000000 Binary files a/content/blog/2021-amsterdam-housing-market/index_files/figure-gfm/plot-n-sales-1.png and /dev/null differ diff --git a/content/blog/2021-amsterdam-housing-market/index_files/figure-gfm/plot-overpay-1.png b/content/blog/2021-amsterdam-housing-market/index_files/figure-gfm/plot-overpay-1.png deleted file mode 100644 index 5c44aeb..0000000 Binary files a/content/blog/2021-amsterdam-housing-market/index_files/figure-gfm/plot-overpay-1.png and /dev/null differ diff --git a/content/blog/2021-amsterdam-housing-market/index_files/figure-gfm/plot-tightness-1.png b/content/blog/2021-amsterdam-housing-market/index_files/figure-gfm/plot-tightness-1.png deleted file mode 100644 index 153c19c..0000000 Binary files a/content/blog/2021-amsterdam-housing-market/index_files/figure-gfm/plot-tightness-1.png and /dev/null differ diff --git a/content/blog/2021-amsterdam-housing-market/mutations-table.png b/content/blog/2021-amsterdam-housing-market/mutations-table.png index 8d35e0f..3488a5c 100644 Binary files a/content/blog/2021-amsterdam-housing-market/mutations-table.png and b/content/blog/2021-amsterdam-housing-market/mutations-table.png differ diff --git a/content/blog/2021-amsterdam-housing-market/overpay-table.png b/content/blog/2021-amsterdam-housing-market/overpay-table.png index b398120..4af8208 100644 Binary files a/content/blog/2021-amsterdam-housing-market/overpay-table.png and b/content/blog/2021-amsterdam-housing-market/overpay-table.png differ diff --git a/content/blog/2021-comparison-fa-pca-ica/index.markdown_strict_files/figure-markdown_strict/all-weight-matrices-1.png b/content/blog/2021-comparison-fa-pca-ica/index.markdown_strict_files/figure-markdown_strict/all-weight-matrices-1.png index d2745d1..be8cc54 100644 Binary files a/content/blog/2021-comparison-fa-pca-ica/index.markdown_strict_files/figure-markdown_strict/all-weight-matrices-1.png and b/content/blog/2021-comparison-fa-pca-ica/index.markdown_strict_files/figure-markdown_strict/all-weight-matrices-1.png differ diff --git a/content/blog/2021-comparison-fa-pca-ica/index.markdown_strict_files/figure-markdown_strict/fa-weight-matrix-1.png b/content/blog/2021-comparison-fa-pca-ica/index.markdown_strict_files/figure-markdown_strict/fa-weight-matrix-1.png index 6042748..65d8be3 100644 Binary files a/content/blog/2021-comparison-fa-pca-ica/index.markdown_strict_files/figure-markdown_strict/fa-weight-matrix-1.png and b/content/blog/2021-comparison-fa-pca-ica/index.markdown_strict_files/figure-markdown_strict/fa-weight-matrix-1.png differ diff --git a/content/blog/2021-comparison-fa-pca-ica/index.markdown_strict_files/figure-markdown_strict/hclust-weight-matrix-1.png b/content/blog/2021-comparison-fa-pca-ica/index.markdown_strict_files/figure-markdown_strict/hclust-weight-matrix-1.png index f64981b..ae83181 100644 Binary files a/content/blog/2021-comparison-fa-pca-ica/index.markdown_strict_files/figure-markdown_strict/hclust-weight-matrix-1.png and b/content/blog/2021-comparison-fa-pca-ica/index.markdown_strict_files/figure-markdown_strict/hclust-weight-matrix-1.png differ diff --git a/content/blog/2021-comparison-fa-pca-ica/index.markdown_strict_files/figure-markdown_strict/ica-weight-matrix-1.png b/content/blog/2021-comparison-fa-pca-ica/index.markdown_strict_files/figure-markdown_strict/ica-weight-matrix-1.png index 908f1cd..57139c4 100644 Binary files a/content/blog/2021-comparison-fa-pca-ica/index.markdown_strict_files/figure-markdown_strict/ica-weight-matrix-1.png and b/content/blog/2021-comparison-fa-pca-ica/index.markdown_strict_files/figure-markdown_strict/ica-weight-matrix-1.png differ diff --git a/content/blog/2021-comparison-fa-pca-ica/index.markdown_strict_files/figure-markdown_strict/pca-weight-matrix-1.png b/content/blog/2021-comparison-fa-pca-ica/index.markdown_strict_files/figure-markdown_strict/pca-weight-matrix-1.png index 7ae8b51..4f35a4e 100644 Binary files a/content/blog/2021-comparison-fa-pca-ica/index.markdown_strict_files/figure-markdown_strict/pca-weight-matrix-1.png and b/content/blog/2021-comparison-fa-pca-ica/index.markdown_strict_files/figure-markdown_strict/pca-weight-matrix-1.png differ diff --git a/content/blog/2021-comparison-fa-pca-ica/index.md b/content/blog/2021-comparison-fa-pca-ica/index.md index f9a497b..00f9c7c 100644 --- a/content/blog/2021-comparison-fa-pca-ica/index.md +++ b/content/blog/2021-comparison-fa-pca-ica/index.md @@ -1,16 +1,13 @@ --- title: A Basic Comparison Between Factor Analysis, PCA, and ICA -author: Daniel Roelfs -date: "2021-09-14" +date: 2021-09-14 +description: A Basic Comparison Between Factor Analysis, PCA, and ICA slug: a-basic-comparison-between-factor-analysis-pca-and-ica categories: - statistics tags: - statistics - R -description: "A Basic Comparison Between Factor Analysis, PCA, and ICA" -thumbnail: images/avatar.png -format: hugo execute: fig.retina: 2 fig.align: center @@ -21,9 +18,7 @@ execute: This is just a very quick blog post outlining some of the commonalities and differences between factor analysis (FA), principal component analysis (PCA), and independent component analysis (ICA). I was inspired to write some of this down through some confusion caused in the lab by SPSS' apparent dual usage of the term "factor analysis" and "principal components". A few of my colleagues who use SPSS showed me the following screen: -

- -

+{{< image src="spss-screenshot.png" alt="spss screenshot" >}} This screen shows up when you click `Analyze` -\> `Dimension Reduction` -\> `Factor`, which then opens a window called "Factor Analysis: Extraction" which lets you pick "Principal components" as a method. To put the (apparent) PCA method as a sub-section of factor analysis is misleading at best, and straight-up erronious at worst. The other options for the method here is "Principal axis factoring" (which is closer to traditional factor analysis) and "Maximum likelihood" ([source for screenshot and SPSS interface](https://stats.idre.ucla.edu/spss/seminars/efa-spss/)). If you're wondering if you're the first one to be confused by SPSS' choice to present this in such a way, you're not ([link](https://stats.stackexchange.com/questions/1576/what-are-the-differences-between-factor-analysis-and-principal-component-analysi), [link](https://stats.stackexchange.com/questions/24781/interpreting-discrepancies-between-r-and-spss-with-exploratory-factor-analysis)). @@ -46,8 +41,8 @@ theme_set(theme_minimal()) We'll load the data into R, clean up the variable names, convert the outcome variable (`ca_cervix`) to a factor. We'll have a look at the dataset using some functions from `{skimr}`. This function will give us summary statistics and basic histograms of the different variables. ``` r -data <- read_csv("sobar-72.csv") %>% - janitor::clean_names() %>% +data <- read_csv("sobar-72.csv") |> + janitor::clean_names() |> mutate(ca_cervix = as_factor(ca_cervix)) skim_summ <- skimr::skim_with(base = skimr::sfl()) @@ -101,8 +96,8 @@ Data summary Now let's only select the variables containing the risk factors (called features from here on). We'll also scale all the features to have a mean of 0 and a standard deviation of 1 using the `scale()` function. We can check what the new variable looks like using the `summary()` function. ``` r -data_features <- data %>% - select(-ca_cervix) %>% +data_features <- data |> + select(-ca_cervix) |> mutate(across(everything(), ~ scale(.x))) summary(data_features$attitude_consistency) @@ -119,11 +114,11 @@ summary(data_features$attitude_consistency) So now we have a data frame with 72 entries and 19 normalized columns, each representing a feature that may or may not predict cervical cancer. We can create a correlation matrix to visualize the degree of correlation between the different features. For this we simply run `cor()` on the data frame with the features, transform the output to a data frame in long format and then visualize it using `ggplot()` and `geom_tile()`. ``` r -cor(data_features) %>% - as_tibble() %>% - mutate(feature_y = names(.)) %>% - pivot_longer(cols = -feature_y, names_to = "feature_x", values_to = "correlation") %>% - mutate(feature_y = fct_rev(feature_y)) %>% +cor(data_features) |> + as_tibble() %>% + mutate(feature_y = names(.)) |> + pivot_longer(cols = -feature_y, names_to = "feature_x", values_to = "correlation") |> + mutate(feature_y = fct_rev(feature_y)) |> ggplot(aes(x = feature_x, y = feature_y, fill = correlation)) + geom_tile() + labs(x = NULL, @@ -242,7 +237,7 @@ We may be tempted to immediately look at the *p*-value at the end of the output. The "Loadings" section of the results show a make-shift weight matrix, but in order to further interpret these results, let's create a plot showing the weight matrix. We'll get the results from the factor analysis model we created earlier using the `tidy()` function from `{broom}` and convert it to long format. We'll then create a weight matrix much in the same way we did earlier. ``` r -fa_weight_matrix <- broom::tidy(fa_model) %>% +fa_weight_matrix <- broom::tidy(fa_model) |> pivot_longer(starts_with("fl"), names_to = "factor", values_to = "loading") fa_loading_plot <- ggplot(fa_weight_matrix, aes(x = factor, y = variable, fill = loading)) + @@ -263,11 +258,11 @@ Here we can more easily see that there's two strong clusters in Factor 1 and Fac Lastly, I think it would be interesting to see how the different factors relate to each other. We'll take the Bartlett's scores and correlate them with each other much like before and create a correlation matrix like before. ``` r -fa_model$scores %>% - cor() %>% - data.frame() %>% - rownames_to_column("factor_x") %>% - pivot_longer(cols = -factor_x, names_to = "factor_y", values_to = "correlation") %>% +fa_model$scores |> + cor() |> + data.frame() |> + rownames_to_column("factor_x") |> + pivot_longer(cols = -factor_x, names_to = "factor_y", values_to = "correlation") |> ggplot(aes(x = factor_x, y = factor_y, fill = correlation)) + geom_tile() + geom_text(aes(label = round(correlation,4)), color = "white") + @@ -301,12 +296,12 @@ pc_model$variance <- pc_model$sdev^2 Next we can make a simple scree plot using the variance we calculate above. We'll create a plot with the principal components on the x-axis and the eigenvalue on the y-axis. The scree plot is a very popular plot to visualize features of a PCA. In this plot the elbow is quite clearly at principal component 3, but as discussed, the scree plot is not the best nor the only way to determine the optimal number of components. In the code below I also added a calculation for the cumulative variance (`cum_variance`) which showed that a little more than 80% of the variance is captured in the first 7 components, while the first 3 components combined capture only 56%. ``` r -pc_model$variance %>% - as_tibble() %>% - rename(eigenvalue = value) %>% - rownames_to_column("comp") %>% +pc_model$variance |> + as_tibble() |> + rename(eigenvalue = value) |> + rownames_to_column("comp") |> mutate(comp = parse_number(comp), - cum_variance = cumsum(eigenvalue)/sum(eigenvalue)) %>% + cum_variance = cumsum(eigenvalue)/sum(eigenvalue)) |> ggplot(aes(x = comp, y = eigenvalue)) + geom_hline(yintercept = 1) + geom_line(size = 1) + @@ -321,13 +316,13 @@ pc_model$variance %>% We'll also create a weight matrix again, based on the rotation from the PCA. We'll create the weight matrix much in the same way as before. A PCA by its very nature returns an equal number of components as the number of variables put in, however, we're interested in just the first 7 components, so we'll select just those using the `filter()` function. ``` r -pc_weight_matrix <- pc_model$rotation %>% - data.frame() %>% - rownames_to_column("variable") %>% +pc_weight_matrix <- pc_model$rotation |> + data.frame() |> + rownames_to_column("variable") |> pivot_longer(starts_with("PC"), names_to = "prin_comp", values_to = "loading") -pca_loading_plot <- pc_weight_matrix %>% - filter(parse_number(prin_comp) <= n_comps) %>% +pca_loading_plot <- pc_weight_matrix |> + filter(parse_number(prin_comp) <= n_comps) |> ggplot(aes(x = reorder(prin_comp, parse_number(prin_comp)), y = variable, fill = loading)) + geom_tile() + labs(title = "PCA loadings", @@ -346,13 +341,13 @@ One thing that immediately jumps out is that PC1 and PC2 are nearly identical to We can also make a correlation matrix for the different principal components. We'll use the `x` field and otherwise create the correlation matrix the same way as before: ``` r -pc_model$x %>% - cor() %>% - data.frame() %>% - rownames_to_column("comp_x") %>% - pivot_longer(cols = starts_with("PC"), names_to = "comp_y", values_to = "correlation") %>% +pc_model$x |> + cor() |> + data.frame() |> + rownames_to_column("comp_x") |> + pivot_longer(cols = starts_with("PC"), names_to = "comp_y", values_to = "correlation") |> filter(parse_number(comp_x) <= n_comps, - parse_number(comp_y) <= n_comps) %>% + parse_number(comp_y) <= n_comps) |> ggplot(aes(x = comp_x, y = comp_y, fill = correlation)) + geom_tile() + geom_text(aes(label = round(correlation,4)), color = "white") + @@ -392,8 +387,8 @@ pc_manual <- as.matrix(data_features) %*% eigenvector Let's look at the scree plot: ``` r -tibble(eigenvalues) %>% - ggplot(aes(x = seq_along(eigenvalues), y = eigenvalues)) + +tibble(eigenvalues) |> + ggplot(aes(x = seq_along(eigenvalues), y = eigenvalues)) + geom_hline(yintercept = 1) + geom_line(size = 1) + geom_point(size = 3) @@ -404,7 +399,7 @@ tibble(eigenvalues) %>% Looks identical to the previous one. Let's also look at the correlation matrix between the principal components. ``` r -data.frame(pc_manual) %>% +data.frame(pc_manual) |> cor() %>% round(., 4) ``` @@ -443,9 +438,9 @@ ica_model <- fastICA(data_features, n.comp = n_comps) Let's create a weight matrix again. The output from the `fastICA()` doesn't provide useful names, but the [documentation](https://www.rdocumentation.org/packages/fastICA/versions/1.2-2/topics/fastICA) provides sufficient information. To create the weight matrix we take the `A` field, transpose it, get the right names to the right places and then create the plot like we've done several times now. ``` r -ica_weight_matrix <- data.frame(t(ica_model$A)) %>% - rename_with(~ str_glue("IC{seq(.)}")) %>% - mutate(variable = names(data_features)) %>% +ica_weight_matrix <- data.frame(t(ica_model$A)) |> + rename_with(~ str_glue("IC{seq(.)}")) |> + mutate(variable = names(data_features)) |> pivot_longer(cols = starts_with("IC"), names_to = "ic", values_to = "loading") ica_loading_plot <- ggplot(ica_weight_matrix, aes(x = ic, y = variable, fill = loading)) + @@ -466,11 +461,11 @@ The FastICA method doesn't rank the components based on variance like factor ana Again, we can also visualize the correlation between the different components. ``` r -ica_model$S %>% - cor() %>% - data.frame() %>% - rownames_to_column("comp_x") %>% - pivot_longer(cols = starts_with("X"), names_to = "comp_y", values_to = "correlation") %>% +ica_model$S |> + cor() |> + data.frame() |> + rownames_to_column("comp_x") |> + pivot_longer(cols = starts_with("X"), names_to = "comp_y", values_to = "correlation") |> ggplot(aes(x = comp_x, y = comp_y, fill = correlation)) + geom_tile() + geom_text(aes(label = round(correlation,4)), color = "white") + @@ -489,21 +484,21 @@ Okay, let's now compare the three approaches and put the loading matrices side b ``` r all_weight_matrices <- bind_rows( - fa_weight_matrix %>% - rename(comp = factor) %>% + fa_weight_matrix |> + rename(comp = factor) |> mutate(alg = "FA"), - pc_weight_matrix %>% - rename(comp = prin_comp) %>% + pc_weight_matrix |> + rename(comp = prin_comp) |> mutate(alg = "PCA"), - ica_weight_matrix %>% - rename(comp = ic) %>% + ica_weight_matrix |> + rename(comp = ic) |> mutate(alg = "ICA") ) -all_weight_matrices %>% - filter(parse_number(comp) <= n_comps) %>% +all_weight_matrices |> + filter(parse_number(comp) <= n_comps) |> mutate(alg = str_glue("{alg} loadings"), - alg = as_factor(alg)) %>% + alg = as_factor(alg)) |> ggplot(aes(x = comp, y = variable, fill = loading)) + geom_tile() + labs(x = NULL, @@ -555,10 +550,10 @@ Let's look at how the clusters are made up according to the hierarchical cluster ``` r hclust_weight_matrix %>% - data.frame() %>% - janitor::clean_names() %>% - rename(cluster = x) %>% - rownames_to_column("variable") %>% + data.frame() |> + janitor::clean_names() |> + rename(cluster = x) |> + rownames_to_column("variable") |> ggplot(aes(x = as_factor(cluster), y = variable, fill = as_factor(cluster))) + geom_tile() + labs(x = NULL, diff --git a/content/blog/2021-comparison-fa-pca-ica/index.qmd b/content/blog/2021-comparison-fa-pca-ica/index.qmd index c25a2b7..8cc0c05 100644 --- a/content/blog/2021-comparison-fa-pca-ica/index.qmd +++ b/content/blog/2021-comparison-fa-pca-ica/index.qmd @@ -1,16 +1,13 @@ --- title: A Basic Comparison Between Factor Analysis, PCA, and ICA -author: Daniel Roelfs -date: "2021-09-14" +date: 2021-09-14 +description: A Basic Comparison Between Factor Analysis, PCA, and ICA slug: a-basic-comparison-between-factor-analysis-pca-and-ica categories: - statistics tags: - statistics - R -description: "A Basic Comparison Between Factor Analysis, PCA, and ICA" -thumbnail: images/avatar.png -format: hugo execute: fig.retina: 2 fig.align: center @@ -21,9 +18,7 @@ execute: This is just a very quick blog post outlining some of the commonalities and differences between factor analysis (FA), principal component analysis (PCA), and independent component analysis (ICA). I was inspired to write some of this down through some confusion caused in the lab by SPSS' apparent dual usage of the term "factor analysis" and "principal components". A few of my colleagues who use SPSS showed me the following screen: -

- -

+{{{< image src="spss-screenshot.png" alt="spss screenshot" >}}} This screen shows up when you click `Analyze` -> `Dimension Reduction` -> `Factor`, which then opens a window called "Factor Analysis: Extraction" which lets you pick "Principal components" as a method. To put the (apparent) PCA method as a sub-section of factor analysis is misleading at best, and straight-up erronious at worst. The other options for the method here is "Principal axis factoring" (which is closer to traditional factor analysis) and "Maximum likelihood" ([source for screenshot and SPSS interface](https://stats.idre.ucla.edu/spss/seminars/efa-spss/)). If you're wondering if you're the first one to be confused by SPSS' choice to present this in such a way, you're not ([link](https://stats.stackexchange.com/questions/1576/what-are-the-differences-between-factor-analysis-and-principal-component-analysi), [link](https://stats.stackexchange.com/questions/24781/interpreting-discrepancies-between-r-and-spss-with-exploratory-factor-analysis)). @@ -52,8 +47,8 @@ We'll load the data into R, clean up the variable names, convert the outcome var #| label: load-data #| message: false -data <- read_csv("sobar-72.csv") %>% - janitor::clean_names() %>% +data <- read_csv("sobar-72.csv") |> + janitor::clean_names() |> mutate(ca_cervix = as_factor(ca_cervix)) skim_summ <- skimr::skim_with(base = skimr::sfl()) @@ -65,8 +60,8 @@ Now let's only select the variables containing the risk factors (called features ```{r} #| label: scale-data -data_features <- data %>% - select(-ca_cervix) %>% +data_features <- data |> + select(-ca_cervix) |> mutate(across(everything(), ~ scale(.x))) summary(data_features$attitude_consistency) @@ -77,11 +72,11 @@ So now we have a data frame with 72 entries and 19 normalized columns, each repr ```{r} #| label: corr-matrix-original -cor(data_features) %>% - as_tibble() %>% - mutate(feature_y = names(.)) %>% - pivot_longer(cols = -feature_y, names_to = "feature_x", values_to = "correlation") %>% - mutate(feature_y = fct_rev(feature_y)) %>% +cor(data_features) |> + as_tibble() %>% + mutate(feature_y = names(.)) |> + pivot_longer(cols = -feature_y, names_to = "feature_x", values_to = "correlation") |> + mutate(feature_y = fct_rev(feature_y)) |> ggplot(aes(x = feature_x, y = feature_y, fill = correlation)) + geom_tile() + labs(x = NULL, @@ -128,7 +123,7 @@ The "Loadings" section of the results show a make-shift weight matrix, but in or ```{r} #| label: fa-weight-matrix -fa_weight_matrix <- broom::tidy(fa_model) %>% +fa_weight_matrix <- broom::tidy(fa_model) |> pivot_longer(starts_with("fl"), names_to = "factor", values_to = "loading") fa_loading_plot <- ggplot(fa_weight_matrix, aes(x = factor, y = variable, fill = loading)) + @@ -149,11 +144,11 @@ Lastly, I think it would be interesting to see how the different factors relate ```{r} #| label: fa-corr-matrix -fa_model$scores %>% - cor() %>% - data.frame() %>% - rownames_to_column("factor_x") %>% - pivot_longer(cols = -factor_x, names_to = "factor_y", values_to = "correlation") %>% +fa_model$scores |> + cor() |> + data.frame() |> + rownames_to_column("factor_x") |> + pivot_longer(cols = -factor_x, names_to = "factor_y", values_to = "correlation") |> ggplot(aes(x = factor_x, y = factor_y, fill = correlation)) + geom_tile() + geom_text(aes(label = round(correlation,4)), color = "white") + @@ -190,12 +185,12 @@ Next we can make a simple scree plot using the variance we calculate above. We'l ```{r} #| label: pca-scree -pc_model$variance %>% - as_tibble() %>% - rename(eigenvalue = value) %>% - rownames_to_column("comp") %>% +pc_model$variance |> + as_tibble() |> + rename(eigenvalue = value) |> + rownames_to_column("comp") |> mutate(comp = parse_number(comp), - cum_variance = cumsum(eigenvalue)/sum(eigenvalue)) %>% + cum_variance = cumsum(eigenvalue)/sum(eigenvalue)) |> ggplot(aes(x = comp, y = eigenvalue)) + geom_hline(yintercept = 1) + geom_line(size = 1) + @@ -207,13 +202,13 @@ We'll also create a weight matrix again, based on the rotation from the PCA. We' ```{r} #| label: pca-weight-matrix -pc_weight_matrix <- pc_model$rotation %>% - data.frame() %>% - rownames_to_column("variable") %>% +pc_weight_matrix <- pc_model$rotation |> + data.frame() |> + rownames_to_column("variable") |> pivot_longer(starts_with("PC"), names_to = "prin_comp", values_to = "loading") -pca_loading_plot <- pc_weight_matrix %>% - filter(parse_number(prin_comp) <= n_comps) %>% +pca_loading_plot <- pc_weight_matrix |> + filter(parse_number(prin_comp) <= n_comps) |> ggplot(aes(x = reorder(prin_comp, parse_number(prin_comp)), y = variable, fill = loading)) + geom_tile() + labs(title = "PCA loadings", @@ -232,13 +227,13 @@ We can also make a correlation matrix for the different principal components. We ```{r} #| label: pca-corr-matrix -pc_model$x %>% - cor() %>% - data.frame() %>% - rownames_to_column("comp_x") %>% - pivot_longer(cols = starts_with("PC"), names_to = "comp_y", values_to = "correlation") %>% +pc_model$x |> + cor() |> + data.frame() |> + rownames_to_column("comp_x") |> + pivot_longer(cols = starts_with("PC"), names_to = "comp_y", values_to = "correlation") |> filter(parse_number(comp_x) <= n_comps, - parse_number(comp_y) <= n_comps) %>% + parse_number(comp_y) <= n_comps) |> ggplot(aes(x = comp_x, y = comp_y, fill = correlation)) + geom_tile() + geom_text(aes(label = round(correlation,4)), color = "white") + @@ -280,8 +275,8 @@ Let's look at the scree plot: ```{r} #| label: pca-manual-scree -tibble(eigenvalues) %>% - ggplot(aes(x = seq_along(eigenvalues), y = eigenvalues)) + +tibble(eigenvalues) |> + ggplot(aes(x = seq_along(eigenvalues), y = eigenvalues)) + geom_hline(yintercept = 1) + geom_line(size = 1) + geom_point(size = 3) @@ -294,11 +289,11 @@ Looks identical to the previous one. Let's also look at the correlation matrix b #| echo: false #| eval: false -data.frame(pc_manual) %>% - rename_with(~ str_glue("PC{parse_number(.x)}")) %>% - rownames_to_column("participant") %>% - pivot_longer(starts_with("PC"), names_to = "prin_comp", values_to = "loading") %>% - filter(parse_number(prin_comp) <= n_comps) %>% +data.frame(pc_manual) |> + rename_with(~ str_glue("PC{parse_number(.x)}")) |> + rownames_to_column("participant") |> + pivot_longer(starts_with("PC"), names_to = "prin_comp", values_to = "loading") |> + filter(parse_number(prin_comp) <= n_comps) |> ggplot(aes(x = reorder(prin_comp, parse_number(prin_comp)), y = parse_number(participant), fill = loading)) + geom_tile() + @@ -312,7 +307,7 @@ data.frame(pc_manual) %>% ```{r} #| label: pca-manual-corr-matrix -data.frame(pc_manual) %>% +data.frame(pc_manual) |> cor() %>% round(., 4) ``` @@ -335,9 +330,9 @@ Let's create a weight matrix again. The output from the `fastICA()` doesn't prov ```{r} #| label: ica-weight-matrix -ica_weight_matrix <- data.frame(t(ica_model$A)) %>% - rename_with(~ str_glue("IC{seq(.)}")) %>% - mutate(variable = names(data_features)) %>% +ica_weight_matrix <- data.frame(t(ica_model$A)) |> + rename_with(~ str_glue("IC{seq(.)}")) |> + mutate(variable = names(data_features)) |> pivot_longer(cols = starts_with("IC"), names_to = "ic", values_to = "loading") ica_loading_plot <- ggplot(ica_weight_matrix, aes(x = ic, y = variable, fill = loading)) + @@ -358,11 +353,11 @@ Again, we can also visualize the correlation between the different components. ```{r} #| label: ica-corr-matrix -ica_model$S %>% - cor() %>% - data.frame() %>% - rownames_to_column("comp_x") %>% - pivot_longer(cols = starts_with("X"), names_to = "comp_y", values_to = "correlation") %>% +ica_model$S |> + cor() |> + data.frame() |> + rownames_to_column("comp_x") |> + pivot_longer(cols = starts_with("X"), names_to = "comp_y", values_to = "correlation") |> ggplot(aes(x = comp_x, y = comp_y, fill = correlation)) + geom_tile() + geom_text(aes(label = round(correlation,4)), color = "white") + @@ -382,21 +377,21 @@ Okay, let's now compare the three approaches and put the loading matrices side b #| out.width: 100% all_weight_matrices <- bind_rows( - fa_weight_matrix %>% - rename(comp = factor) %>% + fa_weight_matrix |> + rename(comp = factor) |> mutate(alg = "FA"), - pc_weight_matrix %>% - rename(comp = prin_comp) %>% + pc_weight_matrix |> + rename(comp = prin_comp) |> mutate(alg = "PCA"), - ica_weight_matrix %>% - rename(comp = ic) %>% + ica_weight_matrix |> + rename(comp = ic) |> mutate(alg = "ICA") ) -all_weight_matrices %>% - filter(parse_number(comp) <= n_comps) %>% +all_weight_matrices |> + filter(parse_number(comp) <= n_comps) |> mutate(alg = str_glue("{alg} loadings"), - alg = as_factor(alg)) %>% + alg = as_factor(alg)) |> ggplot(aes(x = comp, y = variable, fill = loading)) + geom_tile() + labs(x = NULL, @@ -452,10 +447,10 @@ Let's look at how the clusters are made up according to the hierarchical cluster #| label: hclust-weight-matrix hclust_weight_matrix %>% - data.frame() %>% - janitor::clean_names() %>% - rename(cluster = x) %>% - rownames_to_column("variable") %>% + data.frame() |> + janitor::clean_names() |> + rename(cluster = x) |> + rownames_to_column("variable") |> ggplot(aes(x = as_factor(cluster), y = variable, fill = as_factor(cluster))) + geom_tile() + labs(x = NULL, diff --git a/content/blog/2021-comparison-fa-pca-ica/index_files/figure-gfm/all-weight-matrices-1.png b/content/blog/2021-comparison-fa-pca-ica/index_files/figure-gfm/all-weight-matrices-1.png deleted file mode 100644 index 1045637..0000000 Binary files a/content/blog/2021-comparison-fa-pca-ica/index_files/figure-gfm/all-weight-matrices-1.png and /dev/null differ diff --git a/content/blog/2021-comparison-fa-pca-ica/index_files/figure-gfm/corr-matrix-original-1.png b/content/blog/2021-comparison-fa-pca-ica/index_files/figure-gfm/corr-matrix-original-1.png deleted file mode 100644 index ce1c0d5..0000000 Binary files a/content/blog/2021-comparison-fa-pca-ica/index_files/figure-gfm/corr-matrix-original-1.png and /dev/null differ diff --git a/content/blog/2021-comparison-fa-pca-ica/index_files/figure-gfm/dendrogram-1.png b/content/blog/2021-comparison-fa-pca-ica/index_files/figure-gfm/dendrogram-1.png deleted file mode 100644 index 8484fa4..0000000 Binary files a/content/blog/2021-comparison-fa-pca-ica/index_files/figure-gfm/dendrogram-1.png and /dev/null differ diff --git a/content/blog/2021-comparison-fa-pca-ica/index_files/figure-gfm/dendrogram-w-line-1.png b/content/blog/2021-comparison-fa-pca-ica/index_files/figure-gfm/dendrogram-w-line-1.png deleted file mode 100644 index 8226c86..0000000 Binary files a/content/blog/2021-comparison-fa-pca-ica/index_files/figure-gfm/dendrogram-w-line-1.png and /dev/null differ diff --git a/content/blog/2021-comparison-fa-pca-ica/index_files/figure-gfm/fa-corr-matrix-1.png b/content/blog/2021-comparison-fa-pca-ica/index_files/figure-gfm/fa-corr-matrix-1.png deleted file mode 100644 index f103501..0000000 Binary files a/content/blog/2021-comparison-fa-pca-ica/index_files/figure-gfm/fa-corr-matrix-1.png and /dev/null differ diff --git a/content/blog/2021-comparison-fa-pca-ica/index_files/figure-gfm/fa-weight-matrix-1.png b/content/blog/2021-comparison-fa-pca-ica/index_files/figure-gfm/fa-weight-matrix-1.png deleted file mode 100644 index 95eca95..0000000 Binary files a/content/blog/2021-comparison-fa-pca-ica/index_files/figure-gfm/fa-weight-matrix-1.png and /dev/null differ diff --git a/content/blog/2021-comparison-fa-pca-ica/index_files/figure-gfm/hclust-weight-matrix-1.png b/content/blog/2021-comparison-fa-pca-ica/index_files/figure-gfm/hclust-weight-matrix-1.png deleted file mode 100644 index f64981b..0000000 Binary files a/content/blog/2021-comparison-fa-pca-ica/index_files/figure-gfm/hclust-weight-matrix-1.png and /dev/null differ diff --git a/content/blog/2021-comparison-fa-pca-ica/index_files/figure-gfm/ica-corr-matrix-1.png b/content/blog/2021-comparison-fa-pca-ica/index_files/figure-gfm/ica-corr-matrix-1.png deleted file mode 100644 index 60ea990..0000000 Binary files a/content/blog/2021-comparison-fa-pca-ica/index_files/figure-gfm/ica-corr-matrix-1.png and /dev/null differ diff --git a/content/blog/2021-comparison-fa-pca-ica/index_files/figure-gfm/ica-weight-matrix-1.png b/content/blog/2021-comparison-fa-pca-ica/index_files/figure-gfm/ica-weight-matrix-1.png deleted file mode 100644 index 37bb391..0000000 Binary files a/content/blog/2021-comparison-fa-pca-ica/index_files/figure-gfm/ica-weight-matrix-1.png and /dev/null differ diff --git a/content/blog/2021-comparison-fa-pca-ica/index_files/figure-gfm/pca-biplot-1.png b/content/blog/2021-comparison-fa-pca-ica/index_files/figure-gfm/pca-biplot-1.png deleted file mode 100644 index 594a647..0000000 Binary files a/content/blog/2021-comparison-fa-pca-ica/index_files/figure-gfm/pca-biplot-1.png and /dev/null differ diff --git a/content/blog/2021-comparison-fa-pca-ica/index_files/figure-gfm/pca-corr-matrix-1.png b/content/blog/2021-comparison-fa-pca-ica/index_files/figure-gfm/pca-corr-matrix-1.png deleted file mode 100644 index a0bdcaf..0000000 Binary files a/content/blog/2021-comparison-fa-pca-ica/index_files/figure-gfm/pca-corr-matrix-1.png and /dev/null differ diff --git a/content/blog/2021-comparison-fa-pca-ica/index_files/figure-gfm/pca-manual-scree-1.png b/content/blog/2021-comparison-fa-pca-ica/index_files/figure-gfm/pca-manual-scree-1.png deleted file mode 100644 index 02cd68d..0000000 Binary files a/content/blog/2021-comparison-fa-pca-ica/index_files/figure-gfm/pca-manual-scree-1.png and /dev/null differ diff --git a/content/blog/2021-comparison-fa-pca-ica/index_files/figure-gfm/pca-scree-1.png b/content/blog/2021-comparison-fa-pca-ica/index_files/figure-gfm/pca-scree-1.png deleted file mode 100644 index a822ca6..0000000 Binary files a/content/blog/2021-comparison-fa-pca-ica/index_files/figure-gfm/pca-scree-1.png and /dev/null differ diff --git a/content/blog/2021-comparison-fa-pca-ica/index_files/figure-gfm/pca-weight-matrix-1.png b/content/blog/2021-comparison-fa-pca-ica/index_files/figure-gfm/pca-weight-matrix-1.png deleted file mode 100644 index fbcfe66..0000000 Binary files a/content/blog/2021-comparison-fa-pca-ica/index_files/figure-gfm/pca-weight-matrix-1.png and /dev/null differ diff --git a/content/blog/2021-easy-map-norway/index.md b/content/blog/2021-easy-map-norway/index.md index 03dfdde..0249baf 100644 --- a/content/blog/2021-easy-map-norway/index.md +++ b/content/blog/2021-easy-map-norway/index.md @@ -1,7 +1,7 @@ --- title: The Easier Way to Create a Map of Norway Using {csmaps} -author: Daniel Roelfs -date: "2021-08-24" +date: 2021-08-24 +description: The Easier Way to Create a Map of Norway Using {csmaps} slug: the-easier-way-to-create-a-map-of-norway-using-csmaps categories: - ggplot @@ -9,9 +9,6 @@ tags: - ggplot - map - norway -description: "The Easier Way to Create a Map of Norway Using {csmaps}" -thumbnail: images/avatar.png -format: hugo editor_options: chunk_output_type: console execute: diff --git a/content/blog/2021-easy-map-norway/index.qmd b/content/blog/2021-easy-map-norway/index.qmd index da22f25..c52efc6 100644 --- a/content/blog/2021-easy-map-norway/index.qmd +++ b/content/blog/2021-easy-map-norway/index.qmd @@ -1,7 +1,7 @@ --- title: The Easier Way to Create a Map of Norway Using {csmaps} -author: Daniel Roelfs -date: "2021-08-24" +date: 2021-08-24 +description: The Easier Way to Create a Map of Norway Using {csmaps} slug: the-easier-way-to-create-a-map-of-norway-using-csmaps categories: - ggplot @@ -9,9 +9,6 @@ tags: - ggplot - map - norway -description: "The Easier Way to Create a Map of Norway Using {csmaps}" -thumbnail: images/avatar.png -format: hugo editor_options: chunk_output_type: console execute: @@ -23,7 +20,10 @@ execute: --- -```{css echo=FALSE, label='css'} +```{css} +#| label: style +#| echo: FALSE + p.announcement { border-radius: 5px; background-color: #acc8d4; diff --git a/content/blog/2021-easy-map-norway/index_files/figure-gfm/age-plot-1.png b/content/blog/2021-easy-map-norway/index_files/figure-gfm/age-plot-1.png deleted file mode 100644 index 5ba6b40..0000000 Binary files a/content/blog/2021-easy-map-norway/index_files/figure-gfm/age-plot-1.png and /dev/null differ diff --git a/content/blog/2021-easy-map-norway/index_files/figure-gfm/minimal-plot-1.png b/content/blog/2021-easy-map-norway/index_files/figure-gfm/minimal-plot-1.png deleted file mode 100644 index 32f4029..0000000 Binary files a/content/blog/2021-easy-map-norway/index_files/figure-gfm/minimal-plot-1.png and /dev/null differ diff --git a/content/blog/2021-easy-map-norway/index_files/figure-gfm/plot-kommune-faceted-1.png b/content/blog/2021-easy-map-norway/index_files/figure-gfm/plot-kommune-faceted-1.png deleted file mode 100644 index e9c7f53..0000000 Binary files a/content/blog/2021-easy-map-norway/index_files/figure-gfm/plot-kommune-faceted-1.png and /dev/null differ diff --git a/content/blog/2021-easy-map-norway/index_files/figure-gfm/plot-oslo-1.png b/content/blog/2021-easy-map-norway/index_files/figure-gfm/plot-oslo-1.png deleted file mode 100644 index e1e810b..0000000 Binary files a/content/blog/2021-easy-map-norway/index_files/figure-gfm/plot-oslo-1.png and /dev/null differ diff --git a/content/blog/2021-easy-map-norway/index_files/figure-gfm/plot-w-cities-1.png b/content/blog/2021-easy-map-norway/index_files/figure-gfm/plot-w-cities-1.png deleted file mode 100644 index 55fc79d..0000000 Binary files a/content/blog/2021-easy-map-norway/index_files/figure-gfm/plot-w-cities-1.png and /dev/null differ diff --git a/content/blog/2021-easy-map-norway/index_files/figure-gfm/simple-plot-1.png b/content/blog/2021-easy-map-norway/index_files/figure-gfm/simple-plot-1.png deleted file mode 100644 index 2c1f54e..0000000 Binary files a/content/blog/2021-easy-map-norway/index_files/figure-gfm/simple-plot-1.png and /dev/null differ diff --git a/content/blog/2021-easy-map-norway/index_files/figure-gfm/vax-plot-1.png b/content/blog/2021-easy-map-norway/index_files/figure-gfm/vax-plot-1.png deleted file mode 100644 index 3be9069..0000000 Binary files a/content/blog/2021-easy-map-norway/index_files/figure-gfm/vax-plot-1.png and /dev/null differ diff --git a/content/blog/2022-dutch-performance-olympic-speed-skating/index.markdown_strict_files/figure-markdown_strict/events-timeline-1.png b/content/blog/2022-dutch-performance-olympic-speed-skating/index.markdown_strict_files/figure-markdown_strict/events-timeline-1.png index 0f0cf43..c354b49 100644 Binary files a/content/blog/2022-dutch-performance-olympic-speed-skating/index.markdown_strict_files/figure-markdown_strict/events-timeline-1.png and b/content/blog/2022-dutch-performance-olympic-speed-skating/index.markdown_strict_files/figure-markdown_strict/events-timeline-1.png differ diff --git a/content/blog/2022-dutch-performance-olympic-speed-skating/index.markdown_strict_files/figure-markdown_strict/podium-sweeps-1.png b/content/blog/2022-dutch-performance-olympic-speed-skating/index.markdown_strict_files/figure-markdown_strict/podium-sweeps-1.png index ef6dc53..8adc175 100644 Binary files a/content/blog/2022-dutch-performance-olympic-speed-skating/index.markdown_strict_files/figure-markdown_strict/podium-sweeps-1.png and b/content/blog/2022-dutch-performance-olympic-speed-skating/index.markdown_strict_files/figure-markdown_strict/podium-sweeps-1.png differ diff --git a/content/blog/2022-dutch-performance-olympic-speed-skating/index.md b/content/blog/2022-dutch-performance-olympic-speed-skating/index.md index cc98c6a..b7ef3d0 100644 --- a/content/blog/2022-dutch-performance-olympic-speed-skating/index.md +++ b/content/blog/2022-dutch-performance-olympic-speed-skating/index.md @@ -1,16 +1,13 @@ --- title: Dutch performance at Olympic speed skating -author: Daniel Roelfs -date: "2022-02-09" +date: 2022-02-09 +description: Dutch performance at Olympic speed skating slug: dutch-performance-at-olympic-speed-skating categories: - miscellaneous tags: - data visualization - R -description: "Dutch performance at Olympic speed skating" -thumbnail: images/avatar.png -format: hugo execute: fig.retina: 2 fig.align: center @@ -25,6 +22,9 @@ Now, since the last Winter Olympic Games in 2018 I've learned quite a bit about First we'll load the packages, as usual, we'll use the `{tidyverse}` package. For some more functionality around text rendering in the plots, we'll also load the `{ggtext}` package, and in order to use different fonts than the default ones we'll use functionality from the `{showtext}` package and then load a nice sans-serif font called [Yanone Kaffeesatz](https://fonts.google.com/specimen/Yanone+Kaffeesatz?preview.text=solipsism&preview.text_type=custom). We'll incidentally use some other packages, but then we can use the `::` operator. +
+Show code + ``` r library(tidyverse) library(ggtext) @@ -34,10 +34,15 @@ font_add_google(name = "Yanone Kaffeesatz", family = "custom") showtext_auto() ``` +
+ Before we can do anything, we need to find a nice dataset. What was quite surprising to me, it was rather tough to find a good dataset on Olympic events and results. Wikipedia of course has all the data one could want, but it's not always structured in an organized way, which makes it hard to scrape programatically. I looked at the IOC, ISU (International Skating Union), and news agencies, but the best (i.e. most complete and most organized) resource I could was a website called [Olympian Database](https://www.olympiandatabase.com/). The website looks rather outdated and the html is fairly outdated too, but we can work with this. The website has a [main page for speed skating](https://www.olympiandatabase.com/index.php?id=6934&L=1), and then we can iteratively go through the games and events to scrape every individual webpage. Before we've used the `{rvest}` package to scrape websites, but since then I've actually gotten really fond of using Python for web scraping with the `Selenium` library, and then parsing the html with the `BeautifulSoup` library. So what we'll do first is scrape and parse the [main table](https://www.olympiandatabase.com/index.php?id=6934&L=1). This will give us the links to the speed skating events at each Winter Olympic Game. This will give us a list of all events that were part of that tournament, then we'll go one level deeper and scrape the table there. This table contains the final placements (and in case of team events, the results from the last rounds), the athlete, country, and a comment (Olympic records, disqualifications, etc.). We'll run through each game, and each event iteratively, save the data in an individual json file, and then at the end merge the individual json files into a single large json which we can then parse in R. While running this script I found [one instance](https://www.olympiandatabase.com/index.php?id=11769&L=1) where the header data and some variables were missing, which made machine reading this page very difficult, so when the script got to that instance I filled in the data manually. +
+Show code + ``` python ### DOWNLOAD OLYMPIC SPEED SKATING DATA ######################## @@ -164,38 +169,43 @@ with open(out_name, 'w') as outfile: json.dump(result, outfile, indent=4) ``` +
+ I said before that the data is neatly organized, which is true except for a few instances. The individual events are simple tables with a ranking and time for each athlete. It's a bit more complicated for the team pursuits, since team pursuit events are a direct competition with qualifying rounds and knock-out rounds, the table is a bit more complicated. In this case we're just interested in the final ranking (so we dismiss the semi- and quarter-finals). The final ranking is split across two columns, so we stitch those together. For some reason the men's team pursuit from 2018 lists only the medal winners, and not in the same format as the other team pursuit events. One advantage here is that they list individual skaters too, but since this is the only time indivdual skaters are listed among the team pursuits it's still not very useful. It just meant we have to create another few lines in "if else" statement to parse the json. In the html, the podium places aren't denoted with a numeric list, but rather with a gold, silver, and bronze badge. Since the python script doesn't parse those, we add those back here (except for the 1928 Men's 10.000 m event, which was canceled due to bad weather). +
+Show code + ``` r parse_json <- function(json) { - t_df <- jsonlite::fromJSON(json) %>% - as_tibble() %>% - unnest() %>% + t_df <- jsonlite::fromJSON(json) |> + as_tibble() |> + unnest() |> janitor::clean_names() %>% slice(seq(3, nrow(.) - 2)) if (str_detect(json, "Men's Team pursuit 2018")) { - t_df_out <- t_df %>% - filter(is.na(x0)) %>% + t_df_out <- t_df |> + filter(is.na(x0)) |> rename(ranking = x0, athlete = x1, country = x3, time = x4, - comment = x5) %>% + comment = x5) |> mutate(ranking = rep(seq(3), each = 4), - ranking = str_glue("{ranking}.")) %>% - fill(country, time, comment) %>% - group_by(ranking) %>% - mutate(athlete = toString(athlete)) %>% - ungroup() %>% - distinct() %>% + ranking = str_glue("{ranking}.")) |> + fill(country, time, comment) |> + group_by(ranking) |> + mutate(athlete = toString(athlete)) |> + ungroup() |> + distinct() |> select(-x2) } else if (str_detect(json, "Men's Team pursuit|Women's Team pursuit")) { - t_df_tp <- t_df %>% + t_df_tp <- t_df |> rename(ranking = x0, country = x1, time = x3, @@ -203,25 +213,25 @@ parse_json <- function(json) { ranking2 = x5, country2 = x6, time2 = x8, - comment2 = x9) %>% + comment2 = x9) |> select(seq(10), - -c(x2,x7)) %>% + -c(x2,x7)) |> slice(seq(0, min(which(nchar(ranking) > 3)) - 1)) - t_df_out <- bind_rows(t_df_tp %>% + t_df_out <- bind_rows(t_df_tp |> select(seq(4)), - t_df_tp %>% - select(seq(5,last_col())) %>% + t_df_tp |> + select(seq(5,last_col())) |> rename_with( ~ c("ranking","country","time","comment"))) } else { - t_df <- t_df %>% + t_df <- t_df |> rename(ranking = x0, athlete = x1, country = x3, time = x4, - comment = x5) %>% + comment = x5) |> select(-x2) if (str_detect(json, "Men's 10000 m 1928", negate = TRUE)) { @@ -236,32 +246,39 @@ parse_json <- function(json) { } ``` +
+ Okay, now that we have the function to parse the json file, let's look at some R code. We'll load the json file using the `{jsonlite}` package, and then parse each json string using the `map()` function from `{purrr}`. Then when this data is parsed, we'll wrangle the nested data frames into one long data frame, and then we'll tidy up the data. Tied places are denoted using a single dash, we want to get rid of that. Then we'll fill the missing place numbers using the `fill()` function. However, there were also a number of people who either did not finish or start or were disqualified and so they don't have a ranking. These instances are denoted in the `time` column with a "dnf", "dns", or "dq". Since those are the only times it uses the lowercase d, we can use this feature to replace the ranking with a missing value. We'll then also add the comment from the `time` column to the `comment` column. Then there are also some artifacts which we can easily remove since the `country` column uses IOC 3-letter abbreviations, so any entry there that's longer than 3 characters we can remove. Then we'll also create two vectors that contain the breaks we'll use later for the visualizations. Until 1992 both summer and winter olympic games were held in the same year. However, since 1994 they moved the Olympic Winter Games up 2 years to get the alternating schedule we have today. The Olympic Games were also not held during the second world war. I want to account for that so I create a vector with each unique entry in the `year` column. I also want a neatly organized ordering of the events, so I create a vector called `event_lims` that saves stores this preferred ordering. +
+Show code + ``` r -data_load <- jsonlite::fromJSON("./all_events.json") %>% +data_load <- jsonlite::fromJSON("./all_events.json") |> mutate(df = map(table, ~ parse_json(.x))) -data <- data_load %>% - select(-table) %>% - unnest(df) %>% - group_by(year, distance, sex) %>% - mutate(ranking = ifelse(str_detect(ranking, "-"), NA, ranking)) %>% - fill(ranking) %>% - ungroup() %>% +data <- data_load |> + select(-table) |> + unnest(df) |> + group_by(year, distance, sex) |> + mutate(ranking = ifelse(str_detect(ranking, "-"), NA, ranking)) |> + fill(ranking) |> + ungroup() |> mutate(ranking = parse_number(ranking), ranking = ifelse(str_detect(time, "d"), NA, ranking), comment = ifelse(str_detect(time, "d"), time, comment), - time = parse_number(time)) %>% - filter(nchar(country) < 4) %>% - arrange(year) %>% + time = parse_number(time)) |> + filter(nchar(country) < 4) |> + arrange(year) |> glimpse() ``` +
+ Rows: 5,712 Columns: 10 $ title "Men's 1500 m - Speed Skating Chamonix 1924 Winter Olympics",… @@ -275,26 +292,34 @@ data <- data_load %>% $ time 2.208, 2.220, 2.256, 2.266, 2.290, 2.292, 2.298, 2.316, 2.316… $ comment "OR", NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA,… +
+Show code + ``` r game_years <- unique(data$year) event_lims <- c("500 m", "1000 m", "1500 m", "3000 m", "5000 m", "10000 m", "Combined", "Team pursuit", "Mass Start") ``` +
+ Then we can finally create some plots. Not all speed skating events were present from the start in 1924. Back then only men competed in Olympic speed skating, the women's program started in 1960. Here we'll create something that looks a bit like a Gantt chart. We'll use a `geom_segment()` to visualize the timeline and since there's a few events which have only been on the program once we'll use a `geom_point()` for those since `geom_segment()` requires a begin and end point that are different. Since this is just a casual visualization for illustrative purposes we can take some creative liberty and experiment a bit with the design. That's why I chose to remove the grid lines and axes, make the lines fairly big and added the individual distances as a label on top of the lines. I also made the text quite large and moved the labels slightly up. The first year an event was held is shown slightly below the line. +
+Show code + ``` r -data %>% - select(year, distance, sex) %>% - distinct() %>% - mutate(distance = fct_relevel(distance, ~ event_lims)) %>% - group_by(distance, sex) %>% - arrange(year) %>% +data |> + select(year, distance, sex) |> + distinct() |> + mutate(distance = fct_relevel(distance, ~ event_lims)) |> + group_by(distance, sex) |> + arrange(year) |> summarise(first_year = min(year), - last_year = max(year)) %>% + last_year = max(year)) |> ggplot(aes(x = first_year, y = distance)) + geom_segment(aes(xend = last_year, yend = distance, color = distance), - size = 8, lineend = "round", alpha = 0.4) + + linewidth = 8, lineend = "round", alpha = 0.4) + geom_point(data = . %>% filter(first_year == last_year), aes(color = distance), size = 8, alpha = 0.5) + @@ -311,8 +336,7 @@ data %>% strip.text = element_text(face = "bold", size = 42)) ``` - Warning: Using `size` aesthetic for lines was deprecated in ggplot2 3.4.0. - ℹ Please use `linewidth` instead. +
@@ -320,22 +344,25 @@ As we can see, the first Winter Olympic Games had only 5 events. This also inclu Now, let's dive into the medals. First let's create a simple barplot with the total number of medals. As I prefer, we'll rotate so that the bars extent across the x-axis instead of the y-axis. This leaves more space for the country names (which we'll extract from the IOC codes using the `{countrycodes}` package) so we don't have to rotate labels. A simple rule: never rotate labels if you can avoid it. It makes the labels harder to read and increases cognitive load. To make the plot a bit cleaner, we'll move the title and subtitle (which we'll create with `{ggtext}`'s `geom_richtext()`) to the empty space in the barplot. Since I want to draw attention to the Netherlands in particular, I'll highlight that bar in its national orange color. We can easily do that by creating a separate column which will store the hex-value of the color and then we can use `scale_fill_identity()` to make the bar the color saved in that column. +
+Show code + ``` r -data %>% +data |> filter(year >= 1960, - ranking %in% seq(3)) %>% + ranking %in% seq(3)) |> mutate(country_long = countrycode::countrycode(country, origin = "ioc", destination = "country.name"), country_long = case_when(str_detect(country, "URS") ~ "Soviet Union", str_detect(country, "GDR") ~ "East Germany", str_detect(country, "FRG") ~ "West Germany", str_detect(country, "OAR") ~ "Olympic Athletes from Russia", TRUE ~ country_long), - country_label = str_glue("{country_long} ({country})")) %>% - count(country_label, sort = TRUE) %>% - mutate(highlight_col = ifelse(str_detect(country_label, "NED"), "#FF9B00", "grey")) %>% + country_label = str_glue("{country_long} ({country})")) |> + count(country_label, sort = TRUE) |> + mutate(highlight_col = ifelse(str_detect(country_label, "NED"), "#FF9B00", "grey")) |> ggplot(aes(x = n, y = reorder(country_label, n))) + geom_col(aes(fill = highlight_col)) + - geom_vline(xintercept = 0, size = 1) + + geom_vline(xintercept = 0, linewidth = 1) + geom_richtext(data = tibble(), aes(x = 24, y = 15, label = "Total number of medals won per country since 1960"), family = "custom", size = 7, fontface = "bold", hjust = 0, @@ -352,26 +379,31 @@ data %>% theme(panel.grid.major.y = element_blank()) ``` +
+ As you can see, the Netherlands has earned by far the most medals since 1960 than any other country. In fact, it's earned more medals than number two and three combined. Now, news agencies have reported on the total number of medals, and numbers may slightly differ between reports. This is the number reported by the source, and unless I made some errors in scraping, parsing, or wrangling the data I'll stand by it. However, differences of 3 or 4 medals won't change the message that the Netherlands is absolutely dominant in this area of the Winter Olympics. Let's look at how this distribution is spread out across the different Olympic events. We'll start in 1960 since that's when the women's tournament was added and I consider that the proper start of the Winter Olympics. Since 1960 we've had 16 Winter Olympics (the 17th is currently underway). Since not all games had the same number of medals (events were added at different years), I'll calculate the percentage of medals won per year. +
+Show code + ``` r -data %>% +data |> filter(ranking %in% seq(3), - year >= 1960) %>% - group_by(year) %>% - mutate(total_medals = n()) %>% - group_by(year, country) %>% + year >= 1960) |> + group_by(year) |> + mutate(total_medals = n()) |> + group_by(year, country) |> summarise(medals_won = n(), - total_medals = first(total_medals)) %>% + total_medals = first(total_medals)) |> mutate(perc_won = medals_won / total_medals, perc_label = str_glue("{round(perc_won * 100)}%"), highlight_col = ifelse(country == "NED", "#FF9B00", "grey"), - country = tidytext::reorder_within(country, perc_won, year)) %>% - slice_max(perc_won, n = 5) %>% + country = tidytext::reorder_within(country, perc_won, year)) |> + slice_max(perc_won, n = 5) |> ggplot(aes(x = perc_won, y = country)) + geom_col(aes(fill = highlight_col)) + geom_text(aes(label = perc_label), family = "custom", @@ -392,21 +424,26 @@ data %>% panel.grid.major.y = element_blank()) ``` +
+ As you can see from the plot, of the 16 Olympic Games since 1960, the Netherlands has earned the largest share of medals 9 times (56.25%). It has topped the list every game since 1994 (shared in 1994 and 2002). In 2014, the Netherlands took home 64% of all medals. Due to ISU and IOC qualification rules it's technically impossible for a single country to take home all medals (e.g. a country cannot send three teams for the team pursuit). So there might be 36 medals available (from 12 events), but a single country can only take home 32. Since I didn't want to go into the (sometimes changing) qualification rules since 1960 I made it simple and calculated based on the total number of medals, not the total number available to a country. But of course, not all medals are created equal. In Olympic rankings or medal tables, the order is determined by the number of gold medals first, then silver, then bronze. Total number of medals does not matter here. So a country with 2 gold medals and no other metal will be ranked above a country with 1 gold medal, 10 silver, and 15 bronze medals. So the Netherlands can win a lot of medals, but for rankins the color matters too. So let's create a metal table. Again, we'll only look at results from 1960. We'll calculate the number of medals each country won, then we'll fill in the blank spaces with the amazing `complete()` function. Since not all medals are equal, we'll add a multiplier and then calculate a theoretical score (where gold counts 10 times stronger than a silver etc.). Then we'll look at the top 10 countries and use `geom_point()` to create a table. +
+Show code + ``` r -data %>% +data |> filter(year >= 1960, - ranking %in% seq(3)) %>% - group_by(country, ranking) %>% - summarise(n_medals = n()) %>% - ungroup() %>% - complete(country, ranking) %>% - replace_na(list(n_medals = 0)) %>% + ranking %in% seq(3)) |> + group_by(country, ranking) |> + summarise(n_medals = n()) |> + ungroup() |> + complete(country, ranking) |> + replace_na(list(n_medals = 0)) |> mutate(country_long = countrycode::countrycode(country, origin = "ioc", destination = "country.name"), country_long = case_when(str_detect(country, "URS") ~ "Soviet Union", str_detect(country, "GDR") ~ "East Germany", @@ -418,11 +455,11 @@ data %>% rank_mult = case_when(ranking == 1 ~ 100, ranking == 2 ~ 10, ranking == 3 ~ 1), - rank_score = n_medals * rank_mult) %>% - group_by(country) %>% - mutate(country_rank = sum(rank_score)) %>% - ungroup() %>% - slice_max(country_rank, n = 30) %>% + rank_score = n_medals * rank_mult) |> + group_by(country) |> + mutate(country_rank = sum(rank_score)) |> + ungroup() |> + slice_max(country_rank, n = 30) |> ggplot(aes(x = ranking, y = reorder(country_label, country_rank), fill = ranking_color, alpha = n_medals)) + geom_point(shape = 21, size = 10, stroke = 0, show.legend = FALSE) + @@ -441,18 +478,23 @@ data %>% axis.text.y = element_text(hjust = 1)) ``` +
+ To show that a country is dominant in a particular competition it helps to show that a country can deliver not just one, but a few contenders for Olympic gold. The greatest display of strength for a country is to take home all medals in a single event, a so-called *podium sweep*. If a country can take home gold, silver, and bronze in a single event it may show they're competing mostly with each other. Now, to calculate this can simply take the rankins, group by event and country, and count how often a single country took home three medals in a single event. For this we'll create a simple stacked barplot. +
+Show code + ``` r -data %>% +data |> filter(year >= 1960, - ranking %in% seq(3)) %>% - group_by(year, distance, sex, country) %>% - count(year, distance, sex, country, name = "medals_won") %>% - filter(medals_won == 3) %>% - mutate(sweeps = medals_won / 3) %>% + ranking %in% seq(3)) |> + group_by(year, distance, sex, country) |> + count(year, distance, sex, country, name = "medals_won") |> + filter(medals_won == 3) |> + mutate(sweeps = medals_won / 3) |> ggplot(aes(x = year, y = sweeps, fill = country)) + geom_col(key_glyph = "point") + geom_hline(yintercept = 0) + @@ -478,23 +520,28 @@ data %>% panel.grid.minor = element_blank()) ``` +
+ As you might gather, from this and the previous plot, the Winter Olympic Games from 2014 were a very good year for the Dutch speed skating team. That one year the Netherlands had *four* podium sweeps. For one of these four podium sweeps (the [women's 1500 m](https://en.wikipedia.org/wiki/Speed_skating_at_the_2014_Winter_Olympics_–_Women%27s_1500_metres)) the fourth place was also a Dutch women (Marrit Leenstra), a first in the Winter Olympics. +
+Show code + ``` r -data %>% - mutate(distance = fct_relevel(distance, ~ event_lims)) %>% +data |> + mutate(distance = fct_relevel(distance, ~ event_lims)) |> filter(str_detect(comment, "OR"), - distance != "Combined") %>% - group_by(distance, sex) %>% - arrange(year) %>% - mutate(no = row_number()) %>% + distance != "Combined") |> + group_by(distance, sex) |> + arrange(year) |> + mutate(no = row_number()) |> ggplot(aes(x = year, y = no, color = distance)) + geom_vline(xintercept = c(1940, 1944), linetype = "dashed", color = "grey92") + - geom_step(size = 1.5, alpha = 0.4, show.legend = FALSE) + + geom_step(linewidth = 1.5, alpha = 0.4, show.legend = FALSE) + geom_point(size = 4, alpha = 0.75, stroke = 0) + - ggrepel::geom_text_repel(data = . %>% filter(no == max(no)), + ggrepel::geom_text_repel(data = . |> filter(no == max(no)), aes(label = country), show.legend = FALSE, seed = 2, color = "#333333", size = 4, family = "custom", fontface = "bold") + @@ -520,21 +567,28 @@ data %>% panel.grid.minor = element_blank()) ``` +
+ Next, I want to highlight one athlete in particular. The Dutch team is a powerhouse of speed skating, but a team is still made up of individual athletes. And one of those athletes deserves some special attention: Ireen Wüst. She is one of the most successful Winter Olympic athletes ever and the most succesful speed skater of all time. As of time of writing (9/2/2022) she won 6 gold, 5 silver, and 1 bronze medals across 5 Winter Olympic Games. She's the only Olympian (Winter or Summer) to win individual gold in 5 different Olympic Games. So let's look at her performance. Let's extract all events where Ireen Wüst participated. One caveat here is that we can't only look for her name in the `athlete` column, and as we saw before, there's also team pursuit where individual names aren't registered in the website. Lucky for us, Ireen Wüst participated in all team pursuit events (only held since 2006), so we'll extract all instances where the Dutch team pursuit team participated. Since the 2022 Olympics are already underway and Ireen has already won a gold medal in her first event, I'll add a row manually to include this data too. +
+Show code + ``` r -data_wust <- data %>% +data_wust <- data |> filter(str_detect(athlete, "Ireen") | str_detect(title, "Women's Team pursuit") & - country == "NED") %>% + country == "NED") |> add_row(tibble(year = 2022, distance = "1500 m", sex = "Women", ranking = 1, - comment = "OR")) %>% + comment = "OR")) |> glimpse() ``` +
+ Rows: 18 Columns: 10 $ title "Women's 1000 m - Speed Skating Torino 2006 Winter Olympics",… @@ -550,23 +604,26 @@ data_wust <- data %>% So Ireen participated in 18 events across 5 Olympic Games. She participated in all events apart from the 500 m and the 5000 m. Now, let's see how often she'll take home a medal if she shows up at the start. For this we can calculate a win rate. Let's count per year how many medals she won, and then we can calculate a percentage and create a barplot. +
+Show code + ``` r -data_wust %>% - group_by(year, medals_won = ranking %in% seq(3)) %>% - mutate(medals_won = ifelse(medals_won, "medals_won", "medals_not_won")) %>% - count() %>% - ungroup() %>% - complete(year, medals_won, fill = list(n = 0)) %>% - pivot_wider(names_from = medals_won, values_from = n) %>% +data_wust |> + group_by(year, medals_won = ranking %in% seq(3)) |> + mutate(medals_won = ifelse(medals_won, "medals_won", "medals_not_won")) |> + count() |> + ungroup() |> + complete(year, medals_won, fill = list(n = 0)) |> + pivot_wider(names_from = medals_won, values_from = n) |> mutate(total_events = medals_won + medals_not_won, perc_won = medals_won / total_events, perc_won_label = str_glue("{round(perc_won * 100,1)}%"), perc_won_label = ifelse(year == 2022, str_glue("{perc_won_label}*"), perc_won_label), year = as_factor(year), - year = fct_rev(year)) %>% + year = fct_rev(year)) |> ggplot(aes(x = perc_won, y = year)) + geom_segment(aes(x = 0, xend = perc_won, yend = year), - size = 10, lineend = "round", color = "#FF9B00") + + linewidth = 10, lineend = "round", color = "#FF9B00") + geom_text(aes(label = perc_won_label), size = 4, family = "custom", hjust = 1) + labs(title = "**Will Ireen win a medal if she shows up?**", subtitle = "Of all the events Ireen Wüst participated in, how often did she win a medal (of any color)?
2022 Olympics is still ongoing, shown is win rate **so far***", @@ -584,22 +641,27 @@ data_wust %>% panel.grid.major.y = element_blank()) ``` +
+ With the caveat that Ireen has only participated in one event in 2022 (as of time of writing, 9/2/2022), there has been one instance where she took home a medal on every single event she participated in. The Sochi Olympics in 2014 were successful for the Dutch team and for Ireen Wüst individually too. Finally, we can also visualize the individual medals she won. Again, I'll take some artistic liberty here by creating a sort-of bar plot, but instead with `geom_points()` in the shape and color of the medals. +
+Show code + ``` r -data_wust %>% - filter(ranking %in% seq(3)) %>% +data_wust |> + filter(ranking %in% seq(3)) |> mutate(ranking_color = case_when(ranking == 1 ~ "#F8CC46", ranking == 2 ~ "#DFDFE7", ranking == 3 ~ "#D8B581"), - label = str_glue("{sex}'s {distance}")) %>% - group_by(year) %>% - arrange(ranking) %>% - mutate(y = row_number()) %>% + label = str_glue("{sex}'s {distance}")) |> + group_by(year) |> + arrange(ranking) |> + mutate(y = row_number()) |> ggplot(aes(x = year, y = -y)) + geom_point(aes(color = ranking_color), size = 12) + geom_text(aes(label = label), size = 4, family = "custom", hjust = 0.1) + @@ -626,28 +688,8 @@ data_wust %>% panel.grid = element_blank()) ``` +
+ Of course the Olympics are still ongoing, but I had a lot of fun collecting and visualizing this data. Again, not all numbers might correspond to official IOC records (as detailed [here](https://www.olympiandatabase.com/index.php?id=13738&L=1)), and I'll welcome any feedback on the code in this post. I'll use these posts as a creative outlet for data visualization ideas that my current professional work doesn't allow for. Since this is my own website and these posts aren't always very serious, I have some creative liberty. I enjoy writing these posts and they get the creative juices flowing. I hope for those not interested in speed skating they at least found the data wrangling process and visualization useful. I enjoy reading other people's blogposts since I usually learn a new function or approach, so I hope I can do the same for others. The Winter Olympics happen every four years so I won't get much opportunity to do this again any time soon, but it might update this post later with the latest data. - - diff --git a/content/blog/2022-dutch-performance-olympic-speed-skating/index.qmd b/content/blog/2022-dutch-performance-olympic-speed-skating/index.qmd index 8ff24da..2294ff4 100644 --- a/content/blog/2022-dutch-performance-olympic-speed-skating/index.qmd +++ b/content/blog/2022-dutch-performance-olympic-speed-skating/index.qmd @@ -1,16 +1,13 @@ --- title: Dutch performance at Olympic speed skating -author: Daniel Roelfs -date: "2022-02-09" +date: 2022-02-09 +description: Dutch performance at Olympic speed skating slug: dutch-performance-at-olympic-speed-skating categories: - miscellaneous tags: - data visualization - R -description: "Dutch performance at Olympic speed skating" -thumbnail: images/avatar.png -format: hugo execute: fig.retina: 2 fig.align: center @@ -28,6 +25,8 @@ First we'll load the packages, as usual, we'll use the `{tidyverse}` package. Fo ```{r} #| label: pkgs #| message: false +#| code-fold: true +#| code-summary: "Show code" library(tidyverse) library(ggtext) @@ -44,6 +43,8 @@ Before we've used the `{rvest}` package to scrape websites, but since then I've ```{python, code=readLines("./scrape_data.py")} #| label: py-scrape-script #| eval: false +#| code-fold: true +#| code-summary: "Show code" ``` I said before that the data is neatly organized, which is true except for a few instances. The individual events are simple tables with a ranking and time for each athlete. It's a bit more complicated for the team pursuits, since team pursuit events are a direct competition with qualifying rounds and knock-out rounds, the table is a bit more complicated. In this case we're just interested in the final ranking (so we dismiss the semi- and quarter-finals). The final ranking is split across two columns, so we stitch those together. For some reason the men's team pursuit from 2018 lists only the medal winners, and not in the same format as the other team pursuit events. One advantage here is that they list individual skaters too, but since this is the only time indivdual skaters are listed among the team pursuits it's still not very useful. It just meant we have to create another few lines in "if else" statement to parse the json. In the html, the podium places aren't denoted with a numeric list, but rather with a gold, silver, and bronze badge. Since the python script doesn't parse those, we add those back here (except for the 1928 Men's 10.000 m event, which was canceled due to bad weather). @@ -51,36 +52,38 @@ I said before that the data is neatly organized, which is true except for a few ```{r} #| label: define-parsing-function #| message: false +#| code-fold: true +#| code-summary: "Show code" parse_json <- function(json) { - t_df <- jsonlite::fromJSON(json) %>% - as_tibble() %>% - unnest() %>% + t_df <- jsonlite::fromJSON(json) |> + as_tibble() |> + unnest() |> janitor::clean_names() %>% slice(seq(3, nrow(.) - 2)) if (str_detect(json, "Men's Team pursuit 2018")) { - t_df_out <- t_df %>% - filter(is.na(x0)) %>% + t_df_out <- t_df |> + filter(is.na(x0)) |> rename(ranking = x0, athlete = x1, country = x3, time = x4, - comment = x5) %>% + comment = x5) |> mutate(ranking = rep(seq(3), each = 4), - ranking = str_glue("{ranking}.")) %>% - fill(country, time, comment) %>% - group_by(ranking) %>% - mutate(athlete = toString(athlete)) %>% - ungroup() %>% - distinct() %>% + ranking = str_glue("{ranking}.")) |> + fill(country, time, comment) |> + group_by(ranking) |> + mutate(athlete = toString(athlete)) |> + ungroup() |> + distinct() |> select(-x2) } else if (str_detect(json, "Men's Team pursuit|Women's Team pursuit")) { - t_df_tp <- t_df %>% + t_df_tp <- t_df |> rename(ranking = x0, country = x1, time = x3, @@ -88,25 +91,25 @@ parse_json <- function(json) { ranking2 = x5, country2 = x6, time2 = x8, - comment2 = x9) %>% + comment2 = x9) |> select(seq(10), - -c(x2,x7)) %>% + -c(x2,x7)) |> slice(seq(0, min(which(nchar(ranking) > 3)) - 1)) - t_df_out <- bind_rows(t_df_tp %>% + t_df_out <- bind_rows(t_df_tp |> select(seq(4)), - t_df_tp %>% - select(seq(5,last_col())) %>% + t_df_tp |> + select(seq(5,last_col())) |> rename_with( ~ c("ranking","country","time","comment"))) } else { - t_df <- t_df %>% + t_df <- t_df |> rename(ranking = x0, athlete = x1, country = x3, time = x4, - comment = x5) %>% + comment = x5) |> select(-x2) if (str_detect(json, "Men's 10000 m 1928", negate = TRUE)) { @@ -131,23 +134,25 @@ Then we'll also create two vectors that contain the breaks we'll use later for t #| label: load-json-data #| message: false #| warning: false +#| code-fold: true +#| code-summary: "Show code" -data_load <- jsonlite::fromJSON("./all_events.json") %>% +data_load <- jsonlite::fromJSON("./all_events.json") |> mutate(df = map(table, ~ parse_json(.x))) -data <- data_load %>% - select(-table) %>% - unnest(df) %>% - group_by(year, distance, sex) %>% - mutate(ranking = ifelse(str_detect(ranking, "-"), NA, ranking)) %>% - fill(ranking) %>% - ungroup() %>% +data <- data_load |> + select(-table) |> + unnest(df) |> + group_by(year, distance, sex) |> + mutate(ranking = ifelse(str_detect(ranking, "-"), NA, ranking)) |> + fill(ranking) |> + ungroup() |> mutate(ranking = parse_number(ranking), ranking = ifelse(str_detect(time, "d"), NA, ranking), comment = ifelse(str_detect(time, "d"), time, comment), - time = parse_number(time)) %>% - filter(nchar(country) < 4) %>% - arrange(year) %>% + time = parse_number(time)) |> + filter(nchar(country) < 4) |> + arrange(year) |> glimpse() game_years <- unique(data$year) @@ -160,18 +165,20 @@ Then we can finally create some plots. Not all speed skating events were present ```{r} #| label: events-timeline #| message: false - -data %>% - select(year, distance, sex) %>% - distinct() %>% - mutate(distance = fct_relevel(distance, ~ event_lims)) %>% - group_by(distance, sex) %>% - arrange(year) %>% +#| code-fold: true +#| code-summary: "Show code" + +data |> + select(year, distance, sex) |> + distinct() |> + mutate(distance = fct_relevel(distance, ~ event_lims)) |> + group_by(distance, sex) |> + arrange(year) |> summarise(first_year = min(year), - last_year = max(year)) %>% + last_year = max(year)) |> ggplot(aes(x = first_year, y = distance)) + geom_segment(aes(xend = last_year, yend = distance, color = distance), - size = 8, lineend = "round", alpha = 0.4) + + linewidth = 8, lineend = "round", alpha = 0.4) + geom_point(data = . %>% filter(first_year == last_year), aes(color = distance), size = 8, alpha = 0.5) + @@ -195,22 +202,24 @@ Now, let's dive into the medals. First let's create a simple barplot with the to ```{r} #| label: n-medals #| warning: false +#| code-fold: true +#| code-summary: "Show code" -data %>% +data |> filter(year >= 1960, - ranking %in% seq(3)) %>% + ranking %in% seq(3)) |> mutate(country_long = countrycode::countrycode(country, origin = "ioc", destination = "country.name"), country_long = case_when(str_detect(country, "URS") ~ "Soviet Union", str_detect(country, "GDR") ~ "East Germany", str_detect(country, "FRG") ~ "West Germany", str_detect(country, "OAR") ~ "Olympic Athletes from Russia", TRUE ~ country_long), - country_label = str_glue("{country_long} ({country})")) %>% - count(country_label, sort = TRUE) %>% - mutate(highlight_col = ifelse(str_detect(country_label, "NED"), "#FF9B00", "grey")) %>% + country_label = str_glue("{country_long} ({country})")) |> + count(country_label, sort = TRUE) |> + mutate(highlight_col = ifelse(str_detect(country_label, "NED"), "#FF9B00", "grey")) |> ggplot(aes(x = n, y = reorder(country_label, n))) + geom_col(aes(fill = highlight_col)) + - geom_vline(xintercept = 0, size = 1) + + geom_vline(xintercept = 0, linewidth = 1) + geom_richtext(data = tibble(), aes(x = 24, y = 15, label = "Total number of medals won per country since 1960"), family = "custom", size = 7, fontface = "bold", hjust = 0, @@ -236,20 +245,22 @@ Let's look at how this distribution is spread out across the different Olympic e #| message: false #| fig-height: 6 #| out.width: 100% +#| code-fold: true +#| code-summary: "Show code" -data %>% +data |> filter(ranking %in% seq(3), - year >= 1960) %>% - group_by(year) %>% - mutate(total_medals = n()) %>% - group_by(year, country) %>% + year >= 1960) |> + group_by(year) |> + mutate(total_medals = n()) |> + group_by(year, country) |> summarise(medals_won = n(), - total_medals = first(total_medals)) %>% + total_medals = first(total_medals)) |> mutate(perc_won = medals_won / total_medals, perc_label = str_glue("{round(perc_won * 100)}%"), highlight_col = ifelse(country == "NED", "#FF9B00", "grey"), - country = tidytext::reorder_within(country, perc_won, year)) %>% - slice_max(perc_won, n = 5) %>% + country = tidytext::reorder_within(country, perc_won, year)) |> + slice_max(perc_won, n = 5) |> ggplot(aes(x = perc_won, y = country)) + geom_col(aes(fill = highlight_col)) + geom_text(aes(label = perc_label), family = "custom", @@ -278,15 +289,17 @@ But of course, not all medals are created equal. In Olympic rankings or medal ta #| label: medal-table #| message: false #| warning: false +#| code-fold: true +#| code-summary: "Show code" -data %>% +data |> filter(year >= 1960, - ranking %in% seq(3)) %>% - group_by(country, ranking) %>% - summarise(n_medals = n()) %>% - ungroup() %>% - complete(country, ranking) %>% - replace_na(list(n_medals = 0)) %>% + ranking %in% seq(3)) |> + group_by(country, ranking) |> + summarise(n_medals = n()) |> + ungroup() |> + complete(country, ranking) |> + replace_na(list(n_medals = 0)) |> mutate(country_long = countrycode::countrycode(country, origin = "ioc", destination = "country.name"), country_long = case_when(str_detect(country, "URS") ~ "Soviet Union", str_detect(country, "GDR") ~ "East Germany", @@ -298,11 +311,11 @@ data %>% rank_mult = case_when(ranking == 1 ~ 100, ranking == 2 ~ 10, ranking == 3 ~ 1), - rank_score = n_medals * rank_mult) %>% - group_by(country) %>% - mutate(country_rank = sum(rank_score)) %>% - ungroup() %>% - slice_max(country_rank, n = 30) %>% + rank_score = n_medals * rank_mult) |> + group_by(country) |> + mutate(country_rank = sum(rank_score)) |> + ungroup() |> + slice_max(country_rank, n = 30) |> ggplot(aes(x = ranking, y = reorder(country_label, country_rank), fill = ranking_color, alpha = n_medals)) + geom_point(shape = 21, size = 10, stroke = 0, show.legend = FALSE) + @@ -325,14 +338,16 @@ To show that a country is dominant in a particular competition it helps to show ```{r} #| label: podium-sweeps +#| code-fold: true +#| code-summary: "Show code" -data %>% +data |> filter(year >= 1960, - ranking %in% seq(3)) %>% - group_by(year, distance, sex, country) %>% - count(year, distance, sex, country, name = "medals_won") %>% - filter(medals_won == 3) %>% - mutate(sweeps = medals_won / 3) %>% + ranking %in% seq(3)) |> + group_by(year, distance, sex, country) |> + count(year, distance, sex, country, name = "medals_won") |> + filter(medals_won == 3) |> + mutate(sweeps = medals_won / 3) |> ggplot(aes(x = year, y = sweeps, fill = country)) + geom_col(key_glyph = "point") + geom_hline(yintercept = 0) + @@ -363,19 +378,21 @@ As you might gather, from this and the previous plot, the Winter Olympic Games f ```{r} #| label: olympic-records #| eval: false +#| code-fold: true +#| code-summary: "Show code" -data %>% - mutate(distance = fct_relevel(distance, ~ event_lims)) %>% +data |> + mutate(distance = fct_relevel(distance, ~ event_lims)) |> filter(str_detect(comment, "OR"), - distance != "Combined") %>% - group_by(distance, sex) %>% - arrange(year) %>% - mutate(no = row_number()) %>% + distance != "Combined") |> + group_by(distance, sex) |> + arrange(year) |> + mutate(no = row_number()) |> ggplot(aes(x = year, y = no, color = distance)) + geom_vline(xintercept = c(1940, 1944), linetype = "dashed", color = "grey92") + - geom_step(size = 1.5, alpha = 0.4, show.legend = FALSE) + + geom_step(linewidth = 1.5, alpha = 0.4, show.legend = FALSE) + geom_point(size = 4, alpha = 0.75, stroke = 0) + - ggrepel::geom_text_repel(data = . %>% filter(no == max(no)), + ggrepel::geom_text_repel(data = . |> filter(no == max(no)), aes(label = country), show.legend = FALSE, seed = 2, color = "#333333", size = 4, family = "custom", fontface = "bold") + @@ -405,16 +422,18 @@ Next, I want to highlight one athlete in particular. The Dutch team is a powerho ```{r} #| label: ireen-wust-data +#| code-fold: true +#| code-summary: "Show code" -data_wust <- data %>% +data_wust <- data |> filter(str_detect(athlete, "Ireen") | str_detect(title, "Women's Team pursuit") & - country == "NED") %>% + country == "NED") |> add_row(tibble(year = 2022, distance = "1500 m", sex = "Women", ranking = 1, - comment = "OR")) %>% + comment = "OR")) |> glimpse() ``` @@ -422,23 +441,25 @@ So Ireen participated in 18 events across 5 Olympic Games. She participated in a ```{r} #| label: wust-win-rate - -data_wust %>% - group_by(year, medals_won = ranking %in% seq(3)) %>% - mutate(medals_won = ifelse(medals_won, "medals_won", "medals_not_won")) %>% - count() %>% - ungroup() %>% - complete(year, medals_won, fill = list(n = 0)) %>% - pivot_wider(names_from = medals_won, values_from = n) %>% +#| code-fold: true +#| code-summary: "Show code" + +data_wust |> + group_by(year, medals_won = ranking %in% seq(3)) |> + mutate(medals_won = ifelse(medals_won, "medals_won", "medals_not_won")) |> + count() |> + ungroup() |> + complete(year, medals_won, fill = list(n = 0)) |> + pivot_wider(names_from = medals_won, values_from = n) |> mutate(total_events = medals_won + medals_not_won, perc_won = medals_won / total_events, perc_won_label = str_glue("{round(perc_won * 100,1)}%"), perc_won_label = ifelse(year == 2022, str_glue("{perc_won_label}*"), perc_won_label), year = as_factor(year), - year = fct_rev(year)) %>% + year = fct_rev(year)) |> ggplot(aes(x = perc_won, y = year)) + geom_segment(aes(x = 0, xend = perc_won, yend = year), - size = 10, lineend = "round", color = "#FF9B00") + + linewidth = 10, lineend = "round", color = "#FF9B00") + geom_text(aes(label = perc_won_label), size = 4, family = "custom", hjust = 1) + labs(title = "**Will Ireen win a medal if she shows up?**", subtitle = "Of all the events Ireen Wüst participated in, how often did she win a medal (of any color)?
2022 Olympics is still ongoing, shown is win rate **so far***", @@ -462,16 +483,18 @@ Finally, we can also visualize the individual medals she won. Again, I'll take s ```{r} #| label: wust-medals +#| code-fold: true +#| code-summary: "Show code" -data_wust %>% - filter(ranking %in% seq(3)) %>% +data_wust |> + filter(ranking %in% seq(3)) |> mutate(ranking_color = case_when(ranking == 1 ~ "#F8CC46", ranking == 2 ~ "#DFDFE7", ranking == 3 ~ "#D8B581"), - label = str_glue("{sex}'s {distance}")) %>% - group_by(year) %>% - arrange(ranking) %>% - mutate(y = row_number()) %>% + label = str_glue("{sex}'s {distance}")) |> + group_by(year) |> + arrange(ranking) |> + mutate(y = row_number()) |> ggplot(aes(x = year, y = -y)) + geom_point(aes(color = ranking_color), size = 12) + geom_text(aes(label = label), size = 4, family = "custom", hjust = 0.1) + @@ -499,26 +522,3 @@ data_wust %>% ``` Of course the Olympics are still ongoing, but I had a lot of fun collecting and visualizing this data. Again, not all numbers might correspond to official IOC records (as detailed [here](https://www.olympiandatabase.com/index.php?id=13738&L=1)), and I'll welcome any feedback on the code in this post. I'll use these posts as a creative outlet for data visualization ideas that my current professional work doesn't allow for. Since this is my own website and these posts aren't always very serious, I have some creative liberty. I enjoy writing these posts and they get the creative juices flowing. I hope for those not interested in speed skating they at least found the data wrangling process and visualization useful. I enjoy reading other people's blogposts since I usually learn a new function or approach, so I hope I can do the same for others. The Winter Olympics happen every four years so I won't get much opportunity to do this again any time soon, but it might update this post later with the latest data. - - - \ No newline at end of file diff --git a/content/blog/2022-dutch-performance-olympic-speed-skating/index_files/figure-gfm/events-timeline-1.png b/content/blog/2022-dutch-performance-olympic-speed-skating/index_files/figure-gfm/events-timeline-1.png deleted file mode 100644 index 0f0cf43..0000000 Binary files a/content/blog/2022-dutch-performance-olympic-speed-skating/index_files/figure-gfm/events-timeline-1.png and /dev/null differ diff --git a/content/blog/2022-dutch-performance-olympic-speed-skating/index_files/figure-gfm/medal-table-1.png b/content/blog/2022-dutch-performance-olympic-speed-skating/index_files/figure-gfm/medal-table-1.png deleted file mode 100644 index d30e7f8..0000000 Binary files a/content/blog/2022-dutch-performance-olympic-speed-skating/index_files/figure-gfm/medal-table-1.png and /dev/null differ diff --git a/content/blog/2022-dutch-performance-olympic-speed-skating/index_files/figure-gfm/n-medals-1.png b/content/blog/2022-dutch-performance-olympic-speed-skating/index_files/figure-gfm/n-medals-1.png deleted file mode 100644 index 8d90fb8..0000000 Binary files a/content/blog/2022-dutch-performance-olympic-speed-skating/index_files/figure-gfm/n-medals-1.png and /dev/null differ diff --git a/content/blog/2022-dutch-performance-olympic-speed-skating/index_files/figure-gfm/n-medals-per-game-1.png b/content/blog/2022-dutch-performance-olympic-speed-skating/index_files/figure-gfm/n-medals-per-game-1.png deleted file mode 100644 index bbb5d2c..0000000 Binary files a/content/blog/2022-dutch-performance-olympic-speed-skating/index_files/figure-gfm/n-medals-per-game-1.png and /dev/null differ diff --git a/content/blog/2022-dutch-performance-olympic-speed-skating/index_files/figure-gfm/podium-sweeps-1.png b/content/blog/2022-dutch-performance-olympic-speed-skating/index_files/figure-gfm/podium-sweeps-1.png deleted file mode 100644 index ef6dc53..0000000 Binary files a/content/blog/2022-dutch-performance-olympic-speed-skating/index_files/figure-gfm/podium-sweeps-1.png and /dev/null differ diff --git a/content/blog/2022-dutch-performance-olympic-speed-skating/index_files/figure-gfm/wust-medals-1.png b/content/blog/2022-dutch-performance-olympic-speed-skating/index_files/figure-gfm/wust-medals-1.png deleted file mode 100644 index d36f74a..0000000 Binary files a/content/blog/2022-dutch-performance-olympic-speed-skating/index_files/figure-gfm/wust-medals-1.png and /dev/null differ diff --git a/content/blog/2022-dutch-performance-olympic-speed-skating/index_files/figure-gfm/wust-win-rate-1.png b/content/blog/2022-dutch-performance-olympic-speed-skating/index_files/figure-gfm/wust-win-rate-1.png deleted file mode 100644 index ebd1ac3..0000000 Binary files a/content/blog/2022-dutch-performance-olympic-speed-skating/index_files/figure-gfm/wust-win-rate-1.png and /dev/null differ diff --git a/content/blog/2022-dutch-performance-olympic-speed-skating/index_files/libs/quarto-ojs/esbuild-bundle.js b/content/blog/2022-dutch-performance-olympic-speed-skating/index_files/libs/quarto-ojs/esbuild-bundle.js deleted file mode 100644 index 12aa00b..0000000 --- a/content/blog/2022-dutch-performance-olympic-speed-skating/index_files/libs/quarto-ojs/esbuild-bundle.js +++ /dev/null @@ -1,2464 +0,0 @@ -// quarto-ojs.js -import { - Interpreter as Interpreter2 -} from "https://cdn.skypack.dev/@alex.garcia/unofficial-observablehq-compiler@0.6.0-alpha.9"; -import { - Inspector as Inspector4, - Runtime as Runtime2, - RuntimeError as RuntimeError2 -} from "https://cdn.skypack.dev/@observablehq/runtime@4.18.3"; - -// stdlib.js -var e = {}; -var t = {}; -function n(e2) { - return new Function("d", "return {" + e2.map(function(e3, t2) { - return JSON.stringify(e3) + ": d[" + t2 + '] || ""'; - }).join(",") + "}"); -} -function r(e2) { - var t2 = Object.create(null), n2 = []; - return e2.forEach(function(e3) { - for (var r2 in e3) - r2 in t2 || n2.push(t2[r2] = r2); - }), n2; -} -function o(e2, t2) { - var n2 = e2 + "", r2 = n2.length; - return r2 < t2 ? new Array(t2 - r2 + 1).join(0) + n2 : n2; -} -function i(e2) { - var t2, n2 = e2.getUTCHours(), r2 = e2.getUTCMinutes(), i2 = e2.getUTCSeconds(), a2 = e2.getUTCMilliseconds(); - return isNaN(e2) ? "Invalid Date" : ((t2 = e2.getUTCFullYear()) < 0 ? "-" + o(-t2, 6) : t2 > 9999 ? "+" + o(t2, 6) : o(t2, 4)) + "-" + o(e2.getUTCMonth() + 1, 2) + "-" + o(e2.getUTCDate(), 2) + (a2 ? "T" + o(n2, 2) + ":" + o(r2, 2) + ":" + o(i2, 2) + "." + o(a2, 3) + "Z" : i2 ? "T" + o(n2, 2) + ":" + o(r2, 2) + ":" + o(i2, 2) + "Z" : r2 || n2 ? "T" + o(n2, 2) + ":" + o(r2, 2) + "Z" : ""); -} -function a(o2) { - var a2 = new RegExp('["' + o2 + "\n\r]"), s2 = o2.charCodeAt(0); - function u2(n2, r2) { - var o3, i2 = [], a3 = n2.length, u3 = 0, l3 = 0, c3 = a3 <= 0, f3 = false; - function d2() { - if (c3) - return t; - if (f3) - return f3 = false, e; - var r3, o4, i3 = u3; - if (n2.charCodeAt(i3) === 34) { - for (; u3++ < a3 && n2.charCodeAt(u3) !== 34 || n2.charCodeAt(++u3) === 34; ) - ; - return (r3 = u3) >= a3 ? c3 = true : (o4 = n2.charCodeAt(u3++)) === 10 ? f3 = true : o4 === 13 && (f3 = true, n2.charCodeAt(u3) === 10 && ++u3), n2.slice(i3 + 1, r3 - 1).replace(/""/g, '"'); - } - for (; u3 < a3; ) { - if ((o4 = n2.charCodeAt(r3 = u3++)) === 10) - f3 = true; - else if (o4 === 13) - f3 = true, n2.charCodeAt(u3) === 10 && ++u3; - else if (o4 !== s2) - continue; - return n2.slice(i3, r3); - } - return c3 = true, n2.slice(i3, a3); - } - for (n2.charCodeAt(a3 - 1) === 10 && --a3, n2.charCodeAt(a3 - 1) === 13 && --a3; (o3 = d2()) !== t; ) { - for (var h2 = []; o3 !== e && o3 !== t; ) - h2.push(o3), o3 = d2(); - r2 && (h2 = r2(h2, l3++)) == null || i2.push(h2); - } - return i2; - } - function l2(e2, t2) { - return e2.map(function(e3) { - return t2.map(function(t3) { - return f2(e3[t3]); - }).join(o2); - }); - } - function c2(e2) { - return e2.map(f2).join(o2); - } - function f2(e2) { - return e2 == null ? "" : e2 instanceof Date ? i(e2) : a2.test(e2 += "") ? '"' + e2.replace(/"/g, '""') + '"' : e2; - } - return { parse: function(e2, t2) { - var r2, o3, i2 = u2(e2, function(e3, i3) { - if (r2) - return r2(e3, i3 - 1); - o3 = e3, r2 = t2 ? function(e4, t3) { - var r3 = n(e4); - return function(n2, o4) { - return t3(r3(n2), o4, e4); - }; - }(e3, t2) : n(e3); - }); - return i2.columns = o3 || [], i2; - }, parseRows: u2, format: function(e2, t2) { - return t2 == null && (t2 = r(e2)), [t2.map(f2).join(o2)].concat(l2(e2, t2)).join("\n"); - }, formatBody: function(e2, t2) { - return t2 == null && (t2 = r(e2)), l2(e2, t2).join("\n"); - }, formatRows: function(e2) { - return e2.map(c2).join("\n"); - }, formatRow: c2, formatValue: f2 }; -} -var s = a(","); -var u = s.parse; -var l = s.parseRows; -var c = a(" "); -var f = c.parse; -var d = c.parseRows; -function h(e2) { - for (var t2 in e2) { - var n2, r2, o2 = e2[t2].trim(); - if (o2) - if (o2 === "true") - o2 = true; - else if (o2 === "false") - o2 = false; - else if (o2 === "NaN") - o2 = NaN; - else if (isNaN(n2 = +o2)) { - if (!(r2 = o2.match(/^([-+]\d{2})?\d{4}(-\d{2}(-\d{2})?)?(T\d{2}:\d{2}(:\d{2}(\.\d{3})?)?(Z|[-+]\d{2}:\d{2})?)?$/))) - continue; - m && r2[4] && !r2[7] && (o2 = o2.replace(/-/g, "/").replace(/T/, " ")), o2 = new Date(o2); - } else - o2 = n2; - else - o2 = null; - e2[t2] = o2; - } - return e2; -} -var m = new Date("2019-01-01T00:00").getHours() || new Date("2019-07-01T00:00").getHours(); -var p = new Map(); -var w = []; -var v = w.map; -var y = w.some; -var g = w.hasOwnProperty; -var b = "https://cdn.jsdelivr.net/npm/"; -var x = /^((?:@[^/@]+\/)?[^/@]+)(?:@([^/]+))?(?:\/(.*))?$/; -var j = /^\d+\.\d+\.\d+(-[\w-.+]+)?$/; -var E = /\.[^/]*$/; -var P = ["unpkg", "jsdelivr", "browser", "main"]; -var RequireError = class extends Error { - constructor(e2) { - super(e2); - } -}; -function C(e2) { - const t2 = x.exec(e2); - return t2 && { name: t2[1], version: t2[2], path: t2[3] }; -} -function A(e2) { - const t2 = `${b}${e2.name}${e2.version ? `@${e2.version}` : ""}/package.json`; - let n2 = p.get(t2); - return n2 || p.set(t2, n2 = fetch(t2).then((e3) => { - if (!e3.ok) - throw new RequireError("unable to load package.json"); - return e3.redirected && !p.has(e3.url) && p.set(e3.url, n2), e3.json(); - })), n2; -} -RequireError.prototype.name = RequireError.name; -var N = L(async function(e2, t2) { - if (e2.startsWith(b) && (e2 = e2.substring(b.length)), /^(\w+:)|\/\//i.test(e2)) - return e2; - if (/^[.]{0,2}\//i.test(e2)) - return new URL(e2, t2 == null ? location : t2).href; - if (!e2.length || /^[\s._]/.test(e2) || /\s$/.test(e2)) - throw new RequireError("illegal name"); - const n2 = C(e2); - if (!n2) - return `${b}${e2}`; - if (!n2.version && t2 != null && t2.startsWith(b)) { - const e3 = await A(C(t2.substring(b.length))); - n2.version = e3.dependencies && e3.dependencies[n2.name] || e3.peerDependencies && e3.peerDependencies[n2.name]; - } - if (n2.path && !E.test(n2.path) && (n2.path += ".js"), n2.path && n2.version && j.test(n2.version)) - return `${b}${n2.name}@${n2.version}/${n2.path}`; - const r2 = await A(n2); - return `${b}${r2.name}@${r2.version}/${n2.path || function(e3) { - for (const t3 of P) { - const n3 = e3[t3]; - if (typeof n3 == "string") - return E.test(n3) ? n3 : `${n3}.js`; - } - }(r2) || "index.js"}`; -}); -function L(e2) { - const t2 = new Map(), n2 = a2(null); - let r2, o2 = 0; - function i2(e3) { - if (typeof e3 != "string") - return e3; - let n3 = t2.get(e3); - return n3 || t2.set(e3, n3 = new Promise((t3, n4) => { - const i3 = document.createElement("script"); - i3.onload = () => { - try { - t3(w.pop()(a2(e3))); - } catch (e4) { - n4(new RequireError("invalid module")); - } - i3.remove(), o2--, o2 === 0 && (window.define = r2); - }, i3.onerror = () => { - n4(new RequireError("unable to load module")), i3.remove(), o2--, o2 === 0 && (window.define = r2); - }, i3.async = true, i3.src = e3, o2 === 0 && (r2 = window.define, window.define = k), o2++, document.head.appendChild(i3); - })), n3; - } - function a2(t3) { - return (n3) => Promise.resolve(e2(n3, t3)).then(i2); - } - function s2(e3) { - return arguments.length > 1 ? Promise.all(v.call(arguments, n2)).then(O) : n2(e3); - } - return s2.alias = function(t3) { - return L((n3, r3) => n3 in t3 && (r3 = null, typeof (n3 = t3[n3]) != "string") ? n3 : e2(n3, r3)); - }, s2.resolve = e2, s2; -} -function O(e2) { - const t2 = {}; - for (const n2 of e2) - for (const e3 in n2) - g.call(n2, e3) && (n2[e3] == null ? Object.defineProperty(t2, e3, { get: R(n2, e3) }) : t2[e3] = n2[e3]); - return t2; -} -function R(e2, t2) { - return () => e2[t2]; -} -function $2(e2) { - return (e2 += "") === "exports" || e2 === "module"; -} -function k(e2, t2, n2) { - const r2 = arguments.length; - r2 < 2 ? (n2 = e2, t2 = []) : r2 < 3 && (n2 = t2, t2 = typeof e2 == "string" ? [] : e2), w.push(y.call(t2, $2) ? (e3) => { - const r3 = {}, o2 = { exports: r3 }; - return Promise.all(v.call(t2, (t3) => (t3 += "") === "exports" ? r3 : t3 === "module" ? o2 : e3(t3))).then((e4) => (n2.apply(null, e4), o2.exports)); - } : (e3) => Promise.all(v.call(t2, e3)).then((e4) => typeof n2 == "function" ? n2.apply(null, e4) : n2)); -} -function T(e2, t2, n2) { - return { resolve: (r2 = n2) => `https://cdn.jsdelivr.net/npm/${e2}@${t2}/${r2}` }; -} -k.amd = {}; -var U = T("d3", "7.0.1", "dist/d3.min.js"); -var q = T("@observablehq/inputs", "0.9.1", "dist/inputs.min.js"); -var M = T("@observablehq/plot", "0.2.0", "dist/plot.umd.min.js"); -var S = T("@observablehq/graphviz", "0.2.1", "dist/graphviz.min.js"); -var _ = T("@observablehq/highlight.js", "2.0.0", "highlight.min.js"); -var D = T("@observablehq/katex", "0.11.1", "dist/katex.min.js"); -var F = T("lodash", "4.17.21", "lodash.min.js"); -var B = T("htl", "0.3.0", "dist/htl.min.js"); -var z = T("jszip", "3.7.1", "dist/jszip.min.js"); -var H = T("marked", "0.3.12", "marked.min.js"); -var W = T("sql.js", "1.6.1", "dist/sql-wasm.js"); -var I = T("vega", "5.20.2", "build/vega.min.js"); -var Z = T("vega-lite", "5.1.0", "build/vega-lite.min.js"); -var Q = T("vega-lite-api", "5.0.0", "build/vega-lite-api.min.js"); -var V = T("apache-arrow", "4.0.1", "Arrow.es2015.min.js"); -var J = T("arquero", "4.8.4", "dist/arquero.min.js"); -var X = T("topojson-client", "3.1.0", "dist/topojson-client.min.js"); -function Y(e2) { - const t2 = {}; - for (const [n2, r2] of e2) - t2[n2] = r2; - return t2; -} -async function G(e2) { - return (await e2(W.resolve()))({ locateFile: (e3) => W.resolve(`dist/${e3}`) }); -} -var SQLiteDatabaseClient = class { - constructor(e2) { - Object.defineProperties(this, { _db: { value: e2 } }); - } - static async open(e2) { - const [t2, n2] = await Promise.all([G(N), Promise.resolve(e2).then(K)]); - return new SQLiteDatabaseClient(new t2.Database(n2)); - } - async query(e2, t2) { - return await async function(e3, t3, n2) { - const [r2] = await e3.exec(t3, n2); - if (!r2) - return []; - const { columns: o2, values: i2 } = r2, a2 = i2.map((e4) => Y(e4.map((e5, t4) => [o2[t4], e5]))); - return a2.columns = o2, a2; - }(this._db, e2, t2); - } - async queryRow(e2, t2) { - return (await this.query(e2, t2))[0] || null; - } - async explain(e2, t2) { - return ee("pre", { className: "observablehq--inspect" }, [te((await this.query(`EXPLAIN QUERY PLAN ${e2}`, t2)).map((e3) => e3.detail).join("\n"))]); - } - async describe(e2) { - const t2 = await (e2 === void 0 ? this.query("SELECT name FROM sqlite_master WHERE type = 'table'") : this.query("SELECT * FROM pragma_table_info(?)", [e2])); - if (!t2.length) - throw new Error("Not found"); - const { columns: n2 } = t2; - return ee("table", { value: t2 }, [ee("thead", [ee("tr", n2.map((e3) => ee("th", [te(e3)])))]), ee("tbody", t2.map((e3) => ee("tr", n2.map((t3) => ee("td", [te(e3[t3])])))))]); - } -}; -function K(e2) { - return typeof e2 == "string" ? fetch(e2).then(K) : e2 instanceof Response || e2 instanceof Blob ? e2.arrayBuffer().then(K) : e2 instanceof ArrayBuffer ? new Uint8Array(e2) : e2; -} -function ee(e2, t2, n2) { - arguments.length === 2 && (n2 = t2, t2 = void 0); - const r2 = document.createElement(e2); - if (t2 !== void 0) - for (const e3 in t2) - r2[e3] = t2[e3]; - if (n2 !== void 0) - for (const e3 of n2) - r2.appendChild(e3); - return r2; -} -function te(e2) { - return document.createTextNode(e2); -} -async function ne(e2) { - const t2 = await fetch(await e2.url()); - if (!t2.ok) - throw new Error(`Unable to load file: ${e2.name}`); - return t2; -} -async function re(e2, t2, { array: n2 = false, typed: r2 = false } = {}) { - const o2 = await e2.text(); - return (t2 === " " ? n2 ? d : f : n2 ? l : u)(o2, r2 && h); -} -var oe = class { - constructor(e2) { - Object.defineProperty(this, "name", { value: e2, enumerable: true }); - } - async blob() { - return (await ne(this)).blob(); - } - async arrayBuffer() { - return (await ne(this)).arrayBuffer(); - } - async text() { - return (await ne(this)).text(); - } - async json() { - return (await ne(this)).json(); - } - async stream() { - return (await ne(this)).body; - } - async csv(e2) { - return re(this, ",", e2); - } - async tsv(e2) { - return re(this, " ", e2); - } - async image() { - const e2 = await this.url(); - return new Promise((t2, n2) => { - const r2 = new Image(); - new URL(e2, document.baseURI).origin !== new URL(location).origin && (r2.crossOrigin = "anonymous"), r2.onload = () => t2(r2), r2.onerror = () => n2(new Error(`Unable to load file: ${this.name}`)), r2.src = e2; - }); - } - async arrow() { - const [e2, t2] = await Promise.all([N(V.resolve()), ne(this)]); - return e2.Table.from(t2); - } - async sqlite() { - return SQLiteDatabaseClient.open(ne(this)); - } - async zip() { - const [e2, t2] = await Promise.all([N(z.resolve()), this.arrayBuffer()]); - return new ZipArchive(await e2.loadAsync(t2)); - } - async xml(e2 = "application/xml") { - return new DOMParser().parseFromString(await this.text(), e2); - } - async html() { - return this.xml("text/html"); - } -}; -var FileAttachment = class extends oe { - constructor(e2, t2) { - super(t2), Object.defineProperty(this, "_url", { value: e2 }); - } - async url() { - return await this._url + ""; - } -}; -function ie(e2) { - throw new Error(`File not found: ${e2}`); -} -function ae(e2) { - return Object.assign((t2) => { - const n2 = e2(t2 += ""); - if (n2 == null) - throw new Error(`File not found: ${t2}`); - return new FileAttachment(n2, t2); - }, { prototype: FileAttachment.prototype }); -} -var ZipArchive = class { - constructor(e2) { - Object.defineProperty(this, "_", { value: e2 }), this.filenames = Object.keys(e2.files).filter((t2) => !e2.files[t2].dir); - } - file(e2) { - const t2 = this._.file(e2 += ""); - if (!t2 || t2.dir) - throw new Error(`file not found: ${e2}`); - return new ZipArchiveEntry(t2); - } -}; -var ZipArchiveEntry = class extends oe { - constructor(e2) { - super(e2.name), Object.defineProperty(this, "_", { value: e2 }), Object.defineProperty(this, "_url", { writable: true }); - } - async url() { - return this._url || (this._url = this.blob().then(URL.createObjectURL)); - } - async blob() { - return this._.async("blob"); - } - async arrayBuffer() { - return this._.async("arraybuffer"); - } - async text() { - return this._.async("text"); - } - async json() { - return JSON.parse(await this.text()); - } -}; -var se = { math: "http://www.w3.org/1998/Math/MathML", svg: "http://www.w3.org/2000/svg", xhtml: "http://www.w3.org/1999/xhtml", xlink: "http://www.w3.org/1999/xlink", xml: "http://www.w3.org/XML/1998/namespace", xmlns: "http://www.w3.org/2000/xmlns/" }; -var ue = 0; -function le(e2) { - this.id = e2, this.href = new URL(`#${e2}`, location) + ""; -} -le.prototype.toString = function() { - return "url(" + this.href + ")"; -}; -var ce = { canvas: function(e2, t2) { - var n2 = document.createElement("canvas"); - return n2.width = e2, n2.height = t2, n2; -}, context2d: function(e2, t2, n2) { - n2 == null && (n2 = devicePixelRatio); - var r2 = document.createElement("canvas"); - r2.width = e2 * n2, r2.height = t2 * n2, r2.style.width = e2 + "px"; - var o2 = r2.getContext("2d"); - return o2.scale(n2, n2), o2; -}, download: function(e2, t2 = "untitled", n2 = "Save") { - const r2 = document.createElement("a"), o2 = r2.appendChild(document.createElement("button")); - async function i2() { - await new Promise(requestAnimationFrame), URL.revokeObjectURL(r2.href), r2.removeAttribute("href"), o2.textContent = n2, o2.disabled = false; - } - return o2.textContent = n2, r2.download = t2, r2.onclick = async (t3) => { - if (o2.disabled = true, r2.href) - return i2(); - o2.textContent = "Saving\u2026"; - try { - const t4 = await (typeof e2 == "function" ? e2() : e2); - o2.textContent = "Download", r2.href = URL.createObjectURL(t4); - } catch (e3) { - o2.textContent = n2; - } - if (t3.eventPhase) - return i2(); - o2.disabled = false; - }, r2; -}, element: function(e2, t2) { - var n2, r2 = e2 += "", o2 = r2.indexOf(":"); - o2 >= 0 && (r2 = e2.slice(0, o2)) !== "xmlns" && (e2 = e2.slice(o2 + 1)); - var i2 = se.hasOwnProperty(r2) ? document.createElementNS(se[r2], e2) : document.createElement(e2); - if (t2) - for (var a2 in t2) - o2 = (r2 = a2).indexOf(":"), n2 = t2[a2], o2 >= 0 && (r2 = a2.slice(0, o2)) !== "xmlns" && (a2 = a2.slice(o2 + 1)), se.hasOwnProperty(r2) ? i2.setAttributeNS(se[r2], a2, n2) : i2.setAttribute(a2, n2); - return i2; -}, input: function(e2) { - var t2 = document.createElement("input"); - return e2 != null && (t2.type = e2), t2; -}, range: function(e2, t2, n2) { - arguments.length === 1 && (t2 = e2, e2 = null); - var r2 = document.createElement("input"); - return r2.min = e2 = e2 == null ? 0 : +e2, r2.max = t2 = t2 == null ? 1 : +t2, r2.step = n2 == null ? "any" : n2 = +n2, r2.type = "range", r2; -}, select: function(e2) { - var t2 = document.createElement("select"); - return Array.prototype.forEach.call(e2, function(e3) { - var n2 = document.createElement("option"); - n2.value = n2.textContent = e3, t2.appendChild(n2); - }), t2; -}, svg: function(e2, t2) { - var n2 = document.createElementNS("http://www.w3.org/2000/svg", "svg"); - return n2.setAttribute("viewBox", [0, 0, e2, t2]), n2.setAttribute("width", e2), n2.setAttribute("height", t2), n2; -}, text: function(e2) { - return document.createTextNode(e2); -}, uid: function(e2) { - return new le("O-" + (e2 == null ? "" : e2 + "-") + ++ue); -} }; -var fe = { buffer: function(e2) { - return new Promise(function(t2, n2) { - var r2 = new FileReader(); - r2.onload = function() { - t2(r2.result); - }, r2.onerror = n2, r2.readAsArrayBuffer(e2); - }); -}, text: function(e2) { - return new Promise(function(t2, n2) { - var r2 = new FileReader(); - r2.onload = function() { - t2(r2.result); - }, r2.onerror = n2, r2.readAsText(e2); - }); -}, url: function(e2) { - return new Promise(function(t2, n2) { - var r2 = new FileReader(); - r2.onload = function() { - t2(r2.result); - }, r2.onerror = n2, r2.readAsDataURL(e2); - }); -} }; -function de() { - return this; -} -function he(e2, t2) { - let n2 = false; - if (typeof t2 != "function") - throw new Error("dispose is not a function"); - return { [Symbol.iterator]: de, next: () => n2 ? { done: true } : (n2 = true, { done: false, value: e2 }), return: () => (n2 = true, t2(e2), { done: true }), throw: () => ({ done: n2 = true }) }; -} -function me(e2) { - let t2, n2, r2 = false; - const o2 = e2(function(e3) { - n2 ? (n2(e3), n2 = null) : r2 = true; - return t2 = e3; - }); - if (o2 != null && typeof o2 != "function") - throw new Error(typeof o2.then == "function" ? "async initializers are not supported" : "initializer returned something, but not a dispose function"); - return { [Symbol.iterator]: de, throw: () => ({ done: true }), return: () => (o2 != null && o2(), { done: true }), next: function() { - return { done: false, value: r2 ? (r2 = false, Promise.resolve(t2)) : new Promise((e3) => n2 = e3) }; - } }; -} -function pe(e2) { - switch (e2.type) { - case "range": - case "number": - return e2.valueAsNumber; - case "date": - return e2.valueAsDate; - case "checkbox": - return e2.checked; - case "file": - return e2.multiple ? e2.files : e2.files[0]; - case "select-multiple": - return Array.from(e2.selectedOptions, (e3) => e3.value); - default: - return e2.value; - } -} -var we = { disposable: he, filter: function* (e2, t2) { - for (var n2, r2 = -1; !(n2 = e2.next()).done; ) - t2(n2.value, ++r2) && (yield n2.value); -}, input: function(e2) { - return me(function(t2) { - var n2 = function(e3) { - switch (e3.type) { - case "button": - case "submit": - case "checkbox": - return "click"; - case "file": - return "change"; - default: - return "input"; - } - }(e2), r2 = pe(e2); - function o2() { - t2(pe(e2)); - } - return e2.addEventListener(n2, o2), r2 !== void 0 && t2(r2), function() { - e2.removeEventListener(n2, o2); - }; - }); -}, map: function* (e2, t2) { - for (var n2, r2 = -1; !(n2 = e2.next()).done; ) - yield t2(n2.value, ++r2); -}, observe: me, queue: function(e2) { - let t2; - const n2 = [], r2 = e2(function(e3) { - n2.push(e3), t2 && (t2(n2.shift()), t2 = null); - return e3; - }); - if (r2 != null && typeof r2 != "function") - throw new Error(typeof r2.then == "function" ? "async initializers are not supported" : "initializer returned something, but not a dispose function"); - return { [Symbol.iterator]: de, throw: () => ({ done: true }), return: () => (r2 != null && r2(), { done: true }), next: function() { - return { done: false, value: n2.length ? Promise.resolve(n2.shift()) : new Promise((e3) => t2 = e3) }; - } }; -}, range: function* (e2, t2, n2) { - e2 = +e2, t2 = +t2, n2 = (o2 = arguments.length) < 2 ? (t2 = e2, e2 = 0, 1) : o2 < 3 ? 1 : +n2; - for (var r2 = -1, o2 = 0 | Math.max(0, Math.ceil((t2 - e2) / n2)); ++r2 < o2; ) - yield e2 + r2 * n2; -}, valueAt: function(e2, t2) { - if (!(!isFinite(t2 = +t2) || t2 < 0 || t2 != t2 | 0)) { - for (var n2, r2 = -1; !(n2 = e2.next()).done; ) - if (++r2 === t2) - return n2.value; - } -}, worker: function(e2) { - const t2 = URL.createObjectURL(new Blob([e2], { type: "text/javascript" })), n2 = new Worker(t2); - return he(n2, () => { - n2.terminate(), URL.revokeObjectURL(t2); - }); -} }; -function ve(e2, t2) { - return function(n2) { - var r2, o2, i2, a2, s2, u2, l2, c2, f2 = n2[0], d2 = [], h2 = null, m2 = -1; - for (s2 = 1, u2 = arguments.length; s2 < u2; ++s2) { - if ((r2 = arguments[s2]) instanceof Node) - d2[++m2] = r2, f2 += ""; - else if (Array.isArray(r2)) { - for (l2 = 0, c2 = r2.length; l2 < c2; ++l2) - (o2 = r2[l2]) instanceof Node ? (h2 === null && (d2[++m2] = h2 = document.createDocumentFragment(), f2 += ""), h2.appendChild(o2)) : (h2 = null, f2 += o2); - h2 = null; - } else - f2 += r2; - f2 += n2[s2]; - } - if (h2 = e2(f2), ++m2 > 0) { - for (i2 = new Array(m2), a2 = document.createTreeWalker(h2, NodeFilter.SHOW_COMMENT, null, false); a2.nextNode(); ) - o2 = a2.currentNode, /^o:/.test(o2.nodeValue) && (i2[+o2.nodeValue.slice(2)] = o2); - for (s2 = 0; s2 < m2; ++s2) - (o2 = i2[s2]) && o2.parentNode.replaceChild(d2[s2], o2); - } - return h2.childNodes.length === 1 ? h2.removeChild(h2.firstChild) : h2.nodeType === 11 ? ((o2 = t2()).appendChild(h2), o2) : h2; - }; -} -var ye = ve(function(e2) { - var t2 = document.createElement("template"); - return t2.innerHTML = e2.trim(), document.importNode(t2.content, true); -}, function() { - return document.createElement("span"); -}); -function ge(e2) { - let t2; - Object.defineProperties(this, { generator: { value: me((e3) => { - t2 = e3; - }) }, value: { get: () => e2, set: (n2) => t2(e2 = n2) } }), e2 !== void 0 && t2(e2); -} -function* be() { - for (; ; ) - yield Date.now(); -} -var xe = new Map(); -function je(e2, t2) { - var n2; - return (n2 = xe.get(e2 = +e2)) ? n2.then(() => t2) : (n2 = Date.now()) >= e2 ? Promise.resolve(t2) : function(e3, t3) { - var n3 = new Promise(function(n4) { - xe.delete(t3); - var r2 = t3 - e3; - if (!(r2 > 0)) - throw new Error("invalid time"); - if (r2 > 2147483647) - throw new Error("too long to wait"); - setTimeout(n4, r2); - }); - return xe.set(t3, n3), n3; - }(n2, e2).then(() => t2); -} -var Ee = { delay: function(e2, t2) { - return new Promise(function(n2) { - setTimeout(function() { - n2(t2); - }, e2); - }); -}, tick: function(e2, t2) { - return je(Math.ceil((Date.now() + 1) / e2) * e2, t2); -}, when: je }; -function Pe(e2, t2) { - if (/^(\w+:)|\/\//i.test(e2)) - return e2; - if (/^[.]{0,2}\//i.test(e2)) - return new URL(e2, t2 == null ? location : t2).href; - if (!e2.length || /^[\s._]/.test(e2) || /\s$/.test(e2)) - throw new Error("illegal name"); - return "https://unpkg.com/" + e2; -} -function Ce(e2) { - return e2 == null ? N : L(e2); -} -var Ae = ve(function(e2) { - var t2 = document.createElementNS("http://www.w3.org/2000/svg", "g"); - return t2.innerHTML = e2.trim(), t2; -}, function() { - return document.createElementNS("http://www.w3.org/2000/svg", "g"); -}); -var Ne = String.raw; -function Le() { - return me(function(e2) { - var t2 = e2(document.body.clientWidth); - function n2() { - var n3 = document.body.clientWidth; - n3 !== t2 && e2(t2 = n3); - } - return window.addEventListener("resize", n2), function() { - window.removeEventListener("resize", n2); - }; - }); -} -var Oe = Object.assign(function(e2) { - const t2 = Ce(e2); - var n2; - Object.defineProperties(this, (n2 = { FileAttachment: () => ie, Arrow: () => t2(V.resolve()), Inputs: () => t2(q.resolve()), Mutable: () => ge, Plot: () => t2(M.resolve()), SQLite: () => G(t2), SQLiteDatabaseClient: () => SQLiteDatabaseClient, _: () => t2(F.resolve()), aq: () => t2.alias({ "apache-arrow": V.resolve() })(J.resolve()), d3: () => t2(U.resolve()), dot: () => t2(S.resolve()), htl: () => t2(B.resolve()), html: () => ye, md: () => function(e3) { - return e3(H.resolve()).then(function(t3) { - return ve(function(n3) { - var r2 = document.createElement("div"); - r2.innerHTML = t3(n3, { langPrefix: "" }).trim(); - var o2 = r2.querySelectorAll("pre code[class]"); - return o2.length > 0 && e3(_.resolve()).then(function(t4) { - o2.forEach(function(n4) { - function r3() { - t4.highlightBlock(n4), n4.parentNode.classList.add("observablehq--md-pre"); - } - t4.getLanguage(n4.className) ? r3() : e3(_.resolve("async-languages/index.js")).then((r4) => { - if (r4.has(n4.className)) - return e3(_.resolve("async-languages/" + r4.get(n4.className))).then((e4) => { - t4.registerLanguage(n4.className, e4); - }); - }).then(r3, r3); - }); - }), r2; - }, function() { - return document.createElement("div"); - }); - }); - }(t2), now: be, require: () => t2, resolve: () => Pe, svg: () => Ae, tex: () => function(e3) { - return Promise.all([e3(D.resolve()), (t3 = D.resolve("dist/katex.min.css"), new Promise(function(e4, n3) { - var r2 = document.createElement("link"); - r2.rel = "stylesheet", r2.href = t3, r2.onerror = n3, r2.onload = e4, document.head.appendChild(r2); - }))]).then(function(e4) { - var t4 = e4[0], n3 = r2(); - function r2(e5) { - return function() { - var n4 = document.createElement("div"); - return t4.render(Ne.apply(String, arguments), n4, e5), n4.removeChild(n4.firstChild); - }; - } - return n3.options = r2, n3.block = r2({ displayMode: true }), n3; - }); - var t3; - }(t2), topojson: () => t2(X.resolve()), vl: () => async function(e3) { - const [t3, n3, r2] = await Promise.all([I, Z, Q].map((t4) => e3(t4.resolve()))); - return r2.register(t3, n3); - }(t2), width: Le, DOM: ce, Files: fe, Generators: we, Promises: Ee }, Y(Object.entries(n2).map(Re)))); -}, { resolve: N.resolve }); -function Re([e2, t2]) { - return [e2, { value: t2, writable: true, enumerable: true }]; -} - -// pandoc-code-decorator.js -var PandocCodeDecorator = class { - constructor(node) { - this._node = node; - this._spans = []; - this.normalizeCodeRange(); - this.initializeEntryPoints(); - } - normalizeCodeRange() { - const n2 = this._node; - const lines = n2.querySelectorAll("code > span"); - for (const line of lines) { - Array.from(line.childNodes).filter((n22) => n22.nodeType === n22.TEXT_NODE).forEach((n22) => { - const newSpan = document.createElement("span"); - newSpan.textContent = n22.wholeText; - n22.replaceWith(newSpan); - }); - } - } - initializeEntryPoints() { - const lines = this._node.querySelectorAll("code > span"); - let result = []; - let offset = this._node.parentElement.dataset.sourceOffset && -Number(this._node.parentElement.dataset.sourceOffset) || 0; - for (const line of lines) { - let lineNumber = Number(line.id.split("-").pop()); - let column = 1; - Array.from(line.childNodes).filter((n2) => n2.nodeType === n2.ELEMENT_NODE && n2.nodeName === "SPAN").forEach((n2) => { - result.push({ - offset, - line: lineNumber, - column, - node: n2 - }); - offset += n2.textContent.length; - column += n2.textContent.length; - }); - offset += 1; - } - this._elementEntryPoints = result; - } - locateEntry(offset) { - let candidate; - if (offset === Infinity) - return void 0; - for (let i2 = 0; i2 < this._elementEntryPoints.length; ++i2) { - const entry = this._elementEntryPoints[i2]; - if (entry.offset > offset) { - return { entry: candidate, index: i2 - 1 }; - } - candidate = entry; - } - if (offset < candidate.offset + candidate.node.textContent.length) { - return { entry: candidate, index: this._elementEntryPoints.length - 1 }; - } else { - return void 0; - } - } - offsetToLineColumn(offset) { - let entry = this.locateEntry(offset); - if (entry === void 0) { - const entries = this._elementEntryPoints; - const last = entries[entries.length - 1]; - return { - line: last.line, - column: last.column + Math.min(last.node.textContent.length, offset - last.offset) - }; - } - return { - line: entry.entry.line, - column: entry.entry.column + offset - entry.entry.offset - }; - } - *spanSelection(start, end) { - this.ensureExactSpan(start, end); - const startEntry = this.locateEntry(start); - const endEntry = this.locateEntry(end); - if (startEntry === void 0) { - return; - } - const startIndex = startEntry.index; - const endIndex = endEntry && endEntry.index || this._elementEntryPoints.length; - for (let i2 = startIndex; i2 < endIndex; ++i2) { - yield this._elementEntryPoints[i2]; - } - } - decorateSpan(start, end, classes) { - for (const entryPoint of this.spanSelection(start, end)) { - for (const cssClass of classes) { - entryPoint.node.classList.add(cssClass); - } - } - } - clearSpan(start, end, classes) { - for (const entryPoint of this.spanSelection(start, end)) { - for (const cssClass of classes) { - entryPoint.node.classList.remove(cssClass); - } - } - } - ensureExactSpan(start, end) { - const splitEntry = (entry, offset) => { - const newSpan = document.createElement("span"); - for (const cssClass of entry.node.classList) { - newSpan.classList.add(cssClass); - } - const beforeText = entry.node.textContent.slice(0, offset - entry.offset); - const afterText = entry.node.textContent.slice(offset - entry.offset); - entry.node.textContent = beforeText; - newSpan.textContent = afterText; - entry.node.after(newSpan); - this._elementEntryPoints.push({ - column: entry.column + offset - entry.offset, - line: entry.line, - node: newSpan, - offset - }); - this._elementEntryPoints.sort((a2, b2) => a2.offset - b2.offset); - }; - const startEntry = this.locateEntry(start); - if (startEntry !== void 0 && startEntry.entry.offset != start) { - splitEntry(startEntry.entry, start); - } - const endEntry = this.locateEntry(end); - if (endEntry !== void 0 && endEntry.entry.offset !== end) { - splitEntry(endEntry.entry, end); - } - } - clearSpan(start, end, classes) { - this.ensureExactSpan(start, end); - const startEntry = this.locateEntry(start); - const endEntry = this.locateEntry(end); - if (startEntry === void 0) { - return; - } - const startIndex = startEntry.index; - const endIndex = endEntry && endEntry.index || this._elementEntryPoints.length; - for (let i2 = startIndex; i2 < endIndex; ++i2) { - for (const cssClass of classes) { - this._elementEntryPoints[i2].node.classList.remove(cssClass); - } - } - } -}; - -// quarto-observable-shiny.js -import { - Inspector as Inspector2 -} from "https://cdn.skypack.dev/@observablehq/runtime@4.18.3"; -import { - button -} from "https://cdn.skypack.dev/@observablehq/inputs@0.10.4"; - -// quarto-inspector.js -import { - Inspector -} from "https://cdn.skypack.dev/@observablehq/runtime@4.18.3"; -var QuartoInspector = class extends Inspector { - constructor(node, cellAst) { - super(node); - this._cellAst = cellAst; - } - rejected(error) { - return super.rejected(error); - } -}; - -// quarto-observable-shiny.js -var shinyInputVars = new Set(); -var shinyInitialValue = {}; -function extendObservableStdlib(lib) { - class NamedVariableOutputBinding extends Shiny.OutputBinding { - constructor(name, change) { - super(); - this._name = name; - this._change = change; - } - find(scope) { - return $(scope).find("#" + this._name); - } - getId(el) { - return el.id; - } - renderValue(_el, data) { - this._change(data); - } - onValueError(el, err) { - const group = `Shiny error in ${el.id}`; - console.groupCollapsed(`%c${group}`, "color:red"); - console.log(`${err.message}`); - console.log(`call: ${err.call}`); - console.groupEnd(group); - } - } - $(document).on("shiny:connected", function(_event) { - Object.entries(shinyInitialValue).map(([k2, v2]) => { - window.Shiny.setInputValue(k2, v2); - }); - shinyInitialValue = {}; - }); - lib.shinyInput = function() { - return (name) => { - shinyInputVars.add(name); - window._ojs.ojsConnector.mainModule.value(name).then((val) => { - if (window.Shiny && window.Shiny.setInputValue) { - window.Shiny.setInputValue(name, val); - } else { - shinyInitialValue[name] = val; - } - }); - }; - }; - lib.shinyOutput = function() { - return function(name) { - const dummySpan = document.createElement("div"); - dummySpan.id = name; - dummySpan.classList.add("ojs-variable-writer"); - window._ojs.shinyElementRoot.appendChild(dummySpan); - return lib.Generators.observe((change) => { - Shiny.outputBindings.register(new NamedVariableOutputBinding(name, change)); - }); - }; - }; -} -var ShinyInspector = class extends QuartoInspector { - constructor(node) { - super(node); - } - fulfilled(value, name) { - if (shinyInputVars.has(name) && window.Shiny) { - if (window.Shiny.setInputValue === void 0) { - shinyInitialValue[name] = value; - } else { - window.Shiny.setInputValue(name, value); - } - } - return super.fulfilled(value, name); - } -}; -var { Generators } = new Oe(); -var OjsButtonInput = class { - find(_scope) { - return document.querySelectorAll(".ojs-inputs-button"); - } - init(el, change) { - const btn = button(el.textContent); - el.innerHTML = ""; - el.appendChild(btn); - const obs = Generators.input(el.firstChild); - (async function() { - await obs.next().value; - for (const x2 of obs) { - change(await x2); - } - })(); - return { - onSetValue: (_value) => { - }, - dispose: () => { - obs.return(); - } - }; - } -}; -function initOjsShinyRuntime() { - const valueSym = Symbol("value"); - const callbackSym = Symbol("callback"); - const instanceSym = Symbol("instance"); - class BindingAdapter extends Shiny.InputBinding { - constructor(x2) { - super(); - this.x = x2; - } - find(scope) { - const matches = this.x.find(scope); - return $(matches); - } - getId(el) { - if (this.x.getId) { - return this.x.getId(el); - } else { - return super.getId(el); - } - } - initialize(el) { - const changeHandler = (value) => { - el[valueSym] = value; - el[callbackSym](); - }; - const instance = this.x.init(el, changeHandler); - el[instanceSym] = instance; - } - getValue(el) { - return el[valueSym]; - } - setValue(el, value) { - el[valueSym] = value; - el[instanceSym].onSetValue(value); - } - subscribe(el, callback) { - el[callbackSym] = callback; - } - unsubscribe(el) { - el[instanceSym].dispose(); - } - } - class InspectorOutputBinding extends Shiny.OutputBinding { - find(scope) { - return $(scope).find(".observablehq-inspector"); - } - getId(el) { - return el.id; - } - renderValue(el, data) { - new Inspector2(el).fulfilled(data); - } - } - if (window.Shiny === void 0) { - console.warn("Shiny runtime not found; Shiny features won't work."); - return false; - } - Shiny.inputBindings.register(new BindingAdapter(new OjsButtonInput())); - Shiny.outputBindings.register(new InspectorOutputBinding()); - Shiny.addCustomMessageHandler("ojs-export", ({ name }) => { - window._ojs.ojsConnector.mainModule.redefine(name, window._ojs.ojsConnector.library.shinyOutput()(name)); - Shiny.bindAll(document.body); - }); - return true; -} - -// ojs-connector.js -import { - Interpreter -} from "https://cdn.skypack.dev/@alex.garcia/unofficial-observablehq-compiler@0.6.0-alpha.9"; -import { - Inspector as Inspector3, - Runtime, - RuntimeError -} from "https://cdn.skypack.dev/@observablehq/runtime@4.18.3"; - -// observablehq-parser.js -import { getLineInfo, TokContext, tokTypes as tt, Parser } from "https://cdn.skypack.dev/acorn@7"; -import { ancestor } from "https://cdn.skypack.dev/acorn-walk@7"; -import { make } from "https://cdn.skypack.dev/acorn-walk@7"; -import { simple } from "https://cdn.skypack.dev/acorn-walk@7"; -var globals_default = new Set([ - "Array", - "ArrayBuffer", - "atob", - "AudioContext", - "Blob", - "Boolean", - "BigInt", - "btoa", - "clearInterval", - "clearTimeout", - "console", - "crypto", - "CustomEvent", - "DataView", - "Date", - "decodeURI", - "decodeURIComponent", - "devicePixelRatio", - "document", - "encodeURI", - "encodeURIComponent", - "Error", - "escape", - "eval", - "fetch", - "File", - "FileList", - "FileReader", - "Float32Array", - "Float64Array", - "Function", - "Headers", - "Image", - "ImageData", - "Infinity", - "Int16Array", - "Int32Array", - "Int8Array", - "Intl", - "isFinite", - "isNaN", - "JSON", - "Map", - "Math", - "NaN", - "Number", - "navigator", - "Object", - "parseFloat", - "parseInt", - "performance", - "Path2D", - "Promise", - "Proxy", - "RangeError", - "ReferenceError", - "Reflect", - "RegExp", - "cancelAnimationFrame", - "requestAnimationFrame", - "Set", - "setInterval", - "setTimeout", - "String", - "Symbol", - "SyntaxError", - "TextDecoder", - "TextEncoder", - "this", - "TypeError", - "Uint16Array", - "Uint32Array", - "Uint8Array", - "Uint8ClampedArray", - "undefined", - "unescape", - "URIError", - "URL", - "WeakMap", - "WeakSet", - "WebSocket", - "Worker", - "window" -]); -var walk_default = make({ - Import() { - }, - ViewExpression(node, st, c2) { - c2(node.id, st, "Identifier"); - }, - MutableExpression(node, st, c2) { - c2(node.id, st, "Identifier"); - } -}); -function isScope(node) { - return node.type === "FunctionExpression" || node.type === "FunctionDeclaration" || node.type === "ArrowFunctionExpression" || node.type === "Program"; -} -function isBlockScope(node) { - return node.type === "BlockStatement" || node.type === "ForInStatement" || node.type === "ForOfStatement" || node.type === "ForStatement" || isScope(node); -} -function declaresArguments(node) { - return node.type === "FunctionExpression" || node.type === "FunctionDeclaration"; -} -function findReferences(cell, globals) { - const ast = { type: "Program", body: [cell.body] }; - const locals = new Map(); - const globalSet = new Set(globals); - const references = []; - function hasLocal(node, name) { - const l2 = locals.get(node); - return l2 ? l2.has(name) : false; - } - function declareLocal(node, id) { - const l2 = locals.get(node); - if (l2) - l2.add(id.name); - else - locals.set(node, new Set([id.name])); - } - function declareClass(node) { - if (node.id) - declareLocal(node, node.id); - } - function declareFunction(node) { - node.params.forEach((param) => declarePattern(param, node)); - if (node.id) - declareLocal(node, node.id); - } - function declareCatchClause(node) { - if (node.param) - declarePattern(node.param, node); - } - function declarePattern(node, parent) { - switch (node.type) { - case "Identifier": - declareLocal(parent, node); - break; - case "ObjectPattern": - node.properties.forEach((node2) => declarePattern(node2, parent)); - break; - case "ArrayPattern": - node.elements.forEach((node2) => node2 && declarePattern(node2, parent)); - break; - case "Property": - declarePattern(node.value, parent); - break; - case "RestElement": - declarePattern(node.argument, parent); - break; - case "AssignmentPattern": - declarePattern(node.left, parent); - break; - default: - throw new Error("Unrecognized pattern type: " + node.type); - } - } - function declareModuleSpecifier(node) { - declareLocal(ast, node.local); - } - ancestor(ast, { - VariableDeclaration: (node, parents) => { - let parent = null; - for (let i2 = parents.length - 1; i2 >= 0 && parent === null; --i2) { - if (node.kind === "var" ? isScope(parents[i2]) : isBlockScope(parents[i2])) { - parent = parents[i2]; - } - } - node.declarations.forEach((declaration) => declarePattern(declaration.id, parent)); - }, - FunctionDeclaration: (node, parents) => { - let parent = null; - for (let i2 = parents.length - 2; i2 >= 0 && parent === null; --i2) { - if (isScope(parents[i2])) { - parent = parents[i2]; - } - } - declareLocal(parent, node.id); - declareFunction(node); - }, - Function: declareFunction, - ClassDeclaration: (node, parents) => { - let parent = null; - for (let i2 = parents.length - 2; i2 >= 0 && parent === null; i2--) { - if (isScope(parents[i2])) { - parent = parents[i2]; - } - } - declareLocal(parent, node.id); - }, - Class: declareClass, - CatchClause: declareCatchClause, - ImportDefaultSpecifier: declareModuleSpecifier, - ImportSpecifier: declareModuleSpecifier, - ImportNamespaceSpecifier: declareModuleSpecifier - }, walk_default); - function identifier(node, parents) { - let name = node.name; - if (name === "undefined") - return; - for (let i2 = parents.length - 2; i2 >= 0; --i2) { - if (name === "arguments") { - if (declaresArguments(parents[i2])) { - return; - } - } - if (hasLocal(parents[i2], name)) { - return; - } - if (parents[i2].type === "ViewExpression") { - node = parents[i2]; - name = `viewof ${node.id.name}`; - } - if (parents[i2].type === "MutableExpression") { - node = parents[i2]; - name = `mutable ${node.id.name}`; - } - } - if (!globalSet.has(name)) { - if (name === "arguments") { - throw Object.assign(new SyntaxError(`arguments is not allowed`), { node }); - } - references.push(node); - } - } - ancestor(ast, { - VariablePattern: identifier, - Identifier: identifier - }, walk_default); - function checkConst(node, parents) { - if (!node) - return; - switch (node.type) { - case "Identifier": - case "VariablePattern": { - for (const parent of parents) { - if (hasLocal(parent, node.name)) { - return; - } - } - if (parents[parents.length - 2].type === "MutableExpression") { - return; - } - throw Object.assign(new SyntaxError(`Assignment to constant variable ${node.name}`), { node }); - } - case "ArrayPattern": { - for (const element of node.elements) { - checkConst(element, parents); - } - return; - } - case "ObjectPattern": { - for (const property of node.properties) { - checkConst(property, parents); - } - return; - } - case "Property": { - checkConst(node.value, parents); - return; - } - case "RestElement": { - checkConst(node.argument, parents); - return; - } - } - } - function checkConstArgument(node, parents) { - checkConst(node.argument, parents); - } - function checkConstLeft(node, parents) { - checkConst(node.left, parents); - } - ancestor(ast, { - AssignmentExpression: checkConstLeft, - AssignmentPattern: checkConstLeft, - UpdateExpression: checkConstArgument, - ForOfStatement: checkConstLeft, - ForInStatement: checkConstLeft - }, walk_default); - return references; -} -function findFeatures(cell, featureName) { - const ast = { type: "Program", body: [cell.body] }; - const features = new Map(); - const { references } = cell; - simple(ast, { - CallExpression: (node) => { - const { callee, arguments: args } = node; - if (callee.type !== "Identifier" || callee.name !== featureName || references.indexOf(callee) < 0) - return; - if (args.length !== 1 || !(args[0].type === "Literal" && /^['"]/.test(args[0].raw) || args[0].type === "TemplateLiteral" && args[0].expressions.length === 0)) { - throw Object.assign(new SyntaxError(`${featureName} requires a single literal string argument`), { node }); - } - const [arg] = args; - const name = arg.type === "Literal" ? arg.value : arg.quasis[0].value.cooked; - const location2 = { start: arg.start, end: arg.end }; - if (features.has(name)) - features.get(name).push(location2); - else - features.set(name, [location2]); - } - }, walk_default); - return features; -} -var SCOPE_FUNCTION = 2; -var SCOPE_ASYNC = 4; -var SCOPE_GENERATOR = 8; -var STATE_START = Symbol("start"); -var STATE_MODIFIER = Symbol("modifier"); -var STATE_FUNCTION = Symbol("function"); -var STATE_NAME = Symbol("name"); -var CellParser = class extends Parser { - constructor(options, ...args) { - super(Object.assign({ ecmaVersion: 12 }, options), ...args); - } - enterScope(flags) { - if (flags & SCOPE_FUNCTION) - ++this.O_function; - return super.enterScope(flags); - } - exitScope() { - if (this.currentScope().flags & SCOPE_FUNCTION) - --this.O_function; - return super.exitScope(); - } - parseForIn(node, init) { - if (this.O_function === 1 && node.await) - this.O_async = true; - return super.parseForIn(node, init); - } - parseAwait() { - if (this.O_function === 1) - this.O_async = true; - return super.parseAwait(); - } - parseYield(noIn) { - if (this.O_function === 1) - this.O_generator = true; - return super.parseYield(noIn); - } - parseImport(node) { - this.next(); - node.specifiers = this.parseImportSpecifiers(); - if (this.type === tt._with) { - this.next(); - node.injections = this.parseImportSpecifiers(); - } - this.expectContextual("from"); - node.source = this.type === tt.string ? this.parseExprAtom() : this.unexpected(); - return this.finishNode(node, "ImportDeclaration"); - } - parseImportSpecifiers() { - const nodes = []; - const identifiers = new Set(); - let first = true; - this.expect(tt.braceL); - while (!this.eat(tt.braceR)) { - if (first) { - first = false; - } else { - this.expect(tt.comma); - if (this.afterTrailingComma(tt.braceR)) - break; - } - const node = this.startNode(); - node.view = this.eatContextual("viewof"); - node.mutable = node.view ? false : this.eatContextual("mutable"); - node.imported = this.parseIdent(); - this.checkUnreserved(node.imported); - this.checkLocal(node.imported); - if (this.eatContextual("as")) { - node.local = this.parseIdent(); - this.checkUnreserved(node.local); - this.checkLocal(node.local); - } else { - node.local = node.imported; - } - this.checkLVal(node.local, "let"); - if (identifiers.has(node.local.name)) { - this.raise(node.local.start, `Identifier '${node.local.name}' has already been declared`); - } - identifiers.add(node.local.name); - nodes.push(this.finishNode(node, "ImportSpecifier")); - } - return nodes; - } - parseExprAtom(refDestructuringErrors) { - return this.parseMaybeKeywordExpression("viewof", "ViewExpression") || this.parseMaybeKeywordExpression("mutable", "MutableExpression") || super.parseExprAtom(refDestructuringErrors); - } - startCell() { - this.O_function = 0; - this.O_async = false; - this.O_generator = false; - this.strict = true; - this.enterScope(SCOPE_FUNCTION | SCOPE_ASYNC | SCOPE_GENERATOR); - } - finishCell(node, body, id) { - if (id) - this.checkLocal(id); - node.id = id; - node.body = body; - node.async = this.O_async; - node.generator = this.O_generator; - this.exitScope(); - return this.finishNode(node, "Cell"); - } - parseCell(node, eof) { - const lookahead = new CellParser({}, this.input, this.start); - let token = lookahead.getToken(); - let body = null; - let id = null; - this.startCell(); - if (token.type === tt._import && lookahead.getToken().type !== tt.parenL) { - body = this.parseImport(this.startNode()); - } else if (token.type !== tt.eof && token.type !== tt.semi) { - if (token.type === tt.name) { - if (token.value === "viewof" || token.value === "mutable") { - token = lookahead.getToken(); - if (token.type !== tt.name) { - lookahead.unexpected(); - } - } - token = lookahead.getToken(); - if (token.type === tt.eq) { - id = this.parseMaybeKeywordExpression("viewof", "ViewExpression") || this.parseMaybeKeywordExpression("mutable", "MutableExpression") || this.parseIdent(); - token = lookahead.getToken(); - this.expect(tt.eq); - } - } - if (token.type === tt.braceL) { - body = this.parseBlock(); - } else { - body = this.parseExpression(); - if (id === null && (body.type === "FunctionExpression" || body.type === "ClassExpression")) { - id = body.id; - } - } - } - this.semicolon(); - if (eof) - this.expect(tt.eof); - return this.finishCell(node, body, id); - } - parseTopLevel(node) { - return this.parseCell(node, true); - } - toAssignable(node, isBinding, refDestructuringErrors) { - return node.type === "MutableExpression" ? node : super.toAssignable(node, isBinding, refDestructuringErrors); - } - checkLocal(id) { - const node = id.id || id; - if (globals_default.has(node.name) || node.name === "arguments") { - this.raise(node.start, `Identifier '${node.name}' is reserved`); - } - } - checkUnreserved(node) { - if (node.name === "viewof" || node.name === "mutable") { - this.raise(node.start, `Unexpected keyword '${node.name}'`); - } - return super.checkUnreserved(node); - } - checkLVal(expr, bindingType, checkClashes) { - return super.checkLVal(expr.type === "MutableExpression" ? expr.id : expr, bindingType, checkClashes); - } - unexpected(pos) { - this.raise(pos != null ? pos : this.start, this.type === tt.eof ? "Unexpected end of input" : "Unexpected token"); - } - parseMaybeKeywordExpression(keyword, type) { - if (this.isContextual(keyword)) { - const node = this.startNode(); - this.next(); - node.id = this.parseIdent(); - return this.finishNode(node, type); - } - } -}; -var o_tmpl = new TokContext("`", true, true, (parser) => readTemplateToken.call(parser)); -function readTemplateToken() { - out: - for (; this.pos < this.input.length; this.pos++) { - switch (this.input.charCodeAt(this.pos)) { - case 92: { - if (this.pos < this.input.length - 1) - ++this.pos; - break; - } - case 36: { - if (this.input.charCodeAt(this.pos + 1) === 123) { - if (this.pos === this.start && this.type === tt.invalidTemplate) { - this.pos += 2; - return this.finishToken(tt.dollarBraceL); - } - break out; - } - break; - } - } - } - return this.finishToken(tt.invalidTemplate, this.input.slice(this.start, this.pos)); -} -function parseModule(input, { globals } = {}) { - const program = ModuleParser.parse(input); - for (const cell of program.cells) { - parseReferences(cell, input, globals); - parseFeatures(cell, input, globals); - } - return program; -} -var ModuleParser = class extends CellParser { - parseTopLevel(node) { - if (!node.cells) - node.cells = []; - while (this.type !== tt.eof) { - const cell = this.parseCell(this.startNode()); - cell.input = this.input; - node.cells.push(cell); - } - this.next(); - return this.finishNode(node, "Program"); - } -}; -function parseReferences(cell, input, globals = globals_default) { - if (!cell.body) { - cell.references = []; - } else if (cell.body.type === "ImportDeclaration") { - cell.references = cell.body.injections ? cell.body.injections.map((i2) => i2.imported) : []; - } else { - try { - cell.references = findReferences(cell, globals); - } catch (error) { - if (error.node) { - const loc = getLineInfo(input, error.node.start); - error.message += ` (${loc.line}:${loc.column})`; - error.pos = error.node.start; - error.loc = loc; - delete error.node; - } - throw error; - } - } - return cell; -} -function parseFeatures(cell, input) { - if (cell.body && cell.body.type !== "ImportDeclaration") { - try { - cell.fileAttachments = findFeatures(cell, "FileAttachment"); - cell.databaseClients = findFeatures(cell, "DatabaseClient"); - cell.secrets = findFeatures(cell, "Secret"); - } catch (error) { - if (error.node) { - const loc = getLineInfo(input, error.node.start); - error.message += ` (${loc.line}:${loc.column})`; - error.pos = error.node.start; - error.loc = loc; - delete error.node; - } - throw error; - } - } else { - cell.fileAttachments = new Map(); - cell.databaseClients = new Map(); - cell.secrets = new Map(); - } - return cell; -} - -// ojs-connector.js -var EmptyInspector = class { - pending() { - } - fulfilled(_value, _name) { - } - rejected(_error, _name) { - } -}; -function es6ImportAsObservableModule(m2) { - return function(runtime, observer) { - const main = runtime.module(); - Object.keys(m2).forEach((key) => { - const v2 = m2[key]; - main.variable(observer(key)).define(key, [], () => v2); - }); - return main; - }; -} -async function defaultResolveImportPath(path) { - const extractPath = (path2) => { - let source2 = path2; - let m3; - if (m3 = /\.js(\?|$)/i.exec(source2)) { - source2 = source2.slice(0, m3.index); - } - if (m3 = /^[0-9a-f]{16}$/i.test(source2)) { - source2 = `d/${source2}`; - } - if (m3 = /^https:\/\/(api\.|beta\.|)observablehq\.com\//i.exec(source2)) { - source2 = source2.slice(m3[0].length); - } - return source2; - }; - const source = extractPath(path); - const metadataURL = `https://api.observablehq.com/document/${source}`; - const moduleURL = `https://api.observablehq.com/${source}.js?v=3`; - const m2 = await import(moduleURL); - return m2.default; -} -function importPathResolver(paths, localResolverMap) { - function importRootPath(path) { - const { runtimeToRoot } = paths; - if (!runtimeToRoot) { - return path; - } else { - return `${runtimeToRoot}/${path}`; - } - } - function importRelativePath(path) { - const { runtimeToDoc } = paths; - if (!runtimeToDoc) { - return path; - } else { - return `${runtimeToDoc}/${path}`; - } - } - function fetchRootPath(path) { - const { docToRoot } = paths; - if (!docToRoot) { - return path; - } else { - return `${docToRoot}/${path}`; - } - } - function fetchRelativePath(path) { - return path; - } - return async (path) => { - const isLocalModule = path.startsWith("/") || path.startsWith("."); - const isImportFromObservableWebsite = path.match(/^https:\/\/(api\.|beta\.|)observablehq\.com\//i); - if (!isLocalModule || isImportFromObservableWebsite) { - return defaultResolveImportPath(path); - } - let importPath, fetchPath; - let moduleType; - if (window._ojs.selfContained) { - const resolved = localResolverMap.get(path); - if (resolved === void 0) { - throw new Error(`missing local file ${path} in self-contained mode`); - } - importPath = resolved; - fetchPath = resolved; - const mimeType = resolved.match(/data:(.*);base64/)[1]; - switch (mimeType) { - case "application/javascript": - moduleType = "js"; - break; - case "application/ojs-javascript": - moduleType = "ojs"; - break; - default: - throw new Error(`unrecognized MIME type ${mimeType}`); - } - } else { - const resourceURL = new URL(path, window.location); - moduleType = resourceURL.pathname.match(/\.(ojs|js|qmd)$/)[1]; - if (path.startsWith("/")) { - importPath = importRootPath(path); - fetchPath = fetchRootPath(path); - } else { - importPath = importRelativePath(path); - fetchPath = fetchRelativePath(path); - } - } - if (moduleType === "js") { - try { - const m2 = await import(importPath); - return es6ImportAsObservableModule(m2); - } catch (e2) { - console.error(e2); - throw e2; - } - } else if (moduleType === "ojs") { - return importOjsFromURL(fetchPath); - } else if (moduleType === "qmd") { - const htmlPath = `${fetchPath.slice(0, -4)}.html`; - const response = await fetch(htmlPath); - const text = await response.text(); - return createOjsModuleFromHTMLSrc(text); - } else { - throw new Error(`internal error, unrecognized module type ${moduleType}`); - } - }; -} -function createOjsModuleFromHTMLSrc(text) { - const parser = new DOMParser(); - const doc = parser.parseFromString(text, "text/html"); - const staticDefns = []; - for (const el of doc.querySelectorAll('script[type="ojs-define"]')) { - staticDefns.push(el.text); - } - const ojsSource = []; - for (const content of doc.querySelectorAll('script[type="ojs-module-contents"]')) { - for (const cell of JSON.parse(content.text).contents) { - ojsSource.push(cell.source); - } - } - return createOjsModuleFromSrc(ojsSource.join("\n"), staticDefns); -} -function createOjsModuleFromSrc(src, staticDefns = []) { - return (runtime, _observer) => { - const newModule = runtime.module(); - const interpreter = window._ojs.ojsConnector.interpreter; - const _cells = interpreter.module(src, newModule, (_name) => new EmptyInspector()); - for (const defn of staticDefns) { - for (const { name, value } of JSON.parse(defn).contents) { - window._ojs.ojsConnector.define(name, newModule)(value); - } - } - return newModule; - }; -} -async function importOjsFromURL(path) { - const r2 = await fetch(path); - const src = await r2.text(); - return createOjsModuleFromSrc(src); -} -var OJSConnector = class { - constructor({ paths, inspectorClass, library, allowPendingGlobals = false }) { - this.library = library || new Oe(); - this.localResolverMap = new Map(); - this.pendingGlobals = {}; - this.allowPendingGlobals = allowPendingGlobals; - this.runtime = new Runtime(this.library, (name) => this.global(name)); - this.mainModule = this.runtime.module(); - this.interpreter = new Interpreter({ - module: this.mainModule, - resolveImportPath: importPathResolver(paths, this.localResolverMap) - }); - this.inspectorClass = inspectorClass || Inspector3; - this.mainModuleHasImports = false; - this.mainModuleOutstandingImportCount = 0; - this.chunkPromises = []; - } - global(name) { - if (typeof window[name] !== "undefined") { - return window[name]; - } - if (!this.allowPendingGlobals) { - return void 0; - } - if (!this.pendingGlobals.hasOwnProperty(name)) { - const info = {}; - info.promise = new Promise((resolve, reject) => { - info.resolve = resolve; - info.reject = reject; - }); - this.pendingGlobals[name] = info; - } - return this.pendingGlobals[name].promise; - } - killPendingGlobals() { - this.allowPendingGlobals = false; - for (const [name, { reject }] of Object.entries(this.pendingGlobals)) { - reject(new RuntimeError(`${name} is not defined`)); - } - } - setLocalResolver(map) { - for (const [key, value] of Object.entries(map)) { - this.localResolverMap.set(key, value); - } - } - define(name, module = void 0) { - if (!module) { - module = this.mainModule; - } - let change; - const obs = this.library.Generators.observe((change_) => { - change = change_; - }); - module.variable().define(name, obs); - return change; - } - watch(name, k2, module = void 0) { - if (!module) { - module = this.mainModule; - } - module.variable({ - fulfilled: (x2) => k2(x2, name) - }).define([name], (val) => val); - } - async value(val, module = void 0) { - if (!module) { - module = this.mainModule; - } - const result = await module.value(val); - return result; - } - finishInterpreting() { - return Promise.all(this.chunkPromises); - } - interpretWithRunner(src, runner) { - try { - const parse = parseModule(src); - const chunkPromise = Promise.all(parse.cells.map(runner)); - this.chunkPromises.push(chunkPromise); - return chunkPromise; - } catch (error) { - return Promise.reject(error); - } - } - waitOnImports(cell, promise) { - if (cell.body.type !== "ImportDeclaration") { - return promise; - } else { - this.mainModuleHasImports = true; - this.mainModuleOutstandingImportCount++; - return promise.then((result) => { - this.mainModuleOutstandingImportCount--; - if (this.mainModuleOutstandingImportCount === 0) { - this.clearImportModuleWait(); - } - return result; - }); - } - } - interpretQuiet(src) { - const runCell = (cell) => { - const cellSrc = src.slice(cell.start, cell.end); - const promise = this.interpreter.module(cellSrc, void 0, (_name) => new EmptyInspector()); - return this.waitOnImports(cell, promise); - }; - return this.interpretWithRunner(src, runCell); - } -}; - -// quarto-ojs.js -var makeDevhostErrorClickHandler = (line, column) => { - return function() { - if (!window.quartoDevhost) { - return false; - } - window.quartoDevhost.openInputFile(line, column, true); - return false; - }; -}; -if (Object.fromEntries === void 0) { - Object.fromEntries = function(obj) { - const result = {}; - for (const [key, value] of obj) { - result[key] = value; - } - return result; - }; -} -function calloutBlock(opts) { - const { - type, - heading, - message, - onclick - } = opts; - const outerBlock = document.createElement("div"); - outerBlock.classList.add(`callout-${type}`, "callout", "callout-style-default", "callout-captioned"); - const header = document.createElement("div"); - header.classList.add("callout-header", "d-flex", "align-content-center"); - const iconContainer = document.createElement("div"); - iconContainer.classList.add("callout-icon-container"); - const icon = document.createElement("i"); - icon.classList.add("callout-icon"); - iconContainer.appendChild(icon); - header.appendChild(iconContainer); - const headingDiv = document.createElement("div"); - headingDiv.classList.add("callout-caption-container", "flex-fill"); - if (typeof heading === "string") { - headingDiv.innerText = heading; - } else { - headingDiv.appendChild(heading); - } - header.appendChild(headingDiv); - outerBlock.appendChild(header); - const container = document.createElement("div"); - container.classList.add("callout-body-container", "callout-body"); - if (typeof message === "string") { - const p2 = document.createElement("p"); - p2.innerText = message; - container.appendChild(p2); - } else { - container.append(message); - } - outerBlock.appendChild(container); - if (onclick) { - outerBlock.onclick = onclick; - outerBlock.style.cursor = "pointer"; - } - return outerBlock; -} -var kQuartoModuleWaitClass = "ojs-in-a-box-waiting-for-module-import"; -var QuartoOJSConnector = class extends OJSConnector { - constructor(opts) { - super(opts); - } - clearImportModuleWait() { - const array = Array.from(document.querySelectorAll(`.${kQuartoModuleWaitClass}`)); - for (const node of array) { - node.classList.remove(kQuartoModuleWaitClass); - } - } - finishInterpreting() { - return super.finishInterpreting().then(() => { - if (this.mainModuleHasImports) { - this.clearImportModuleWait(); - } - }); - } - locatePreDiv(cellDiv, ojsDiv) { - let preDiv; - for (const candidate of cellDiv.querySelectorAll("pre.sourceCode")) { - if (candidate.compareDocumentPosition(ojsDiv) & ojsDiv.DOCUMENT_POSITION_FOLLOWING) { - preDiv = candidate; - } else { - break; - } - } - return preDiv; - } - findCellOutputDisplay(ojsDiv) { - while (ojsDiv && !ojsDiv.classList.contains("cell-output-display")) { - ojsDiv = ojsDiv.parentElement; - } - if (!ojsDiv) { - throw new Error("Internal error: couldn't find output display div"); - } - return ojsDiv; - } - setPreDivClasses(preDiv, hasErrors) { - if (!hasErrors) { - preDiv.classList.remove("numberSource"); - if (preDiv._hidden === true) { - preDiv.parentElement.classList.add("hidden"); - } - } else { - preDiv.classList.add("numberSource"); - if (preDiv.parentElement.classList.contains("hidden")) { - preDiv._hidden = true; - preDiv.parentElement.classList.remove("hidden"); - } - } - } - clearErrorPinpoints(cellDiv, ojsDiv) { - const preDiv = this.locatePreDiv(cellDiv, ojsDiv); - if (preDiv === void 0) { - return; - } - this.setPreDivClasses(preDiv, false); - let startingOffset = 0; - if (preDiv.parentElement.dataset.sourceOffset) { - startingOffset = -Number(preDiv.parentElement.dataset.sourceOffset); - } - for (const entryPoint of preDiv._decorator.spanSelection(startingOffset, Infinity)) { - const { node } = entryPoint; - node.classList.remove("quarto-ojs-error-pinpoint"); - node.onclick = null; - } - } - decorateOjsDivWithErrorPinpoint(ojsDiv, start, end, line, column) { - const cellOutputDisplay = this.findCellOutputDisplay(ojsDiv); - if (cellOutputDisplay._errorSpans === void 0) { - cellOutputDisplay._errorSpans = []; - } - cellOutputDisplay._errorSpans.push({ - start, - end, - line, - column - }); - } - decorateSource(cellDiv, ojsDiv) { - this.clearErrorPinpoints(cellDiv, ojsDiv); - const preDiv = this.locatePreDiv(cellDiv, ojsDiv); - if (preDiv === void 0) { - return; - } - let div = preDiv.parentElement.nextElementSibling; - let foundErrors = false; - while (div !== null && div.classList.contains("cell-output-display")) { - for (const errorSpan of div._errorSpans || []) { - for (const entryPoint of preDiv._decorator.spanSelection(errorSpan.start, errorSpan.end)) { - const { node } = entryPoint; - node.classList.add("quarto-ojs-error-pinpoint"); - node.onclick = makeDevhostErrorClickHandler(errorSpan.line, errorSpan.column); - } - foundErrors = true; - } - div = div.nextElementSibling; - } - this.setPreDivClasses(preDiv, foundErrors); - } - clearError(ojsDiv) { - const cellOutputDisplay = this.findCellOutputDisplay(ojsDiv); - cellOutputDisplay._errorSpans = []; - } - signalError(cellDiv, ojsDiv, ojsAst) { - const buildCallout = (ojsDiv2) => { - let onclick; - const inspectChild = ojsDiv2.querySelector(".observablehq--inspect"); - let [heading, message] = inspectChild.textContent.split(": "); - if (heading === "RuntimeError") { - heading = "OJS Runtime Error"; - if (message.match(/^(.+) is not defined$/)) { - const [varName, ...rest] = message.split(" "); - const p2 = document.createElement("p"); - const tt2 = document.createElement("tt"); - tt2.innerText = varName; - p2.appendChild(tt2); - p2.appendChild(document.createTextNode(" " + rest.join(" "))); - message = p2; - const preDiv = this.locatePreDiv(cellDiv, ojsDiv2); - if (preDiv !== void 0) { - preDiv.classList.add("numberSource"); - const missingRef = ojsAst.references.find((n2) => n2.name === varName); - if (missingRef !== void 0) { - const { line, column } = preDiv._decorator.offsetToLineColumn(missingRef.start); - const headingSpan = document.createElement("span"); - const headingTextEl = document.createTextNode(`${heading} (line ${line}, column ${column}) `); - headingSpan.appendChild(headingTextEl); - if (window.quartoDevhost) { - const clicker = document.createElement("a"); - clicker.href = "#"; - clicker.innerText = "(source)"; - onclick = makeDevhostErrorClickHandler(line, column); - headingSpan.appendChild(clicker); - } - heading = headingSpan; - this.decorateOjsDivWithErrorPinpoint(ojsDiv2, missingRef.start, missingRef.end, line, column); - } - } - } else if (message.match(/^(.+) could not be resolved$/) || message.match(/^(.+) is defined more than once$/)) { - const [varName, ...rest] = message.split(" "); - const p2 = document.createElement("p"); - const tt2 = document.createElement("tt"); - tt2.innerText = varName; - p2.appendChild(tt2); - p2.appendChild(document.createTextNode(" " + rest.join(" "))); - message = p2; - } else if (message === "circular definition") { - const p2 = document.createElement("p"); - p2.appendChild(document.createTextNode("circular definition")); - message = p2; - } else { - throw new Error(`Internal error, could not parse OJS error message "${message}"`); - } - } else { - heading = "OJS Error"; - const p2 = document.createElement("p"); - p2.appendChild(document.createTextNode(inspectChild.textContent)); - message = p2; - } - const callout = calloutBlock({ - type: "important", - heading, - message, - onclick - }); - ojsDiv2.appendChild(callout); - }; - buildCallout(ojsDiv); - } - interpret(src, elementGetter, elementCreator) { - const that = this; - const observer = (targetElement, ojsAst) => { - return (name) => { - const element = typeof elementCreator === "function" ? elementCreator() : elementCreator; - targetElement.appendChild(element); - if (ojsAst.id && ojsAst.id.type === "ViewExpression" && !name.startsWith("viewof ")) { - element.classList.add("quarto-ojs-hide"); - } - let cellDiv = targetElement; - let cellOutputDisplay; - while (cellDiv !== null && !cellDiv.classList.contains("cell")) { - cellDiv = cellDiv.parentElement; - if (cellDiv && cellDiv.classList.contains("cell-output-display")) { - cellOutputDisplay = cellDiv; - } - } - const forceShowDeclarations = !(cellDiv && cellDiv.dataset.output !== "all"); - const config = { childList: true }; - const callback = function(mutationsList) { - for (const mutation of mutationsList) { - const ojsDiv = mutation.target; - if (!forceShowDeclarations) { - Array.from(mutation.target.childNodes).filter((n2) => { - return n2.classList.contains("observablehq--inspect") && !n2.parentNode.classList.contains("observablehq--error") && n2.parentNode.parentNode.dataset.nodetype !== "expression"; - }).forEach((n2) => n2.classList.add("quarto-ojs-hide")); - Array.from(mutation.target.childNodes).filter((n2) => { - return n2.classList.contains("observablehq--inspect") && !n2.parentNode.classList.contains("observablehq--error") && n2.parentNode.parentNode.dataset.nodetype === "expression"; - }).forEach((n2) => n2.classList.remove("quarto-ojs-hide")); - } - if (ojsDiv.classList.contains("observablehq--error")) { - ojsDiv.querySelector(".observablehq--inspect").style.display = "none"; - if (ojsDiv.querySelectorAll(".callout-important").length === 0) { - that.signalError(cellDiv, ojsDiv, ojsAst); - } - } else { - that.clearError(ojsDiv); - if (ojsDiv.parentNode.dataset.nodetype !== "expression" && !forceShowDeclarations && Array.from(ojsDiv.childNodes).every((n2) => n2.classList.contains("observablehq--inspect"))) { - ojsDiv.classList.add("quarto-ojs-hide"); - } - } - that.decorateSource(cellDiv, ojsDiv); - for (const added of mutation.addedNodes) { - const result = added.querySelectorAll("code.javascript"); - if (result.length !== 1) { - continue; - } - if (result[0].textContent.trim().startsWith("import")) { - ojsDiv.classList.add("quarto-ojs-hide"); - } - } - } - if (cellOutputDisplay) { - const children = Array.from(cellOutputDisplay.querySelectorAll("div.observablehq")); - if (children.every((n2) => { - return n2.classList.contains("quarto-ojs-hide"); - })) { - cellOutputDisplay.classList.add("quarto-ojs-hide"); - } else { - cellOutputDisplay.classList.remove("quarto-ojs-hide"); - } - } - }; - new MutationObserver(callback).observe(element, config); - element.classList.add(kQuartoModuleWaitClass); - return new this.inspectorClass(element, ojsAst); - }; - }; - const runCell = (cell) => { - const targetElement = typeof elementGetter === "function" ? elementGetter() : elementGetter; - const cellSrc = src.slice(cell.start, cell.end); - const promise = this.interpreter.module(cellSrc, void 0, observer(targetElement, cell)); - return this.waitOnImports(cell, promise); - }; - return this.interpretWithRunner(src, runCell); - } -}; -function createRuntime() { - const quartoOjsGlobal = window._ojs; - const isShiny = window.Shiny !== void 0; - if (isShiny) { - quartoOjsGlobal.hasShiny = true; - initOjsShinyRuntime(); - const span = document.createElement("span"); - window._ojs.shinyElementRoot = span; - document.body.appendChild(span); - } - const lib = new Oe(); - if (isShiny) { - extendObservableStdlib(lib); - } - function transpose(df) { - const keys = Object.keys(df); - return df[keys[0]].map((v2, i2) => Object.fromEntries(keys.map((key) => [key, df[key][i2] || void 0]))).filter((v2) => Object.values(v2).every((e2) => e2 !== void 0)); - } - lib.transpose = () => transpose; - const mainEl = document.querySelector("main"); - function width() { - return lib.Generators.observe(function(change) { - var width2 = change(mainEl.clientWidth); - function resized() { - var w2 = mainEl.clientWidth; - if (w2 !== width2) - change(width2 = w2); - } - window.addEventListener("resize", resized); - return function() { - window.removeEventListener("resize", resized); - }; - }); - } - lib.width = width; - Array.from(document.querySelectorAll("span.co")).filter((n2) => n2.textContent === "//| echo: fenced").forEach((n2) => { - const lineSpan = n2.parentElement; - const lineBreak = lineSpan.nextSibling; - if (lineBreak) { - const nextLineSpan = lineBreak.nextSibling; - if (nextLineSpan) { - const lineNumber = Number(nextLineSpan.id.split("-")[1]); - nextLineSpan.style = `counter-reset: source-line ${lineNumber - 1}`; - } - } - const sourceDiv = lineSpan.parentElement.parentElement.parentElement; - const oldOffset = Number(sourceDiv.dataset.sourceOffset); - sourceDiv.dataset.sourceOffset = oldOffset - "//| echo: fenced\n".length; - lineSpan.remove(); - lineBreak.remove(); - }); - const layoutDivs = Array.from(document.querySelectorAll("div.quarto-layout-panel div[id]")); - function layoutWidth() { - return lib.Generators.observe(function(change) { - const ourWidths = Object.fromEntries(layoutDivs.map((div) => [div.id, div.clientWidth])); - change(ourWidths); - function resized() { - let changed = false; - for (const div of layoutDivs) { - const w2 = div.clientWidth; - if (w2 !== ourWidths[div.id]) { - ourWidths[div.id] = w2; - changed = true; - } - } - if (changed) { - change(ourWidths); - } - } - window.addEventListener("resize", resized); - return function() { - window.removeEventListener("resize", resized); - }; - }); - } - lib.layoutWidth = layoutWidth; - let localResolver = {}; - function fileAttachmentPathResolver(n2) { - if (localResolver[n2]) { - return localResolver[n2]; - } - if (n2.startsWith("/")) { - if (quartoOjsGlobal.paths.docToRoot === "") { - return `.${n2}`; - } else { - return `${quartoOjsGlobal.paths.docToRoot}${n2}`; - } - } else { - return n2; - } - } - lib.FileAttachment = () => ae(fileAttachmentPathResolver); - const ojsConnector = new QuartoOJSConnector({ - paths: quartoOjsGlobal.paths, - inspectorClass: isShiny ? ShinyInspector : QuartoInspector, - library: lib, - allowPendingGlobals: isShiny - }); - quartoOjsGlobal.ojsConnector = ojsConnector; - if (isShiny) { - $(document).one("shiny:idle", () => { - $(document).one("shiny:message", () => { - setTimeout(() => { - ojsConnector.killPendingGlobals(); - }, 0); - }); - }); - } - const subfigIdMap = new Map(); - function getSubfigId(elementId) { - if (!subfigIdMap.has(elementId)) { - subfigIdMap.set(elementId, 0); - } - let nextIx = subfigIdMap.get(elementId); - nextIx++; - subfigIdMap.set(elementId, nextIx); - return `${elementId}-${nextIx}`; - } - const sourceNodes = document.querySelectorAll("pre.sourceCode code.sourceCode"); - const decorators = Array.from(sourceNodes).map((n2) => { - n2 = n2.parentElement; - const decorator = new PandocCodeDecorator(n2); - n2._decorator = decorator; - return decorator; - }); - decorators.forEach((n2) => { - if (n2._node.parentElement.dataset.syntaxErrorPosition === void 0) { - return; - } - const offset = Number(n2._node.parentElement.dataset.syntaxErrorPosition); - n2.decorateSpan(offset, offset + 1, ["quarto-ojs-error-pinpoint"]); - }); - const result = { - setLocalResolver(obj) { - localResolver = obj; - ojsConnector.setLocalResolver(obj); - }, - finishInterpreting() { - return ojsConnector.finishInterpreting(); - }, - async value(name) { - await this.finishInterpreting(); - const result2 = await ojsConnector.value(name); - return result2; - }, - interpretLenient(src, targetElementId, inline) { - return result.interpret(src, targetElementId, inline).catch(() => { - }); - }, - interpret(src, targetElementId, inline) { - let targetElement; - const getElement = () => { - targetElement = document.getElementById(targetElementId); - let subFigId; - if (!targetElement) { - subFigId = getSubfigId(targetElementId); - targetElement = document.getElementById(subFigId); - if (!targetElement) { - throw new Error("Ran out of quarto subfigures."); - } - } - return targetElement; - }; - const makeElement = () => { - return document.createElement(inline ? "span" : "div"); - }; - return ojsConnector.interpret(src, getElement, makeElement).catch((e2) => { - let cellDiv = targetElement; - let cellOutputDisplay; - while (cellDiv !== null && !cellDiv.classList.contains("cell")) { - cellDiv = cellDiv.parentElement; - if (cellDiv && cellDiv.classList.contains("cell-output-display")) { - cellOutputDisplay = cellDiv; - } - } - const ojsDiv = targetElement.querySelector(".observablehq"); - for (const div of ojsDiv.querySelectorAll(".callout")) { - div.remove(); - } - const messagePre = document.createElement("pre"); - messagePre.innerText = e2.stack; - const callout = calloutBlock({ - type: "important", - heading: `${e2.name}: ${e2.message}`, - message: messagePre - }); - ojsDiv.appendChild(callout); - ojsConnector.clearError(ojsDiv); - ojsConnector.clearErrorPinpoints(cellDiv, ojsDiv); - return e2; - }); - }, - interpretQuiet(src) { - return ojsConnector.interpretQuiet(src); - }, - interpretFromScriptTags() { - for (const el of document.querySelectorAll("script[type='ojs-module-contents']")) { - for (const call of JSON.parse(el.text).contents) { - switch (call.methodName) { - case "interpret": - this.interpret(call.source, call.cellName, call.inline); - break; - case "interpretLenient": - this.interpretLenient(call.source, call.cellName, call.inline); - break; - case "interpretQuiet": - this.interpretQuiet(call.source); - break; - default: - throw new Error(`Don't know how to call method ${call.methodName}`); - } - } - } - for (const el of document.querySelectorAll("script[type='ojs-define']")) { - for (const { name, value } of JSON.parse(el.text).contents) { - ojsConnector.define(name)(value); - } - } - } - }; - return result; -} -window._ojs = { - ojsConnector: void 0, - paths: {}, - hasShiny: false, - shinyElementRoot: void 0 -}; -window._ojs.runtime = createRuntime(); -export { - createRuntime -}; diff --git a/content/blog/2022-dutch-performance-olympic-speed-skating/index_files/libs/quarto-ojs/quarto-ojs.css b/content/blog/2022-dutch-performance-olympic-speed-skating/index_files/libs/quarto-ojs/quarto-ojs.css deleted file mode 100644 index ea13764..0000000 --- a/content/blog/2022-dutch-performance-olympic-speed-skating/index_files/libs/quarto-ojs/quarto-ojs.css +++ /dev/null @@ -1,132 +0,0 @@ -span.ojs-inline span div { - display: inline-block; -} - -/* add some breathing room between display outputs and text especially */ -div.cell + section, div.cell + h1, div.cell + h2, div.cell + h3, div.cell + h4, div.cell + h5, div.cell + h6, div.cell + p { - margin-top: 1rem; -} - -.observablehq .observablehq--inspect { - font-family: var(--bs-font-monospace); - font-size: 0.8em; -} - -.observablehq--field { - margin-left: 1rem; -} - -.observablehq--caret { - margin-right: 2px; - vertical-align: baseline; -} - -.observablehq--collapsed, -.observablehq--expanded.observablehq--inspect a { - cursor: pointer; -} - -/* classes directly from observable's runtime */ -.observablehq--key, -.observablehq--index { - color: var(--quarto-hl-dt-color); -} - -.observablehq--string { - color: var(--quarto-hl-st-color); -} - -.observablehq--bigint, -.observablehq--date, -.observablehq--number, -.observablehq--regexp, -.observablehq--symbol { - color: var(--quarto-hl-dv-color); -} - -.observablehq--null, -.observablehq--boolean, -.observablehq--undefined, -.observablehq--keyword { - color: var(--quarto-hl-kw-color); -} - -/* In addition, their import statements specifically come highlighted by hljs. - (probably some legacy feature of theirs?) We handle those here as well. - - Just to be on the safe side, we select on observable's 'md-pre' - class as well, in case someone else uses hljs and wishes to put - their own highlighting. - - TODO Note that to make our highlighting consistent, we're - overriding the "im" class to present like a keyword. I should make - sure this looks right everywhere, but I don't know how to test it - comprehensively. -*/ - -code.javascript span.im { - color: var(--quarto-hl-kw-color); -} - -pre.observablehq--md-pre span.hljs-keyword { - color: var(--quarto-hl-kw-color); -} - -pre.observablehq--md-pre span.hljs-string { - color: var(--quarto-hl-st-color); -} - -pre.observablehq--md-pre .span.hljs-date, -pre.observablehq--md-pre .span.hljs-number, -pre.observablehq--md-pre .span.hljs-regexp, -pre.observablehq--md-pre .span.hljs-symbol { - color: var(--quarto-hl-dv-color); -} - -/* Other general niceties, but it's possible that we should do this on a page-by-page basis */ - -input { - vertical-align: middle; -} - -input[type="radio"], -input[type="checkbox"] { - margin: 0px 0px 3px 0px; -} - -.observable-in-a-box-waiting-for-module-import { - visibility: hidden; -} - -/* play nicely w/ sidebar layout */ -.panel-sidebar .observablehq > form[class^="oi-"] { - flex-wrap: wrap !important; -} - -/* likely that this only makes sense with bootstrap. TODO check with charles */ -.observablehq table { - font-size: 0.9em !important; -} - -.quarto-ojs-hide { - display: none; -} - -.quarto-ojs-error-pinpoint { - border-bottom: 2px dotted #e51400; - font-weight: 700; - cursor: pointer; -} - -code span.quarto-ojs-error-pinpoint { - color: inherit; -} - -.observablehq--error .observablehq--inspect { - font-size: 0.875em !important; -} - -.observablehq--error .callout { - margin-bottom: 0; - margin-top: 0; -} diff --git a/content/blog/2022-everything-is-a-linear-model/index.markdown_strict_files/figure-markdown_strict/tst-boxplot-1.png b/content/blog/2022-everything-is-a-linear-model/index.markdown_strict_files/figure-markdown_strict/tst-boxplot-1.png index 9cd0136..f2cdabd 100644 Binary files a/content/blog/2022-everything-is-a-linear-model/index.markdown_strict_files/figure-markdown_strict/tst-boxplot-1.png and b/content/blog/2022-everything-is-a-linear-model/index.markdown_strict_files/figure-markdown_strict/tst-boxplot-1.png differ diff --git a/content/blog/2022-everything-is-a-linear-model/index.md b/content/blog/2022-everything-is-a-linear-model/index.md index b1c1e28..46bfcb2 100644 --- a/content/blog/2022-everything-is-a-linear-model/index.md +++ b/content/blog/2022-everything-is-a-linear-model/index.md @@ -1,7 +1,7 @@ --- title: Everything is a Linear Model -author: Daniel Roelfs -date: "2022-03-18" +date: 2022-03-18 +description: Everything is a Linear Model slug: everything-is-a-linear-model categories: - statistics @@ -9,9 +9,6 @@ tags: - statistics - R - data visualization -description: "Everything is a Linear Model" -thumbnail: images/avatar.png -format: hugo execute: fig.retina: 2 fig.align: center @@ -35,15 +32,19 @@ set.seed(2022) # One-Sample T-test -Let's start simple with the One-Sample T-test. This test can be used to test how the mean value of your sample measure differs from a reference number. Throughout this page, I'll throw around a bunch of formulas, which, dependending on your background, can either be informative or confusing. The formula for a One-Sample T-test is: +Let's start simple with the One-Sample T-test. This test can be used to test how the mean value of your sample measure differs from a reference number. Throughout this page, I'll throw around a bunch of formulas, which, depending on your background, can either be informative or confusing. The formula for a One-Sample T-test is: -`$$t = \frac{\overline{x} - \mu}{\frac{\sigma}{\sqrt{n}}} = \frac{sample~mean - population~mean}{\frac{standard~deviation}{\sqrt{sample~size}}}$$` +$$ +t = \frac{\overline{x} - \mu}{\frac{\sigma}{\sqrt{n}}} = \frac{sample~mean - population~mean}{\frac{standard~deviation}{\sqrt{sample~size}}} +$$ -What this says is that the effect size (*t*) is equal to the sample mean minus the population mean (or reference number) and you divide it by the standard deviation of the sample divided by the square root of the sample size. This formula will output the *t*-value that you would usually report when doing a T-test. The formula requires the standard deviation (`$\sigma$`) of the sample values, which is: +What this says is that the effect size (*t*) is equal to the sample mean minus the population mean (or reference number) and you divide it by the standard deviation of the sample divided by the square root of the sample size. This formula will output the *t*-value that you would usually report when doing a T-test. The formula requires the standard deviation (*σ*) of the sample values, which is: -`$$\sigma = \sqrt{\frac{\sum\limits_{i=1}^n{(x_{i} - \overline{x})^2}}{n - 1}}$$` +$$ +\sigma = \sqrt{\frac{\sum\limits\_{i=1}^n{(x\_{i} - \overline{x})^2}}{n - 1}} +$$ -In this formula, you'd subtract the average across the sample values from each individual value, square it, and sum all these resulting values. This sum you would then divide by the size of the sample minus one (or the degrees of freedom), and take the square root of the whole thing. This will give the standard deviation (`$\sigma$`). Alright, let's now consider a study where we collected blood samples from a number of patients and measured for instance sodium levels in the blood. We don't have a control group for this study, but we know from medical textbooks that the reference value for sodium in healthy individuals for the age and sex distribution in our sample is for instance 2.5 mmol/L. Then we measure the sodium levels for 30 patients, we can simulate some fake measurements by generating a random sequence of values with a mean of 3 and a standard deviation of 1.2. +In this formula, you'd subtract the average across the sample values from each individual value, square it, and sum all these resulting values. This sum you would then divide by the size of the sample minus one (or the degrees of freedom), and take the square root of the whole thing. This will give the standard deviation (*σ*). Alright, let's now consider a study where we collected blood samples from a number of patients and measured for instance sodium levels in the blood. We don't have a control group for this study, but we know from medical textbooks that the reference value for sodium in healthy individuals for the age and sex distribution in our sample is for instance 2.5 mmol/L. Then we measure the sodium levels for 30 patients, we can simulate some fake measurements by generating a random sequence of values with a mean of 3 and a standard deviation of 1.2. ``` r ref_concentration <- 2.5 @@ -54,7 +55,9 @@ concentration <- rnorm(n, mean = 3, sd = 1.25) If we then implement these values in the formulas earlier, we get the following result for the standard deviation: -`$$\sigma = \sqrt{\frac{\sum\limits_{i=1}^n{|x_{i} - \overline{x}|^2}}{n - 1}} = \sqrt{\frac{\sum\limits_{i=1}^{30}{|concentration_{i} - 2.855|^2}}{30 - 1}} = 1.157$$` +$$ +\sigma = \sqrt{\frac{\sum\limits\_{i=1}^n{\|x\_{i} - \overline{x}\|^2}}{n - 1}} = \sqrt{\frac{\sum\limits\_{i=1}^{30}{\|concentration\_{i} - 2.855\|^2}}{30 - 1}} = 1.157 +$$ this formula would like like this when implemented in R: @@ -66,7 +69,9 @@ sqrt( sum(abs(concentration - mean(concentration))^2) / (n - 1) ) But of course in any normal setting, you'd use the `sd()` function, which will give the same result as the code above, but I just wanted to show it for illustrative purposes. Anywhere else I'll use the `sd()` function. Now let's calculate the *t*-value. In formula form this would look like this: -`$$t = \frac{\overline{x} - \mu}{\frac{\sigma}{\sqrt{n}}} = \frac{2.855 - 2.5}{\frac{1.157}{\sqrt{30}}} = 1.681$$` +$$ +t = \frac{\overline{x} - \mu}{\frac{\sigma}{\sqrt{n}}} = \frac{2.855 - 2.5}{\frac{1.157}{\sqrt{30}}} = 1.681 +$$ So just to over this formula again, you take the mean of your sample, subtract the reference number, and you divide this number by the standard deviation of your sample divided by the square root of the sample size. Or in R-terms: @@ -110,13 +115,13 @@ t.test(concentration - ref_concentration, mu = 0) mean of x 0.3550862 -So now back to the premise of this exercise, how is a T-test the same as a linear model? Like we showed before, subtracting the reference value from the sample values and adding that to a T-test comparing the values to 0 is equivalent to comparing the sample values to the reference value. Now let's consider what a linear model does. You might recall from high-school mathematics that the formula for a straight line is always some form of `$y = ax + c$`, the linear model formula is somewhat similar: +So now back to the premise of this exercise, how is a T-test the same as a linear model? Like we showed before, subtracting the reference value from the sample values and adding that to a T-test comparing the values to 0 is equivalent to comparing the sample values to the reference value. Now let's consider what a linear model does. You might recall from high-school mathematics that the formula for a straight line is always some form of *y* = \*a\*\*x* + *c\*, the linear model formula is somewhat similar: -`$$Y_{i} = \beta_{0} + \beta_{1}x + \epsilon_{i}$$` +*Y**i* = *β*0 + *β*1*x* + *ϵ**i* -In this formula `$Y_{i}$` is the dependent variable, `$x$` is the independent variable. `$\beta_{0}$` is equivalent to the intercept at the y-axis, similar to `$c$` in the formula for a straight line. `$\beta_{1}$` is the slope (equivalent to `$a$` in the formula earlier). Finally, the `$\epsilon_{i}$` is the random error term. +In this formula *Y**i* is the dependent variable, *x* is the independent variable. *β*0 is equivalent to the intercept at the y-axis, similar to *c* in the formula for a straight line. *β*1 is the slope (equivalent to *a* in the formula earlier). Finally, the *ϵ**i* is the random error term. -Now let's build the linear model. Remember that the formula for the linear model included this term: `$\beta_{1}x$`. In this case, since we only have one sample, we don't have any value to multiply our value to, so we multiply it by 1. If we wanted to correlate two variables, for instance concentration with age, we would substitute the 1 with a continuous variable, i.e. age, but in this case we correlate all sample values with 1. Since we still want to compare our value to 0, we subtract the reference value from our sample values like we did before for the `t.test()`. Let's build the linear model. +Now let's build the linear model. Remember that the formula for the linear model included this term: *β*1*x*. In this case, since we only have one sample, we don't have any value to multiply our value to, so we multiply it by 1. If we wanted to correlate two variables, for instance concentration with age, we would substitute the 1 with a continuous variable, i.e. age, but in this case we correlate all sample values with 1. Since we still want to compare our value to 0, we subtract the reference value from our sample values like we did before for the `t.test()`. Let's build the linear model. ``` r ost_model <- lm((concentration - ref_concentration) ~ 1) @@ -153,16 +158,18 @@ data <- tibble( The formula for an Two-Sample T-test is very similar to that of the One-Sample T-test, with the added factor of the second set of sample values. The formula for an Two-Sample T-test is as follows: -`$$t = \frac{(\overline{x_1} - \overline{x_2})}{\sqrt{\frac{\sigma_1^2}{n_1} + \frac{\sigma_2^2}{n_2}}} = \frac{(3.838073 - 5.455809)}{\sqrt{\frac{1.343565^2}{30} + \frac{1.69711^2}{30}}} = -4.093524$$` +$$ +t = \frac{(\overline{x_1} - \overline{x_2})}{\sqrt{\frac{\sigma_1^2}{n_1} + \frac{\sigma_2^2}{n_2}}} = \frac{(3.838073 - 5.455809)}{\sqrt{\frac{1.343565^2}{30} + \frac{1.69711^2}{30}}} = -4.093524 +$$ -It's a bit too complex to describe in a sentence, but the definitions are perhaps familiar: `$\overline{x}$` for group means, `$\sigma$` for group standard deviations, and `$n$` for group size. I find that the simplest way to implement this in R is by first separating the groups and then adding them in the formula. +It's a bit too complex to describe in a sentence, but the definitions are perhaps familiar: $\overline{x}$ for group means, *σ* for group standard deviations, and *n* for group size. I find that the simplest way to implement this in R is by first separating the groups and then adding them in the formula. ``` r -g1 <- data %>% - filter(group == "HC") %>% +g1 <- data |> + filter(group == "HC") |> pull(concentration) -g2 <- data %>% - filter(group == "PAT") %>% +g2 <- data |> + filter(group == "PAT") |> pull(concentration) (mean(g1) - mean(g2)) / sqrt( (sd(g1)^2 / length(g1)) + (sd(g2)^2 / length(g2)) ) @@ -205,8 +212,8 @@ ggplot(data, aes(x = group, y = concentration, fill = group)) + Great! Now we have a visual representation of the data. Now, since the T-test compares means, we can might also add a point indicating the mean for both groups. Let's look just at the jittered points and add a line connecting the two mean values. ``` r -mean_concentration <- data %>% - group_by(group) %>% +mean_concentration <- data |> + group_by(group) |> summarise(mean_conc = mean(concentration)) ggplot(data, aes(x = group)) + @@ -214,13 +221,10 @@ ggplot(data, aes(x = group)) + geom_point(data = mean_concentration, aes(y = mean_conc), color = "violet", size = 5) + geom_line(data = mean_concentration, aes(y = mean_conc), group = 1, - size = 2, color = "violet") + + linewidth = 2, color = "violet") + theme_minimal() ``` - Warning: Using `size` aesthetic for lines was deprecated in ggplot2 3.4.0. - ℹ Please use `linewidth` instead. - You might see where we are going with this. The parameters from the linear model will describe the angle of the diagonal line and I'll illustrative this a further down. Let's get the values from the linear model: @@ -251,19 +255,38 @@ summary(tst_model) First of all, let's look at the `groupPAT`, there we find the same *t*-value as we did when we ran the T-tests earlier, although with the sign flipped. I'll show later why that is. -Now, back to the plot. The x-axis has two conditions: `HC` and `PAT`, but let's imagine those values are `0` and `1`. Let's now also throw back to the time we recalled the formula for a straight line: `$y = ax + c$`. In this context we only have two x-values, `HC` and `PAT` or `0` and `1`. Then we can obtain `$y$` in the formula by solving the equation when `$x$` is equal to `0`, in that case `$y$` becomes just the mean concentration of the healthy controls, or the big magenta dot in the previous plot, and that is a value we can calculate. Remember that `0` in the formula below stands for `HC`. That looks something like this: +Now, back to the plot. The x-axis has two conditions: `HC` and `PAT`, but let's imagine those values are `0` and `1`. Let's now also throw back to the time we recalled the formula for a straight line: *y* = \*a\*\*x* + *c*. In this context we only have two x-values, `HC` and `PAT` or `0` and `1`. Then we can obtain *y\* in the formula by solving the equation when *x* is equal to `0`, in that case *y* becomes just the mean concentration of the healthy controls, or the big magenta dot in the previous plot, and that is a value we can calculate. Remember that `0` in the formula below stands for `HC`. That looks something like this: -`$$ \begin{eqnarray} \overline{x}_{0} &=& a \times 0 + c \\ c &=& \overline{x}_{0} \\ &=& 4.162838 \end{eqnarray} $$` +$$ +\begin{eqnarray} +\overline{x}\_{0} &=& a \times 0 + c \newline +c &=& \overline{x}\_{0} \newline + &=& 4.162838 +\end{eqnarray} +$$ -So that's the constant our formula. If we look back at the output from the `lm()` function, we see that this value is represented as the `Estimate` of the `(Intercept)` row! Let's also solve `$a$`. Remember that `$a$` represents the slope of the line. How do we get the slope? The slope is basically nothing more than the difference between the mean values of `HC` and `PAT`, but let's solve it in a more elegant way, by using the same formula we used to find `$c$`. We'll use the same coding as before, `0` for `HC` and `1` for `PAT`. Remember that `$c$` is equal to the mean value of `HC` (aka `$\overline{x}_{0}$`). +So that's the constant our formula. If we look back at the output from the `lm()` function, we see that this value is represented as the `Estimate` of the `(Intercept)` row! Let's also solve *a*. Remember that *a* represents the slope of the line. How do we get the slope? The slope is basically nothing more than the difference between the mean values of `HC` and `PAT`, but let's solve it in a more elegant way, by using the same formula we used to find *c*. We'll use the same coding as before, `0` for `HC` and `1` for `PAT`. Remember that *c* is equal to the mean value of `HC` (aka $\overline{x}_{0}$). -`$$ \begin{eqnarray} \overline{x}_{1} &=& a \times 1 + c \\ &=& a + \overline{x}_{0} \\ a &=& \overline{x}_{1} - \overline{x}_{0} \\ &=& 6.746285 - 4.162838 \\ &=& 2.583447 \end{eqnarray} $$` +$$ +\begin{eqnarray} +\overline{x}\_{1} &=& a \times 1 + c \newline + &=& a + \overline{x}\_{0} \newline +a &=& \overline{x}\_{1} - \overline{x}\_{0} \newline + &=& 6.746285 - 4.162838 \newline + &=& 2.583447 +\end{eqnarray} +$$ -And then we find that `$a$` is equal to the `Estimate` column for the `groupPAT` row. +And then we find that *a* is equal to the `Estimate` column for the `groupPAT` row. We can reverse engineer the *t*-value too using just the output from the `lm()` function. One can imagine that if one would plot a situation where the null hypothesis (H0) is true, the slope of that line would be 0 since then there's no difference between the mean of the two groups (inb4 the angry statisticians: it's more complicated than that but let me use this for now). We'll take the difference between our observed slope, or the slope of the alternative hypothesis (H0), and the slope of the null hypothesis, which is 0, and divide that by the standard error of the sampling distribution, which is given by the `lm()` function as the `Std. Error` of the `groupPAT` row: -`$$t = \frac{slope\ of\ regression\ line\ at\ H_{a} - slope\ of\ regression\ line\ at\ H_{0}}{standard\ error\ of\ sampling\ distribution} = \frac{1.6177 - 0}{0.3952} = 4.093$$` +$$ +\begin{eqnarray} +t &=& \frac{slope~of~regression~line~at~H\_{a} - slope~of~regression~line~at~H\_{0}}{standard~error~of~sampling~distribution}\newline +&=& \frac{1.6177 - 0}{0.3952} = 4.093 +\end{eqnarray} +$$ Which as you'll notice is one thousandths-decimal place off, which is due to rounding errors. `lm()` reports up to 4 decimal points while it uses more for the calculation. And now we've come full circle, because the slope of the regression line is nothing more than the difference between the mean of the second group minor the mean of the first group. Now we can go back to the figure we made earlier and see how all these values relate: @@ -281,7 +304,7 @@ data <- tibble(score = round(c(rnorm(n, mean = 75, sd = 30), rnorm(n, mean = 60, sd = 35), rnorm(n, mean = 30, sd = 17), rnorm(n, mean = 45, sd = 32))), - group = rep(c("SCZ", "BD", "MDD", "ASD"), each = n)) %>% + group = rep(c("SCZ", "BD", "MDD", "ASD"), each = n)) |> mutate(group = as_factor(group)) ``` @@ -329,14 +352,14 @@ summary(anova_lm_model) The first thing you might notice is that the *F*-statistic and the *p*-value are the same in both models. ``` r -ref_mean <- data %>% - filter(group == "SCZ") %>% - pull(score) %>% +ref_mean <- data |> + filter(group == "SCZ") |> + pull(score) |> mean() -anova_group_means <- data %>% - group_by(group) %>% - summarise(score = mean(score)) %>% +anova_group_means <- data |> + group_by(group) |> + summarise(score = mean(score)) |> mutate(ref_mean = ref_mean, mean_adj = score - ref_mean) @@ -350,32 +373,31 @@ ggplot(data, aes(x = group, y = score - ref_mean)) + theme_minimal() ``` - Warning in do_once((if (is_R_CMD_check()) stop else warning)("The function - xfun::isFALSE() will be deprecated in the future. Please ", : The function - xfun::isFALSE() will be deprecated in the future. Please consider using - base::isFALSE(x) or identical(x, FALSE) instead. - Oh, would you look at that! The differences between the group means and the reference mean (in this case SCZ) correspond with the `Estimate` of the linear model! Let's also see if we can reproduce the sum of squares from the ANOVA summary. We'll go a bit more in depth into the sum of squares further down, but I just wanted to go over a few formulas and calculations: -`$$ \begin{eqnarray} total~sum~of~squares &=& \sum\limits_{j=1}^{J} n_{j} \times (\overline{x}_{j} - \mu)^2 \\ residual~sum~of~squares &=& \sum\limits_{j=1}^{J} \sum\limits_{i=1}^{n_{j}} (y_{ij} - \overline{y}_{j})^2 \end{eqnarray} $$` +$$ +\begin{eqnarray} +total~sum~of~squares &=& \sum\limits\_{j=1}^{J} n\_{j} \times (\overline{x}\_{j} - \mu)^2 \newline +residual~sum~of~squares &=& \sum\limits\_{j=1}^{J} \sum\limits\_{i=1}^{n\_{j}} (y\_{ij} - \overline{y}\_{j})^2 +\end{eqnarray} +$$ -Just briefly, the first formula takes the mean value for the group in question, subtracts the overall mean (or grand mean) and squares the result. Then it multiplies this number by the sample size in this group. In this case we'll only do it for the first group since that's the one listed in the `summary(aov_model)` output. -The second formula calculates the residual sum of squares (or sum of squared error), we'll come back to this later. In essence it substracts the group mean from each of the individual values, squares it, and sums it first within the group, and then sums it again across the groups. +Just briefly, the first formula takes the mean value for the group in question, subtracts the overall mean (or grand mean) and squares the result. Then it multiplies this number by the sample size in this group. In this case we'll only do it for the first group since that's the one listed in the `summary(aov_model)` output. The second formula calculates the residual sum of squares (or sum of squared error), we'll come back to this later. In essence it substracts the group mean from each of the individual values, squares it, and sums it first within the group, and then sums it again across the groups. We can do both calculations in one go with the following quick code: ``` r -data %>% - mutate(overall_mean = mean(score)) %>% - group_by(group) %>% +data |> + mutate(overall_mean = mean(score)) |> + group_by(group) |> summarise(group_mean = mean(score), group_n = n(), overall_mean = first(overall_mean), sq_group = group_n * (group_mean - overall_mean)^2, - sq_error = sum((score - group_mean)^2)) %>% - ungroup() %>% + sq_error = sum((score - group_mean)^2)) |> + ungroup() |> summarise(ss_group = sum(sq_group), ss_error = sum(sq_error)) ``` @@ -397,11 +419,11 @@ I don't think I need a lot of effort to convince anyone that a linear model is a n <- 20 data <- tibble(age = round(runif(n = n, min = 18, max = 60)), sex = factor(sample(c("Male", "Female"), size = n, replace = TRUE), - levels = c("Male", "Female"))) %>% + levels = c("Male", "Female"))) |> mutate(measure = 1e-2 * age + sqrt(1e-2) * rnorm(n)) ``` -We've used the formula for a straight line in previous sections (`$y = ax + c$`), we can apply it here too, but instead of the difference in the mean between two groups, the slope of the line (denoted by `$a$`) is now derived from the slope at which the line has the least distance to all points, referred to as the best fit. We will plot this later, but first we should maybe just run the linear model: +We've used the formula for a straight line in previous sections (*y* = \*a\*\*x* + *c*), we can apply it here too, but instead of the difference in the mean between two groups, the slope of the line (denoted by *a\*) is now derived from the slope at which the line has the least distance to all points, referred to as the best fit. We will plot this later, but first we should maybe just run the linear model: ``` r lm_model <- lm(measure ~ age, data = data) @@ -446,13 +468,15 @@ The line in the figure above shows the line that best fits the points with a rib Back to our data. We know that a linear models fits a line that "predicts" outcome based on some other variable. This is heavily simplified, but it'll make clear what we'll do next. So what we did before with the best fit line was create one line that best fits all the data points, but now we want to relate that back to our data points. What would our values be if they would be exactly on this line? To get this, all we have to do is calculate the difference between the current data point and the value of the best fit line at the corresponding "predictor" value. We could do it by hand, but since this section is quite long already, I'll skip straight to the R function, which is appropriately called `predict.lm()`. It takes the linear model we created with the `lm()` function earlier as input. It outputs a vector with the predicted values based on the model. ``` r -data <- data %>% +data <- data |> mutate(measure_pred = predict.lm(lm_model)) ``` With that we can quickly calculate the residual standard error (oversimplified, it's a measure of how well a regression model fits a dataset). The formula for the residual standard error is this: -`$$Residual~standard~error = \sqrt{\frac{\sum(observed - predicted)^2}{degrees~of~freedom}}$$` +$$ +Residual~standard~error = \sqrt{\frac{\sum(observed - predicted)^2}{degrees~of~freedom}} +$$ or in R terms (the degrees of freedom is 18 here, too complicated to explain for now): @@ -465,7 +489,7 @@ sqrt(sum((data$measure - data$measure_pred)^2) / 18) So that checks out. What we can then also do is calculate the difference between the observed and the predicted values values, this is called the residual: ``` r -data <- data %>% +data <- data |> mutate(residual = measure - measure_pred) ``` @@ -473,7 +497,7 @@ We can check that this is correct too by comparing the residuals we calculated w ``` r tibble(residual_manual = data$residual, - residual_lm = residuals(lm_model)) %>% + residual_lm = residuals(lm_model)) |> glimpse() ``` @@ -500,13 +524,20 @@ You might have noticed now that the size of the arrow is defined as the differen The "sum" part of "sum of squared error" refers to the sum of the areas of those squares. Simply, you sum the square of the sides. You can also look at it in mathematical form: -$$\sum(residual~or~difference~with~regression~line^2)$$ +$$ +\sum{(residual~or~difference~with~regression~line^2)} +$$ We'll use this formula again a bit later to calculate the R2. In order to calculate the squared regression coefficient, we should also calculate the mean value of the measure across all points. This is necessary because the squared regression coefficient is defined as a perfect correlation (i.e. a correlation coefficient of 1) minus the explained variance divided by the total variance, or in formula form: -`$$R^2 = perfect~correlation - \frac{explained~variance}{total~variance} = 1 - \frac{\sum(difference~with~regression~line^2)}{\sum(difference~with~mean~value^2)}$$` +$$ +\begin{eqnarray} +R^2 &=& perfect~correlation - \frac{explained~variance}{total~variance} \newline +&=& 1 - \frac{\sum(difference~with~regression~line^2)}{\sum(difference~with~mean~value^2)} +\end{eqnarray} +$$ Explained variance is defined here as the sum of squared error. You might notice the sum symbols and the squares, so you might guess that this formula is also some kind of sum of squares, and it is! As we already discovered, the numerator in this formula is the sum of squared error, the denominator is referred to as the sum of squared total. And the composite of those two is referred to as the sum of squared regression. Making three different sum of squares. @@ -519,14 +550,16 @@ We've already plotted the sum of squared error, now we'll also illustrate sum of We already calculated the difference with the regression line, then to calculate the difference with the mean is simple: ``` r -data <- data %>% +data <- data |> mutate(measure_mean = mean(measure), difference_mean = measure - measure_mean) ``` Sidenote, if you wanted to calculate the total variance the formula for that would look like this: -`$$S^2 = \frac{\sum(x_{i} - \overline{x})^2}{n - 1}$$` +$$ +S^2 = \frac{\sum(x\_{i} - \overline{x})^2}{n - 1} +$$ Notice how the numerator is the same calculation as the sum of squared total, then divided by the sample size minus 1 (like the degrees of freedom). @@ -561,48 +594,41 @@ Session info for reproducibility purposes sessionInfo() ``` - R version 4.2.1 (2022-06-23) - Platform: x86_64-apple-darwin17.0 (64-bit) - Running under: macOS Big Sur ... 10.16 + R version 4.3.0 (2023-04-21) + Platform: x86_64-apple-darwin20 (64-bit) + Running under: macOS 14.0 Matrix products: default - BLAS: /Library/Frameworks/R.framework/Versions/4.2/Resources/lib/libRblas.0.dylib - LAPACK: /Library/Frameworks/R.framework/Versions/4.2/Resources/lib/libRlapack.dylib + BLAS: /Library/Frameworks/R.framework/Versions/4.3-x86_64/Resources/lib/libRblas.0.dylib + LAPACK: /Library/Frameworks/R.framework/Versions/4.3-x86_64/Resources/lib/libRlapack.dylib; LAPACK version 3.11.0 locale: [1] en_US.UTF-8/en_US.UTF-8/en_US.UTF-8/C/en_US.UTF-8/en_US.UTF-8 + time zone: Europe/Oslo + tzcode source: internal + attached base packages: - [1] stats graphics grDevices utils datasets methods base + [1] stats graphics grDevices datasets utils methods base other attached packages: - [1] patchwork_1.1.2 forcats_0.5.2 stringr_1.5.0 dplyr_1.0.10 - [5] purrr_0.3.5 readr_2.1.3 tidyr_1.2.1 tibble_3.2.1 - [9] ggplot2_3.4.2 tidyverse_1.3.2 + [1] patchwork_1.1.2 lubridate_1.9.2 forcats_1.0.0 stringr_1.5.0 + [5] dplyr_1.1.3 purrr_1.0.2 readr_2.1.4 tidyr_1.3.0 + [9] tibble_3.2.1 ggplot2_3.4.3 tidyverse_2.0.0 loaded via a namespace (and not attached): - [1] Rcpp_1.0.10 lattice_0.20-45 lubridate_1.8.0 - [4] assertthat_0.2.1 digest_0.6.31 utf8_1.2.3 - [7] R6_2.5.1 cellranger_1.1.0 backports_1.4.1 - [10] reprex_2.0.2 evaluate_0.21 httr_1.4.4 - [13] pillar_1.9.0 rlang_1.1.1 googlesheets4_1.0.1 - [16] readxl_1.4.1 rstudioapi_0.14 Matrix_1.5-1 - [19] rmarkdown_2.22 splines_4.2.1 labeling_0.4.2 - [22] googledrive_2.0.0 munsell_0.5.0 gridtext_0.1.5 - [25] broom_1.0.1 compiler_4.2.1 modelr_0.1.9 - [28] xfun_0.39 pkgconfig_2.0.3 mgcv_1.8-40 - [31] scico_1.3.1 htmltools_0.5.5 ggtext_0.1.2 - [34] tidyselect_1.2.0 fansi_1.0.4 crayon_1.5.2 - [37] tzdb_0.3.0 dbplyr_2.2.1 withr_2.5.0 - [40] commonmark_1.8.1 grid_4.2.1 nlme_3.1-157 - [43] jsonlite_1.8.5 gtable_0.3.3 lifecycle_1.0.3 - [46] DBI_1.1.3 magrittr_2.0.3 scales_1.2.1 - [49] cli_3.6.1 stringi_1.7.12 farver_2.1.1 - [52] fs_1.6.2 xml2_1.3.3 ellipsis_0.3.2 - [55] generics_0.1.3 vctrs_0.6.2 tools_4.2.1 - [58] glue_1.6.2 markdown_1.3 hms_1.1.2 - [61] fastmap_1.1.1 yaml_2.3.7 colorspace_2.1-0 - [64] gargle_1.2.1 rvest_1.0.3 knitr_1.43 - [67] haven_2.5.1 + [1] utf8_1.2.3 generics_0.1.3 renv_0.17.3 xml2_1.3.4 + [5] lattice_0.21-9 stringi_1.7.12 hms_1.1.3 digest_0.6.33 + [9] magrittr_2.0.3 evaluate_0.22 grid_4.3.0 timechange_0.2.0 + [13] fastmap_1.1.1 Matrix_1.6-1.1 jsonlite_1.8.7 ggtext_0.1.2 + [17] mgcv_1.9-0 fansi_1.0.4 scales_1.2.1 scico_1.4.0 + [21] cli_3.6.1 rlang_1.1.1 splines_4.3.0 commonmark_1.9.0 + [25] munsell_0.5.0 withr_2.5.1 yaml_2.3.7 tools_4.3.0 + [29] tzdb_0.4.0 colorspace_2.1-0 vctrs_0.6.3 R6_2.5.1 + [33] lifecycle_1.0.3 pkgconfig_2.0.3 pillar_1.9.0 gtable_0.3.4 + [37] glue_1.6.2 Rcpp_1.0.11 xfun_0.40 tidyselect_1.2.0 + [41] rstudioapi_0.14 knitr_1.44 farver_2.1.1 nlme_3.1-163 + [45] htmltools_0.5.6 rmarkdown_2.25 labeling_0.4.3 compiler_4.3.0 + [49] markdown_1.7 gridtext_0.1.5 diff --git a/content/blog/2022-everything-is-a-linear-model/index.qmd b/content/blog/2022-everything-is-a-linear-model/index.qmd index 92043ae..2591a27 100644 --- a/content/blog/2022-everything-is-a-linear-model/index.qmd +++ b/content/blog/2022-everything-is-a-linear-model/index.qmd @@ -1,7 +1,7 @@ --- title: Everything is a Linear Model -author: Daniel Roelfs -date: "2022-03-18" +date: 2022-03-18 +description: Everything is a Linear Model slug: everything-is-a-linear-model categories: - statistics @@ -9,9 +9,6 @@ tags: - statistics - R - data visualization -description: "Everything is a Linear Model" -thumbnail: images/avatar.png -format: hugo execute: fig.retina: 2 fig.align: center @@ -38,15 +35,19 @@ set.seed(2022) # One-Sample T-test -Let's start simple with the One-Sample T-test. This test can be used to test how the mean value of your sample measure differs from a reference number. Throughout this page, I'll throw around a bunch of formulas, which, dependending on your background, can either be informative or confusing. The formula for a One-Sample T-test is: +Let's start simple with the One-Sample T-test. This test can be used to test how the mean value of your sample measure differs from a reference number. Throughout this page, I'll throw around a bunch of formulas, which, depending on your background, can either be informative or confusing. The formula for a One-Sample T-test is: -`$$t = \frac{\overline{x} - \mu}{\frac{\sigma}{\sqrt{n}}} = \frac{sample~mean - population~mean}{\frac{standard~deviation}{\sqrt{sample~size}}}$$` +$$ +t = \frac{\overline{x} - \mu}{\frac{\sigma}{\sqrt{n}}} = \frac{sample~mean - population~mean}{\frac{standard~deviation}{\sqrt{sample~size}}} +$$ -What this says is that the effect size (_t_) is equal to the sample mean minus the population mean (or reference number) and you divide it by the standard deviation of the sample divided by the square root of the sample size. This formula will output the _t_-value that you would usually report when doing a T-test. The formula requires the standard deviation (`$\sigma$`) of the sample values, which is: +What this says is that the effect size (*t*) is equal to the sample mean minus the population mean (or reference number) and you divide it by the standard deviation of the sample divided by the square root of the sample size. This formula will output the *t*-value that you would usually report when doing a T-test. The formula requires the standard deviation (*σ*) of the sample values, which is: -`$$\sigma = \sqrt{\frac{\sum\limits_{i=1}^n{(x_{i} - \overline{x})^2}}{n - 1}}$$` +$$ +\sigma = \sqrt{\frac{\sum\limits\_{i=1}^n{(x\_{i} - \overline{x})^2}}{n - 1}} +$$ -In this formula, you'd subtract the average across the sample values from each individual value, square it, and sum all these resulting values. This sum you would then divide by the size of the sample minus one (or the degrees of freedom), and take the square root of the whole thing. This will give the standard deviation (`$\sigma$`). Alright, let's now consider a study where we collected blood samples from a number of patients and measured for instance sodium levels in the blood. We don't have a control group for this study, but we know from medical textbooks that the reference value for sodium in healthy individuals for the age and sex distribution in our sample is for instance 2.5 mmol/L. Then we measure the sodium levels for 30 patients, we can simulate some fake measurements by generating a random sequence of values with a mean of 3 and a standard deviation of 1.2. +In this formula, you'd subtract the average across the sample values from each individual value, square it, and sum all these resulting values. This sum you would then divide by the size of the sample minus one (or the degrees of freedom), and take the square root of the whole thing. This will give the standard deviation (*σ*). Alright, let's now consider a study where we collected blood samples from a number of patients and measured for instance sodium levels in the blood. We don't have a control group for this study, but we know from medical textbooks that the reference value for sodium in healthy individuals for the age and sex distribution in our sample is for instance 2.5 mmol/L. Then we measure the sodium levels for 30 patients, we can simulate some fake measurements by generating a random sequence of values with a mean of 3 and a standard deviation of 1.2. ```{r} #| label: ost-setup @@ -59,7 +60,9 @@ concentration <- rnorm(n, mean = 3, sd = 1.25) If we then implement these values in the formulas earlier, we get the following result for the standard deviation: -`$$\sigma = \sqrt{\frac{\sum\limits_{i=1}^n{|x_{i} - \overline{x}|^2}}{n - 1}} = \sqrt{\frac{\sum\limits_{i=1}^{30}{|concentration_{i} - 2.855|^2}}{30 - 1}} = 1.157$$` +$$ +\sigma = \sqrt{\frac{\sum\limits\_{i=1}^n{\|x\_{i} - \overline{x}\|^2}}{n - 1}} = \sqrt{\frac{\sum\limits\_{i=1}^{30}{\|concentration\_{i} - 2.855\|^2}}{30 - 1}} = 1.157 +$$ this formula would like like this when implemented in R: @@ -69,9 +72,11 @@ this formula would like like this when implemented in R: sqrt( sum(abs(concentration - mean(concentration))^2) / (n - 1) ) ``` -But of course in any normal setting, you'd use the `sd()` function, which will give the same result as the code above, but I just wanted to show it for illustrative purposes. Anywhere else I'll use the `sd()` function. Now let's calculate the _t_-value. In formula form this would look like this: +But of course in any normal setting, you'd use the `sd()` function, which will give the same result as the code above, but I just wanted to show it for illustrative purposes. Anywhere else I'll use the `sd()` function. Now let's calculate the *t*-value. In formula form this would look like this: -`$$t = \frac{\overline{x} - \mu}{\frac{\sigma}{\sqrt{n}}} = \frac{2.855 - 2.5}{\frac{1.157}{\sqrt{30}}} = 1.681$$` +$$ +t = \frac{\overline{x} - \mu}{\frac{\sigma}{\sqrt{n}}} = \frac{2.855 - 2.5}{\frac{1.157}{\sqrt{30}}} = 1.681 +$$ So just to over this formula again, you take the mean of your sample, subtract the reference number, and you divide this number by the standard deviation of your sample divided by the square root of the sample size. Or in R-terms: @@ -81,7 +86,7 @@ So just to over this formula again, you take the mean of your sample, subtract t (mean(concentration) - ref_concentration) / (sd(concentration) / sqrt(n)) ``` -Now we can compare this to the `t.test()` function and then we'd find the same _t_-value (barring some rounding and digit cutoffs). In this function, since we're not comparing two samples, we set the population mean (`mu`) we want to compare to as the reference concentration (the default value for a One-Sample T-test is 0). What the `mu` option does is nothing else than subtract the reference value from all values. By doing this it centers all the values relative to 0, so if we'd run `t.test(concentration - ref_concentration)`, we'd get the same result, obviously with a different mean and the values of the confidence interval have changed, although the range stays the same. +Now we can compare this to the `t.test()` function and then we'd find the same *t*-value (barring some rounding and digit cutoffs). In this function, since we're not comparing two samples, we set the population mean (`mu`) we want to compare to as the reference concentration (the default value for a One-Sample T-test is 0). What the `mu` option does is nothing else than subtract the reference value from all values. By doing this it centers all the values relative to 0, so if we'd run `t.test(concentration - ref_concentration)`, we'd get the same result, obviously with a different mean and the values of the confidence interval have changed, although the range stays the same. ```{r} #| label: ost-test @@ -90,13 +95,13 @@ t.test(concentration, mu = ref_concentration) t.test(concentration - ref_concentration, mu = 0) ``` -So now back to the premise of this exercise, how is a T-test the same as a linear model? Like we showed before, subtracting the reference value from the sample values and adding that to a T-test comparing the values to 0 is equivalent to comparing the sample values to the reference value. Now let's consider what a linear model does. You might recall from high-school mathematics that the formula for a straight line is always some form of `$y = ax + c$`, the linear model formula is somewhat similar: +So now back to the premise of this exercise, how is a T-test the same as a linear model? Like we showed before, subtracting the reference value from the sample values and adding that to a T-test comparing the values to 0 is equivalent to comparing the sample values to the reference value. Now let's consider what a linear model does. You might recall from high-school mathematics that the formula for a straight line is always some form of *y* = *a**x* + *c*, the linear model formula is somewhat similar: -`$$Y_{i} = \beta_{0} + \beta_{1}x + \epsilon_{i}$$` +*Y**i* = *β*0 + *β*1*x* + *ϵ**i* -In this formula `$Y_{i}$` is the dependent variable, `$x$` is the independent variable. `$\beta_{0}$` is equivalent to the intercept at the y-axis, similar to `$c$` in the formula for a straight line. `$\beta_{1}$` is the slope (equivalent to `$a$` in the formula earlier). Finally, the `$\epsilon_{i}$` is the random error term. +In this formula *Y**i* is the dependent variable, *x* is the independent variable. *β*0 is equivalent to the intercept at the y-axis, similar to *c* in the formula for a straight line. *β*1 is the slope (equivalent to *a* in the formula earlier). Finally, the *ϵ**i* is the random error term. -Now let's build the linear model. Remember that the formula for the linear model included this term: `$\beta_{1}x$`. In this case, since we only have one sample, we don't have any value to multiply our value to, so we multiply it by 1. If we wanted to correlate two variables, for instance concentration with age, we would substitute the 1 with a continuous variable, i.e. age, but in this case we correlate all sample values with 1. Since we still want to compare our value to 0, we subtract the reference value from our sample values like we did before for the `t.test()`. Let's build the linear model. +Now let's build the linear model. Remember that the formula for the linear model included this term: *β*1*x*. In this case, since we only have one sample, we don't have any value to multiply our value to, so we multiply it by 1. If we wanted to correlate two variables, for instance concentration with age, we would substitute the 1 with a continuous variable, i.e. age, but in this case we correlate all sample values with 1. Since we still want to compare our value to 0, we subtract the reference value from our sample values like we did before for the `t.test()`. Let's build the linear model. ```{r} #| label: ost-lm @@ -105,7 +110,7 @@ ost_model <- lm((concentration - ref_concentration) ~ 1) summary(ost_model) ``` -Again, we find the same _t_- and _p_-value as when we ran the `t.test()`! How exciting is that! We now have three ways to obtain the same values. Later I'll go into what the `Residuals`, `Estimate` and `Std. Error` mean when running comparing group means with a linear model. +Again, we find the same *t*- and *p*-value as when we ran the `t.test()`! How exciting is that! We now have three ways to obtain the same values. Later I'll go into what the `Residuals`, `Estimate` and `Std. Error` mean when running comparing group means with a linear model. # Two-Sample T-test @@ -123,18 +128,20 @@ data <- tibble( The formula for an Two-Sample T-test is very similar to that of the One-Sample T-test, with the added factor of the second set of sample values. The formula for an Two-Sample T-test is as follows: -`$$t = \frac{(\overline{x_1} - \overline{x_2})}{\sqrt{\frac{\sigma_1^2}{n_1} + \frac{\sigma_2^2}{n_2}}} = \frac{(3.838073 - 5.455809)}{\sqrt{\frac{1.343565^2}{30} + \frac{1.69711^2}{30}}} = -4.093524$$` +$$ +t = \frac{(\overline{x_1} - \overline{x_2})}{\sqrt{\frac{\sigma_1^2}{n_1} + \frac{\sigma_2^2}{n_2}}} = \frac{(3.838073 - 5.455809)}{\sqrt{\frac{1.343565^2}{30} + \frac{1.69711^2}{30}}} = -4.093524 +$$ -It's a bit too complex to describe in a sentence, but the definitions are perhaps familiar: `$\overline{x}$` for group means, `$\sigma$` for group standard deviations, and `$n$` for group size. I find that the simplest way to implement this in R is by first separating the groups and then adding them in the formula. +It's a bit too complex to describe in a sentence, but the definitions are perhaps familiar: $\overline{x}$ for group means, *σ* for group standard deviations, and *n* for group size. I find that the simplest way to implement this in R is by first separating the groups and then adding them in the formula. ```{r} #| label: tst-t-calc -g1 <- data %>% - filter(group == "HC") %>% +g1 <- data |> + filter(group == "HC") |> pull(concentration) -g2 <- data %>% - filter(group == "PAT") %>% +g2 <- data |> + filter(group == "PAT") |> pull(concentration) (mean(g1) - mean(g2)) / sqrt( (sd(g1)^2 / length(g1)) + (sd(g2)^2 / length(g2)) ) @@ -148,7 +155,7 @@ Then running the regular T-test is easy. t.test(g1, g2) ``` -Look at that! We find the same _t_-value! Before we move on to the linear model, I first want to do some plotting, it will help us visualize how the linear model applies here later. Let's make a boxplot: +Look at that! We find the same *t*-value! Before we move on to the linear model, I first want to do some plotting, it will help us visualize how the linear model applies here later. Let's make a boxplot: ```{r} #| label: tst-boxplot @@ -167,8 +174,8 @@ Great! Now we have a visual representation of the data. Now, since the T-test co ```{r} #| label: tst-plot -mean_concentration <- data %>% - group_by(group) %>% +mean_concentration <- data |> + group_by(group) |> summarise(mean_conc = mean(concentration)) ggplot(data, aes(x = group)) + @@ -176,7 +183,7 @@ ggplot(data, aes(x = group)) + geom_point(data = mean_concentration, aes(y = mean_conc), color = "violet", size = 5) + geom_line(data = mean_concentration, aes(y = mean_conc), group = 1, - size = 2, color = "violet") + + linewidth = 2, color = "violet") + theme_minimal() ``` @@ -189,35 +196,40 @@ tst_model <- lm(concentration ~ group, data = data) summary(tst_model) ``` -First of all, let's look at the `groupPAT`, there we find the same _t_-value as we did when we ran the T-tests earlier, although with the sign flipped. I'll show later why that is. +First of all, let's look at the `groupPAT`, there we find the same *t*-value as we did when we ran the T-tests earlier, although with the sign flipped. I'll show later why that is. -Now, back to the plot. The x-axis has two conditions: `HC` and `PAT`, but let's imagine those values are `0` and `1`. Let's now also throw back to the time we recalled the formula for a straight line: `$y = ax + c$`. In this context we only have two x-values, `HC` and `PAT` or `0` and `1`. Then we can obtain `$y$` in the formula by solving the equation when `$x$` is equal to `0`, in that case `$y$` becomes just the mean concentration of the healthy controls, or the big magenta dot in the previous plot, and that is a value we can calculate. Remember that `0` in the formula below stands for `HC`. That looks something like this: +Now, back to the plot. The x-axis has two conditions: `HC` and `PAT`, but let's imagine those values are `0` and `1`. Let's now also throw back to the time we recalled the formula for a straight line: *y* = *a**x* + *c*. In this context we only have two x-values, `HC` and `PAT` or `0` and `1`. Then we can obtain *y* in the formula by solving the equation when *x* is equal to `0`, in that case *y* becomes just the mean concentration of the healthy controls, or the big magenta dot in the previous plot, and that is a value we can calculate. Remember that `0` in the formula below stands for `HC`. That looks something like this: -`$$ +$$ \begin{eqnarray} -\overline{x}_{0} &=& a \times 0 + c \\ -c &=& \overline{x}_{0} \\ +\overline{x}\_{0} &=& a \times 0 + c \newline +c &=& \overline{x}\_{0} \newline &=& 4.162838 \end{eqnarray} -$$` +$$ -So that's the constant our formula. If we look back at the output from the `lm()` function, we see that this value is represented as the `Estimate` of the `(Intercept)` row! Let's also solve `$a$`. Remember that `$a$` represents the slope of the line. How do we get the slope? The slope is basically nothing more than the difference between the mean values of `HC` and `PAT`, but let's solve it in a more elegant way, by using the same formula we used to find `$c$`. We'll use the same coding as before, `0` for `HC` and `1` for `PAT`. Remember that `$c$` is equal to the mean value of `HC` (aka `$\overline{x}_{0}$`). +So that's the constant our formula. If we look back at the output from the `lm()` function, we see that this value is represented as the `Estimate` of the `(Intercept)` row! Let's also solve *a*. Remember that *a* represents the slope of the line. How do we get the slope? The slope is basically nothing more than the difference between the mean values of `HC` and `PAT`, but let's solve it in a more elegant way, by using the same formula we used to find *c*. We'll use the same coding as before, `0` for `HC` and `1` for `PAT`. Remember that *c* is equal to the mean value of `HC` (aka $\overline{x}_{0}$). -`$$ +$$ \begin{eqnarray} -\overline{x}_{1} &=& a \times 1 + c \\ - &=& a + \overline{x}_{0} \\ -a &=& \overline{x}_{1} - \overline{x}_{0} \\ - &=& 6.746285 - 4.162838 \\ +\overline{x}\_{1} &=& a \times 1 + c \newline + &=& a + \overline{x}\_{0} \newline +a &=& \overline{x}\_{1} - \overline{x}\_{0} \newline + &=& 6.746285 - 4.162838 \newline &=& 2.583447 \end{eqnarray} -$$` +$$ -And then we find that `$a$` is equal to the `Estimate` column for the `groupPAT` row. +And then we find that *a* is equal to the `Estimate` column for the `groupPAT` row. -We can reverse engineer the _t_-value too using just the output from the `lm()` function. One can imagine that if one would plot a situation where the null hypothesis (H0) is true, the slope of that line would be 0 since then there's no difference between the mean of the two groups (inb4 the angry statisticians: it's more complicated than that but let me use this for now). We'll take the difference between our observed slope, or the slope of the alternative hypothesis (H0), and the slope of the null hypothesis, which is 0, and divide that by the standard error of the sampling distribution, which is given by the `lm()` function as the `Std. Error` of the `groupPAT` row: +We can reverse engineer the *t*-value too using just the output from the `lm()` function. One can imagine that if one would plot a situation where the null hypothesis (H0) is true, the slope of that line would be 0 since then there's no difference between the mean of the two groups (inb4 the angry statisticians: it's more complicated than that but let me use this for now). We'll take the difference between our observed slope, or the slope of the alternative hypothesis (H0), and the slope of the null hypothesis, which is 0, and divide that by the standard error of the sampling distribution, which is given by the `lm()` function as the `Std. Error` of the `groupPAT` row: -`$$t = \frac{slope\ of\ regression\ line\ at\ H_{a} - slope\ of\ regression\ line\ at\ H_{0}}{standard\ error\ of\ sampling\ distribution} = \frac{1.6177 - 0}{0.3952} = 4.093$$` +$$ +\begin{eqnarray} +t &=& \frac{slope~of~regression~line~at~H\_{a} - slope~of~regression~line~at~H\_{0}}{standard~error~of~sampling~distribution}\newline +&=& \frac{1.6177 - 0}{0.3952} = 4.093 +\end{eqnarray} +$$ Which as you'll notice is one thousandths-decimal place off, which is due to rounding errors. `lm()` reports up to 4 decimal points while it uses more for the calculation. And now we've come full circle, because the slope of the regression line is nothing more than the difference between the mean of the second group minor the mean of the first group. Now we can go back to the figure we made earlier and see how all these values relate: @@ -230,11 +242,11 @@ ggplot(data, aes(x = group)) + geom_point(data = mean_concentration, aes(y = mean_conc), color = "violet", size = 5) + geom_line(data = mean_concentration, aes(y = mean_conc), group = 1, - size = 2, color = "violet") + + linewidth = 2, color = "violet") + geom_segment(data = NULL, aes(x = 0.4, xend = 0.925, y = mean(g1), yend = mean(g1)), - color = "grey", size = 0.2) + + color = "grey", linewidth = 0.2) + geom_segment(data = NULL, aes(x = 0.4, xend = 1.925, y = mean(g2), yend = mean(g2)), - color = "grey", size = 0.2) + + color = "grey", linewidth = 0.2) + geom_text(data = data.frame(), aes(x = 1.45, y = 0.18 + (mean(g1) + mean(g2))/2), label = "a = 1.6177", angle = 21) + scale_y_continuous(breaks = round(c(seq(2.5,10,2.5), mean(g1), mean(g2)), 4), @@ -256,7 +268,7 @@ data <- tibble(score = round(c(rnorm(n, mean = 75, sd = 30), rnorm(n, mean = 60, sd = 35), rnorm(n, mean = 30, sd = 17), rnorm(n, mean = 45, sd = 32))), - group = rep(c("SCZ", "BD", "MDD", "ASD"), each = n)) %>% + group = rep(c("SCZ", "BD", "MDD", "ASD"), each = n)) |> mutate(group = as_factor(group)) ``` @@ -278,19 +290,19 @@ anova_lm_model <- lm(score ~ group, data = data) summary(anova_lm_model) ``` -The first thing you might notice is that the _F_-statistic and the _p_-value are the same in both models. +The first thing you might notice is that the *F*-statistic and the *p*-value are the same in both models. ```{r} #| label: aov-plot -ref_mean <- data %>% - filter(group == "SCZ") %>% - pull(score) %>% +ref_mean <- data |> + filter(group == "SCZ") |> + pull(score) |> mean() -anova_group_means <- data %>% - group_by(group) %>% - summarise(score = mean(score)) %>% +anova_group_means <- data |> + group_by(group) |> + summarise(score = mean(score)) |> mutate(ref_mean = ref_mean, mean_adj = score - ref_mean) @@ -306,30 +318,29 @@ ggplot(data, aes(x = group, y = score - ref_mean)) + Oh, would you look at that! The differences between the group means and the reference mean (in this case SCZ) correspond with the `Estimate` of the linear model! Let's also see if we can reproduce the sum of squares from the ANOVA summary. We'll go a bit more in depth into the sum of squares further down, but I just wanted to go over a few formulas and calculations: -`$$ +$$ \begin{eqnarray} -total~sum~of~squares &=& \sum\limits_{j=1}^{J} n_{j} \times (\overline{x}_{j} - \mu)^2 \\ -residual~sum~of~squares &=& \sum\limits_{j=1}^{J} \sum\limits_{i=1}^{n_{j}} (y_{ij} - \overline{y}_{j})^2 +total~sum~of~squares &=& \sum\limits\_{j=1}^{J} n\_{j} \times (\overline{x}\_{j} - \mu)^2 \newline +residual~sum~of~squares &=& \sum\limits\_{j=1}^{J} \sum\limits\_{i=1}^{n\_{j}} (y\_{ij} - \overline{y}\_{j})^2 \end{eqnarray} -$$` +$$ -Just briefly, the first formula takes the mean value for the group in question, subtracts the overall mean (or grand mean) and squares the result. Then it multiplies this number by the sample size in this group. In this case we'll only do it for the first group since that's the one listed in the `summary(aov_model)` output. -The second formula calculates the residual sum of squares (or sum of squared error), we'll come back to this later. In essence it substracts the group mean from each of the individual values, squares it, and sums it first within the group, and then sums it again across the groups. +Just briefly, the first formula takes the mean value for the group in question, subtracts the overall mean (or grand mean) and squares the result. Then it multiplies this number by the sample size in this group. In this case we'll only do it for the first group since that's the one listed in the `summary(aov_model)` output. The second formula calculates the residual sum of squares (or sum of squared error), we'll come back to this later. In essence it substracts the group mean from each of the individual values, squares it, and sums it first within the group, and then sums it again across the groups. We can do both calculations in one go with the following quick code: ```{r} #| label: aov-calc-ss -data %>% - mutate(overall_mean = mean(score)) %>% - group_by(group) %>% +data |> + mutate(overall_mean = mean(score)) |> + group_by(group) |> summarise(group_mean = mean(score), group_n = n(), overall_mean = first(overall_mean), sq_group = group_n * (group_mean - overall_mean)^2, - sq_error = sum((score - group_mean)^2)) %>% - ungroup() %>% + sq_error = sum((score - group_mean)^2)) |> + ungroup() |> summarise(ss_group = sum(sq_group), ss_error = sum(sq_error)) ``` @@ -338,7 +349,7 @@ Now look back at the output from `summary(aov_model)` and we'll find the same va # A linear model is a linear model -Well that's a statement of unequaled wisdom, isn't it? No wonder they give us doctorates to talk about this stuff. +Well that's a statement of unequaled wisdom, isn't it? No wonder they give us doctorates to talk about this stuff. I don't think I need a lot of effort to convince anyone that a linear model is a linear model. Actually, I'm so convinced that you are aware that a linear model is a linear model that I wanted to about something else instead. Instead I wanted to dive into residuals and R2. Before we start, let's first simulate some data, We'll create an age column, a sex column, and a measure column. We'll make it so that the measure column correlates with the age column. @@ -348,11 +359,11 @@ I don't think I need a lot of effort to convince anyone that a linear model is a n <- 20 data <- tibble(age = round(runif(n = n, min = 18, max = 60)), sex = factor(sample(c("Male", "Female"), size = n, replace = TRUE), - levels = c("Male", "Female"))) %>% + levels = c("Male", "Female"))) |> mutate(measure = 1e-2 * age + sqrt(1e-2) * rnorm(n)) ``` -We've used the formula for a straight line in previous sections (`$y = ax + c$`), we can apply it here too, but instead of the difference in the mean between two groups, the slope of the line (denoted by `$a$`) is now derived from the slope at which the line has the least distance to all points, referred to as the best fit. We will plot this later, but first we should maybe just run the linear model: +We've used the formula for a straight line in previous sections (*y* = *a**x* + *c*), we can apply it here too, but instead of the difference in the mean between two groups, the slope of the line (denoted by *a*) is now derived from the slope at which the line has the least distance to all points, referred to as the best fit. We will plot this later, but first we should maybe just run the linear model: ```{r} #| label: lm @@ -373,20 +384,22 @@ ggplot(data, aes(x = age, y = measure)) + theme_minimal() ``` -The line in the figure above shows the line that best fits the points with a ribbon indicating the standard error. +The line in the figure above shows the line that best fits the points with a ribbon indicating the standard error. -Back to our data. We know that a linear models fits a line that "predicts" outcome based on some other variable. This is heavily simplified, but it'll make clear what we'll do next. So what we did before with the best fit line was create one line that best fits all the data points, but now we want to relate that back to our data points. What would our values be if they would be exactly on this line? To get this, all we have to do is calculate the difference between the current data point and the value of the best fit line at the corresponding "predictor" value. We could do it by hand, but since this section is quite long already, I'll skip straight to the R function, which is appropriately called `predict.lm()`. It takes the linear model we created with the `lm()` function earlier as input. It outputs a vector with the predicted values based on the model. +Back to our data. We know that a linear models fits a line that "predicts" outcome based on some other variable. This is heavily simplified, but it'll make clear what we'll do next. So what we did before with the best fit line was create one line that best fits all the data points, but now we want to relate that back to our data points. What would our values be if they would be exactly on this line? To get this, all we have to do is calculate the difference between the current data point and the value of the best fit line at the corresponding "predictor" value. We could do it by hand, but since this section is quite long already, I'll skip straight to the R function, which is appropriately called `predict.lm()`. It takes the linear model we created with the `lm()` function earlier as input. It outputs a vector with the predicted values based on the model. ```{r} #| label: lm-predict -data <- data %>% +data <- data |> mutate(measure_pred = predict.lm(lm_model)) ``` With that we can quickly calculate the residual standard error (oversimplified, it's a measure of how well a regression model fits a dataset). The formula for the residual standard error is this: -`$$Residual~standard~error = \sqrt{\frac{\sum(observed - predicted)^2}{degrees~of~freedom}}$$` +$$ +Residual~standard~error = \sqrt{\frac{\sum(observed - predicted)^2}{degrees~of~freedom}} +$$ or in R terms (the degrees of freedom is 18 here, too complicated to explain for now): @@ -401,7 +414,7 @@ So that checks out. What we can then also do is calculate the difference between ```{r} #| label: lm-calc-residual -data <- data %>% +data <- data |> mutate(residual = measure - measure_pred) ``` @@ -411,7 +424,7 @@ We can check that this is correct too by comparing the residuals we calculated w #| label: lm-compare-residuals tibble(residual_manual = data$residual, - residual_lm = residuals(lm_model)) %>% + residual_lm = residuals(lm_model)) |> glimpse() ``` @@ -434,8 +447,9 @@ ggplot(data, aes(x = age)) + geom_smooth(aes(y = measure), method = 'lm', color = "grey30") + geom_point(aes(y = measure), size = 4, alpha = 0.8) + geom_point(aes(y = measure_pred), alpha = 0.5, size = 2.5) + - #geom_smooth(aes(y = measure_pred), method = 'lm', color = "red", size = 0.5) + - geom_segment(aes(xend = age, y = measure, yend = measure_pred), arrow = arrow(length = unit(4, "points")), color = "black", alpha = 0.8, show.legend = FALSE) + + geom_segment(aes(xend = age, y = measure, yend = measure_pred), + arrow = arrow(length = unit(4, "points")), + color = "black", alpha = 0.8, show.legend = FALSE) + scale_x_continuous(limits = c(18,60)) + scico::scale_color_scico_d() + theme_minimal() @@ -452,8 +466,9 @@ ggplot(data, aes(x = age)) + geom_smooth(aes(y = measure), method = 'lm', color = "grey30") + geom_point(aes(y = measure), size = 4, alpha = 0.8) + geom_point(aes(y = measure_pred), alpha = 0.5, size = 2.5) + - #geom_smooth(aes(y = measure_pred), method = 'lm', color = "red", size = 0.5) + - geom_segment(aes(xend = age, y = measure, yend = measure_pred), arrow = arrow(length = unit(4, "points")), color = "black", alpha = 0.8, show.legend = FALSE) + + geom_segment(aes(xend = age, y = measure, yend = measure_pred), + arrow = arrow(length = unit(4, "points")), + color = "black", alpha = 0.8, show.legend = FALSE) + geom_rect(data = . %>% filter(age < 35 & age > 30), aes(xmin = age, xmax = age - (1.6 * residual * age), ymin = measure_pred, ymax = measure), @@ -467,18 +482,24 @@ ggplot(data, aes(x = age)) + theme_minimal() ``` - The "sum" part of "sum of squared error" refers to the sum of the areas of those squares. Simply, you sum the square of the sides. You can also look at it in mathematical form: -$$\sum(residual~or~difference~with~regression~line^2)$$ +$$ +\sum{(residual~or~difference~with~regression~line^2)} +$$ We'll use this formula again a bit later to calculate the R2. In order to calculate the squared regression coefficient, we should also calculate the mean value of the measure across all points. This is necessary because the squared regression coefficient is defined as a perfect correlation (i.e. a correlation coefficient of 1) minus the explained variance divided by the total variance, or in formula form: -`$$R^2 = perfect~correlation - \frac{explained~variance}{total~variance} = 1 - \frac{\sum(difference~with~regression~line^2)}{\sum(difference~with~mean~value^2)}$$` +$$ +\begin{eqnarray} +R^2 &=& perfect~correlation - \frac{explained~variance}{total~variance} \newline +&=& 1 - \frac{\sum(difference~with~regression~line^2)}{\sum(difference~with~mean~value^2)} +\end{eqnarray} +$$ -Explained variance is defined here as the sum of squared error. You might notice the sum symbols and the squares, so you might guess that this formula is also some kind of sum of squares, and it is! As we already discovered, the numerator in this formula is the sum of squared error, the denominator is referred to as the sum of squared total. And the composite of those two is referred to as the sum of squared regression. Making three different sum of squares. +Explained variance is defined here as the sum of squared error. You might notice the sum symbols and the squares, so you might guess that this formula is also some kind of sum of squares, and it is! As we already discovered, the numerator in this formula is the sum of squared error, the denominator is referred to as the sum of squared total. And the composite of those two is referred to as the sum of squared regression. Making three different sum of squares. Important here is to notice that the error term has switched from the difference between the values with the group mean (as in ANOVA) to the difference between the values and the regression line. Where in the linear model the predicted value was the regression line, in the ANOVA is represented as group mean instead. @@ -491,26 +512,30 @@ We've already plotted the sum of squared error, now we'll also illustrate sum of ggplot(data, aes(x = age)) + geom_smooth(aes(y = measure), method = 'lm', color = "grey95", alpha = 0.1) + - geom_hline(yintercept = mean(data$measure), color = "grey30", size = 1) + + geom_hline(yintercept = mean(data$measure), color = "grey30", linewidth = 1) + geom_point(aes(y = measure), size = 4, alpha = 0.8) + geom_point(aes(y = mean(measure)), alpha = 0.5, size = 2.5) + - geom_segment(aes(xend = age, y = measure, yend = mean(measure)), arrow = arrow(length = unit(4, "points")), color = "black", alpha = 0.8, show.legend = FALSE) + + geom_segment(aes(xend = age, y = measure, yend = mean(measure)), + arrow = arrow(length = unit(4, "points")), + color = "black", alpha = 0.8, show.legend = FALSE) + theme_minimal() ``` -We already calculated the difference with the regression line, then to calculate the difference with the mean is simple: +We already calculated the difference with the regression line, then to calculate the difference with the mean is simple: ```{r} #| label: lm-calc-differences -data <- data %>% +data <- data |> mutate(measure_mean = mean(measure), difference_mean = measure - measure_mean) ``` Sidenote, if you wanted to calculate the total variance the formula for that would look like this: -`$$S^2 = \frac{\sum(x_{i} - \overline{x})^2}{n - 1}$$` +$$ +S^2 = \frac{\sum(x\_{i} - \overline{x})^2}{n - 1} +$$ Notice how the numerator is the same calculation as the sum of squared total, then divided by the sample size minus 1 (like the degrees of freedom). @@ -522,7 +547,7 @@ To calculate the squared regression coefficient (R2) from the formula 1 - sum(data$residual^2) / sum(data$difference_mean^2) ``` -And there we have it, the regression coefficient R2! You can check that it's the same by scrolling up to where we ran `summary(lm_model)` and you'll find the same number. We could also calculate the F-statistic and the _t_- and _p_-values, but I think this tutorial has drained enough cognitive energy. For this last section, I hope it's become clear what we mean when we talk about "residuals", "sum of squares", and "variance" in the context of linear models. I also hope it's enlightened you a bit on what a linear model does and how it works. +And there we have it, the regression coefficient R2! You can check that it's the same by scrolling up to where we ran `summary(lm_model)` and you'll find the same number. We could also calculate the F-statistic and the *t*- and *p*-values, but I think this tutorial has drained enough cognitive energy. For this last section, I hope it's become clear what we mean when we talk about "residuals", "sum of squares", and "variance" in the context of linear models. I also hope it's enlightened you a bit on what a linear model does and how it works. # Conclusion @@ -530,11 +555,11 @@ There's many more things we could go over, multiple linear regression, non-param ## Resources -- [Common statistical tests are linear models (or: how to teach stats) - Jonas Kristoffer Lindeløv](https://lindeloev.github.io/tests-as-linear/) -- [The Linear Regression Family in R - Athanasia Mowinckel](https://drmowinckels.netlify.app/blog/2020-06-24-the-linear-regression-family-in-r/) -- [STAT 415: Introduction to Mathematical Statistics - Penn State Department of Statistics](https://online.stat.psu.edu/stat415/lesson/13/13.2) -- [ANOVA for Regression - Rahul Pathak](https://towardsdatascience.com/anova-for-regression-fdb49cf5d684) -- [Explaining the `lm` summary in R - Learn by Marketing](https://www.learnbymarketing.com/tutorials/explaining-the-lm-summary-in-r/) +- [Common statistical tests are linear models (or: how to teach stats) - Jonas Kristoffer Lindeløv](https://lindeloev.github.io/tests-as-linear/) +- [The Linear Regression Family in R - Athanasia Mowinckel](https://drmowinckels.netlify.app/blog/2020-06-24-the-linear-regression-family-in-r/) +- [STAT 415: Introduction to Mathematical Statistics - Penn State Department of Statistics](https://online.stat.psu.edu/stat415/lesson/13/13.2) +- [ANOVA for Regression - Rahul Pathak](https://towardsdatascience.com/anova-for-regression-fdb49cf5d684) +- [Explaining the `lm` summary in R - Learn by Marketing](https://www.learnbymarketing.com/tutorials/explaining-the-lm-summary-in-r/)
Session info for reproducibility purposes ```{r} diff --git a/content/blog/2022-everything-is-a-linear-model/index_files/figure-gfm/aov-plot-1.png b/content/blog/2022-everything-is-a-linear-model/index_files/figure-gfm/aov-plot-1.png deleted file mode 100644 index cf63fd7..0000000 Binary files a/content/blog/2022-everything-is-a-linear-model/index_files/figure-gfm/aov-plot-1.png and /dev/null differ diff --git a/content/blog/2022-everything-is-a-linear-model/index_files/figure-gfm/lm-plot-1.png b/content/blog/2022-everything-is-a-linear-model/index_files/figure-gfm/lm-plot-1.png deleted file mode 100644 index 98864a3..0000000 Binary files a/content/blog/2022-everything-is-a-linear-model/index_files/figure-gfm/lm-plot-1.png and /dev/null differ diff --git a/content/blog/2022-everything-is-a-linear-model/index_files/figure-gfm/lm-plot-error-1.png b/content/blog/2022-everything-is-a-linear-model/index_files/figure-gfm/lm-plot-error-1.png deleted file mode 100644 index 5dc51f4..0000000 Binary files a/content/blog/2022-everything-is-a-linear-model/index_files/figure-gfm/lm-plot-error-1.png and /dev/null differ diff --git a/content/blog/2022-everything-is-a-linear-model/index_files/figure-gfm/lm-plot-mean-1.png b/content/blog/2022-everything-is-a-linear-model/index_files/figure-gfm/lm-plot-mean-1.png deleted file mode 100644 index 4deddf7..0000000 Binary files a/content/blog/2022-everything-is-a-linear-model/index_files/figure-gfm/lm-plot-mean-1.png and /dev/null differ diff --git a/content/blog/2022-everything-is-a-linear-model/index_files/figure-gfm/lm-plot-squares-1.png b/content/blog/2022-everything-is-a-linear-model/index_files/figure-gfm/lm-plot-squares-1.png deleted file mode 100644 index a8f558b..0000000 Binary files a/content/blog/2022-everything-is-a-linear-model/index_files/figure-gfm/lm-plot-squares-1.png and /dev/null differ diff --git a/content/blog/2022-everything-is-a-linear-model/index_files/figure-gfm/tst-boxplot-1.png b/content/blog/2022-everything-is-a-linear-model/index_files/figure-gfm/tst-boxplot-1.png deleted file mode 100644 index 408e5cd..0000000 Binary files a/content/blog/2022-everything-is-a-linear-model/index_files/figure-gfm/tst-boxplot-1.png and /dev/null differ diff --git a/content/blog/2022-everything-is-a-linear-model/index_files/figure-gfm/tst-plot-1.png b/content/blog/2022-everything-is-a-linear-model/index_files/figure-gfm/tst-plot-1.png deleted file mode 100644 index 5d04183..0000000 Binary files a/content/blog/2022-everything-is-a-linear-model/index_files/figure-gfm/tst-plot-1.png and /dev/null differ diff --git a/content/blog/2022-everything-is-a-linear-model/index_files/figure-gfm/tst-plot-w-annot-1.png b/content/blog/2022-everything-is-a-linear-model/index_files/figure-gfm/tst-plot-w-annot-1.png deleted file mode 100644 index 0816ab1..0000000 Binary files a/content/blog/2022-everything-is-a-linear-model/index_files/figure-gfm/tst-plot-w-annot-1.png and /dev/null differ diff --git a/content/blog/2023-nyt-books-api/index.markdown_strict_files/figure-markdown_strict/plot-top-authors-1.png b/content/blog/2023-nyt-books-api/index.markdown_strict_files/figure-markdown_strict/plot-top-authors-1.png index 8aa70ae..3824614 100644 Binary files a/content/blog/2023-nyt-books-api/index.markdown_strict_files/figure-markdown_strict/plot-top-authors-1.png and b/content/blog/2023-nyt-books-api/index.markdown_strict_files/figure-markdown_strict/plot-top-authors-1.png differ diff --git a/content/blog/2023-nyt-books-api/index.markdown_strict_files/figure-markdown_strict/plot-top-weeks-1.png b/content/blog/2023-nyt-books-api/index.markdown_strict_files/figure-markdown_strict/plot-top-weeks-1.png index 4c83362..77a5fb6 100644 Binary files a/content/blog/2023-nyt-books-api/index.markdown_strict_files/figure-markdown_strict/plot-top-weeks-1.png and b/content/blog/2023-nyt-books-api/index.markdown_strict_files/figure-markdown_strict/plot-top-weeks-1.png differ diff --git a/content/blog/2023-nyt-books-api/index.markdown_strict_files/figure-markdown_strict/plot-trajectory-1.png b/content/blog/2023-nyt-books-api/index.markdown_strict_files/figure-markdown_strict/plot-trajectory-1.png index a440dea..1e24046 100644 Binary files a/content/blog/2023-nyt-books-api/index.markdown_strict_files/figure-markdown_strict/plot-trajectory-1.png and b/content/blog/2023-nyt-books-api/index.markdown_strict_files/figure-markdown_strict/plot-trajectory-1.png differ diff --git a/content/blog/2023-nyt-books-api/index.markdown_strict_files/figure-markdown_strict/plot-trajectory-adaptations-1.png b/content/blog/2023-nyt-books-api/index.markdown_strict_files/figure-markdown_strict/plot-trajectory-adaptations-1.png index 6ba1983..c0204fd 100644 Binary files a/content/blog/2023-nyt-books-api/index.markdown_strict_files/figure-markdown_strict/plot-trajectory-adaptations-1.png and b/content/blog/2023-nyt-books-api/index.markdown_strict_files/figure-markdown_strict/plot-trajectory-adaptations-1.png differ diff --git a/content/blog/2023-nyt-books-api/index.md b/content/blog/2023-nyt-books-api/index.md index 38302d3..1a1df34 100644 --- a/content/blog/2023-nyt-books-api/index.md +++ b/content/blog/2023-nyt-books-api/index.md @@ -1,15 +1,13 @@ --- -title: "Analysing the NYT Best Sellers list using an API" -author: Daniel Roelfs -date: "2023-09-17" +title: Analysing the NYT Best Sellers list using an API +date: 2023-09-17 +description: Analysing the NYT Best Sellers list using an API slug: analysing-the-nyt-best-sellers-list-using-an-api categories: - data science tags: - Python - R -description: "Analysing the NYT Best Sellers list using an API" -format: hugo execute: fig.retina: 2 fig.align: center @@ -169,25 +167,24 @@ data <- read_csv("./data/nyt_list_fiction.csv") |> glimpse() ``` - Rows: 3915 Columns: 7 + Rows: 3900 Columns: 6 ── Column specification ──────────────────────────────────────────────────────── Delimiter: "," chr (2): title, author - dbl (4): rank, rank_last_week, rank_diff, weeks_on_list + dbl (3): rank, rank_last_week, weeks_on_list date (1): list_publication_date ℹ Use `spec()` to retrieve the full column specification for this data. ℹ Specify the column types or set `show_col_types = FALSE` to quiet this message. - Rows: 3,915 - Columns: 7 + Rows: 3,900 + Columns: 6 $ rank 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 1… - $ rank_last_week NA, 1, NA, NA, NA, 3, NA, 6, NA, 4, 5, 9, 8, 12,… - $ rank_diff NA, -1, NA, NA, NA, -3, NA, -2, NA, -6, -6, -3, … - $ title "Leverage In Death", "Crazy Rich Asians", "In Hi… - $ author "JD Robb", "Kevin Kwan", "Danielle Steel", "Chri… - $ weeks_on_list 1, 13, 1, 1, 1, 5, 1, 4, 1, 15, 4, 14, 28, 13, 1… - $ list_publication_date 2018-09-23, 2018-09-23, 2018-09-23, 2018-09-23,… + $ rank_last_week NA, NA, NA, 5, 1, 4, 8, NA, 10, NA, 3, NA, 9, NA… + $ title "Vince Flynn: Red War", "An Absolutely Remarkabl… + $ author "Kyle Mills", "Hank Green", "Kate Atkinson", "Ke… + $ weeks_on_list 1, 1, 1, 16, 2, 3, 3, 1, 4, 1, 31, 15, 8, 1, 7, … + $ list_publication_date 2018-10-14, 2018-10-14, 2018-10-14, 2018-10-14,… As mentioned, the dataset we downloaded does not include all history of the Best Sellers list due to the API rate limit. Instead we specified that we want to look about 5 years back, let's see what the earliest to latest dates in the dataset are. @@ -200,7 +197,7 @@ data |> # A tibble: 1 × 2 from to - 1 2018-09-23 2023-09-17 + 1 2018-10-14 2023-10-01 The purpose of this post is mostly to show the implementation of an API call in Python, but I want to make some plots anyway because it's fun. Let's first look at what book has spend the longest on the Best Sellers list. Just to make it easier to determine the book, we'll create a composite variable of both the book author and title before plotting and take the 20 books that have the most occurences on the list using the `count()` function and then use the `slice_max()` to limit the dataset to 20. diff --git a/content/blog/2023-nyt-books-api/index.qmd b/content/blog/2023-nyt-books-api/index.qmd index 4591bff..946fbf7 100644 --- a/content/blog/2023-nyt-books-api/index.qmd +++ b/content/blog/2023-nyt-books-api/index.qmd @@ -1,15 +1,13 @@ --- -title: "Analysing the NYT Best Sellers list using an API" -author: Daniel Roelfs -date: "2023-09-17" +title: Analysing the NYT Best Sellers list using an API +date: 2023-09-17 +description: Analysing the NYT Best Sellers list using an API slug: analysing-the-nyt-best-sellers-list-using-an-api categories: - data science tags: - Python - R -description: "Analysing the NYT Best Sellers list using an API" -format: hugo execute: fig.retina: 2 fig.align: center diff --git a/content/blog/2023-sunrise-sunset-differences/index.markdown_strict_files/figure-markdown_strict/plot-map-1.png b/content/blog/2023-sunrise-sunset-differences/index.markdown_strict_files/figure-markdown_strict/plot-map-1.png index bc6bf4b..a836da8 100644 Binary files a/content/blog/2023-sunrise-sunset-differences/index.markdown_strict_files/figure-markdown_strict/plot-map-1.png and b/content/blog/2023-sunrise-sunset-differences/index.markdown_strict_files/figure-markdown_strict/plot-map-1.png differ diff --git a/content/blog/2023-sunrise-sunset-differences/index.md b/content/blog/2023-sunrise-sunset-differences/index.md index b7453a6..21ca6c0 100644 --- a/content/blog/2023-sunrise-sunset-differences/index.md +++ b/content/blog/2023-sunrise-sunset-differences/index.md @@ -1,7 +1,7 @@ --- title: Difference in sunset times in Europe -author: Daniel Roelfs -date: "2023-04-06" +date: 2023-04-06 +description: Difference in sunset times in Europe slug: difference-in-sunset-times-in-europe categories: - data science @@ -9,9 +9,6 @@ tags: - Python - R - data visualization -description: "Difference in sunset times in Europe" -thumbnail: images/avatar.png -format: hugo execute: fig.retina: 2 fig.align: center @@ -89,7 +86,7 @@ def get_sun_data( Get sunset data from location ''' - geolocator = Nominatim(user_agent="geoapiExercises") + geolocator = Nominatim(user_agent="sunset-sunrise-app") df = pd.DataFrame() for i, city in enumerate(cities): @@ -196,7 +193,7 @@ data <- parse_sun_data(reticulate::py$df) data |> ggplot(aes(x = date, y = sunset, color = city, group = city)) + - geom_line(size = 2, lineend = "round", key_glyph = "point") + + geom_line(linewidth = 2, lineend = "round", key_glyph = "point") + labs(x = NULL, y = "Sunset time", color = NULL) + @@ -205,9 +202,6 @@ data |> theme_custom() ``` - Warning: Using `size` aesthetic for lines was deprecated in ggplot2 3.4.0. - ℹ Please use `linewidth` instead. - As expected, the latitude (how far north or south a location is) has the most to say about sunset times in summer in winter. Oslo has both the earliest and latest sunsets in winter and summer. And also predictable, the longitude (how far east or west a location is) merely shifts the curve up or down. Madrid has later sunsets in summer than Warsaw (which is further north) but also later sunsets in winter and a flatter curve overall. Oslo has a much steeper curve than e.g. Amsterdam while simultaneously also having the curve slightly shifted due to Oslo being further east than Amsterdam. The plot below shows the locations of of the cities in this analysis. @@ -217,7 +211,7 @@ rnaturalearth::ne_countries(scale = "medium", returnclass = "sf", continent = "Europe") |> ggplot() + - geom_sf(color = "grey60", fill = "#DDD5C7", size = 0.1) + + geom_sf(color = "grey60", fill = "#DDD5C7", linewidth = 0.1) + geom_point(data = data |> distinct(city, lat, long), aes(x = long, y = lat, color = city), shape = 18, size = 4) + @@ -230,6 +224,15 @@ rnaturalearth::ne_countries(scale = "medium", panel.grid.major = element_line(size = 0.1)) ``` + The legacy packages maptools, rgdal, and rgeos, underpinning the sp package, + which was just loaded, will retire in October 2023. + Please refer to R-spatial evolution reports for details, especially + https://r-spatial.org/r/2023/05/15/evolution4.html. + It may be desirable to make the sf package available; + package maintainers should consider adding sf to Suggests:. + The sp package is now running under evolution status 2 + (status 2 uses the sf package in place of rgdal) + Warning: The `size` argument of `element_line()` is deprecated as of ggplot2 3.4.0. ℹ Please use the `linewidth` argument instead. @@ -259,6 +262,9 @@ parse_sun_data(reticulate::py$df_deux) |> theme_custom() ``` + Warning: Using `size` aesthetic for lines was deprecated in ggplot2 3.4.0. + ℹ Please use `linewidth` instead. + As you see, in neither cities 12:00 in the afternoon is perfectly in the middle between sunrise and sunset, although this trend is slightly enhanced for Oslo (again due to longitude differences). We can also see that the length of days in Oslo is longer in summer particularly because the mornings start earlier. We'll get back to the distance between sunset and dusk, which is also a lot larger in Oslo which means darkness sets in much later in Oslo despite sunset happening around the same time. This means it's lighter outside longer than in Amsterdam. @@ -287,11 +293,6 @@ parse_sun_data(reticulate::py$df_deux) |> legend.position = c(0.85, 0.85)) ``` - Warning in do_once((if (is_R_CMD_check()) stop else warning)("The function - xfun::isFALSE() will be deprecated in the future. Please ", : The function - xfun::isFALSE() will be deprecated in the future. Please consider using - base::isFALSE(x) or identical(x, FALSE) instead. - Now I'm curious when the largest difference occurs and when the days are roughly equal. Let's select the relevant columns and get the day length in Oslo and Amsterdam next to each other using the `pivot_wider()` function so we can easily compare. Let's then select the largest and smallest differences. @@ -342,8 +343,8 @@ Finally, let's look at the effect of longitude (how far east or west a place is) ``` r data |> ggplot(aes(x = date, y = noon, group = city, color = city)) + - geom_hline(yintercept = hms::hms(hours = 12), size = 1) + - geom_line(size = 2, key_glyph = "point") + + geom_hline(yintercept = hms::hms(hours = 12), linewidth = 1) + + geom_line(linewidth = 2, key_glyph = "point") + labs(title = "What time is solar noon?", color = NULL) + scale_x_date(expand = expansion(add = 0)) + diff --git a/content/blog/2023-sunrise-sunset-differences/index.qmd b/content/blog/2023-sunrise-sunset-differences/index.qmd index 306548a..272ca35 100644 --- a/content/blog/2023-sunrise-sunset-differences/index.qmd +++ b/content/blog/2023-sunrise-sunset-differences/index.qmd @@ -1,7 +1,7 @@ --- title: Difference in sunset times in Europe -author: Daniel Roelfs -date: "2023-04-06" +date: 2023-04-06 +description: Difference in sunset times in Europe slug: difference-in-sunset-times-in-europe categories: - data science @@ -9,9 +9,6 @@ tags: - Python - R - data visualization -description: "Difference in sunset times in Europe" -thumbnail: images/avatar.png -format: hugo execute: fig.retina: 2 fig.align: center @@ -96,7 +93,7 @@ def get_sun_data( Get sunset data from location ''' - geolocator = Nominatim(user_agent="geoapiExercises") + geolocator = Nominatim(user_agent="sunset-sunrise-app") df = pd.DataFrame() for i, city in enumerate(cities): @@ -182,7 +179,7 @@ data <- parse_sun_data(reticulate::py$df) data |> ggplot(aes(x = date, y = sunset, color = city, group = city)) + - geom_line(size = 2, lineend = "round", key_glyph = "point") + + geom_line(linewidth = 2, lineend = "round", key_glyph = "point") + labs(x = NULL, y = "Sunset time", color = NULL) + @@ -200,7 +197,7 @@ rnaturalearth::ne_countries(scale = "medium", returnclass = "sf", continent = "Europe") |> ggplot() + - geom_sf(color = "grey60", fill = "#DDD5C7", size = 0.1) + + geom_sf(color = "grey60", fill = "#DDD5C7", linewidth = 0.1) + geom_point(data = data |> distinct(city, lat, long), aes(x = long, y = lat, color = city), shape = 18, size = 4) + @@ -300,8 +297,8 @@ Finally, let's look at the effect of longitude (how far east or west a place is) data |> ggplot(aes(x = date, y = noon, group = city, color = city)) + - geom_hline(yintercept = hms::hms(hours = 12), size = 1) + - geom_line(size = 2, key_glyph = "point") + + geom_hline(yintercept = hms::hms(hours = 12), linewidth = 1) + + geom_line(linewidth = 2, key_glyph = "point") + labs(title = "What time is solar noon?", color = NULL) + scale_x_date(expand = expansion(add = 0)) + diff --git a/content/blog/_index.md b/content/blog/_index.md new file mode 100644 index 0000000..ee163da --- /dev/null +++ b/content/blog/_index.md @@ -0,0 +1,3 @@ +--- +title: "Blog" +--- \ No newline at end of file diff --git a/content/cv/index.md b/content/cv/index.md index afc473f..a6f5de4 100644 --- a/content/cv/index.md +++ b/content/cv/index.md @@ -1,10 +1,21 @@ --- -description: "cv" -slug: "cv" -thumbnail: "images/avatar.png" -title: "Curriculum Vitæ" -author: "Daniel Roelfs" -format: hugo +title: Curriculum Vitae +description: Curriculum Vitae +hidetopnav: true --- + diff --git a/content/cv/index.qmd b/content/cv/index.qmd index 141aed3..6e2d650 100644 --- a/content/cv/index.qmd +++ b/content/cv/index.qmd @@ -1,12 +1,27 @@ --- -description: "cv" -slug: "cv" -thumbnail: "images/avatar.png" -title: "Curriculum Vitæ" -author: "Daniel Roelfs" -format: hugo +title: Curriculum Vitae +description: Curriculum Vitae +hidetopnav: true --- +```{css} +#| label: style +#| echo: FALSE + +body > * { + margin-left: 3rem; +} + +#content { + width: 38rem; + padding-bottom: 4rem; +} + +iframe { + margin-left: -1rem; +} +``` + ```{r} #| label: iframe #| echo: FALSE diff --git a/content/photography/index.md b/content/photography/index.md index a343766..3f3d710 100644 --- a/content/photography/index.md +++ b/content/photography/index.md @@ -1,14 +1,26 @@ --- -description: "photography" -slug: "photography" -thumbnail: "images/avatar.png" -title: "Photography" -author: "Daniel Roelfs" -format: hugo +title: Photography +description: Photography +hidetopnav: true --- + + In my free time I enjoy photography. Since I live in Norway, every hike is an opportunity to see some amazing landscapes. Besides that my interests also extend to cityscapes, lifestyle pictures, and portraits. See the website for my photography here: photography.danielroelfs.com - + diff --git a/content/photography/index.qmd b/content/photography/index.qmd index 3a18180..0f758ea 100644 --- a/content/photography/index.qmd +++ b/content/photography/index.qmd @@ -1,12 +1,27 @@ --- -description: "photography" -slug: "photography" -thumbnail: "images/avatar.png" -title: "Photography" -author: "Daniel Roelfs" -format: hugo +title: Photography +description: Photography +hidetopnav: true --- +```{css} +#| label: style +#| echo: FALSE + +body > * { + margin-left: 3rem; +} + +#content { + width: 38rem; + padding-bottom: 4rem; +} + +iframe { + margin-left: -1rem; +} +``` + In my free time I enjoy photography. Since I live in Norway, every hike is an opportunity to see some amazing landscapes. Besides that my interests also extend to cityscapes, lifestyle pictures, and portraits. See the website for my photography here: photography.danielroelfs.com @@ -16,6 +31,6 @@ See the website for my photography here: -ul li p, ul li a { +ul li:before { + content: ""; + margin: 0; +} + +ul li p { + margin-left: 0; +} + +ul li p, .sidenote { font-size: 80%; line-height: 1.4; } + +ul li a { + padding: 0 2px 0 2px; +} -Selection of publications. For a full list see my [Google Scholar profile](https://scholar.google.com/citations?user=QmVQcsAAAAAJ&hl=en). +Selection of publications. For a full list see my [Google Scholar profile](https://scholar.google.com/citations?user=QmVQcsAAAAAJ&hl=en) + +{{< sidenote >}} +2023 +{{< /sidenote >}} - **D Roelfs**, O Frei, D van der Meer, E Tissink, A Shadrin, D Alnæs, OA Andreassen, LT Westlye, T Kaufmann (2023) [***Shared genetic architecture between mental health and the brain functional connectome in the UK Biobank***](https://doi.org/10.1186/s12888-023-04905-7). *BMC Psychiatry*. doi: [10.1186/s12888-023-04905-7](https://doi.org/10.1186/s12888-023-04905-7) preprint: [10.1101/2022.06.24.22276846](https://doi.org/10.1101/2022.06.24.22276846) @@ -23,16 +37,28 @@ Selection of publications. For a full list see my [Google Scholar profile](https - LS Sæther, T Ueland, B Haatveit, LA Maglanoc, A Szabo, S Djurovic, P Aukrust, **D Roelfs**, C Mohn, MBEG Ormerud, TV Lagerberg, NE Steen, I Melle, OA Andreassen, T Ueland (2023) [***Inflammation and cognition in severe mental illness: patterns of covariation and subgroups***](https://doi.org/10.1038/s41380-022-01924-w). *Molecular Psychiatry*. doi: [10.1038/s41380-022-01924-w](https://doi.org/10.1038/s41380-022-01924-w) preprint: [10.1101/2022.08.25.22279209](https://doi.org/10.1101/2022.08.25.22279209) +{{< sidenote >}} +2022 +{{< /sidenote >}} + - G Hindley, O Frei, AA Shadrin, W Cheng, KS O'Connoll, R Icick, N Parker, S Bahrami, N Karadag, **D Roelfs**, B Holen, A Lin, CC Fan, S Djurovic, AM Dale, OB Smeland, OA Andreassen (2022) [***Charting the Landscape of Genetic Overlap Between Mental Disorders and Related Traits Beyond Genetic Correlation***](https://doi.org/10.1176/appi.ajp.21101051). *The American Journal of Psychiatry*. doi: [10.1176/appi.ajp.21101051](https://doi.org/10.1176/appi.ajp.21101051) - D van der Meer, T Kaufmann, AA Shadrin, C Makowski, O Frei, **D Roelfs**, J Monereo Sanchez, DEJ Linden, J Rokicki, D Alnæs, C de Leeuw, WK Thompson, R Loughnan, C Chieh Fan, PM Thompson, LT Westlye, OA Andreassen, AM Dale (2022) [***The genetic architecture of human cortical folding***](https://doi.org/10.1126/sciadv.abj9446). *Science Advances*. doi: [10.1101/ 2021.01.13.426555](https://doi.org/10.1126/sciadv.abj9446) preprint: [10.1101/ 2021.01.13.426555](https://doi.org/10.1101/2021.01.13.426555) +{{< sidenote >}} +2021 +{{< /sidenote >}} + - LS Sæther, **D Roelfs**, T Moberget, OA Andreassen, T Elvsåshagen, EG Jönsson, A Vaskinn (2021) [***Exploring neurophysiological markers of visual perspective taking: Methodological considerations***](https://doi.org/10.1016/j.ijpsycho.2020.12.006). *International Journal of Psychophysiology*. doi: [10.1016/j.ijpsycho.2020.12.006](https://doi.org/10.1016/j.ijpsycho.2020.12.006) - **PREPRINT** **D Roelfs**, D van der Meer, D Alnæs, O Frei, R Loughnan, CC Fan, AM Dale, OA Andreassen, LT Westlye, T Kaufmann (2021) [***Genetic overlap between multivariate measures of human functional brain connectivity and psychiatric disorders***](https://doi.org/10.1101/2021.06.15.21258954). *medRxiv*. doi: [10.1101/2021.06.15.21258954](https://doi.org/10.1101/2021.06.15.21258954) - **D Roelfs**, D Alnæs, O Frei, D van der Meer, OB Smeland, OA Andreassen, LT Westlye, T Kaufmann (2021) [***Phenotypically independent profiles relevant to mental health are genetically correlated***](https://doi.org/10.1038/s41398-021-01313-x). *Translational Psychiatry*. doi: [10.1038/s41398-021-01313-x](https://doi.org/10.1038/s41398-021-01313-x) preprint: [10.1101/2020.03.30.20045591](https://doi.org/10.1101/2020.03.30.20045591) +{{< sidenote >}} +2020 +{{< /sidenote >}} + - I Voldsbekk, I Groote, N Zak, **D Roelfs**, O Geier, P Due-Tønnesen, LL Løkken, M Strømstad, TY Blakstvedt, YS Kuiper, T Elvsåshagen, LT Westlye, A Bjørnerud, II Maximov (2020) [***Sleep and sleep deprivation differentially alter white matter microstructure: A mixed model design utilising advanced diffusion modelling***](https://doi.org/10.1016/j.neuroimage.2020.117540). *NeuroImage*. doi: [j.neuroimage.2020.117540](https://doi.org/10.1016/j.neuroimage.2020.117540) preprint: [10.1101/ 2020.08.24.259432](https://doi.org/10.1101/2020.08.24.259432) - M Valstad, T Moberget, **D Roelfs**, NB Slapø, CMF Timpe, D Beck, G Richard, LS Sæther, B Haatveit, KA Skaug, JE Nordvik, C Hatlestad-Hall, GT Einevoll, T Mãki-Marttunen, LT Westlye, EG Jõnsson, OA Andreassen, T Elvsåshagen (2020) [***Experience-dependent modulation of the visual evoked potential: Testing effect sizes, retention over time, and associations with age in 415 healthy individuals***](https://doi.org/10.1016/j.neuroimage.2020.117302). *NeuroImage*. doi: [10.1016/j.neuroimage.2020.117302](https://doi.org/10.1016/j.neuroimage.2020.117302) diff --git a/content/publications/index.qmd b/content/publications/index.qmd index 30770c6..4f9a8fa 100644 --- a/content/publications/index.qmd +++ b/content/publications/index.qmd @@ -1,24 +1,39 @@ --- -description: "publications" -slug: "publications" -thumbnail: "images/avatar.png" -title: "Publications" -author: "Daniel Roelfs" -format: hugo +title: Publications +description: Publications engine: knitr +hidetopnav: true --- ```{css} #| label: style #| echo: FALSE -ul li p, ul li a { +ul li:before { + content: ""; + margin: 0; +} + +ul li p { + margin-left: 0; +} + +ul li p, .sidenote { font-size: 80%; line-height: 1.4; } + +ul li a { + padding: 0 2px 0 2px; +} ``` -Selection of publications. For a full list see my [Google Scholar profile](https://scholar.google.com/citations?user=QmVQcsAAAAAJ&hl=en). +Selection of publications. For a full list see my [Google Scholar profile](https://scholar.google.com/citations?user=QmVQcsAAAAAJ&hl=en) + +{{{< sidenote >}}} +2023 +{{{< /sidenote >}}} + - **D Roelfs**, O Frei, D van der Meer, E Tissink, A Shadrin, D Alnæs, OA Andreassen, LT Westlye, T Kaufmann (2023) [***Shared genetic architecture between mental health and the brain functional connectome in the UK Biobank***](https://doi.org/10.1186/s12888-023-04905-7). _BMC Psychiatry_. doi: [10.1186/s12888-023-04905-7](https://doi.org/10.1186/s12888-023-04905-7) preprint: [10.1101/2022.06.24.22276846](https://doi.org/10.1101/2022.06.24.22276846) @@ -26,9 +41,17 @@ Selection of publications. For a full list see my [Google Scholar profile](https - LS Sæther, T Ueland, B Haatveit, LA Maglanoc, A Szabo, S Djurovic, P Aukrust, **D Roelfs**, C Mohn, MBEG Ormerud, TV Lagerberg, NE Steen, I Melle, OA Andreassen, T Ueland (2023) [***Inflammation and cognition in severe mental illness: patterns of covariation and subgroups***](https://doi.org/10.1038/s41380-022-01924-w). _Molecular Psychiatry_. doi: [10.1038/s41380-022-01924-w](https://doi.org/10.1038/s41380-022-01924-w) preprint: [10.1101/2022.08.25.22279209](https://doi.org/10.1101/2022.08.25.22279209) +{{{< sidenote >}}} +2022 +{{{< /sidenote >}}} + - G Hindley, O Frei, AA Shadrin, W Cheng, KS O'Connoll, R Icick, N Parker, S Bahrami, N Karadag, **D Roelfs**, B Holen, A Lin, CC Fan, S Djurovic, AM Dale, OB Smeland, OA Andreassen (2022) [***Charting the Landscape of Genetic Overlap Between Mental Disorders and Related Traits Beyond Genetic Correlation***](https://doi.org/10.1176/appi.ajp.21101051). _The American Journal of Psychiatry_. doi: [10.1176/appi.ajp.21101051](https://doi.org/10.1176/appi.ajp.21101051) - D van der Meer, T Kaufmann, AA Shadrin, C Makowski, O Frei, **D Roelfs**, J Monereo Sanchez, DEJ Linden, J Rokicki, D Alnæs, C de Leeuw, WK Thompson, R Loughnan, C Chieh Fan, PM Thompson, LT Westlye, OA Andreassen, AM Dale (2022) [***The genetic architecture of human cortical folding***](https://doi.org/10.1126/sciadv.abj9446). _Science Advances_. doi: [10.1101/ 2021.01.13.426555](https://doi.org/10.1126/sciadv.abj9446) preprint: [10.1101/ 2021.01.13.426555](https://doi.org/10.1101/2021.01.13.426555) + +{{{< sidenote >}}} +2021 +{{{< /sidenote >}}} - LS Sæther, **D Roelfs**, T Moberget, OA Andreassen, T Elvsåshagen, EG Jönsson, A Vaskinn (2021) [***Exploring neurophysiological markers of visual perspective taking: Methodological considerations***](https://doi.org/10.1016/j.ijpsycho.2020.12.006). _International Journal of Psychophysiology_. doi: [10.1016/j.ijpsycho.2020.12.006](https://doi.org/10.1016/j.ijpsycho.2020.12.006) @@ -36,6 +59,10 @@ Selection of publications. For a full list see my [Google Scholar profile](https - **D Roelfs**, D Alnæs, O Frei, D van der Meer, OB Smeland, OA Andreassen, LT Westlye, T Kaufmann (2021) [***Phenotypically independent profiles relevant to mental health are genetically correlated***](https://doi.org/10.1038/s41398-021-01313-x). _Translational Psychiatry_. doi: [10.1038/s41398-021-01313-x](https://doi.org/10.1038/s41398-021-01313-x) preprint: [10.1101/2020.03.30.20045591](https://doi.org/10.1101/2020.03.30.20045591) +{{{< sidenote >}}} +2020 +{{{< /sidenote >}}} + - I Voldsbekk, I Groote, N Zak, **D Roelfs**, O Geier, P Due-Tønnesen, LL Løkken, M Strømstad, TY Blakstvedt, YS Kuiper, T Elvsåshagen, LT Westlye, A Bjørnerud, II Maximov (2020) [***Sleep and sleep deprivation differentially alter white matter microstructure: A mixed model design utilising advanced diffusion modelling***](https://doi.org/10.1016/j.neuroimage.2020.117540). _NeuroImage_. doi: [j.neuroimage.2020.117540](https://doi.org/10.1016/j.neuroimage.2020.117540) preprint: [10.1101/ 2020.08.24.259432](https://doi.org/10.1101/2020.08.24.259432) - M Valstad, T Moberget, **D Roelfs**, NB Slapø, CMF Timpe, D Beck, G Richard, LS Sæther, B Haatveit, KA Skaug, JE Nordvik, C Hatlestad-Hall, GT Einevoll, T Mãki-Marttunen, LT Westlye, EG Jõnsson, OA Andreassen, T Elvsåshagen (2020) [***Experience-dependent modulation of the visual evoked potential: Testing effect sizes, retention over time, and associations with age in 415 healthy individuals***](https://doi.org/10.1016/j.neuroimage.2020.117302). _NeuroImage_. doi: [10.1016/j.neuroimage.2020.117302](https://doi.org/10.1016/j.neuroimage.2020.117302) diff --git a/layouts/index.html b/layouts/index.html new file mode 100644 index 0000000..d810723 --- /dev/null +++ b/layouts/index.html @@ -0,0 +1,44 @@ +{{ define "main" }} +

{{ .Title | markdownify }}

+
+{{ if .Site.Params.intro }} + avatar +

{{ .Site.Params.intro | markdownify }}

+
+{{ end }} +
pages
+{{ range .Site.Menus.main }} + {{- if ne .Name "Home" -}} +

{{ .Name | lower | markdownify }}

+ {{ end }} +{{ end }} +
+
socials
+ +{{ end }} diff --git a/layouts/partials/page.html b/layouts/partials/page.html deleted file mode 100644 index f72c622..0000000 --- a/layouts/partials/page.html +++ /dev/null @@ -1,9 +0,0 @@ -
-
-
- {{ if not .Params.hidetitle }}

{{ .Title }}

{{ end }} -
- - {{ .Content }} -
-
\ No newline at end of file diff --git a/layouts/shortcodes/blogdown/postref.html b/layouts/shortcodes/blogdown/postref.html deleted file mode 100644 index 1b33d19..0000000 --- a/layouts/shortcodes/blogdown/postref.html +++ /dev/null @@ -1 +0,0 @@ -{{ if eq (getenv "BLOGDOWN_POST_RELREF") "true" }}{{ .Page.RelPermalink }}{{ else }}{{ .Page.Permalink }}{{ end }} \ No newline at end of file diff --git a/content/about/avatar.png b/static/avatar.png similarity index 100% rename from content/about/avatar.png rename to static/avatar.png diff --git a/themes/typography b/themes/typography new file mode 160000 index 0000000..d3ab135 --- /dev/null +++ b/themes/typography @@ -0,0 +1 @@ +Subproject commit d3ab1351cdd7495e19192b5d66ce0763505597a9