rfordatascience · jonthegeek · Nov 30, 2024 · Nov 28, 2024 · Nov 28, 2024 · Nov 29, 2024
diff --git a/data/curated/jamesbeard/book.csv b/data/curated/jamesbeard/book.csv
diff --git a/data/curated/jamesbeard/book.md b/data/curated/jamesbeard/book.md
@@ -0,0 +1,8 @@
+|variable    |class     |description                           |
+|:-----------|:---------|:-------------------------------------|
+|subcategory |character |The type of book. These subcategories have changed over time, and many appear to overlap. |
+|rank        |character |Whether the person is a "Winner" or "Nominee". |
+|year        |character |The year of the award. |
+|name        |character |The name of the award winner or nominee. |
+|title       |character |The title of the book. |
+|publisher   |character |The publisher of the book, if supplied. Some publishers are listed multiple times with slightly different names. |
diff --git a/data/curated/jamesbeard/broadcast_media.csv b/data/curated/jamesbeard/broadcast_media.csv
diff --git a/data/curated/jamesbeard/broadcast_media.md b/data/curated/jamesbeard/broadcast_media.md
@@ -0,0 +1,9 @@
+|variable    |class     |description                           |
+|:-----------|:---------|:-------------------------------------|
+|subcategory |character |The type of media. These subcategories have changed over time, and many appear to overlap. |
+|rank        |character |Whether the person is a "Winner" or "Nominee". |
+|year        |character |The year of the award. |
+|name        |character |The name of the award winner or nominee. |
+|show        |character |The name of the show, if applicable. |
+|affiliation |character |Where to find the show. |
+|title       |character |Title of the specific episode. |
diff --git a/data/curated/jamesbeard/cleaning.R b/data/curated/jamesbeard/cleaning.R
@@ -0,0 +1,238 @@
+# Based on data by PythonCoderUnicorn:
+# https://github.com/PythonCoderUnicorn/JamesBeardAward/tree/main)
+library(tidyverse)
+library(httr2)
+
+url <- "https://www.jamesbeard.org/awards/search"
+
+all_categories <- c(
+  "Book",
+  "Broadcast Media",
+  "Humanitarian of the Year",
+  "Journalism",
+  "Leadership",
+  "Lifetime Achievement",
+  "Restaurant & Chef",
+  "Who's Who of Food & Beverage in America"
+)
+# Only certain categories appear to be searchable. I may come back to dig for
+# the hidden ones, though.
+categories <- c(
+  "Book",
+  "Broadcast Media",
+  "Journalism",
+  "Leadership",
+  "Restaurant & Chef"
+)
+
+next_page <- function(resp, req) {
+  next_url <- httr2::resp_body_html(resp) |> 
+    rvest::html_element('a[rel="next"]') |> 
+    rvest::html_attr("href")
+  if (is.na(next_url)) {
+    return(NULL)
+  }
+  return(httr2::request(next_url))
+}
+
+# Data can be different by category, and a few individual cases are laid out
+# incorrectly, unfortunately. So fetch data by category, and then we'll process
+# that data differently, and semi-manually correct strange cases.
+
+all_data <- purrr::map(
+  categories,
+  \(category) {
+    query <- rlang::set_names(
+      1,
+      paste0("categories[", category, "]")
+    )
+
+    resps <- httr2::request(url) |>
+      httr2::req_url_query(!!!query) |>
+      httr2::req_perform_iterative(next_req = next_page, max_reqs = 1000)
+
+    resps |>
+      httr2::resps_data(\(resp) {
+        recipients <- httr2::resp_body_html(resp) |>
+          rvest::html_elements(".c-award-recipient")
+        purrr::map(recipients, \(recipient) {
+          extraction <- recipient |>
+            rvest::html_elements(
+              ".c-award-recipient__award, .c-award-recipient__text"
+            ) |>
+            rvest::html_text2()
+          tibble::tibble(
+            category = category,
+            name = recipient |>
+              rvest::html_element(".c-award-recipient__name") |>
+              rvest::html_text2(),
+            extraction = list(extraction[nchar(extraction) > 0])
+          )
+        }) |>
+          purrr::list_rbind()
+      })
+  },
+  .progress = TRUE
+) |>
+  purrr::list_rbind()
+
+# saveRDS(all_data, "data/curated/jamesbeard/all_data.rds")
+# all_data <- readRDS("data/curated/jamesbeard/all_data.rds")
+
+# Other variables and functions for cleaning ----
+years <- 1967:lubridate::year(lubridate::today())
+ranks <- c("Winner", "Nominee", "Semifinalist")
+
+find_values <- function(df, values) {
+  cols <- purrr::map(
+    df, \(col) {
+      col[!(col %in% values)] <- NA_character_
+      col
+    }
+  )
+  dplyr::coalesce(!!!cols)
+}
+
+use_values <- function(data, new_col, cols, values) {
+  df <- dplyr::select(data, {{ cols }})
+  new_vals <- find_values(df, values)
+
+  data |> 
+    dplyr::mutate(
+      {{new_col}} := new_vals,
+      dplyr::across(
+        {{ cols }},
+        \(col) {
+          blank_values(col, values)
+        }
+      )
+    ) |> 
+    janitor::remove_empty("cols")
+}
+
+blank_values <- function(col, values) {
+  col[col %in% values] <- NA_character_
+  col
+}
+
+# Leadership ----
+leadership <- 
+  all_data |> 
+  dplyr::filter(category == "Leadership") |> 
+  tidyr::unnest_wider(extraction, names_sep = "_") |> 
+  use_values("rank", tidyselect::starts_with("extraction_"), values = ranks) |> 
+  use_values("year", tidyselect::starts_with("extraction_"), values = years) |> 
+  dplyr::select(
+    "rank",
+    "year",
+    "name",
+    "affiliation" = "extraction_2"
+  )
+
+# Journalism ----
+journalism <- 
+  all_data |> 
+  dplyr::filter(category == "Journalism") |> 
+  tidyr::unnest_wider(extraction, names_sep = "_") |> 
+  # Find where "Journalism" is so we can get rid of those values.
+  use_values(
+    "full_category_name",
+    tidyselect::starts_with("extraction_"),
+    values = "Journalism"
+  ) |> 
+  use_values("rank", tidyselect::starts_with("extraction_"), values = ranks) |> 
+  use_values("year", tidyselect::starts_with("extraction_"), values = years) |> 
+  # The newspaper, etc, appears to be the second-to-last thing that's left.
+  dplyr::mutate(
+    affiliation = dplyr::coalesce(extraction_3, extraction_2),
+    extraction_3 = dplyr::na_if(extraction_3, affiliation),
+    extraction_2 = dplyr::na_if(extraction_2, affiliation),
+  ) |> 
+  dplyr::select(
+    "subcategory" = "extraction_1",
+    "rank",
+    "year",
+    "name",
+    "affiliation",
+    "title" = "extraction_2"
+  )
+
+# Broadcast Media ----
+broadcast_media <-
+  all_data |> 
+  dplyr::filter(category == "Broadcast Media") |> 
+  tidyr::unnest_wider(extraction, names_sep = "_") |> 
+  # Find where the category is so we can get rid of those values.
+  use_values(
+    "full_category_name",
+    tidyselect::starts_with("extraction_"),
+    values = "Broadcast Media"
+  ) |> 
+  use_values("rank", tidyselect::starts_with("extraction_"), values = ranks) |> 
+  use_values("year", tidyselect::starts_with("extraction_"), values = years) |> 
+  # The affiliation appears to be the second-to-last thing that's left.
+  dplyr::mutate(
+    affiliation = dplyr::coalesce(extraction_4, extraction_3, extraction_2),
+    extraction_4 = dplyr::na_if(extraction_4, affiliation),
+    extraction_3 = dplyr::na_if(extraction_3, affiliation),
+    extraction_2 = dplyr::na_if(extraction_2, affiliation),
+  ) |> 
+  dplyr::mutate(
+    title = dplyr::coalesce(extraction_3, extraction_2),
+    extraction_3 = dplyr::na_if(extraction_3, title),
+    extraction_2 = dplyr::na_if(extraction_2, title),
+  ) |> 
+  dplyr::select(
+    "subcategory" = "extraction_1",
+    "rank",
+    "year",
+    "name",
+    "show" = "extraction_2",
+    "affiliation",
+    "title"
+  )
+
+# Book ----
+book <-
+  all_data |> 
+  dplyr::filter(category == "Book") |> 
+  tidyr::unnest_wider(extraction, names_sep = "_") |> 
+  # Find where the category is so we can get rid of those values.
+  use_values(
+    "full_category_name",
+    tidyselect::starts_with("extraction_"),
+    values = "Book"
+  ) |> 
+  use_values("rank", tidyselect::starts_with("extraction_"), values = ranks) |> 
+  use_values("year", tidyselect::starts_with("extraction_"), values = years) |> 
+  dplyr::select(
+    "subcategory" = "extraction_1",
+    "rank",
+    "year",
+    "name",
+    "title" = "extraction_2",
+    "publisher" = "extraction_3"
+  )
+
+# Restaurant & Chef ----
+restaurant_and_chef <-
+  all_data |> 
+  dplyr::filter(category == "Restaurant & Chef") |> 
+  tidyr::unnest_wider(extraction, names_sep = "_") |> 
+  # Find where the category is so we can get rid of those values.
+  use_values(
+    "full_category_name",
+    tidyselect::starts_with("extraction_"),
+    values = "Restaurant & Chef"
+  ) |> 
+  use_values("rank", tidyselect::starts_with("extraction_"), values = ranks) |> 
+  use_values("year", tidyselect::starts_with("extraction_"), values = years) |>
+  # dplyr::glimpse()
+  dplyr::select(
+    "subcategory" = "extraction_1",
+    "rank",
+    "year",
+    "name",
+    "restaurant" = "extraction_2",
+    "city" = "extraction_3"
+  )
diff --git a/data/curated/jamesbeard/instructions.md b/data/curated/jamesbeard/instructions.md
@@ -0,0 +1,30 @@
+## Prepare the dataset
+
+These instructions are for preparing a dataset using the R programming language.
+We hope to provide instructions for other programming languages eventually.
+
+If you have not yet set up your computer for submitting a dataset, please see the full instructions at <https://github.com/rfordatascience/tidytuesday/blob/master/.github/pr_instructions.md>.
+
+1.  `cleaning.R`: Modify the `cleaning.R` file to get and clean the data.
+    -   Write the code to download and clean the data in `cleaning.R`.
+    -   If you're getting the data from a github repo, remember to use the 'raw' version of the URL.
+    -   This script should result in one or more data.frames, with descriptive variable names (eg `players` and `teams`, not `df1` and `df2`).
+
+2.  `saving.R`: Use`saving.R` to save your datasets. This process creates both the `.csv` file(s) and the data dictionary template file(s) for your datasets. **Don't save the CSV files using a separate process because we also need the data dictionaries.**
+    -   Run the first line of `saving.R` to create the functions we'll use to save your dataset.
+    -   Provide the name of your directory as `dir_name`.
+    -   Use `ttsave()` for each dataset you created in `cleaning.R`, substituting the name for the dataset for `YOUR_DATASET_DF`.
+
+3.  `{dataset}.md`: Edit the `{dataset}.md` files to describe your datasets (where `{dataset}` is the name of the dataset). These files are created by `saving.R`. There should be one file for each of your datasets. You most likely only need to edit the "description" column to provide a description of each variable.
+
+4.  `intro.md`: Edit the `intro.md` file to describe your dataset. You don't need to add a `# Title` at the top; this is just a paragraph or two to introduce the week.
+
+5.  Find at least one image for your dataset. These often come from the article about your dataset. If you can't find an image, create an example data visualization, and save the images in your folder as `png` files.
+
+6.  `meta.yaml`: Edit `meta.yaml` to provide information about your dataset and how we can credit you. You can delete lines from the `credit` block that do not apply to you.
+
+### Submit your pull request with the data
+
+1.  Commit the changes with this folder to your branch. In RStudio, you can do this on the "Git" tab (the "Commit" button).
+
+2.  Submit a pull request to <https://github.com/rfordatascience/tidytuesday>. In R, you can do this with `usethis::pr_push()`, and then follow the instructions in your browser.
diff --git a/data/curated/jamesbeard/intro.md b/data/curated/jamesbeard/intro.md
@@ -0,0 +1,9 @@
+This week we're exploring the [James Beard Awards](https://www.jamesbeard.org/awards)! [Wikipedia tells us](https://en.wikipedia.org/wiki/James_Beard_Foundation_Award):
+
+> The James Beard Foundation Awards are annual awards presented by the James Beard Foundation to recognize chefs, restaurateurs, authors and journalists in the United States.
+
+Thank you to [PythonCoderUnicorn](https://github.com/PythonCoderUnicorn) for the dataset suggestion!
+
+- How have the subcategories of the various awards changed over time?
+- Has anybody won in multiple categories?
+- Which restaurants have the most winners? Which newspapers or networks?
diff --git a/data/curated/jamesbeard/james_beard_logo.png b/data/curated/jamesbeard/james_beard_logo.png