rfordatascience · nicolasfoss · Dec 8, 2024 · Dec 8, 2024 · Dec 8, 2024 · Dec 8, 2024
diff --git a/data/curated/simpsons/cleaning.R b/data/curated/simpsons/cleaning.R
@@ -0,0 +1,119 @@
+###_____________________________________________________________________________
+### The Simpson's data!
+### Script to clean the data sourced from Kaggle
+###_____________________________________________________________________________
+
+# packages
+library(httr)
+library(tidyverse)
+library(jsonlite)
+
+# Define the metadata URL and fetch it
+metadata_url <- "www.kaggle.com/datasets/prashant111/the-simpsons-dataset/croissant/download"
+response <- httr::GET(metadata_url)
+
+# Ensure the request succeeded
+if (httr::http_status(response)$category != "Success") {
+  stop("Failed to fetch metadata.")
+}
+
+# Parse the metadata
+metadata <- httr::content(response, as = "parsed", type = "application/json")
+
+# Locate the ZIP file URL
+distribution <- metadata$distribution
+zip_url <- NULL
+
+for (file in distribution) {
+  if (file$encodingFormat == "application/zip") {
+    zip_url <- file$contentUrl
+    break
+  }
+}
+
+if (is.null(zip_url)) {
+  stop("No ZIP file URL found in the metadata.")
+}
+
+# Download the ZIP file
+temp_file <- base::tempfile(fileext = ".zip")
+utils::download.file(zip_url, temp_file, mode = "wb")
+
+# Unzip and read the CSV
+unzip_dir <- base::tempdir()
+utils::unzip(temp_file, exdir = unzip_dir)
+
+# Locate the CSV file within the extracted contents
+csv_file <- list.files(unzip_dir, pattern = "\\.csv$", full.names = TRUE)
+
+if (length(csv_file) == 0) {
+  stop("No CSV file found in the unzipped contents.")
+}
+
+# Read the CSV into a dataframe
+simpsons_characters <- read_csv(csv_file[1])
+simpsons_episodes <- read_csv(csv_file[2])
+simpsons_locations <- read_csv(csv_file[3])
+simpsons_script_lines <- read_csv(csv_file[4])
+
+# Step 5: Explore the data
+glimpse(simpsons_characters)
+glimpse(simpsons_episodes)
+glimpse(simpsons_locations)
+glimpse(simpsons_script_lines)
+
+# Clean up temporary files (optional)
+unlink(c(temp_file, unzip_dir), recursive = TRUE)
+
+###_____________________________________________________________________________
+# Problems with the Data!
+
+# The script lines are of great interest, but it is a larger file, too big
+# for Tidy Tuesday.  We need to reduce the size of the file so we can use all
+# the files together for a more robust analysis.
+# Let's filter episodes down to the years 2010-2016, and then only select
+# the script lines that correspond with with those episodes.
+
+###_____________________________________________________________________________
+
+# filter
+simpsons_episodes_filter <- simpsons_episodes |> 
+  filter(original_air_year >= 2010)
+
+# get episode ids of interest
+new_episode_id <- simpsons_episodes_filter$id
+
+# filter script lines to only include lines for these episodes
+
+simpsons_script_lines_filter <- simpsons_script_lines |> 
+  filter(episode_id %in% new_episode_id)
+
+# export
+
+# characters
+write_csv(
+  x = simpsons_characters,
+  file = paste0(getwd(), "/data/curated/simpsons/simpsons_characters.csv")
+)
+
+# episodes
+write_csv(
+  x = simpsons_episodes_filter,
+  file = paste0(getwd(), "/data/curated/simpsons/simpsons_episodes.csv")
+)
+
+# locations
+write_csv(
+  x = simpsons_locations,
+  file = paste0(getwd(), "/data/curated/simpsons/simpsons_locations.csv")
+)
+
+# script lines
+write_csv(
+  x = simpsons_script_lines_filter,
+  file = paste0(getwd(), "/data/curated/simpsons/simpsons_script_lines.csv")
+)
+
+################################################################################
+### End ########################################################################
+################################################################################
diff --git a/data/curated/simpsons/instructions.md b/data/curated/simpsons/instructions.md
@@ -0,0 +1,30 @@
+## Prepare the dataset
+
+These instructions are for preparing a dataset using the R programming language.
+We hope to provide instructions for other programming languages eventually.
+
+If you have not yet set up your computer for submitting a dataset, please see the full instructions at <https://github.com/rfordatascience/tidytuesday/blob/main/.github/pr_instructions.md>.
+
+1.  `cleaning.R`: Modify the `cleaning.R` file to get and clean the data.
+    -   Write the code to download and clean the data in `cleaning.R`.
+    -   If you're getting the data from a github repo, remember to use the 'raw' version of the URL.
+    -   This script should result in one or more data.frames, with descriptive variable names (eg `players` and `teams`, not `df1` and `df2`).
+
+2.  `saving.R`: Use`saving.R` to save your datasets. This process creates both the `.csv` file(s) and the data dictionary template file(s) for your datasets. **Don't save the CSV files using a separate process because we also need the data dictionaries.**
+    -   Run the first line of `saving.R` to create the functions we'll use to save your dataset.
+    -   Provide the name of your directory as `dir_name`.
+    -   Use `ttsave()` for each dataset you created in `cleaning.R`, substituting the name for the dataset for `YOUR_DATASET_DF`.
+
+3.  `{dataset}.md`: Edit the `{dataset}.md` files to describe your datasets (where `{dataset}` is the name of the dataset). These files are created by `saving.R`. There should be one file for each of your datasets. You most likely only need to edit the "description" column to provide a description of each variable.
+
+4.  `intro.md`: Edit the `intro.md` file to describe your dataset. You don't need to add a `# Title` at the top; this is just a paragraph or two to introduce the week.
+
+5.  Find at least one image for your dataset. These often come from the article about your dataset. If you can't find an image, create an example data visualization, and save the images in your folder as `png` files.
+
+6.  `meta.yaml`: Edit `meta.yaml` to provide information about your dataset and how we can credit you. You can delete lines from the `credit` block that do not apply to you.
+
+### Submit your pull request with the data
+
+1.  Commit the changes with this folder to your branch. In RStudio, you can do this on the "Git" tab (the "Commit" button).
+
+2.  Submit a pull request to <https://github.com/rfordatascience/tidytuesday>. In R, you can do this with `usethis::pr_push()`, and then follow the instructions in your browser.
diff --git a/data/curated/simpsons/intro.md b/data/curated/simpsons/intro.md
@@ -0,0 +1,12 @@
+This week, we are going to explore the Simpsons Dataset from [Kaggle](https://www.kaggle.com/datasets/prashant111/the-simpsons-dataset).  Many thanks to [Prashant Banerjee](https://www.kaggle.com/prashant111) for making this dataset available to the public.  The Simpsons Dataset is comprised of four files that contain the characters, locations, episode details, and script lines for approximately 600 Simpsons episodes.  Please note that episodes and script lines have been filtered down to only include episodes from 2010 to 2016 in the episodes data to keep file size within Tidy Tuesday guidelines!
+
+Here is some history on the Simpsons Dataset from the author:
+
+> Originally, this dataset was scraped by Tod Schenider for his post The Simpsons by the Data, for which he made the scraper available on GitHub. Kaggle user William Cukierski used the scraper to upload the data set, which has been rehosted here.
+
+Thanks to [Nicolas Foss Ed.D., MS](https://github.com/nicolasfoss) for curating this week's dataset!
+
+* Which character has the most spoken lines across all episodes, and how has their dialogue volume changed over the seasons?
+* What are the most frequently used locations in the series, and do specific locations correspond to higher IMDb ratings for episodes?
+* Is there a relationship between the number of U.S. viewers (in millions) and the IMDb ratings or votes for episodes?
+* What are the most commonly used words or phrases in the dialogue across the series, and do they differ by character or location?
diff --git a/data/curated/simpsons/meta.yaml b/data/curated/simpsons/meta.yaml
@@ -0,0 +1,17 @@
+title: Donuts, Data, and D'oh - A Deep Dive into The Simpsons
+article:
+  title: The Simpsons Dataset
+  url: https://www.kaggle.com/datasets/prashant111/the-simpsons-dataset
+data_source:
+  title: The Simpsons Dataset
+  url: https://www.kaggle.com/datasets/prashant111/the-simpsons-dataset
+images:
+- file: simpsons_line.png
+  alt: >
+    Line graph titled 'The Simpsons Avg Views Over the Years' showing the average U.S. viewership for The Simpsons from its peak around 30 million viewers in the early 1990s to a gradual decline below 5 million by the 2010s. Note: this data reflects viewership in the United States only.
+credit:
+  post: Nicolas Foss, Ed.D., MS with Iowa HHS
+  bluesky: 
+  linkedin: https://www.linkedin.com/in/nicolas-foss
+  mastodon: 
+  github: https://github.com/nicolasfoss
diff --git a/data/curated/simpsons/saving.R b/data/curated/simpsons/saving.R
@@ -0,0 +1,12 @@
+# Run this
+source("data/curated/curation_scripts.R")
+
+# Fill in the name of the folder you created in "curated", then run this.
+dir_name <- "simpsons"
+
+# Run this for each of your datasets, replacing YOUR_DATASET_DF with the name of
+# a data.frame from cleaning.R.
+ttsave(simpsons_characters, dir_name = dir_name)
+ttsave(simpsons_episodes_filter, dir_name = dir_name)
+ttsave(simpsons_locations, dir_name = dir_name)
+ttsave(simpsons_script_lines_filter, dir_name = dir_name)