forked from herndonj/dvs_cal
-
Notifications
You must be signed in to change notification settings - Fork 0
/
01_gather_and_wrangle.R
144 lines (126 loc) · 7.69 KB
/
01_gather_and_wrangle.R
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
# ---
# title: "Create CSV file of Future Workshop"
# subtitle: "Data are from the LibCal API, parsed by rvest and dplyr"
# date: "`r Sys.Date()`"
# output: html_notebook
# ---
# Undertook this revision because the integratin of online and in-person workshops made Joel's script ineffective. Nonetheless, borrowed heavily from his previous work.
# Run this script to harvest data from the SpringShare LibCal API.
# This script is calle by `02_make_main_workshop_list.Rmd`
Sys.setenv(TZ="America/New_York")
library(rvest)
library(clock)
library(tidyverse)
library(fs)
# Import LibCal API list of future workshops.
dvs_cal <- read_html("https://api3.libcal.com/api_events.php?iid=971&m=upc&cid=3819&c=&d=25858&l=50&target=_blank")
## Wrangle data
nregistration <- html_nodes(dvs_cal, ".s-lc-ea-treg a")
registration_list <- html_attr(nregistration, "href")
# Convert rvest response into a tibble by parsing the HTML table.
by_workshop <- html_nodes(dvs_cal, ".cat25858") %>%
html_table()
# Data come back in long tidy format. Convert to wide. lines 46 and 47 (ish) take into account that the API delivers inconsistent information based on workshop type. i.e. online workshops report less data than in-person workshops
my_df <- dvs_cal |>
html_nodes(".cat25858") |>
html_table() |>
bind_rows() |>
filter(X1 != "Location:") |>
pivot_wider(names_from = X1, values_from = X2) |>
unnest(cols = everything()) |>
janitor::clean_names() |>
mutate(ronum = row_number(), .before = 1) |>
# mutate(location = if_else(ronum == 24, by_workshop[[24]][["X2"]][5], "NA"), .after = campus) |>
# mutate(location = if_else(ronum == 22, by_workshop[[22]][["X2"]][5], location)) |>
select(!c(contains("campus"), categories))
# my_df <- tibble(by_workshop) %>%
# tidyr::unnest(cols = everything()) %>%
# mutate(X1 = if_else(X2 == "n/a", "Location:", X1)) %>% # this line
# mutate(X2 = if_else(X2 == "n/a", "Online", X2)) %>% # this line
# tidyr::pivot_wider(names_from = X1, values_from = X2, values_fn = list) %>% # `values_fn = list` suppresses warnings
# janitor::clean_names() %>%
# # select(-c("campus", "categories")) %>%
# select(!c(contains("campus"), categories)) %>%
# unnest(cols = everything())
## This was a workaround because row 22 was a HYBRID workshop!!!
# my_df <- bind_rows(
# tibble(by_workshop[1:20]) %>%
# tidyr::unnest(cols = everything()) %>%
# mutate(X1 = if_else(X2 == "n/a", "Location:", X1)) %>% # this line
# mutate(X2 = if_else(X2 == "n/a", "Online", X2)) %>% # this line
# tidyr::pivot_wider(names_from = X1, values_from = X2, values_fn = list) %>% # `values_fn = list` suppresses warnings
# janitor::clean_names() %>%
# select(!c(contains("campus"), categories)) %>%
# unnest(cols = everything())
# ,
# tibble(by_workshop[21]) %>%
# tidyr::unnest(cols = everything()) %>%
# mutate(X1 = if_else(X2 == "n/a", "Location:", X1)) %>% # this line
# mutate(X2 = if_else(X2 == "n/a", "Online", X2)) %>% # this line
# tidyr::pivot_wider(names_from = X1, values_from = X2, values_fn = list) %>% # `values_fn = list` suppresses warnings
# janitor::clean_names() %>%
# select(!c(contains("campus"), categories)) %>%
# unnest(cols = everything()) |>
# filter(location != "Online")
# ,
# tibble(by_workshop[22]) %>%
# tidyr::unnest(cols = everything()) %>%
# mutate(X1 = if_else(X2 == "n/a", "Location:", X1)) %>% # this line
# mutate(X2 = if_else(X2 == "n/a", "Online", X2)) %>% # this line
# tidyr::pivot_wider(names_from = X1, values_from = X2, values_fn = list) %>% # `values_fn = list` suppresses warnings
# janitor::clean_names() %>%
# select(!c(contains("campus"), categories)) %>%
# unnest(cols = everything())
# )
# Insert workshop URL into the data frame
my_df$registration <- registration_list
# Transform the data to look just like the original from Joel.
# my_df <- my_df %>%
# mutate(workshop_id = str_extract(registration, "(?<=event/)\\d+")) %>%
# mutate(workshop_begins = date_time_parse(glue::glue("{date} {str_extract(time, '.*[ap]m(?= - )')}"),
# "America/New_York",
# format = "%a, %b %d, %Y %I:%M%p")) %>%
# mutate(begins_display = date_format(workshop_begins, format = "%I:%M %p")) %>%
# mutate(workshop_ends = date_time_parse(glue::glue("{date} {str_extract(time, '(?<= - ).*[ap]m')}"),
# "America/New_York",
# format = "%a, %b %d, %Y %I:%M%p")) %>%
# mutate(ends_display = date_format(workshop_ends, format = "%I:%M %p")) %>%
# mutate(time_flyer = str_to_lower(glue::glue("{begins_display} - {ends_display}"))) %>%
# mutate(duration = as.numeric(workshop_ends - workshop_begins) * 60) %>%
# mutate(workshop_duration_minutes = as.character(duration)) %>%
# # mutate(description = str_extract(description, ".*(?<=\\.)")) %>% # take only the first paragraph
# # mutate(description = str_remove(description, "^[Online] ")) |>
# mutate(description = str_remove(description, "\\[Online\\]\\s")) |>
# mutate(registration_link = str_extract(registration, ".*(?=\\?)")) %>%
# mutate(date = date_format(workshop_begins, format = "%F")) %>%
# mutate(day = date_format(workshop_begins, format = "%a")) %>%
# mutate(day_flyer = date_format(workshop_begins, format = "%a, %h %d")) %>%
# # mutate(online_in_person_flyer = if_else(location == "NA", "Online", "In-Person")) |>
# # mutate(location = if_else(location == "NA", "Online", "In-Person")) |>
# arrange(workshop_begins, workshop_id)
my_df <- my_df %>%
mutate(workshop_id = str_extract(registration, "(?<=event/)\\d+")) %>%
mutate(workshop_begins = date_time_parse(glue::glue("{date} {str_extract(time, '.*[ap]m(?= - )')}"),
"America/New_York",
format = "%a, %b %d, %Y %I:%M%p")) %>%
mutate(begins_display = date_format(workshop_begins, format = "%I:%M %p")) %>%
mutate(workshop_ends = date_time_parse(glue::glue("{date} {str_extract(time, '(?<= - ).*[ap]m')}"),
"America/New_York",
format = "%a, %b %d, %Y %I:%M%p")) %>%
mutate(ends_display = date_format(workshop_ends, format = "%I:%M %p")) %>%
mutate(time_flyer = str_to_lower(glue::glue("{begins_display} - {ends_display}"))) %>%
mutate(duration = as.numeric(workshop_ends - workshop_begins) * 60) %>%
mutate(workshop_duration_minutes = as.character(duration)) %>%
mutate(description = str_extract(description, ".*(?<=\\.)")) %>% # take only the first paragraph
mutate(online = str_extract(description, "^\\[Online\\]\\s")) |>
mutate(inperson = str_extract(description, "^\\[In[\\s-]person\\]\\s")) |>
mutate(online_in_person_flyer = coalesce(online, inperson), .before = online) |>
select(-online, -inperson) |>
mutate(description = str_remove(description, "^\\[Online\\]\\s")) |>
mutate(description = str_remove(description, "^\\[In[\\s-]person\\]\\s")) |>
mutate(registration_link = str_extract(registration, ".*(?=\\?)")) %>%
mutate(date = date_format(workshop_begins, format = "%F")) %>%
mutate(day = date_format(workshop_begins, format = "%a")) %>%
mutate(day_flyer = date_format(workshop_begins, format = "%a, %h %d")) |>
mutate(registration = str_remove(registration, "\\?hs=a$")) |>
arrange(workshop_begins, workshop_id)