-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathget_data.R
67 lines (53 loc) · 2.41 KB
/
get_data.R
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
rm(list = ls())
# simplefmi is available at github: devtools::install_github("paasim/simplefmi")
# also, to use the fmi weather api, an api key is required, see
# http://en.ilmatieteenlaitos.fi/open-data-manual-fmi-wfs-services
library(tidyverse)
library(forcats)
library(stringr)
library(feather)
library(lubridate)
library(simplefmi)
# Read the cycling data
data_url <- "http://www.hel.fi/hel2/tietokeskus/data/helsinki/ksv/Helsingin_pyorailijamaarat.csv"
df1 <- read_csv2(data_url, col_names = TRUE,
col_types = cols("Päivämäärä" = "c", .default = "i"),
locale = locale(encoding = "ISO-8859-1", decimal_mark = ","))
# only include baana and exclude every observation before 2014 as the format
# before that is not consistent with the format after that & remove duplicates
df_onlybaana <- select(df1, one_of(c("Päivämäärä","Baana"))) %>%
setNames(c("date", "count")) %>%
filter(as.numeric(str_extract(date, "(?=\\D*\\d+\\D*)\\d{4}")) >= 2014) %>%
distinct()
# functions to transform the date string into numeric variables
gen_map <- function(vals, targs) {
y <- as.character(vals) %>% setNames(targs)
function(x) str_replace_all(x, y)
}
days_en <- locale()$date_names$day_ab
loc <- locale(date_names = date_names_lang("fi"))
mon_map <- gen_map(1:12, str_sub(loc$date_names$mon, end = -6))
day_map <- gen_map(days_en, loc$date_names$day_ab)
varnames <- c("wkday", "day", "mon", "year", "hr")
# get a tibble with dates in separate variables
df_dates <- separate(df_onlybaana, "date", varnames, sep = " ") %>%
transmute(date = make_date(year, mon_map(mon), day),
wkday = parse_factor(day_map(wkday), levels = days_en[c(2:7,1)]),
count = count) %>%
group_by(date, wkday) %>%
summarise(count = sum(count)) %>%
ungroup()
n <- nrow(df_dates)
df_dates$day_ind <- 1:n - round(n/2) # add zero centered index for days
# get the respective weather data:
fmi_apikey <- readLines("apik") # fmi-apikey
# station id for kaisaniemi from
# http://en.ilmatieteenlaitos.fi/observation-stations
station_id <- "100971"
weather <- fmi_download(fmi_apikey, df_dates$date[1], df_dates$date[n],
station_id, hourly = FALSE)
# combine the data frames
df_all <- mutate(weather, rain = pmax(rain, 0)) %>% # rain = -1 => no rain.
left_join(df_dates, "date") %>%
select(date, wkday, day_ind, temp, rain, count) #reorder the columns
write_feather(df_all, "data.feather")