-
Notifications
You must be signed in to change notification settings - Fork 0
/
basics.R
118 lines (75 loc) · 2.82 KB
/
basics.R
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
# load packages ---------
library(tidyverse)
library(here)
library(skimr)
library(janitor)
# read in data -----------
beaches <- read_csv(here("data","sydneybeaches.csv"))
# exploring the data --------------
View(beaches)
dim(beaches)
str(beaches)
glimpse(beaches)
head(beaches)
tail(beaches)
summary(beaches)
skim(beaches)
# tidying columns -------------------
glimpse(beaches)
select_all(beaches, toupper)
select_all(beaches, tolower)
cleanbeaches <- clean_names(beaches)
names(cleanbeaches)
# for rename use newname = oldname
cleanbeaches <- rename(cleanbeaches, beachbugs = enterococci_cfu_100ml)
# select a subset of columns
select(cleanbeaches, council, site, beachbugs, everything())
# pipe %>%
cleanbeaches <- beaches %>%
clean_names() %>%
rename(beachbugs = enterococci_cfu_100ml)
write_csv(cleanbeaches, "cleanbeaches.csv")
# sorting and filtering ------------
# which beach has the most extreme levels of bugs
worstbugs <- cleanbeaches %>% arrange(-beachbugs)
worstcoogee <- cleanbeaches %>%
filter(site == "Coogee Beach") %>%
arrange(-beachbugs)
# lets compare max bug values across different beaches
cleanbeaches %>%
filter(site %in% c("Coogee Beach", "Bondi Beach")) %>%
arrange(desc(beachbugs))
# group_by and summarise ------------
cleanbeaches %>%
group_by(site) %>%
summarise(maxbug = max(beachbugs, na.rm = TRUE),
meanbugs = mean(beachbugs, na.rm = TRUE),
medianbugs = median(beachbugs, na.rm = TRUE),
sdbugs = sd(beachbugs, na.rm = TRUE))
# lets compare councils
cleanbeaches %>% distinct(council)
councilbysite <- cleanbeaches %>%
group_by(council, site) %>%
summarise(meanbugs = mean(beachbugs, na.rm = TRUE),
medianbugs = median(beachbugs, na.rm = TRUE))
# compute new variables ------------
glimpse(cleanbeaches)
testdate <- cleanbeaches %>% separate(date, c("day", "month", "year"), remove = FALSE)
cleanbeaches %>% unite(council_site, council:site, remove = FALSE)
# use mutate to transform the beachbugs data
summary(cleanbeaches)
cleanbeaches %>% mutate(logbeachbugs = log(beachbugs))
#use mutate to computer new numeric variable
cleanbeaches %>% mutate(beachbugsdiff = beachbugs - lag(beachbugs))
#use mutate to compute new logical variable
cleanbeaches %>% mutate(buggier = beachbugs > mean(beachbugs, na.rm = TRUE))
meanbugs = mean(cleanbeaches$beachbugs, na.rm= TRUE)
cleanbeaches_new <- cleanbeaches %>%
separate(date, c("day", "month", "year"), remove = FALSE) %>%
mutate(logbeachbugs = log(beachbugs)) %>%
mutate(beachbugsdiff = beachbugs - lag(beachbugs)) %>%
mutate(buggier_all = beachbugs > mean(beachbugs, na.rm= TRUE)) %>%
group_by(site) %>%
mutate(buggier_site = beachbugs > mean(beachbugs, na.rm= TRUE))
# write cleaned data to .csv -------------
write_csv(cleanbeaches_new, here("data", "cleanbeaches_new.csv"))