forked from ourcodingclub/CC-Qualit
-
Notifications
You must be signed in to change notification settings - Fork 0
/
qual_example_script.R
198 lines (148 loc) · 6.23 KB
/
qual_example_script.R
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
# Packages ----
library(ggplot2)
library(dplyr)
library(tidyr)
library(RColorBrewer)
library(tidytext)
library(R.utils)
library(wordcloud)
# set working directory ----
setwd("~/Downloads/CC-Qualit-master")
# Load data ----
# The survey responses
sust_data <- read.csv("sust_behaviour.csv")
# A lookup table which connects each column in `sust_data` to the actual question on the survey
sust_lookup <- read.csv("sust_lookup.csv")
# A list of boring and non-useful words, bundled with `tidytext`
data(stop_words)
# Inspect the data ----
head(sust_data)
str(sust_data)
head(sust_lookup)
# Inspect a column
unique(sust_data$sustainability_daily_think)
str(sust_data$sustainability_daily_think)
# Fix data ----
# Make column an ordered factor
sust_data$sustainability_daily_think <- factor(sust_data$sustainability_daily_think,
levels=c("Never", "Rarely", "Sometimes", "Often", "All the time"),
ordered=TRUE)
# Remove NAs from a column
sust_data$energy_action_n <- nchar(as.character(sust_data$energy_action))
# Diverging stacked bar chart ----
# Create a summary dataframe of likert responses to a single question
sust_think_summ_wide <- sust_data %>%
group_by(gender, sustainability_daily_think) %>%
tally() %>%
mutate(perc = n / sum(n) * 100) %>%
dplyr::select(-n) %>%
group_by(gender) %>%
spread(sustainability_daily_think, perc)
sust_think_summ_hi_lo <- sust_think_summ_wide %>%
mutate(midlow = Sometimes / 2,
midhigh = Sometimes / 2) %>%
dplyr::select(gender, Never, Rarely, midlow, midhigh, Often, `All the time`) %>%
gather(key = response, value = perc, 2:7) %>%
`colnames<-`(c("gender", "response", "perc"))
# Split data into high and low groups
sust_think_summ_hi <- sust_think_summ_hi_lo %>%
filter(response %in% c("All the time", "Often", "midhigh")) %>%
mutate(response = factor(response, levels = c("All the time", "Often", "midhigh")))
sust_think_summ_lo <- sust_think_summ_hi_lo %>%
filter(response %in% c("midlow", "Rarely", "Never")) %>%
mutate(response = factor(response, levels = c("Never", "Rarely", "midlow")))
# Use RColorBrewer to store a preset diverging colour palette as a vector of colour codes
legend_pal <- brewer.pal(name = "RdBu", n = 5)
# Duplicate the middle value, remember that "Sometimes" is actually two groups, "midhigh" and "midlow"
legend_pal <- insert(legend_pal, ats = 3, legend_pal[3])
# Replace the ugly white colour for "Sometimes" with a pleasant dishwater grey
legend_pal <- gsub("#F7F7F7", "#9C9C9C", legend_pal)
# Assign names to the vector based on the colours we want for each group
names(legend_pal) <- c("All the time", "Often", "midhigh", "midlow", "Rarely", "Never" )
# Make plot
ggplot() +
geom_bar(data=sust_think_summ_hi, aes(x = gender, y=perc, fill = response), stat="identity") +
geom_bar(data=sust_think_summ_lo, aes(x = gender, y=-perc, fill = response), stat="identity") +
geom_hline(yintercept = 0, color =c("black")) +
scale_fill_manual(values = legend_pal,
breaks = c("All the time", "Often", "midhigh", "Rarely", "Never"),
labels = c("All the time", "Often", "Sometimes", "Rarely", "Never")) +
coord_flip() +
labs(x = "Gender", y = "Percentage of respondents (%)") +
ggtitle(sust_lookup$survey_question[sust_lookup$column_title == "sustainability_daily_think"]) +
theme_classic()
# Basic stacked bar chart ----
# Count the number of actions performed by counting characters
sust_data$energy_action_n <- nchar(as.character(sust_data$energy_action))
# Create a colour palette
male_female_pal <- c("#0389F0", "#E30031")
names(male_female_pal) <- c("Male", "Female")
# Create plot
ggplot(sust_data, aes(x =energy_action_n, fill = gender)) +
geom_bar() +
scale_fill_manual(values = male_female_pal) +
scale_x_continuous(breaks = seq(1:8)) +
theme_classic()
# Bubble plot ----
# Create tally by age and how often thinking about actions
sust_bubble <- sust_data %>%
group_by(age, sustainability_daily_think) %>%
tally()
# Create plot
ggplot(sust_bubble, aes(x = age, y = sustainability_daily_think)) +
geom_point(aes(size = n)) +
theme_classic()
# Text mining ----
# Explore a column
sust_data$energy_action_comment
# Create gathered comments
sust_comm_gather <- sust_data %>%
dplyr::select(id, gender, energy_action_comment,
food_action_comment, water_action_comment,
waste_action_comment, other_action_comment) %>%
gather(action, comment, -id, -gender) %>%
mutate(comment = as.character(comment))
# Transform to one word per row and remove boring words
sust_comm_tidy <- sust_comm_gather %>%
group_by(gender) %>%
unnest_tokens(output = comment_word,
input = comment) %>%
anti_join(stop_words, by = c("comment_word" = "word")) %>%
count(comment_word, sort = TRUE) %>%
filter(n > 10) %>%
filter(!is.na(comment_word))
# Create plot by gender
ggplot(sust_comm_tidy, aes(x = comment_word, y = n, fill = gender)) +
geom_bar(stat = "identity") +
coord_flip() +
scale_fill_manual(values = male_female_pal) +
theme_classic()
# The same, but only for one question
tidy_energy_often_comment <- sust_data %>%
mutate(energy_action_comment = as.character(energy_action_comment)) %>%
unnest_tokens(output = energy_action_comment_word,
input = energy_action_comment) %>%
anti_join(stop_words, by = c("energy_action_comment_word" = "word")) %>%
count(energy_action_comment_word, sort = TRUE)
tidy_energy_often_comment_summ <- tidy_energy_often_comment %>%
filter(n > 10) %>%
filter(!is.na(energy_action_comment_word)) %>%
mutate(energy_action_comment_word = reorder(energy_action_comment_word, n ))
ggplot(tidy_energy_often_comment_summ, aes(x = energy_action_comment_word, y = n)) +
geom_col() +
xlab(NULL) +
coord_flip() +
theme_classic()
# Wordcloud
tidy_energy_often_comment %>%
with(wordcloud(words = energy_action_comment_word, freq = n, max.words = 100))
# Statistical analysis ----
# Chi squared
gender_think_chi <- chisq.test(sust_data$gender, sust_data$sustainability_daily_think)
gender_think_chi
# Poisson regression
energy_action_pois <- glm(energy_action_n ~ gender, family = "poisson", data = sust_data)
summary(energy_action_pois)
# Multi-variate poisson
energy_action_pois_int <- glm(energy_action_n ~ gender * age, family = "poisson", data = sust_data)
summary(energy_action_pois_int)