forked from andrewheiss/Harry-Potter-aggression
-
Notifications
You must be signed in to change notification settings - Fork 0
/
harry_potter_aggression.R
180 lines (152 loc) · 6.68 KB
/
harry_potter_aggression.R
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
#----------------
# Load packages
#----------------
library(RCurl)
library(dplyr)
library(tidyr)
library(stringr)
library(ggplot2)
library(grid)
library(Cairo)
#-------------------
# Helper functions
#-------------------
# Nice clean, stripped-down theme
theme_clean <- function(base_size=12, base_family="Source Sans Pro Light") {
ret <- theme_bw(base_size, base_family) +
theme(panel.background = element_rect(fill="#ffffff", colour=NA),
axis.title.x=element_text(vjust=-0.2), axis.title.y=element_text(vjust=1.5),
title=element_text(vjust=1.2, family="Source Sans Pro Semibold"),
panel.border = element_blank(), axis.line=element_blank(),
panel.grid=element_blank(), axis.ticks=element_blank(),
legend.position="bottom",
axis.title=element_text(size=rel(0.8), family="Source Sans Pro Semibold"),
strip.text=element_text(size=rel(1), family="Source Sans Pro Semibold"),
strip.background=element_rect(fill="#ffffff", colour=NA),
panel.margin.y=unit(1.5, "lines"))
ret
}
# Append spaces to labels to fake right margin/padding
add.spaces <- function(x) {
return(as.character(paste0(x, " ")))
}
# Select all the text for a specific book
get_chunk <- function(x) {
# The book name passes as a number, not text...
book.name <- nice.books[x]
filter(full.text, book==book.name)$text
}
# Count the number of times a name appears in a vector of text
count.name <- function(x, regex) {
grepl(regex, x)
}
# Make the book names from the scraped full text match the aggression data
clean.book.names <- function(x) {
ret <- gsub("sorcerers", "sorcerer's", tolower(sub("Harry Potter and ", "", x)))
ret
}
#------------
# Load data
#------------
# URL to the fantastic data
data.url <- "https://docs.google.com/spreadsheets/d/1heSMqYzYnL5bS0xiZ2waReUMLKIxHce5MMsQxzhgaA8/export?format=csv&gid=384008866"
# Book names
nice.books <- c("The Sorcerer's Stone", "The Chamber of Secrets",
"The Prisoner of Azkaban", "The Goblet of Fire",
"The Order of the Phoenix", "The Half-Blood Prince",
"The Deathly Hallows")
# Load and rearrange the data
hp <- read.csv(textConnection(getURL(data.url)), as.is=TRUE) %>%
select(1:13) %>% # Only get the first 13 columns
select(-c(g_e_m_n, evil, creature, tot)) %>% # Get rid of other columns
gather(book, aggressions, -c(Name, abb)) %>%
mutate(book = factor(book, levels=levels(book), labels=nice.books, ordered=TRUE),
book.rev = factor(book, levels=rev(levels(book)), ordered=TRUE))
# Load and clean full text from Harry Potter books
full.text <- read.csv("hp-corpus/hp-full.csv", stringsAsFactors=FALSE) %>%
mutate(text = tolower(text),
book = factor(clean.book.names(book),
levels=tolower(levels(hp$book)),
labels=levels(hp$book),
ordered=TRUE)) %>%
filter(text != "") %>% filter(text != "\n")
# Load search strings
searches <- read.csv("names_with_regex.csv", stringsAsFactors=FALSE) %>%
select(-c(abb, search))
# Calculate book length
num.paragraphs <- full.text %>% group_by(book) %>% summarize(paragraphs = n())
# Count all instances of each search string in each book (takes a while)
mentions <- hp %>%
left_join(searches, by="Name") %>%
rowwise() %>%
summarize(mentions = sum(sapply(get_chunk(book), FUN=count.name, regex=regex)),
Name = Name, book = book) %>%
mutate(book = factor(book, labels=nice.books, ordered=TRUE))
# Combine all the fancy new data
hp.full <- hp %>%
left_join(num.paragraphs, by="book") %>%
left_join(mentions, by=c("Name", "book")) %>%
mutate(agg.per.mention = ifelse(mentions == 0, 0, aggressions / mentions),
mentions.per.p = mentions / paragraphs,
agg.weighted = aggressions / mentions.per.p) %>%
filter(agg.per.mention < 1.1) %>%
filter(mentions > 10)
write.csv(select(hp.full, -book.rev),
file="harry_potter_aggression_full.csv", row.names=FALSE)
#------------
# Plot data
#------------
# Find most aggressive characters
hp.full %>% group_by(Name) %>% summarize(Aggressions = sum(aggressions)) %>%
arrange(desc(Aggressions)) %>% head(5) -> most.aggressive
# Rearrange data for plotting
plot.data <- hp.full %>%
filter(Name %in% most.aggressive$Name) %>%
mutate(Name = factor(Name, levels=most.aggressive$Name, ordered=TRUE),
Name.rev = factor(Name, levels=rev(levels(Name)),
labels=add.spaces(rev(levels(Name))), ordered=TRUE)) %>%
group_by(Name.rev, book.rev) %>%
summarize(aggr = sum(aggressions))
# Plot top 5
ggplot(plot.data, aes(x=book.rev, y=aggr, fill=Name.rev)) +
geom_bar(stat="identity", position="dodge") +
geom_hline(yintercept=seq(10, 50, by=10), colour="#ffffff", size=0.25) +
coord_flip() +
labs(x=NULL, y="Instances of aggression",
title="Most aggressive characters in the Harry Potter series") +
scale_fill_manual(values=c("#915944", "#9BAB9C", "#692521", "#CAAA17", "#B51616"),
guide=guide_legend(reverse=TRUE), name="") +
theme_clean() + theme(legend.key.size = unit(0.5, "lines")) -> nice.plot
# Save plot
ggsave(nice.plot, filename="images/top_5.png", width=7, height=5, units="in")
ggsave(nice.plot, filename="images/top_5.pdf", width=7, height=5, units="in",
device=cairo_pdf)
# Plot the characters that are most likely to be aggressive when mentioned
plot.data <- hp.full %>%
group_by(Name) %>%
summarize(aggr = mean(agg.per.mention)) %>%
arrange(desc(aggr)) %>% head(15) %>%
mutate(Name = factor(Name, levels=Name, ordered=TRUE),
Name.rev = factor(Name, level=rev(levels(Name)), ordered=TRUE)) %>%
select(-Name)
# Get the data from the original top 5 aggressive characters
original.plot <- hp.full %>%
filter(Name %in% most.aggressive$Name) %>%
mutate(Name = factor(Name, levels=most.aggressive$Name, ordered=TRUE),
Name.rev = factor(Name, levels=rev(levels(Name)),
labels=add.spaces(rev(levels(Name))), ordered=TRUE)) %>%
group_by(Name.rev) %>%
summarize(aggr = mean(agg.per.mention))
# Combine the two sets of data
combined.plot.data <- rbind(plot.data, original.plot)
# Plot!
ggplot(combined.plot.data, aes(x=Name.rev, y=aggr)) +
geom_bar(stat="identity", position="dodge") +
geom_hline(yintercept=seq(0.1, 0.5, by=0.1), colour="#ffffff", size=0.25) +
geom_vline(xintercept=15.5, colour="red", size=1) +
labs(x=NULL, y="Probability of being aggressive when mentioned") +
coord_flip() + theme_clean() -> full.plot
# Save plot
ggsave(full.plot, filename="images/adjusted.png", width=7, height=5, units="in")
ggsave(full.plot, filename="images/adjusted.pdf", width=7, height=5, units="in",
device=cairo_pdf)