-
Notifications
You must be signed in to change notification settings - Fork 3
/
stopwords_homer.R
179 lines (153 loc) · 6.3 KB
/
stopwords_homer.R
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
# ----------------+
# Stopwords: Homer
# ----------------+
source("~/Documents/github/helpers/R/packages_ab.R")
# Iliad -------------------------------------------------------------------
iliad <-
read_lines("data/corpus_test/greek_iliade.txt") %>%
str_replace_all("\\{.+?\\}", " ") %>%
str_replace_all("\\s+", " ") %>%
str_replace_all("^\\s", "")
iliad_df <- iliad %>%
data_frame(text = iliad) %>%
unnest_tokens(word, text)
iliad_df %>% count(word, sort = TRUE)
iliad_lem <-
read_lines("data/corpus_test/greek_iliade_perseus_lem.txt") %>%
str_replace_all("\n+", " ")
iliad_lem_df <-
data_frame(text = iliad_lem) %>%
unnest_tokens(word, text)
iliad_lem_df %>% count(word, sort = TRUE)
iliad_top_100 <-
iliad_df %>% count(word, sort = TRUE) %>% slice(1:100)
tlg_top_100 <- read_lines("data/tlg_data/tlg_top_100.txt")
iliad_100_not_in_tlg_100 <- setdiff(iliad_top_100$word, tlg_top_100)
write_lines(iliad_100_not_in_tlg_100,
"data/homer_data/iliad_100_not_in_tlg_100.txt")
iliad_top_1000 <-
iliad_df %>% count(word, sort = TRUE) %>% slice(1:1000)
write_lines(iliad_top_1000$word,
"data/homer_data/iliad_top_1000.txt")
tlg_top_1000 <- read_lines("data/tlg_data/tlg_top_1000.txt")
iliad_1000_not_in_tlg_1000 <-
setdiff(iliad_top_1000$word, tlg_top_1000)
write_lines(iliad_1000_not_in_tlg_1000,
"data/homer_data/iliad_1000_not_in_tlg_1000.txt")
iliad_lem_top_1000 <-
iliad_lem_df %>% count(word, sort = TRUE) %>% slice(1:1000)
tlg_lem_top_1000 <-
read_lines("data/tlg_data/tlg_top_1000_lemmatised_corrected.txt")
iliad_lem_1000_not_in_tlg_lem_1000 <-
setdiff(iliad_lem_top_1000$word, tlg_lem_top_1000)
write_lines(
iliad_lem_1000_not_in_tlg_lem_1000,
"data/homer_data/iliad_lem_1000_not_in_tlg_lem_1000.txt"
)
iliad_candidates_100 <-
read_lines("data/homer_data/iliad_100_not_in_tlg_100_selection.txt")
iliad_candidates_1000 <-
read_lines("data/homer_data/iliad_1000_not_in_tlg_1000_selection.txt")
setdiff(iliad_candidates_100, iliad_candidates_1000)
iliad_lem_candidates_1000 <-
read_lines("data/homer_data/iliad_lem_1000_not_in_tlg_lem_1000_selection.txt")
sort(setdiff(iliad_lem_candidates_1000, iliad_candidates_1000))
sort(setdiff(iliad_candidates_1000, iliad_lem_candidates_1000))
iliad_all_candidates_1000 <-
sort(union(iliad_candidates_1000, iliad_lem_candidates_1000))
write_lines(iliad_all_candidates_1000,
"data/homer_data/iliad_all_candidates_1000.txt")
current_greek <- read_lines("stopwords_greek.txt")
iliad_candidates_not_in_current_greek <-
sort(setdiff(iliad_all_candidates_1000, current_greek)) %>% str_extract_all("^[^#].*") %>% unlist()
write_lines(
iliad_candidates_not_in_current_greek,
"data/homer_data/iliad_candidates_not_in_current_greek.txt"
)
# Odyssey -----------------------------------------------------------------
odyssey <-
read_lines("data/corpus_test/greek_odyssee.txt") %>%
str_replace_all("\\{.+?\\}", " ") %>%
str_replace_all("\\s+", " ") %>%
str_replace_all("^\\s", "")
odyssey_df <- odyssey %>%
data_frame(text = odyssey) %>%
unnest_tokens(word, text)
odyssey_df %>% count(word, sort = TRUE)
odyssey_lem <-
read_lines("data/corpus_test/greek_odyssee_perseus_lem.txt") %>%
str_replace_all("\n+", " ")
odyssey_lem_df <-
data_frame(text = odyssey_lem) %>%
unnest_tokens(word, text)
odyssey_lem_df %>% count(word, sort = TRUE)
odyssey_top_100 <-
odyssey_df %>% count(word, sort = TRUE) %>% slice(1:100)
tlg_top_100 <- read_lines("data/tlg_data/tlg_top_100.txt")
odyssey_100_not_in_tlg_100 <-
setdiff(odyssey_top_100$word, tlg_top_100)
write_lines(odyssey_100_not_in_tlg_100,
"data/homer_data/odyssey_100_not_in_tlg_100.txt")
odyssey_top_1000 <-
odyssey_df %>% count(word, sort = TRUE) %>% slice(1:1000)
write_lines(odyssey_top_1000$word,
"data/homer_data/odyssey_top_1000.txt")
tlg_top_1000 <- read_lines("data/tlg_data/tlg_top_1000.txt")
odyssey_1000_not_in_tlg_1000 <-
setdiff(odyssey_top_1000$word, tlg_top_1000)
write_lines(
odyssey_1000_not_in_tlg_1000,
"data/homer_data/odyssey_1000_not_in_tlg_1000.txt"
)
odyssey_lem_top_1000 <-
odyssey_lem_df %>% count(word, sort = TRUE) %>% slice(1:1000)
tlg_lem_top_1000 <-
read_lines("data/tlg_data/tlg_top_1000_lemmatised_corrected.txt")
odyssey_lem_1000_not_in_tlg_lem_1000 <-
setdiff(odyssey_lem_top_1000$word, tlg_lem_top_1000)
write_lines(
odyssey_lem_1000_not_in_tlg_lem_1000,
"data/homer_data/odyssey_lem_1000_not_in_tlg_lem_1000.txt"
)
odyssey_candidates_100 <-
read_lines("data/homer_data/odyssey_100_not_in_tlg_100_selection.txt")
odyssey_candidates_1000 <-
read_lines("data/homer_data/odyssey_1000_not_in_tlg_1000_selection.txt")
setdiff(odyssey_candidates_100, odyssey_candidates_1000)
setdiff(odyssey_candidates_1000, odyssey_candidates_100)
odyssey_candidates_union_100_1000 <-
union(odyssey_candidates_1000, odyssey_candidates_100)
write_lines(
odyssey_candidates_union_100_1000,
"data/homer_data/odyssey_candidates_union_100_1000.txt"
)
odyssey_lem_candidates_1000 <-
read_lines("data/homer_data/odyssey_lem_1000_not_in_tlg_lem_1000_selection.txt")
odyssey_all_candidates_1000 <-
sort(union(
odyssey_lem_candidates_1000,
odyssey_candidates_union_100_1000
))
write_lines(odyssey_all_candidates_1000,
"data/homer_data/odyssey_all_candidates_1000.txt")
current_greek <- read_lines("stopwords_greek.txt")
odyssey_candidates_not_in_current_greek <-
sort(setdiff(odyssey_all_candidates_1000, current_greek)) %>% str_extract_all("^[^#].*") %>% unlist()
write_lines(
odyssey_candidates_not_in_current_greek,
"data/homer_data/odyssey_candidates_not_in_current_greek.txt"
)
# Homer : Iliad + Odyssey -------------------------------------------------
homer_all_candidates <-
sort(
union(
iliad_candidates_not_in_current_greek,
odyssey_candidates_not_in_current_greek
)
)
write_lines(homer_all_candidates,
"data/homer_data/homer_all_candidates.txt")
# Summarise ---------------------------------------------------------------
length(read_lines("data/homer_data/homer_all_candidates.txt"))
length(read_lines("data/homer_data/odyssey_candidates_not_in_current_greek.txt"))
length(read_lines("data/homer_data/iliad_candidates_not_in_current_greek.txt"))