-
Notifications
You must be signed in to change notification settings - Fork 15
/
Copy pathmain_a3.py
191 lines (147 loc) · 6.59 KB
/
main_a3.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
"""main_a3.py
"""
import re
import os
import math
import nltk
from nltk.corpus import brown
from nltk.corpus import wordnet as wn
from nltk.corpus import PlaintextCorpusReader
from fsa import FSA
# NLTK stoplist with 3136 words (multilingual)
STOPLIST = set(nltk.corpus.stopwords.words())
# Vocabulary with 234,377 English words from NLTK
ENGLISH_VOCABULARY = set(w.lower() for w in nltk.corpus.words.words())
# The five categories from Brown that we are using
BROWN_CATEGORIES = ('adventure', 'fiction', 'government', 'humor', 'news')
# Global place to store Brown vocabularies so you calculate them only once
BROWN_VOCABULARIES = None
def is_content_word(word):
"""A content word is not on the stoplist and its first character is a letter."""
return word.lower() not in STOPLIST and word[0].isalpha()
class Text(object):
def __init__(self, path, name=None):
"""Takes a file path, which is assumed to point to a file or a directory,
extracts and stores the raw text and also stores an instance of nltk.text.Text."""
self.name = name
if os.path.isfile(path):
self.raw = open(path).read()
elif os.path.isdir(path):
corpus = PlaintextCorpusReader(path, '.*.mrg')
self.raw = corpus.raw()
self.text = nltk.text.Text( nltk.word_tokenize(self.raw))
def __len__(self):
return len(self.text)
def __getitem__(self, i):
return self.text[i]
def __str__(self):
name = '' if self.name is None else " '%s'" % self.name
return "<Text%s tokens=%s>" % (name, len(self))
def token_count(self):
"""Just return the length of the text."""
return len(self)
def type_count(self):
"""Returns the type count, with minimal normalization by lower casing."""
# an alternative would be to use the method nltk.text.Text.vocab()
return len(set([w.lower() for w in self.text]))
def sentence_count(self):
"""Return number of sentences, using the simplistic measure of counting period,
exclamation marks and question marks."""
# could also use nltk.sent.tokenize on self.raw
return len([t for t in self.text if t in '.!?'])
def most_frequent_content_words(self):
"""Return a list with the 25 most frequent content words and their
frequencies. The list has (word, frequency) pairs and is ordered
on the frequency."""
dist = nltk.FreqDist([w for w in self.text if is_content_word(w.lower())])
return dist.most_common(n=25)
def most_frequent_bigrams(self, n=25):
"""Return a list with the 25 most frequent bigrams that only contain
content words. The list returned should have pairs where the first
element in the pair is the bigram and the second the frequency, as in
((word1, word2), frequency), these should be ordered on frequency."""
filtered_bigrams = [b for b in list(nltk.bigrams(self.text))
if is_content_word(b[0]) and is_content_word(b[1])]
dist = nltk.FreqDist([b for b in filtered_bigrams])
return dist.most_common(n=n)
def concordance(self, word):
self.text.concordance(word)
## new methods for search part of assignment 3
def search(self, pattern):
return re.finditer(pattern, self.raw)
def find_sirs(self):
answer = set()
for match in self.search(r"\bSir \S+\b"):
answer.add(match.group())
return sorted(answer)
def find_brackets(self):
answer = set()
# use a non-greedy match on the characters between the brackets
for match in self.search(r"([\(\[\{]).+?([\)\]\}])"):
brackets = "%s%s" % (match.group(1), match.group(2))
# this tests for matching pairs
if brackets in ['[]', '{}', '()']:
answer.add(match.group())
return sorted(answer)
def find_roles(self):
answer = set()
for match in re.finditer(r"^([A-Z]{2,}[^\:]+): ", self.raw, re.MULTILINE):
answer.add(match.group(1))
return sorted(answer)
def find_repeated_words(self):
answer = set()
for match in self.search(r"(\w{3,}) \1 \1"):
answer.add(match.group())
return sorted(answer)
def apply_fsa(self, fsa):
i = 0
results = []
while i < len(self):
match = fsa.consume(self.text[i:])
if match:
results.append((i, match))
i += len(match)
else:
i += 1
return results
class Vocabulary():
"""Class to store all information on a vocabulary, where a vocabulary is created
from a text. The vocabulary includes the text, a frequency distribution over
that text, the vocabulary items themselves (as a set) and the sizes of the
vocabulary and the text. We do not store POS and gloss, for those we rely on
WordNet. The vocabulary is contrained to those words that occur in a
standard word list. Vocabulary items are not normalized, except for being in
lower case."""
def __init__(self, text):
self.text = text.text
# keeping the unfiltered list around for statistics
self.all_items = set([w.lower() for w in text])
self.items = self.all_items.intersection(ENGLISH_VOCABULARY)
# restricting the frequency dictionary to vocabulary items
self.fdist = nltk.FreqDist(t.lower() for t in text if t.lower() in self.items)
self.text_size = len(self.text)
self.vocab_size = len(self.items)
def __str__(self):
return "<Vocabulary size=%d text_size=%d>" % (self.vocab_size, self.text_size)
def __len__(self):
return self.vocab_size
def frequency(self, word):
return self.fdist[word]
def pos(self, word):
# do not volunteer the pos for words not in the vocabulary
if word not in self.items:
return None
synsets = wn.synsets(word)
# somewhat arbitrary choice to make unknown words nouns, returning None
# or 'UNKNOWN' would have been fine too.
return synsets[0].pos() if synsets else 'n'
def gloss(self, word):
# do not volunteer the gloss (definition) for words not in the vocabulary
if word not in self.items:
return None
synsets = wn.synsets(word)
# make a difference between None for words not in vocabulary and words
# in the vocabulary that do not have a gloss in WordNet
return synsets[0].definition() if synsets else 'NO DEFINITION'
def kwic(self, word):
self.text.concordance(word)