-
Notifications
You must be signed in to change notification settings - Fork 14
/
rank_verbs.py
executable file
·235 lines (185 loc) · 8.62 KB
/
rank_verbs.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
#!/usr/bin/env python
# -*- encoding: utf-8 -*-
import json
import logging
from collections import defaultdict, OrderedDict
from sys import exit
from numpy import average
import click
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import linear_kernel
from strephit.commons.io import load_corpus, load_scraped_items
from strephit.commons import parallel
logger = logging.getLogger(__name__)
VERBAL_PREFIXES = {
'en': 'V',
'it': 'VER',
}
def get_similarity_scores(verb_token, vectorizer, tf_idf_matrix):
""" Compute the cosine similarity score of a given verb token against the input corpus TF/IDF matrix.
:param str verb_token: Surface form of a verb, e.g., *born*
:param sklearn.feature_extraction.text.TfidfVectorizer vectorizer: Vectorizer
used to transform verbs into vectors
:return: cosine similarity score
:rtype: ndarray
"""
verb_token_vector = vectorizer.transform([verb_token])
# Here the linear kernel is the same as the cosine similarity, but faster
# cf. http://scikit-learn.org/stable/modules/metrics.html#cosine-similarity
scores = linear_kernel(verb_token_vector, tf_idf_matrix)
logger.debug("Corpus-wide TF/IDF scores for '%s': %s" % (verb_token, scores))
logger.debug("Average TF/IDF score for '%s': %f" % (verb_token, average(scores)))
return scores
def produce_lemma_tokens(pos_tagged_path, pos_tag_key, language):
""" Extracts a map from lemma to all its tokens
:param str pos_tagged_path: path of the pos-tagged corpus
:param str pos_tag_key: where the pos tag data is in each item
:param language: language of the corpus
:return: mapping from lemma to tokens
:rtype: dict
"""
corpus = load_scraped_items(pos_tagged_path)
lemma_tokens = defaultdict(set)
for item in corpus:
for token, pos, lemma in item.get(pos_tag_key, []):
if pos.startswith(VERBAL_PREFIXES[language]):
lemma_tokens[lemma.lower()].add(token.lower())
return lemma_tokens
def compute_tf_idf_matrix(corpus_path, document_key):
""" Computes the TF-IDF matrix of the corpus
:param str corpus_path: path of the corpus
:param str document_key: where the textual content is in the corpus
:return: a vectorizer and the computed matrix
:rtype: tuple
"""
corpus = load_corpus(corpus_path, document_key, text_only=True)
vectorizer = TfidfVectorizer()
return vectorizer, vectorizer.fit_transform(corpus)
class TFIDFRanking:
""" Computes TF-IDF based rankings.
The first ranking is based on the average TF-IDF score of each lemma over all corpus
The second ranking is based on the average standard deviation of TF-IDF scores
of each lemma over all corpus
"""
def __init__(self, vectorizer, verbs, tfidf_matrix):
self.vectorizer = vectorizer
self.verbs = verbs
self.tfidf_matrix = tfidf_matrix
def score_lemma(self, lemma):
""" Computess TF-IDF based score of a single lemma
:param str lemma: The lemma to score
:return: tuple with lemma, average tf-idf, average of tf-idf standard deviations
:rtype: tuple of (str, float, float)
"""
tf_idfs, st_devs = [], []
for token in self.verbs[lemma]:
scores = get_similarity_scores(token, self.vectorizer, self.tfidf_matrix)
tf_idfs += filter(None, scores.flatten().tolist())
st_devs.append(scores.std())
return lemma, average(tf_idfs), average(st_devs)
def find_ranking(self, processes=0):
""" Ranks the verbs
:param int processes: How many processes to use for parallel ranking
:return: tuple with average tf-idf and average standard deviation ordered rankings
:rtype: tuple of (OrderedDict, OrderedDict)
"""
tfidf_ranking = {}
stdev_ranking = {}
for lemma, tfidf, stdev in parallel.map(self.score_lemma, self.verbs, processes):
tfidf_ranking[lemma] = tfidf
stdev_ranking[lemma] = stdev
return (OrderedDict(sorted(tfidf_ranking.items(), key=lambda x: x[1], reverse=True)),
OrderedDict(sorted(stdev_ranking.items(), key=lambda x: x[1], reverse=True)))
class PopularityRanking:
""" Ranking based on the popularity of each verb. Simply counts the
frequency of each lemma over all corpus
"""
def __init__(self, corpus_path, pos_tag_key):
self.tags = self._flatten(item.get(pos_tag_key) for item in load_scraped_items(corpus_path))
@staticmethod
def _flatten(iterable):
for each in iterable:
for x in each:
yield x
@staticmethod
def _bulkenize(iterable, bulk_size):
acc = []
for each in iterable:
acc.append(each)
if len(acc) % bulk_size == 0:
yield acc
acc = []
if acc:
yield acc
@staticmethod
def score_from_tokens(tokens):
scores = defaultdict(int)
for token, pos, lemma in tokens:
if pos.startswith('V'):
scores[lemma.lower()] += 1
return scores
def find_ranking(self, processes=0, bulk_size=10000, normalize=True):
ranking = defaultdict(int)
for score in parallel.map(self.score_from_tokens,
self._bulkenize(self.tags, bulk_size),
processes):
for k, v in score.iteritems():
ranking[k] += v
ranking = OrderedDict(sorted(ranking.items(), key=lambda x: x[1], reverse=True))
if normalize:
max_score = float(ranking[next(iter(ranking))])
for lemma, score in ranking.iteritems():
ranking[lemma] = score / max_score
return ranking
def harmonic_ranking(*rankings):
""" Combines individual rankings with an harmonic mean to obtain a final ranking
:param rankings: dictionary of individual rankings
:return: the new, combined ranking
"""
def product(x, y):
return x * y
def sum(x, y):
return x + y
def get(k):
return (r[k] for r in rankings)
lemmas = reduce(lambda x, y: x.union(y), (set(r) for r in rankings))
return OrderedDict(sorted(
[(l, len(rankings) * reduce(product, get(l)) / (1 + reduce(sum, get(l)))) for l in lemmas],
key=lambda (_, v): v,
reverse=True
))
@click.command()
@click.argument('pos_tagged', type=click.Path(exists=True, dir_okay=False))
@click.argument('document_key')
@click.argument('language')
@click.option('--pos-tag-key', default='pos_tag')
@click.option('--dump-verbs', type=click.File('w'), default='output/verbs.json')
@click.option('--dump-tf-idf', type=click.File('w'), default='output/tf_idf_ranking.json')
@click.option('--dump-stdev', type=click.File('w'), default='output/stdev_ranking.json')
@click.option('--dump-popularity', type=click.File('w'), default='output/popularity_ranking.json')
@click.option('--dump-final', type=click.File('w'), default='output/verb_ranking.json')
@click.option('--processes', '-p', default=0)
def main(pos_tagged, document_key, pos_tag_key, language, dump_verbs, dump_tf_idf,
dump_stdev, dump_popularity, dump_final, processes):
""" Computes the three verb rankings: average TF-IDF, average of TF-IDF
standard deviation and popularity.
"""
logger.info('Computing lemma-to-token map and TF-IDF matrix ...')
lemma_tokens, (vectorizer, tf_idf_matrix) = parallel.execute(
2,
produce_lemma_tokens, (pos_tagged, pos_tag_key, language),
compute_tf_idf_matrix, (pos_tagged, document_key)
)
logger.info('Scoring verbs by popularity ...')
pop_ranking = PopularityRanking(pos_tagged, pos_tag_key).find_ranking(processes)
logger.info('Scoring verbs by TF-IDF based metrics (average and standard deviation) ...')
tfidf_ranking, stdev_ranking = TFIDFRanking(vectorizer, lemma_tokens, tf_idf_matrix).find_ranking(processes)
logger.info('Producing combined final ranking ...')
final_ranking = harmonic_ranking(pop_ranking, tfidf_ranking, stdev_ranking)
json.dump(tfidf_ranking, dump_tf_idf, indent=2)
json.dump(stdev_ranking, dump_stdev, indent=2)
json.dump(pop_ranking, dump_popularity, indent=2)
json.dump(final_ranking, dump_final, indent=2)
logger.info('Dumped all the rankings to %s' % [dump_tf_idf.name, dump_stdev.name, dump_popularity.name, dump_final.name])
json.dump(lemma_tokens, dump_verbs, default=lambda x: list(x), indent=2)
logger.info("Dumped lemma-to-token map to '%s'" % dump_verbs.name)