-
Notifications
You must be signed in to change notification settings - Fork 6
/
elmo_vis.py
136 lines (103 loc) · 3.99 KB
/
elmo_vis.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
from collections import OrderedDict
import numpy as np
from allennlp.commands.elmo import ElmoEmbedder
from sklearn.decomposition import PCA
class Elmo:
def __init__(self):
self.elmo = ElmoEmbedder()
def get_elmo_vector(self, tokens, layer):
vectors = self.elmo.embed_sentence(tokens)
X = []
for vector in vectors[layer]:
X.append(vector)
X = np.array(X)
return X
def dim_reduction(X, n):
pca = PCA(n_components=n)
print("size of X: {}".format(X.shape))
results = pca.fit_transform(X)
print("size of reduced X: {}".format(results.shape))
for i, ratio in enumerate(pca.explained_variance_ratio_):
print("Variance retained ratio of PCA-{}: {}".format(i+1, ratio))
return results
def plot(word, token_list, reduced_X, file_name, title):
import matplotlib.pyplot as plt
fig, ax = plt.subplots()
# plot ELMo vectors
i = 0
for j, token in enumerate(token_list):
color = pick_color(j)
for _, w in enumerate(token):
# only plot the word of interest
if w.lower() in [word, word + 's', word + 'ing', word + 'ed']:
ax.plot(reduced_X[i, 0], reduced_X[i, 1], color)
i += 1
tokens = []
for token in token_list:
tokens += token
# annotate point
k = 0
for i, token in enumerate(tokens):
if token.lower() in [word, word + 's', word + 'ing', word + 'ed']:
text = ' '.join(token_list[k])
# bold the word of interest in the sentence
text = text.replace(token, r"$\bf{" + token + "}$")
plt.annotate(text, xy=(reduced_X[i, 0], reduced_X[i, 1]))
k += 1
ax.set_title(title)
ax.set_xlabel("PCA 1")
ax.set_ylabel("PCA 2")
fig.savefig(file_name, bbox_inches="tight")
print("{} saved\n".format(file_name))
def pick_color(i):
if i == 0:
color = 'ro'
elif i == 1:
color = 'bo'
elif i == 2:
color = 'yo'
elif i == 3:
color = 'go'
else:
color = 'co'
return color
if __name__ == "__main__":
model = Elmo()
banks = OrderedDict()
banks[0] = "One can deposit money at the bank"
banks[1] = "He had a nice walk along the river bank"
banks[2] = "I withdrew cash from the bank"
banks[3] = "The river bank was not clean"
banks[4] = "My wife and I have a joint bank account"
works = OrderedDict()
works[0] = "I like this beautiful work by Andy Warhol"
works[1] = "Employee works hard every day"
works[2] = "My sister works at Starbucks"
works[3] = "This amazing work was done in the early nineteenth century"
works[4] = "Hundreds of people work in this building"
plants = OrderedDict()
plants[0] = "The gardener planted some trees in my yard"
plants[1] = "I plan to plant a Joshua tree tomorrow"
plants[2] = "My sister planted a seed and hopes it will grow to a tree"
plants[3] = "This kind of plant only grows in the subtropical region"
plants[4] = "Most of the plants will die without water"
words = {
"bank": banks,
"work": works,
"plant": plants
}
# contextual vectors for ELMo layer 1 and 2
for layer in [1, 2]:
for word, sentences in words.items():
print("visualizing word {} using ELMo layer {}".format(word, layer))
X = np.concatenate([model.get_elmo_vector(tokens=sentences[idx].split(),
layer=layer)
for idx, _ in enumerate(sentences)], axis=0)
# The first 2 principal components
X_reduce = dim_reduction(X=X, n=2)
token_list = []
for _, sentence in sentences.items():
token_list.append(sentence.split())
file_name = "{}_elmo_layer_{}.png".format(word, layer)
title = "Layer {} ELMo vectors of the word {}".format(layer, word)
plot(word, token_list, X_reduce, file_name, title)