-
Notifications
You must be signed in to change notification settings - Fork 0
/
optimized_wsd_v2.cpp
210 lines (165 loc) · 6.58 KB
/
optimized_wsd_v2.cpp
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
/*
* Simplified Sense Word Disambiguation algorithm written by Ahmed Siddiqui and
* Jordan Kirchner based off of Michael Lesk's simplified algorithm
* (https://en.wikipedia.org/wiki/Lesk_algorithm#Simplified_Lesk_algorithm)
*/
/*
Algorithm:
function SIMPLIFIED LESK(word,sentence) returns best sense of word
best-sense <- most frequent sense for word
max-overlap <- 0
context <- set of words in sentence
for each sense in senses of word do
signature <- set of words in the gloss and examples of sense
overlap <- COMPUTEOVERLAP (signature,context)
if overlap > max-overlap then
max-overlap <- overlap
best-sense <- sense
end return (best-sense)
*/
#include <omp.h>
#include <iostream>
#include <string>
#include <set>
#include <vector>
#include <fstream>
#include <nlohmann/json.hpp>
#include <boost/algorithm/string.hpp>
#include "wsd_v2.hpp"
#include <emmintrin.h>
#include <immintrin.h>
#include "nmmintrin.h"
using json = nlohmann::json;
using namespace std;
string remove_punctuation(string str) {
string result;
std::remove_copy_if(str.begin(), str.end(),
std::back_inserter(result), //Store output
::ispunct);
return result;
}
int hash_string(string str) {
/*
Need to lowercase first, and then hash it.
*/
std::transform(str.begin(), str.end(), str.begin(), ::tolower);
std::hash<std::string> hasher;
return hasher(str);
}
int compute_overlap(string sense, set<int> context) {
/*
In this function, we want to go tokenize the sense. After that, we want to compute the
*/
int overlap = 0;
set<int> sense_tokens = tokenize_string(sense);
vector<int> vector_sense(sense_tokens.begin(), sense_tokens.end());
vector<int> vector_context(context.begin(), context.end());
auto const sense_len = vector_sense.size();
auto const context_len = vector_context.size();
// Add padding to both vectors for effective SIMD access
while (vector_sense.size() % 4 != 0)
vector_sense.push_back(1);
vector_context.push_back(2);
vector_context.push_back(2);
vector_context.push_back(2);
std::reverse(vector_context.begin(), vector_context.end());
vector_context.push_back(2);
vector_context.push_back(2);
vector_context.push_back(2);
for (int i = 0; i < sense_len; i += 4) {
__m128i simd_sense = _mm_loadu_si128((__m128i const*) &vector_sense[i]);
for (int j = 0; j < context_len - 3; j++) {
__m128i simd_context = _mm_loadu_si128((__m128i const*) &vector_context[i]);
__m128i equality_results = _mm_cmpeq_epi32(simd_sense, simd_context);
equality_results = _mm_hadd_epi32(equality_results, equality_results);
equality_results = _mm_hadd_epi32(equality_results, equality_results);
overlap += _mm_extract_epi32(equality_results, 0) * -1;
}
}
// for (int i = 0; i < sense_len; i++) {
// // cout << hash_word_dictionary[vector_sense[i]] << "\n";
// for (int j = 0; j < context_len; j++) {
// // cout << hash_word_dictionary[vector_context[j]] << "\n";
// if (vector_sense[i] == vector_context[j]) {
// overlap++;
// }
// }
// }
return overlap;
}
vector<string> get_all_senses(string word) {
/*
This function will query dictionary.json and get the definition of the
word. It will then parse through the definition and get all the senses.
It will then store all those senes in the given vector: all_senses
*/
// read a JSON file
string dictionary_name = "/Users/ahmedsiddiqui/Workspace/UVic/Winter_2021/CSC485C/wsd-485c/final_dictionary/";
dictionary_name += word[0];
if (word[1] != '\0')
dictionary_name += word[1];
dictionary_name += ".json";
std::ifstream i(dictionary_name);
json j;
i >> j;
return j[word];
}
set<int> get_word_set(string word, string sentence) {
set<int> words = tokenize_string(sentence);
words.erase(hash_string(word));
return words;
}
set<int> tokenize_string(string sentence) {
stringstream stream(sentence);
set<int> words;
string tmp;
while (getline(stream, tmp, ' ')) {
words.insert(hash_string(remove_punctuation(tmp)));
}
return words;
}
string simplified_wsd(string word, string sentence) {
string best_sense;
int max_overlap = 0;
set<int> context = get_word_set(word, sentence); // This is the set of words in a sentence excluding the word itself hashed as ints
vector<string> all_senses = get_all_senses(word); // This is all the senses of the word.
vector<int> overlaps(all_senses.size());
// TIMING BEGIN
auto const start = chrono::steady_clock::now();
#pragma omp parallel for
for (int i = 0; i < all_senses.size(); i++)
overlaps[i] = compute_overlap(all_senses[i], context);
for (int i = 0; i < all_senses.size(); i++){
int overlap = overlaps[i];
if (overlap > max_overlap) {
max_overlap = overlap;
best_sense = all_senses[i];
// cout << "best_sense: " << best_sense << "\n";
}
}
auto const end = chrono::steady_clock::now();
cout << "Time to run compute overlap was: " << chrono::duration <double, milli> (end - start).count() << " ms" << endl;
// TIMING END
return best_sense;
}
int main(int argc, char ** argv)
{
/*
cout << "Find the best sense of the word 'stock' in the following sentence:\n\tI'm expecting to make a lot of money from the stocks I'm investing in using my bank account.\n";
cout << "The best sense of the word stock in our example is:\n" << simplified_wsd("stock", "I'm expecting to make a lot of money from the stocks I'm investing in using my bank account.") << "\n";
*/
// auto start = chrono::steady_clock::now();
if( argc >= 2 )
{
// omp_set_num_threads() sets the global number of threads used by OpenMP
// If this is not set, then it defaults to the number of cores on the machine
// Here, we take the value from the first command line argument (`argv[ 1 ]`),
// after converting it from ascii to int (the `atoi()` function).
omp_set_num_threads(atoi(argv[ 1 ]));
}
cout << simplified_wsd("set", "It was a great day of tennis. Game, set, match");
// auto end = chrono::steady_clock::now();
// auto diff = end - start;
// cout << "Total time to run was: " << chrono::duration <double, milli> (diff).count() << " ms" << endl;
return 0;
}