-
Notifications
You must be signed in to change notification settings - Fork 0
/
unoptimized_wsd_v1.cpp
132 lines (106 loc) · 4.08 KB
/
unoptimized_wsd_v1.cpp
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
/*
* Simplified Sense Word Disambiguation algorithm written by Ahmed Siddiqui and
* Jordan Kirchner based off of Michael Lesk's simplified algorithm
* (https://en.wikipedia.org/wiki/Lesk_algorithm#Simplified_Lesk_algorithm)
*/
/*
Algorithm:
function SIMPLIFIED LESK(word,sentence) returns best sense of word
best-sense <- most frequent sense for word
max-overlap <- 0
context <- set of words in sentence
for each sense in senses of word do
signature <- set of words in the gloss and examples of sense
overlap <- COMPUTEOVERLAP (signature,context)
if overlap > max-overlap then
max-overlap <- overlap
best-sense <- sense
end return (best-sense)
*/
#include <iostream>
#include <string>
#include <set>
#include <vector>
#include <fstream>
#include <nlohmann/json.hpp>
#include <boost/algorithm/string.hpp>
#include "wsd.hpp"
using json = nlohmann::json;
using namespace std;
int compute_overlap(string sense, set<string> context) {
/*
In this function, we want to go tokenize the sense. After that, we want to compute the
*/
auto const cache_line_size = 64u;
auto const tile_size = cache_line_size / 8; // median word size is 4.7 characters, so we expect to easily fit 8 words into the cache.
int overlap = 0;
set<string> sense_tokens = tokenize_string(sense);
std::vector<string> vector_sense(sense_tokens.begin(), sense_tokens.end());
std::vector<string> vector_context(context.begin(), context.end());
auto const n = vector_sense.size();
auto const o = vector_context.size();
for(auto i = 0u; i < n; i++){
for (auto j = 0u; j < o; j++){
if (boost::iequals(vector_sense[i], vector_context[j]))
overlap++;
}
}
return overlap;
}
void get_all_senses(string word, vector<string> &all_senses) {
/*
This function will query dictionary.json and get the definition of the
word. It will then parse through the definition and get all the senses.
It will then store all those senes in the given vector: all_senses
*/
// read a JSON file
std::ifstream i("/Users/ahmedsiddiqui/Workspace/UVic/Winter_2021/CSC485C/wsd-485c/dictionary.json");
json j;
i >> j;
vector<string> definitions = j[word];
for (int i = 0; i < definitions.size(); i++) {
all_senses.push_back(definitions[i]);
}
}
set<string> get_word_set(string word, string sentence) {
set<string> words = tokenize_string(sentence);
words.erase(word);
return words;
}
set<string> tokenize_string(string sentence) {
stringstream stream(sentence);
set<string> words;
string tmp;
while (getline(stream, tmp, ' ')) {
words.insert(tmp);
}
return words;
}
string simplified_wsd(string word, string sentence) {
string best_sense;
int max_overlap = 0;
set<string> context = get_word_set(word, sentence);// This is the set of words in a sentence excluding the word itself.
vector<string> all_senses; // This is all the senses of the word.
get_all_senses(word, all_senses);
for (int i = 0; i < all_senses.size(); i++) {
int overlap = compute_overlap(all_senses[i], context);
if (overlap > max_overlap) {
max_overlap = overlap;
best_sense = all_senses[i];
}
}
return best_sense;
}
int main(void )
{
/*
cout << "Find the best sense of the word 'stock' in the following sentence:\n\tI'm expecting to make a lot of money from the stocks I'm investing in using my bank account.\n";
cout << "The best sense of the word stock in our example is:\n" << simplified_wsd("stock", "I'm expecting to make a lot of money from the stocks I'm investing in using my bank account.") << "\n";
*/
// auto start = chrono::steady_clock::now();
simplified_wsd("set", "It was a great day of tennis. Game, set, match");
// auto end = chrono::steady_clock::now();
// auto diff = end - start;
// cout << "Total time to run was: " << chrono::duration <double, milli> (diff).count() << " ms" << endl;
return 0;
}