-
Notifications
You must be signed in to change notification settings - Fork 0
/
main_matching.py
251 lines (205 loc) · 9.8 KB
/
main_matching.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
import matching.utilities as utl
import matching.DocumentMatching as dm
import json
import os
import sys
from pathlib import Path
from multiprocessing import Pool
from typing import Iterator
def article_generator_parallel(matched_article_list: list[tuple[str, str]]) -> Iterator[tuple[str, str, str, str]]:
""" Generator function that iteratively returns preprocessed articles.
Args:
matched_article_list (list[tuple[str, str]]): List of article pairs that is iterated through
Yields:
Iterator[[str, str,str,str]]: simple and normal link to file and preprocessed articles in the form
simple_preprocessed(option_1), (..., simple_preprocessed(option_n)), normal_preprocessed(option_1), (..., normal_preprocessed(option_n))
"""
for simple, normal in matched_article_list:
with open(simple, 'r') as fp:
simple_text = fp.read()
with open(normal, 'r') as fp:
normal_text = fp.read()
# don't process exact copies
if simple_text == normal_text:
continue
yield simple, normal, simple_text, normal_text
def article_preprocess(simple_text: str, normal_text: str) -> [str, str, list[str], list[str]]:
""" Returns the preprocessed articles
Args:
simple_text (str): original simple text
normal_text (str): original normal text
Returns:
[str,str,list[str],list[str]]: preprocessed original texts, followed by list of different preprocessing variants for both articles
"""
simple_original = utl.get_original_text_preprocessed(simple_text)
normal_original = utl.get_original_text_preprocessed(normal_text)
simple_arts = []
normal_arts = []
preprocessing_options = [kwargs_gram, kwargs_embeddings]
for kwargs in preprocessing_options:
simple_arts.append(utl.preprocess(simple_text, **kwargs))
normal_arts.append(utl.preprocess(normal_text, **kwargs))
return simple_original, normal_original, *simple_arts, *normal_arts
def parallel(simple_name: str, normal_name: str, simple_text: str, normal_text: str) -> dict[str, list[str]]:
""" The actual matching calculation
This function is called by main() with multi-processing.
Args:
simple_name (str): name of the simple file
normal_name (str): name of the normal file
simple_text (str): text of the simple file
normal_text (str): text of the normal file
Returns:
dict[str, list[str]]: key is a simple filename, item is a list of filenames containing corresponding results
"""
# get simple/normal text in original, n-gram and embedding form
simple_original, normal_original, simple_gram, simple_embedding, normal_gram, normal_embedding = article_preprocess(
simple_text, normal_text)
simple_file = simple_name.split('/')[-1]
normal_file = normal_name.split('/')[-1]
# check if the file has already been matched
if simple_file in header:
finished = True
for sim_measure in similarity_measures:
for matching in doc_matchings:
for sd_threshold in sd_thresholds:
#
filename = utl.make_matching_path(
simple_file, normal_file, sim_measure, matching, sd_threshold)
if filename not in header[simple_file]:
finished = False
break
if finished == False:
break
if finished == False:
break
# it has already been matched completely
if finished == True:
# no updates need to be done
return {}
else:
# create a new entry for the file
header_extension = {simple_file: []}
# start the calculation
for sim_measure in similarity_measures:
if sim_measure == "n_gram":
simple_n_tf = utl.calculate_n_gram_tf(simple_gram, n)
normal_n_tf = utl.calculate_n_gram_tf(normal_gram, n)
sim_matrix = dm.calculate_similarity_matrix(simple_gram, normal_gram, sim_measure, n,
simple_n_tf, normal_n_tf, n_gram_idf)
elif sim_measure == "bag_of_words":
simple_word_tf = utl.calculate_word_tf(simple_gram)
normal_word_tf = utl.calculate_word_tf(normal_gram)
sim_matrix = dm.calculate_similarity_matrix(simple_gram, normal_gram, sim_measure, n,
simple_word_tf, normal_word_tf, word_idf)
else:
sim_matrix = dm.calculate_similarity_matrix(
simple_embedding, normal_embedding, sim_measure)
for matching in doc_matchings:
for sd_threshold in sd_thresholds:
# get the filename
filename = utl.make_matching_path(
simple_file, normal_file, sim_measure, matching, sd_threshold)
if not os.path.exists(filename):
try:
# calculate the distance according to parameters
results = dm.match_documents(matching, simple_original, normal_original,
sim_matrix, sd_threshold=sd_threshold)
except ValueError as err:
print(
f"ValueError raised by {simple_file} - {normal_file}")
with open("error_log.txt", "a", encoding="utf-8") as fp:
fp.write(
f"simple_file:{simple_file} - normal_file:{normal_file}\n\tsim_measure:{sim_measure} - matching:{matching} - thresh:{sd_threshold}\n")
fp.write(f"{sim_matrix}")
fp.write("\n\n\n#####\n\n\n")
continue
# write the end result of the distance calculation
with open(filename, 'w') as fp:
json.dump(results, fp, ensure_ascii=False, indent=2)
# add the file to the header
header_extension[simple_file].append(filename)
return header_extension
def main():
"""
Calculates all pairings of similarity measures and alignment methods.
BEWARE! Eventhough this calculation is computed in parallel it takes a lot of time.
Also there are no checkpoints.
"""
global similarity_measures, sd_thresholds, doc_matchings, header, n, word_idf, n_gram_idf, kwargs_gram, kwargs_embeddings
# setup lists for all settings
similarity_measures = ["n_gram", "bag_of_words",
"cosine", "average", "maximum", "max_matching", "CWASA"]
sd_thresholds = [0.0, 1.5]
doc_matchings = ["max", "max_increasing_subsequence"]
header_file = "results/header_matching.json"
# preprocessing arguments for gram and embeddings
kwargs_gram = utl.make_preprocessing_dict(remove_punctuation=True)
kwargs_embeddings = utl.make_preprocessing_dict(
lowercase=False, remove_punctuation=True)
# output folder setup
if not os.path.exists("results/matched"):
os.makedirs("results/matched")
# check if some matchings have already been calculated
if not Path(header_file).exists():
header = dict()
else:
with open(header_file, 'r') as fp:
header = json.load(fp)
# n for n-gram
n = 4
print("Start working")
articles = utl.get_article_pairs()
unnested_articles = utl.get_unnested_articles(articles)
# concatenate all article names and hash it
idf_article_string = ''.join([art.split('/')[-1]
for art in sorted(list(unnested_articles))])
idf_article_hash = utl.get_hash(idf_article_string)
print("ARTICLE HASH", idf_article_hash)
# check for word idf (and calculate if necessary)
found = False
word_idf = dict()
if Path("results/word_idf.json").exists():
with open("results/word_idf.json", 'r') as fp:
hash, word_idf = json.load(fp)
if hash == idf_article_hash:
found = True
print("Word idf was already computed!")
if not found:
word_idf = utl.calculate_full_word_idf(
unnested_articles, **kwargs_gram)
print("Calculated new word idf")
with open("results/word_idf.json", 'w') as fp:
json.dump([idf_article_hash, word_idf], fp, ensure_ascii=False)
# check for n-gram idf (and calculate if necessary)
found = False
n_gram_idf = dict()
if Path(f"results/{n}_gram_idf.json").exists():
with open(f"results/{n}_gram_idf.json", 'r') as fp:
hash, n_gram_idf = json.load(fp)
if hash == idf_article_hash:
found = True
print("n_gram idf was already computed!")
if not found:
n_gram_idf = utl.calculate_full_n_gram_idf(
unnested_articles, n, **kwargs_gram)
print("Calculated new n gram idf")
with open(f"results/{n}_gram_idf.json", 'w') as fp:
json.dump([idf_article_hash, n_gram_idf], fp, ensure_ascii=False)
# start multi-processed matching calculation
with Pool() as p:
header_extensions = p.starmap(parallel, article_generator_parallel(
articles))
# merge all results, given a list of many headers
for ext in header_extensions:
for key in ext:
if key in header.keys():
# merge if the key already exists
header[key] = header[key] + ext[key]
else:
# create a new key
header[key] = ext[key]
# save header with results. This could be moved elsewhere for checkpointing
with open(header_file, 'w') as fp:
json.dump(header, fp, ensure_ascii=False, indent=2)
if __name__ == "__main__":
main()