-
Notifications
You must be signed in to change notification settings - Fork 1
/
prep_comparison_data.py
121 lines (102 loc) · 5.92 KB
/
prep_comparison_data.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
"""Compare OCR results to ground truth text, and provide a list of best-match-words (using NLTK fuzzy match ratio) and
a rough baseline comparison score [0, 100] for each OCR platform on each image."""
import string
import sys
from nltk.tokenize import word_tokenize
from nltk.corpus import stopwords
import pandas as pd
from fuzzywuzzy import fuzz
from utilities.data_loader import save_dataframe_as_csv
def main(occurrence_filepath: str) -> pd.DataFrame:
""" Load, populate, and return a pd.DataFrame with OCR text and quality analysis. """
occurrences = pd.read_csv(occurrence_filepath, encoding='UTF-8')
add_ground_truth_text(occurrences)
separate_word_analysis = pd.DataFrame(columns=['barcode', 'word_number', 'ground_truth_token',
'aws_best_match_tokens', 'aws_best_match_score',
'gcv_best_match_tokens', 'gcv_best_match_score'])
print('Gathering OCR accuracy details for %i rows.' % occurrences.shape[0])
for idx, occur_row in occurrences.iterrows():
if pd.isna(occur_row['awsOcrText']):
aws_tokens = [''] # this means that AWS didn't find anything...
else:
aws_tokens = word_tokenize(occur_row['awsOcrText'])
if pd.isna(occur_row['gcvOcrText']):
gcv_tokens = [''] # this means that GCV didn't find anything...
else:
gcv_tokens = word_tokenize(occur_row['gcvOcrText'])
aws_match_results = list()
gcv_match_results = list()
ground_truth_filtered_words: list = preprocess_text(occur_row['labelText'])
for i, search_word in enumerate(ground_truth_filtered_words):
aws_best_matches_list, aws_best_ratio = fuzzy_match_with_token_list(search_word, aws_tokens)
gcv_best_matches_list, gcv_best_ratio = fuzzy_match_with_token_list(search_word, gcv_tokens)
aws_match_results.append((aws_best_matches_list, aws_best_ratio))
gcv_match_results.append((gcv_best_matches_list, gcv_best_ratio))
swa_row = {'barcode': occur_row['catalogNumber'], 'word_number': i, 'ground_truth_token': search_word,
'aws_best_match_tokens': aws_best_matches_list, 'aws_best_match_score': aws_best_ratio,
'gcv_best_match_tokens': gcv_best_matches_list, 'gcv_best_match_score': gcv_best_ratio}
separate_word_analysis = separate_word_analysis.append(swa_row, ignore_index=True)
occurrences.at[idx, 'awsMatchingScore'] = generate_score(occurrences.at[idx, 'catalogNumber'],
'aws', aws_match_results)
occurrences.at[idx, 'gcvMatchingScore'] = generate_score(occurrences.at[idx, 'catalogNumber'],
'gcv', gcv_match_results)
filename = save_dataframe_as_csv('test_results', 'occurrence_with_ocr_and_scores', occurrences)
print('%i row(s) processed, added, and saved to %s' % (occurrences.shape[0], filename))
filename = save_dataframe_as_csv('test_results', 'compare_word_by_word', separate_word_analysis)
print('Word-by-word analysis saved to %s' % filename)
return occurrences
def add_ground_truth_text(occur_data: pd.DataFrame) -> None:
# Ground truth from the following extracted fields, \n separated:
# scientificName + scientificNameAuthorship, recordNumber **ADDED "No." **, verbatimEventDate, habitat,
# stateProvince, county **ADDED "Co."**, locality, verbatimElevation
for idx, one_line in occur_data.iterrows():
ground_truth_string = ''
data_to_add = [one_line['scientificName'], one_line['scientificNameAuthorship'], one_line['recordNumber'],
one_line['verbatimEventDate'], one_line['habitat'],
one_line['stateProvince'], one_line['county'], one_line['locality'],
one_line['verbatimElevation']
]
for data in data_to_add:
if not pd.isna(data):
ground_truth_string += data + '\n'
occur_data.at[idx, 'labelText'] = ground_truth_string.strip()
def preprocess_text(text: str) -> list:
word_tokens = word_tokenize(text)
stop_words = set(stopwords.words('english'))
stopped_words = []
filtered_words = []
for w in word_tokens:
if w.lower() in stop_words:
stopped_words.append(w)
else:
filtered_words.append(w)
filtered_words = [word for word in filtered_words if word not in string.punctuation]
filtered_words = [word for word in filtered_words if len(word) >= 2]
return filtered_words
def fuzzy_match_with_token_list(search_word: str, token_list: list) -> (list, int):
if token_list == '':
return [], 0
fuzz_ratio = dict()
for word in token_list:
ratio = fuzz.ratio(search_word.lower(), word.lower())
if ratio in fuzz_ratio.keys():
fuzz_ratio[ratio].append(word)
else:
fuzz_ratio[ratio] = [word]
fuzz_ratio = sorted(fuzz_ratio.items(), reverse=True)
best_match = fuzz_ratio[0]
best_ratio = best_match[0]
best_matches_list = best_match[1]
return best_matches_list, best_ratio
def generate_score(barcode: str, service_name: str, match_results: list) -> float:
perfect_matches = [word[0][0] for word in match_results if word[1] == 100]
near_matches = [word[0][0] for word in match_results if 60 < word[1] < 100]
num_of_words = len(match_results)
accuracy_score = (len(perfect_matches) + len(near_matches) / 2) / num_of_words
# print('Score for %s on service %s: perfect = %i, near match = %i, score = %.2f%%' %
# (barcode, service_name, len(perfect_matches), len(near_matches), accuracy_score * 100))
return accuracy_score
if __name__ == '__main__':
assert len(sys.argv) == 2, 'Provide 1 argument: filepath for the occurrence_with_ocr file.'
occur = sys.argv[1]
main(occur)