-
Notifications
You must be signed in to change notification settings - Fork 4
/
Copy pathvalidation.py
148 lines (112 loc) · 5.14 KB
/
validation.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
#!/usr/bin/env python
# -*- coding: utf-8 -*-
#
# (C) 2021 Frederico Oliveira fred.santos.oliveira(at)gmail.com
#
#
import argparse
from os import makedirs
from os.path import join, exists, dirname
from textdistance import levenshtein
from tqdm import tqdm
def remove_punctuations(sentence):
"""
Removes punctuations and unwanted characters from a sentence.
"""
punctuations = '''—!()-[]{};:'"\,<>./?@#$%^&*_~'''
sentence_with_no_punct = ""
for char in sentence:
if char not in punctuations:
sentence_with_no_punct = sentence_with_no_punct + char
return sentence_with_no_punct.strip()
def clear_sentences(sentence):
"""
Converts the sentence to lowercase and removes unwanted characters.
"""
sentence = sentence.lower()
clean_sentence = remove_punctuations(sentence)
return clean_sentence
def create_validation_file(input_file1, input_file2, prefix_filepath, output_file):
"""
Given two files containing different transcriptions of audio files, this function calculates the similarity (levenshtein distance) between the sentences,
saving the result in a third file.
Parameters:
input_file1 (str): First filepath. The contents of the file must follow the template: "filename | text"
input_file2 (str): Second filepath. The contents of the file must follow the template: "filename | text"
prefix_filepath: Prefix to be added to the file path within the output file.
Returns:
output_file (str): Returns output filepath. The content of the file follows the template: prefix_filepath/filename | text1 | text2 | similarity
"""
# Loads the contents of the first input file
try:
with open(input_file1) as f:
content_file1 = f.readlines()
except KeyboardInterrupt:
print("KeyboardInterrupt detected!")
exit()
except IOError:
print("Error: File {} does not appear to exist.".format(input_file1))
return False
# Loads the contents of the second input file
try:
with open(input_file2) as g:
content_file2 = g.readlines()
except KeyboardInterrupt:
print("KeyboardInterrupt detected!")
exit()
except IOError:
print("Error: File {} does not appear to exist.".format(input_file2))
return False
# Both files must be the same length, otherwise there is an error.
if not (len(content_file1) == len(content_file2)):
print("Error: length File {} not igual to File {}.".format(content_file1, content_file2))
return False
# Checks if the output folder exists
output_folderpath = dirname(output_file)
if not(exists(output_folderpath)):
makedirs(output_folderpath)
# Saves the result to the output file.
try:
o_file = open(output_file, 'w')
except KeyboardInterrupt:
print("KeyboardInterrupt detected!")
exit()
except IOError:
print("Error: creating File {} problem.".format(output_file))
return False
# Iterate over the two files content simultaneously to calculate the similarity between the sentences.
else:
separator = '|'
header = separator.join(['filename', 'subtitle', 'transcript', 'similarity'])
o_file.write(header + '\n')
# Input files must be csv files with the character "|" as a separator: filename | text
for line1, line2 in tqdm(zip(content_file1, content_file2), total=len(content_file1)):
file1, text1 = line1.split('|')
file2, text2 = line2.split('|')
# Clears sentences by removing unwanted characters.
clean_text1 = clear_sentences(text1)
clean_text2 = clear_sentences(text2)
filepath = join(prefix_filepath, file1)
# Calculates the levenshtein distance to define the normalized similarity (0-1) between two sentences.
l = levenshtein.normalized_similarity(clean_text1, clean_text2)
# Defines the output content and writes to a file.
line = separator.join([filepath, text1.strip(), text2.strip(), str(l)])
o_file.write(line + '\n')
finally:
o_file.close()
return True
def main():
parser = argparse.ArgumentParser()
parser.add_argument('--base_dir', default='./')
parser.add_argument('--input_file1', default='metadata1.csv', help='Input first filename')
parser.add_argument('--input_file2', default='metadata2.csv', help='Input second filename')
parser.add_argument('--prefix', default='', help='Prefix to filename on metadata output file.')
parser.add_argument('--output_dir', default='output', help='Directory to save distances')
parser.add_argument('--output_file', default='validation.csv', help='Output file with the template: "filename, text1, text2, similarity"')
args = parser.parse_args()
input_path_file1 = join(args.base_dir, args.input_file1)
input_path_file2 = join(args.base_dir, args.input_file2)
output_path_file = join(args.base_dir, args.output_dir, args.output_file)
create_validation_file(input_path_file1, input_path_file2, args.prefix, output_path_file)
if __name__ == "__main__":
main()