-
Notifications
You must be signed in to change notification settings - Fork 3
/
Copy pathauto_correct.py
117 lines (95 loc) · 3.34 KB
/
auto_correct.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
import time
import numpy as np
from hazm import *
import os
import pandas as pd
pronunciations = [
['ب', 'پ', 'ت', 'د', 'ط'], ['گ', 'ق', 'غ', 'ف', 'ک'],
['ز', 'ذ', 'ض', 'ژ', 'ظ'], ['ث', 'س', 'ص'], ['ج', 'چ'], ['ح', 'ه'], ['ا', 'ع'],
]
MAX_DIFF = 1
def close_pronunciation(letter: str) -> list:
for lst in pronunciations:
if letter in lst:
return lst
return []
def lv(s, t):
rows = len(s) + 1
cols = len(t) + 1
r, c = 0, 0
distance = np.zeros((rows, cols), dtype=int)
for i in range(1, rows):
for k in range(1, cols):
distance[i][0] = i
distance[0][k] = k
for col in range(1, cols):
for row in range(1, rows):
r, c = row, col
if s[row - 1] == t[col - 1]:
cost = 0
else:
cost = 1
distance[row][col] = min(distance[row - 1][col] + 1,
distance[row][col - 1] + 1,
distance[row - 1][col - 1] + cost)
return distance[r][c]
def correct(word: str) -> str:
with open('new_argument_corpse.txt') as f:
data = f.read().split('\n')
data.remove('')
distances = {w: lv(w, word) for w in data}
res = {k: v for k, v in
sorted(distances.items(), key=lambda item: item[1], reverse=True)}
for r in res.keys():
if res[r] == 0:
return r
for i in range(1, MAX_DIFF + 1):
close_words = [w for w, v in res.items() if v == i]
if len(close_words) == 1:
return close_words[0]
elif len(close_words) <= 3:
for w in close_words:
if len(w) == len(word):
diff_letter = [(i, c) for i, c in enumerate(w) if c != word[i]][0]
close_p = close_pronunciation(diff_letter[1])
if close_p and word[diff_letter[0]] in close_p:
return w
for w in close_words:
if len(w) != len(word):
return w
return word
def load_lists():
p = os.path.dirname(os.path.abspath(__file__))
url = os.path.join(p, "fa_cities_final2.csv")
df = pd.read_csv(url)
cities = df['city-fa']
url = os.path.join(p, "important_words.csv")
df = pd.read_csv(url)
important_words = df['words']
url = os.path.join(p, "find important events.csv")
df = pd.read_csv(url)
events = df['event']
url = os.path.join(p, "countries.csv")
df = pd.read_csv(url)
countries = df['country']
return cities, important_words, events, countries
def auto_correct(sentence: str):
start = time.time()
cities, important_word, events, countries = load_lists()
symbols = "!\"#$%&()*+-./;<=>?@[\\]^_`{|}~\n،,؟؛"
for i in symbols:
sentence = str.replace(sentence, i, '')
words = word_tokenize(sentence)
# words = POSTagger(model="postagger.model").tag(word_tokenize(sentence))
print(words)
new_sen = ""
for w in words:
# if w[1] in ["N", "Ne", "AJ"]:
new_sen += correct(w) + ' '
# else:
# new_sen += w[0] + ' '
end = time.time()
print(new_sen)
print(f"Runtime of the correction is {end - start}")
return new_sen[:-1]
auto_correct("اذان ژهر به در حال حاضر افق تران کی است؟")