-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathstemmer_ua.py
70 lines (57 loc) · 3.18 KB
/
stemmer_ua.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
import re
class UAStemmer:
def __init__(self, word):
self.word = word # word for processing
self.RVRE = r'[аеиіоуяюєї]' # vowels
self.REFLEXIVE = r'(с[иья])$' # reflexive verb
self.ADJECTIVE = r'(ими|ій|ий|а|е|ова|ове|ів|є|їй|єє|еє|я|ім|ем|им|ім|их|іх|ою|йми|іми|' \
r'у|ю|ого|ому|ої)$' # adjective
self.PARTICIPLE = r'(ий|ого|ому|им|ім|а|ій|у|ою|ій|і|их|йми|их)$' # participle
self.VERB = r'(сь|ся|ив|ать|ять|у|ю|ав|али|учи|ячи|вши|ши|е|ме|ати|яти|є)$' # verb
self.NOUN = r'(а|ев|ов|е|ями|ами|еи|и|ей|ой|ий|й|иям|ям|ием|ем|ам|ом|о|у|ах|иях|ях|ы|ь|ию|ью|ю|ия|ья|я|і|' \
r'ові|ї|ею|єю|ою|є|еві|ем|єм|ів|їв|ю)$' # noun
self.PERFECTIVE_GERUND = r'(ив|ивши|ившись|ыв|ывши|ывшись((?<=[ая])(в|вши|вшись)))$'
self.DERIVATIONAL = r'[^аеиоуюяіїє][аеиоуюяіїє]+[^аеиоуюяіїє]+[аеиоуюяіїє].*(?<=о)сть?$'
self.RV = '' # the area of the word after the first vowel. It can be empty if there are no vowels in the word
def s(self, s, reg, to):
orig = s
self.RV = re.sub(reg, to, s)
return orig != self.RV
def stem_word(self):
word = self.word.lower().replace("'", "")
if not re.search(self.RVRE, word):
stem = word
else:
p = re.search(self.RVRE, word)
start = word[0:p.span()[1]]
self.RV = word[p.span()[1]:]
# Step 1
# Find the end of PERFECTIVE_GERUND. If it exists, delete it and complete this step.
#
# Otherwise, remove the REFLEXIVE ending (if it exists). Then, in the following order,
# we try to remove the endings: ADJECTIVAL (ADJECTIVE | PARTICIPLE + ADJECTIVE) VERB, NOUN.
# As soon as one of them is found, the step ends.
if not self.s(self.RV, self.PERFECTIVE_GERUND, ''):
self.s(self.RV, self.REFLEXIVE, '')
if self.s(self.RV, self.ADJECTIVE, ''):
self.s(self.RV, self.PARTICIPLE, '')
else:
if not self.s(self.RV, self.VERB, ''):
self.s(self.RV, self.NOUN, '')
# Step 2
# If the word ends in 'и', delete 'и'.
self.s(self.RV, 'и$', '')
# Step 3
# If there is a DERIVATIONAL ending in RV, delete it.
if re.search(self.DERIVATIONAL, self.RV):
self.s(self.RV, 'ость$', '')
# Step 4
# One of three options is possible:
# - If the word ends in 'ь', delete it.
# - If the word ends in 'нн', delete the last letter.
# - If the word ends in 'ейше', delete it.
if self.s(self.RV, 'ь$', ''):
self.s(self.RV, 'нн$', u'н')
self.s(self.RV, 'ейше?$', '')
stem = start + self.RV
return stem