-
Notifications
You must be signed in to change notification settings - Fork 0
/
013.chooseana.py
executable file
·88 lines (71 loc) · 3.25 KB
/
013.chooseana.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
#!/usr/bin/env python
# Input: .stem file (tsv: surface form, lemma, pos-tag, morphanas delimited by "||")
# Output: .stem file with only 1 ana in the last column (see heuristics below) and the lemma column is the lemma from the chosen ana
import sys
def get_lemma(ana, surf):
'''Return the lemma for an ana: either the lemma it contains or the surf if it's not the proper format (lemma/morphinfo)
Attempt to assemble lemma if it's a compound.
'''
if '+' in ana:
ret = ''.join([x.split('/')[0] for x in ana.split('+')])
return surf if ret == '' else ret
tmp = ana.split('/')
if len(tmp) == 1 or tmp[0] == '':
return surf
return tmp[0]
def get_minimal_partition(l, k):
'''for a list l, return the sublist whose elements all have with the minimal value using key function k'''
tmp = sorted(l, key=k)
minc = k(tmp[0])
out = []
for x in tmp:
if k(x) > minc:
break
else:
out.append(x)
return out
if len(sys.argv) != 2:
sys.exit('Missing input file name')
for line in open(sys.argv[1]):
line = line.rstrip()
if not line or line.startswith('#'):
print(line)
continue
t = line.split('\t')
if len(t) != 4:
sys.stderr.write('Incorrect line format: ' + line)
break
surf = t[0]
pos = t[2]
anas = list(set(t[3].split('||'))) # 0th filter: uniq the anas
if len(anas) == 1: # there was only 1 ana anyway: nothing to do
print('\t'.join([surf, get_lemma(anas[0], surf), pos, anas[0]]))
continue
# 1st filter: filter out anas that don't end with /PoS-tag or don't contain PoS-tag
anas2 = [x for x in anas if x.endswith('/' + pos)]
if not anas2: # if no anas left that end with /postag:
anas2 = [x for x in anas if pos in x ] # try again w/ more relaxed criterion: anas should only contain postag
if not anas2: # still no anas left with pos:
#sys.stderr.write('WARNING: no anas with PoS-tag: ' + line + '\n')
anas2 = anas # give up on filtering using PoS-tag, carry on with all anas
if len(anas2) == 1: # only 1 left: we're done
print('\t'.join([surf, get_lemma(anas2[0], surf), pos, anas2[0]]))
continue
# 2nd filter: filter out anas that have more number of compounding operations than minimal ana (w/ respect to no. of comp. ops.)
anas3 = get_minimal_partition(anas2, lambda x: x.count('+'))
if len(anas3) == 1: # only 1 left: we're done
print('\t'.join([surf, get_lemma(anas3[0], surf), pos, anas3[0]]))
continue
# 3rd filter: filter out anas that have more number of derivation operations than minimal ana (w/ respect to no. of deriv. ops.)
anas4 = get_minimal_partition(anas3, lambda x: x.count('['))
if len(anas4) == 1: # only 1 left: we're done
print('\t'.join([surf, get_lemma(anas4[0], surf), pos, anas4[0]]))
continue
# 4th filter: choose anas with the longest lemma
anas5 = sorted(anas4, key=lambda x: len(x.split('/')[0]), reverse=True)
if len(anas5) == 1: # only 1 left: we're done
print('\t'.join([surf, get_lemma(anas5[0], surf), pos, anas5[0]]))
continue
# 5th filter: choose 1st ana whose lemma has more similar initial capitalization to surface form
anas6 = sorted(anas5, key=lambda x: 1 if x.split('/')[0][0] == surf[0] else 0, reverse=True)
print('\t'.join([surf, get_lemma(anas6[0], surf), pos, anas6[0]]))