-
Notifications
You must be signed in to change notification settings - Fork 1
/
subword.py
270 lines (204 loc) · 8.28 KB
/
subword.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
'''
subword.py
~~~~~~~~~
This script converts infrequent words into subwords in a given corpus in English,
following the method introduced in Mikolov, et al.(2012).
Usage: $ python subword.py <corpus-filename> <W-parameter> <S-parameter>
Yejin Cho ([email protected])
Reference:
Mikolov, et al. (2012), Subword Language Modeling with Neural Networks
Last updated: 2017-05-18
'''
import time
import numpy as np
import re
import sys
def readtext(fname):
with open(fname) as f:
print('Currently reading text from file...')
txt = f.readlines()
print('Lines in text file is read as a list variable')
return txt
def readtext2word(fname):
startTime = time.time()
wordlist = []
with open(fname) as f:
cnt = 0
for chunk in f:
wrd = chunk.lower().split()
cnt += 1
for w in wrd:
wordlist.append(w)
print("Read %d lines: resulting dictionary is %d elements long. Took %f seconds" % (cnt, len(wordlist),time.time()-startTime))
return wordlist
def filter_by_frequency(wordlist, nfreq):
startTime = time.time()
un, ind, inv, cou = np.unique(wordlist, return_index=True, return_inverse=True, return_counts=True)
print("Unigram has %d unique words, Inverse indices are %d long. Took %f seconds" % (len(un), len(inv), time.time() - startTime))
# this sorts the counts from high to low
sorted_cou_ind = np.argsort(cou)[::-1]
# print out the most counts and words for the thirty most common unigrams
print(sorted_cou_ind[0:nfreq], un[sorted_cou_ind[0:nfreq]])
# Save frequent/infrequent as a separate array
freq_ind = sorted_cou_ind[0:nfreq]
freq_un = un[sorted_cou_ind[0:nfreq]]
infreq_ind = sorted_cou_ind[nfreq:]
infreq_un = un[sorted_cou_ind[nfreq:]]
return freq_ind, freq_un, infreq_ind, infreq_un, ind, un, inv
def subword_split(infreq_unigram):
subword_stack = []
# Split all the remaining infrequent words into subwords
for wrd_id in range(0, len(infreq_unigram)):
wrd = infreq_unigram[wrd_id]
vowels = ['a','e','i','o','u']
subwrd_tmp = []
string = ''
for i in range(0, len(wrd)):
string += wrd[i]
# Simply append single letter word as one subword
if len(wrd) == 1:
subwrd_tmp.append(string)
string = ''
# From second letter ...
if i+1 != 1:
# If stacked string has at least two letters and ends with vowel
if wrd[i] in vowels and len(string) > 1:
subwrd_tmp.append(string)
string = ''
# If reached the end of word
elif i+1 == len(wrd):
if len(string) == 1:
subwrd_tmp[-1] += string
else:
subwrd_tmp.append(string)
string = ''
subwrd = []
for k in range(0, len(subwrd_tmp)):
# If current subword is longer than 4 letters
if len(subwrd_tmp[k]) >= 4:
# Save first 2 letters as one subword
subwrd.append(subwrd_tmp[k][0:2])
# Save the rest as another subword
subwrd.append(subwrd_tmp[k][2:])
else:
subwrd.append(subwrd_tmp[k])
print('------------------------------')
print('Word input: ')
print('\'' + wrd + '\'\n')
print('Subword output: ')
print(subwrd)
print('------------------------------')
subword_stack.append(subwrd)
subword_idx = []
for p in range(0, len(subword_stack)):
item = subword_stack[p]
if len(item) > 1:
subword_idx.append(p)
buffer = ''
for subitem in item:
if subitem is not item[-1]:
buffer += subitem + '+ '
else:
buffer += subitem + ':'
subword_stack[p] = [buffer]
return subword_stack, subword_idx
def character_split(infreq_unigram):
subword_stack = []
# Split all the remaining infrequent words into subwords
for wrd_id in range(0, len(infreq_unigram)):
wrd = infreq_unigram[wrd_id]
wrd = re.sub('\+$', '', wrd)
subwrd = ''
for id in range(0, len(wrd)): # for id letter in wrd:
letter = wrd[id]
# prev = wrd[id-1]
if id is not len(wrd)-1:
if id > 0 and wrd[id-1] == '+':
continue
elif id < len(wrd)-1 and wrd[id+1] is ':':
subwrd += letter
elif letter is not '+':
subwrd += letter + '+ '
if id is len(wrd)-1:
if letter is ':':
subwrd += letter
elif letter is '+':
subwrd += ':'
else:
subwrd += letter + ':'
if subwrd[-1] == ' ':
subwrd = subwrd[:-1]
print('------------------------------')
print('Word input: ')
print('\'' + wrd + '\'\n')
print('Subword output: ')
print(subwrd)
print('------------------------------')
subword_stack.append(subwrd)
return subword_stack
def mikolov_subword(fname, w, s):
# Add 1 to parameter W and S, in order to compensate for <eos> token
# (which is added for convenient recovery of newlines and removed before returning final output text)
w += 1
s += 1
print('# [STEP 0] Read corpus as wordlist')
startTime = time.time()
txt = readtext(fname)
wordlist1 = readtext2word(fname)
print('# [STEP 1] Keep W most frequent words')
freq1_ind, freq1_un, infreq1_ind, infreq1_un, full_ind1, full_un1, full_inv1 = filter_by_frequency(wordlist1, w)
print('# [STEP 2] The first subword split')
subword_stack, subword_idx = subword_split(infreq1_un)
print('Flattening subword stack...')
subword_stack_flat = [y for x in subword_stack for y in x]
print('# [STEP 3] Replace infrequent words into designed subwords (initial split)')
full_un1 = full_un1.tolist()
full_inv1 = full_inv1.tolist()
infreq1_ind = infreq1_ind.tolist()
for k in subword_idx:
print(k)
print(full_un1[infreq1_ind[k]])
print(subword_stack_flat[k])
full_un1[infreq1_ind[k]] = subword_stack_flat[k]
print(full_un1[infreq1_ind[k]])
print('# [STEP 4] Keep S most frequent items')
wordlist2 = []
wordlist2.extend(np.array(full_un1)[full_inv1])
wordlist2_flat = []
for i in range(0, len(wordlist2)):
wordlist2_flat.extend(wordlist2[i].split())
freq2_ind, freq2_un, infreq2_ind, infreq2_un, full_ind2, full_un2, full_inv2 = filter_by_frequency(wordlist2_flat, s)
print('# [STEP 5] The second subword split')
subwrd_stack_char = character_split(infreq2_un)
print("Final subword generation completed. Took %f seconds" % (time.time() - startTime))
print('# [STEP 6] Replace infrequent words into designed subwords (final split)')
full_un2 = full_un2.tolist()
full_inv2 = full_inv2.tolist()
infreq2_ind = infreq2_ind.tolist()
for k in range(0, len(infreq2_ind)):
full_un2[infreq2_ind[k]] = subwrd_stack_char[k]
print('# [STEP 7] Concatenate subwords as regular text')
txt = " ".join(np.array(full_un2)[full_inv2])
txt = re.sub(' \+: ', ' ', txt)
txt = re.sub('(?<= [a-z]): ', '+ ', txt)
txt = re.sub(' ?\<eos\> ?', '\n', txt)
print(txt)
print('\n========================================================================')
print('=> FINAL SUBWORD-LEVEL TEXT OUTPUT:')
print(txt)
print('========================================================================\n')
return txt
if __name__ == '__main__':
fname = sys.argv[1]
w = int(sys.argv[2])
s = int(sys.argv[3])
subwordtxt = mikolov_subword(fname, w, s)
wordlist = []
for item in subwordtxt:
wordlist.extend(item.split())
un, ind, inv, cou = np.unique(wordlist, return_index=True, return_inverse=True, return_counts=True)
print("Unigram has %d unique items, Inverse indices are %d long." % (len(un), len(inv)))
print(un)
with open('sub_' + fname, 'w') as f:
for line in subwordtxt:
f.write('{}'.format(line))