-
Notifications
You must be signed in to change notification settings - Fork 0
/
exporter_lib.py
467 lines (394 loc) · 16.4 KB
/
exporter_lib.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
448
449
450
451
452
453
454
455
456
457
458
459
460
461
462
463
464
465
466
467
#!/usr/bin/env python3
# -*- coding: utf-8 -*-
# exportateur Column @ CoNLL-U -> Tier @ Praat TextGrid
# prerequisite : pympi.Praat, javaobj, python-magic
#
# auteurs :
# Sandy Duchemin
# Luigi Liu
# dependencies
import csv, os, argparse, collections, sys, codecs, re, struct, difflib, pympi.Praat, magic, chardet
# try tp enable javaobj for analor file support
javaobj_installed = True
try:
import javaobj
except Exception as e:
javaobj_installed = False
# debug setting
DEBUG_EN = False
INFO_EN = True
WARNING_EN = True
ERR_EN = True
# tools
## 1. visualisation & debug use
def deb_print(x):
if DEBUG_EN: print('[Debug] {}'.format(x))
def info_print(x):
if INFO_EN: print('[Info] {}'.format(x))
def warning_print(x):
if WARNING_EN: print('[Warning] {} !'.format(x))
def err_print(x):
if ERR_EN: print('[Error] {} !!!'.format(x))
def list_of_file_pair_print(conll_tg_pairs,
err_cnt=None,
enc_dict=None,
reverse=False):
if reverse:
conll_tg_pairs = conll_tg_pairs[::-1]
for n, p in enumerate(conll_tg_pairs):
conll, tg = p
info_print(u'{}.\t{:5s} : {}'.format(n, 'CoNLL-U', conll))
string_to_display = u'{}\t{:5s}: {}'.format(' ' * (len(str(n)) + 1),
'TextGrid', tg)
# encoding
if enc_dict:
if tg in enc_dict.keys():
enc = enc_dict[tg]
if enc:
string_to_display += ' [{}]'.format(enc)
info_print(string_to_display)
# error count
if err_cnt:
if conll in err_cnt.keys():
num_err = err_cnt[conll]
if num_err:
info_print('\tnumber of errors: {}'.format(num_err))
## 2. I/O handlers
def listfiles(path):
filenames = None
folder_path = None
if os.path.isdir(path):
filenames = sorted(os.listdir(path)) # multiple files
folder_path = path
elif os.path.isfile(path):
filename = os.path.basename(path)
filenames = [filename] # single file
folder_path = os.path.dirname(path)
return folder_path, filenames
# detectot of file coding
def auto_decode(input):
if input:
if isinstance(input,(str,unicode)):
encoding = chardet.detect(input)['encoding']
return input.decode(encoding)
elif isinstance(input,list):
encoding = chardet.detect(''.join(input))['encoding']
ret = []
for x in input:
if x: x = x.decode(encoding)
ret.append(x)
return ret
else:
return input
else:
return input
# ref: https://stackoverflow.com/questions/436220/how-to-determine-the-encoding-of-text
def get_encoding(filepath):
encoding = None
blob = open(filepath, 'rb').read()
try:
m = magic.open(magic.MAGIC_MIME_ENCODING)
m.load()
encoding = m.buffer(blob) # "utf-8" "us-ascii" etc
except Exception as e:
m = magic.Magic(mime_encoding=True)
encoding = m.from_buffer(blob)
return encoding
# extend original TextGrid reader to
# 1.support praat Collection
# 2. Analor .or (this functionality works under python 2)
class TextGridPlus(pympi.Praat.TextGrid):
def extractTextGridFromAnalorFile(self, ifile):
SuccessOrNot = False
try:
marshaller = javaobj.JavaObjectUnmarshaller(ifile)
except IOError:
ifile.seek(0, 0)
return SuccessOrNot
while True:
# get one object
pobj = marshaller.readObject()
if pobj == 'FIN' or \
pobj == '' :
break
if pobj == 'F0':
self.xmin, self.xmax = marshaller.readObject()
# check if is at the tiers' header
if pobj == 'TIRES':
# get tier number
tier_num = marshaller.readObject()
tier_num = struct.unpack('>i', tier_num)[0]
while tier_num:
# get the metadata of tier
tlims = marshaller.readObject()
typ = marshaller.readObject()
nom = auto_decode(marshaller.readObject())
mots = auto_decode(marshaller.readObject())
bornes = marshaller.readObject()
nomGuide = auto_decode(marshaller.readObject())
# translation between 2 type naming
# between Analor and Praat version
if typ == 'INTERVALLE':
tier_type = 'IntervalTier'
elif typ == 'POINT':
tier_type = 'TextTier'
else:
raise Exception('Tiertype does not exist.')
# form a tier
tier = pympi.Praat.Tier(0,
0,
name=nom,
tier_type=tier_type)
self.tiers.append(tier)
tier.xmin = tlims[0]
tier.xmax = tlims[-1]
if tier.tier_type == 'IntervalTier':
for x1, x2, text in zip(bornes, bornes[1:], mots):
tier.intervals.append((x1, x2, text))
elif tier.tier_type == 'TextTier':
for x1, text in zip(bornes, mots):
tier.intervals.append((x1, text))
else:
raise Exception('Tiertype does not exist.')
# uncount the number of tiers remain to process
if tier_num > 0:
tier_num -= 1
SuccessOrNot = True
ifile.seek(0, 0)
return SuccessOrNot
# extend the original class constructor to have an additional arugment 'analorFileEn'
# in order to control if enable / disable Ananor File support (ie. .or)
def __init__(self, file_path, codec, analorFileEn=False):
self.analorFileEn = analorFileEn
pympi.Praat.TextGrid.__init__(self, file_path=file_path, codec=codec)
def from_file(self, ifile, codec='ascii'):
"""Read textgrid from stream.
:param file ifile: Stream to read from.
:param str codec: Text encoding for the input. Note that this will be
ignored for binary TextGrids.
"""
# try as an Analor file (.or) if the support is enabled
isAnalorFile = False
if self.analorFileEn:
isAnalorFile = self.extractTextGridFromAnalorFile(ifile)
# try as a Praat TextGrid / Collection file
if not isAnalorFile:
# read a Textgrid or extract TextGrid from Collection in Binary Format
if ifile.read(12) == b'ooBinaryFile':
def bin2str(ifile):
textlen = struct.unpack('>h', ifile.read(2))[0]
# Single byte characters
if textlen >= 0:
return ifile.read(textlen).decode('ascii')
# Multi byte characters have initial len -1 and then \xff bytes
elif textlen == -1:
textlen = struct.unpack('>h', ifile.read(2))[0]
data = ifile.read(textlen * 2)
# Hack to go from number to unicode in python3 and python2
fun = unichr if 'unichr' in __builtins__ else chr
charlist = (data[i:i + 2]
for i in range(0, len(data), 2))
return u''.join(
fun(struct.unpack('>h', i)[0]) for i in charlist)
# only difference is here :in the case of a Praat Collection
# jump to the begining of the embedded TextGrid object
if ifile.read(ord(
ifile.read(1))) == b'Collection': # skip oo type
self.jump2TextGridBin(ifile, codec)
self.xmin = struct.unpack('>d', ifile.read(8))[0]
self.xmax = struct.unpack('>d', ifile.read(8))[0]
ifile.read(1) # skip <exists>
self.tier_num = struct.unpack('>i', ifile.read(4))[0]
for i in range(self.tier_num):
tier_type = ifile.read(ord(ifile.read(1))).decode('ascii')
name = bin2str(ifile)
tier = pympi.Praat.Tier(0,
0,
name=name,
tier_type=tier_type)
self.tiers.append(tier)
tier.xmin = struct.unpack('>d', ifile.read(8))[0]
tier.xmax = struct.unpack('>d', ifile.read(8))[0]
nint = struct.unpack('>i', ifile.read(4))[0]
for i in range(nint):
x1 = struct.unpack('>d', ifile.read(8))[0]
if tier.tier_type == 'IntervalTier':
x2 = struct.unpack('>d', ifile.read(8))[0]
text = bin2str(ifile)
if tier.tier_type == 'IntervalTier':
tier.intervals.append((x1, x2, text))
elif tier.tier_type == 'TextTier':
tier.intervals.append((x1, text))
else:
raise Exception('Tiertype does not exist.')
# read a TextGrid file in long/ short text format
else:
def nn(ifile, pat):
line = next(ifile).decode(codec)
return pat.search(line).group(1)
regfloat = re.compile('([\d.]+)\s*$', flags=re.UNICODE)
regint = re.compile('([\d]+)\s*$', flags=re.UNICODE)
regstr = re.compile('"(.*)"\s*$', flags=re.UNICODE)
# Skip the Headers and empty line
next(ifile), next(ifile), next(ifile)
self.xmin = float(nn(ifile, regfloat))
self.xmax = float(nn(ifile, regfloat))
# Skip <exists>
line = next(ifile)
short = line.strip() == b'<exists>'
self.tier_num = int(nn(ifile, regint))
not short and next(ifile)
for i in range(self.tier_num):
not short and next(ifile) # skip item[]: and item[\d]:
tier_type = nn(ifile, regstr)
name = nn(ifile, regstr)
tier = pympi.Praat.Tier(0,
0,
name=name,
tier_type=tier_type)
self.tiers.append(tier)
tier.xmin = float(nn(ifile, regfloat))
tier.xmax = float(nn(ifile, regfloat))
for i in range(int(nn(ifile, regint))):
not short and next(ifile) # skip intervals [\d]
x1 = float(nn(ifile, regfloat))
if tier.tier_type == 'IntervalTier':
x2 = float(nn(ifile, regfloat))
t = nn(ifile, regstr)
tier.intervals.append((x1, x2, t))
elif tier.tier_type == 'TextTier':
t = nn(ifile, regstr)
tier.intervals.append((x1, t))
def jump2TextGridBin(self, ifile, codec='ascii', keyword=b'\x08TextGrid'):
binstr = b''
while ifile:
binstr += ifile.read(1)
if len(binstr) > len(keyword):
binstr = binstr[1:]
if binstr == keyword:
break
lg = struct.unpack('>h', ifile.read(2))[0]
if lg == -1:
lg = lg.astype('>H')
objname = ifile.read(lg).decode('ascii') # skip embeded oo name
def one_to_many_pairing(file1, files2, thld=5):
matched = ''
maxlen = -1
doublon = False
for file2 in files2:
nonenone,nonenone,match_len = \
difflib.SequenceMatcher(None, file1.lower(), file2.lower()).\
find_longest_match(0, len(file1), 0, len(file2))
if match_len > max(thld, maxlen):
maxlen = match_len
matched = file2
doublon = False
elif match_len == maxlen:
doublon = True
# don't make a pair if at the end, a doublon remains
if doublon: matched = ''
return matched
def make_paires(files1, files2):
# fine 1-to-1 file pair
pairs = []
for f1 in files1:
f2 = one_to_many_pairing(f1, files2)
if f2:
if f1 == one_to_many_pairing(f2, files1):
pairs.append((f1, f2))
return pairs
def insert_to_basename(filename, inserted, new_ext_name=None):
basename, extension = os.path.splitext(filename)
if new_ext_name: extension = u'.' + new_ext_name
return basename + inserted + extension
## 3. algorithmic
# source : https://stackoverflow.com/questions/2460177/edit-distance-in-python
def edit_distance(s1, s2):
m = len(s1) + 1
n = len(s2) + 1
tbl = {}
for i in range(m):
tbl[i, 0] = i
for j in range(n):
tbl[0, j] = j
for i in range(1, m):
for j in range(1, n):
cost = 0 if s1[i - 1] == s2[j - 1] else 1
tbl[i, j] = min(tbl[i, j - 1] + 1, tbl[i - 1, j] + 1,
tbl[i - 1, j - 1] + cost)
return tbl[i, j]
def distance(s1, s2):
# retirer des signes de marcro qui ne sont pas présentes dans le tier de ref.
macrosyntax_signs = re.compile(r"[\#\&\(\)\[\]\/\|\+\s\<\>]")
s1 = re.sub(macrosyntax_signs, "", s1.lower())
s2 = re.sub(macrosyntax_signs, "", s2.lower())
dist = edit_distance(s1, s2[:len(s1)])
return dist
def findTimes(tokens,
refTier,
lowerbound,
upperbound=-1,
thld=0.1,
pauseSign='#'):
sent = ' '.join(tokens)
intvs = refTier.get_all_intervals()
ref_tokens = [intv[-1] for intv in intvs]
best_dist = -1
best_begin_n = -1
best_end_n = -1
ref_tokens_sampled = []
best_begin_ref_sent = ''
best_end_ref_sent = ''
width = 2 * len(tokens)
# détection du début temporel
if upperbound < 0: # interprete negative upper bound as unbounded case
upperbound = len(ref_tokens)
for n in range(lowerbound, upperbound)[::-1]:
# check if n is correct
try:
ref_tokens[n]
except IndexError:
continue
# adapt real width if necessary
try:
ref_tokens_sampled = ref_tokens[n:n + width]
except IndexError:
ref_tokens_sampled = ref_tokens[n:]
# check if the current token represnts a pause
if ref_tokens[n] == pauseSign or not (ref_tokens[n]):
continue # interdiction d'aligner le début de la phrase sur une pause ou un vide
# search the begining
ref_sent = ' '.join(ref_tokens_sampled)
dist = distance(sent, ref_sent)
if best_dist < 0 or dist <= best_dist:
best_dist = dist
best_begin_n = n
best_begin_ref_sent = ref_sent
tmin = intvs[best_begin_n][0] # begining time of the starting interval
# détection de la vraie fin temporelle
best_dist = -1
best_sent = ''
width = 2 * len(tokens)
while width:
end_n = best_begin_n + width
ref_sent = ' '.join(ref_tokens[best_begin_n:end_n])
dist = distance(sent[::-1], ref_sent[::-1])
if best_dist < 0 or dist <= best_dist:
best_dist = dist
best_end_n = end_n
best_sent = ref_sent
width -= 1
# verify if dist < 10% of sentence length
deb_print(u"\t@findTimes sent to match : '{}'".format(sent))
if best_dist > thld * (len(sent)**1.1):
tmin = -1
tmax = -1
cursor_out = -1
deb_print(
u"\t@findTimes err : best dist. '{}' too large".format(best_dist))
else:
tmax = intvs[best_end_n - 1][1] # end time of the last interval
cursor_out = best_end_n
deb_print(u"\t@findTimes sent found : '{}'".format(
best_sent, tmin, tmax))
return [tmin, tmax, cursor_out, best_dist]