forked from CentreForCorpusResearch/clic-annotation
-
Notifications
You must be signed in to change notification settings - Fork 0
/
alternativesuspensions.py
110 lines (90 loc) · 4.01 KB
/
alternativesuspensions.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
import sys
import re
from lxml import etree
# regular expression looks for quotation tags
regex = re.compile('(<alt-q[es]/>)')
tree = etree.parse(sys.argv[1])
# find the middle bit of the qe qs and split on space - count each entry in
# list that contains at least one alpha char if more than 4 long 4 or less short
for p in tree.xpath('//p[s/qe]'):
# a. get paragraph paragraph_string
paragraph_string = etree.tostring(p)
# b. split by regular expression defined above
wlist = re.split(regex, paragraph_string)
# c. run index over listed items in each paragraph
for i in range(0, len(wlist)):
# CONDITION: suspensions occur latest two positions before final list
# item
# (final list item is </s> </p> and second last item is potentially
# qs/qe)
# <qe/>, xxxxxxxxxxxxxxxx, <qs/> : xxx is either a suspension or
# a non-suspension
if i + 2 < len(wlist):
# CONDITION: i is a 'qe' tag and there is only one extra list
# item between i and 'qs'
# (there might or might not be a suspension)
if wlist[i] == '<alt-qe/>' and wlist[i + 2] == '<alt-qs/>':
# count words in string: For deciding whether to label short
# or long suspension
wordCount = 0
tag = False # operator for whether or not a tag is present
# NOTE: There are <s> tags within some i+1 list items, preceding <qs>
# We want to get rid of these with a regular expression
# split into <sX> items of letters preceding white-space (i.e.
# 'words')
for w in re.split('(<.+?>|\s)', wlist[i + 1]):
# CONDITION: if the current <qe/> + 1 has an end sentence tag,
# means i+1 is a sentence (not a suspension).
if re.findall('</?s>', w):
# if so don't do anything
pass
# set tag to True if condition is met as only False ones will be labelled
# uncommented in Rein's version:
tag = True
break
# search for lettered words only
# means i+1 is a suspension
if re.findall('[a-zA-Z]', w) and w.find('<') == -1:
# if we've arrived at an actual word, continue to next word
# and add to word count
wordCount += 1
# if there are more than 4 words (and no tag), label sls
if wordCount > 4 and tag == False:
# add sls to <qe> tag
# sls following qe, sle preceding qs
wlist[i] = '<alt-qe/><sls/>'
wlist[i + 2] = '<sle/><alt-qs/>'
elif wordCount > 0 and tag == False:
wlist[i] = '<alt-qe/><sss/>'
wlist[i + 2] = '<sse/><alt-qs/>'
# e = etree.fromstring(''.join(wlist))
# p.getparent().replace(p, e)
# d. glue list of words
para_new = ''.join(wlist)
# print new paragraph to appropriate place in XML tree
nodetree = etree.fromstring('%s' % para_new)
# replace old paragraph nodes with new ones
for c in p.getchildren():
p.remove(c)
for n in nodetree:
p.append(n)
new_tree = etree.tostring(tree)
print new_tree
#==============================================================================
# Rein's alternative:
# d. glue list of words
# para_new = ''.join(wlist)
#
# print new paragraph to appropriate place in XML tree
# nodetree = etree.fromstring('%s' % para_new)
# replace old paragraph nodes with new ones
# for c in para.getchildren():
# para.remove(c)
# for n in nodetree:
# para.append(n)
#
# new_tree = etree.tostring(tree)
# susp_tag.write(new_tree)
# break
#==============================================================================
# print etree.tostring(tree)