-
Notifications
You must be signed in to change notification settings - Fork 22
/
filtersentence_xml.py
319 lines (264 loc) · 11.6 KB
/
filtersentence_xml.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
import re
from datetime import datetime
from contextexception import ContextException
from parlphrases import parlPhrases
from resolvemembernames import memberList
# this code fits onto the paragraphs before the fixhtmlentities and
# performs difficult regular expression matching that can be
# used for embedded links.
# this code detects square bracket qnums [12345], standing order quotes,
# official report references (mostly in wranses), and hyperlinks (most of which
# are badly typed and full of spaces).
# the structure of each function is to search for an occurrence of the pattern.
# it sends the text before the match to the next function, it encodes the
# pattern itself however it likes, and sends the text after the match back to
# itself as a kind of recursion.
# in the future it should be possible to pick out direct references to
# other members of the house in speeches.
# This is solely here so that already existing links (which will only be correction links and links to deposited papers)
# can get through this tokenising stage without being mangled to death
rehreflink = re.compile('(<small>)?<a href="([^"]*)">(.*?)</a>(</small>)?')
# <small> is for 2008-09 Lords wrapping links in them, yuck. Plus this doesn't
# work if more than one link is so wrapped. XXX
reqnum = re.compile("\s*\[(\d+)\]\s*$")
refqnum = re.compile("\s*\[(\d+)\]\s*")
redatephraseval = re.compile(
"(?:(?:%s),? )?(\d+(?: | )*(?:%s)( \d+)?)"
% (parlPhrases.daysofweek, parlPhrases.monthsofyear)
)
def TokenDate(ldate, phrtok):
sdate_year = phrtok.sdate[0:4]
tdate = ldate.group(0).replace(" ", " ")
if not ldate.group(2):
tdate += " %s" % sdate_year
try:
lldate = datetime.strptime(tdate, "%A, %d %B %Y")
phrtok.lastdate = lldate.date().isoformat()
except:
phrtok.lastdate = ""
return ("phrase", ' class="date" code="%s"' % phrtok.lastdate)
restandingo = re.compile("""(?x)
(?:<b>)?
Standing\sOrder\sNo\.\s*
(
\d+[A-Z]? # number+letter
(?:\s*\(\d+\))? # bracketted number
(?:\s*\([a-z]\))? # bracketted letter
)
(?::?\s*
\(([^()]*(?:\([^()]*\))?)\) # inclusion of title for clarity
)?
(?:</b>)?
""")
restandingomarg = re.compile("Standing Order No")
def TokenStandingOrder(mstandingo, phrtok):
if mstandingo.group(2):
return (
"phrase",
' class="standing-order" code="%s" title="%s"'
% (mstandingo.group(1), re.sub("<[^>]*>", "", mstandingo.group(2))),
)
return ("phrase", ' class="standing-order" code="%s"' % mstandingo.group(1))
rehtlink = re.compile("(?<![\"'])(https?://)([^\s]+)")
def TokenHttpLink(mhttp, phrtok):
qstrlink = mhttp.group(0)
return ("a", ' href="%s"' % qstrlink)
def TokenHrefLink(mhttp, phrtok):
return ("", "")
reoffrepw = re.compile("""(?ix)
<i>\s*official(?:</i>|<i>|\s)*report # Official Report
(?:</i>|<i>|[,;\s])*
(Commons|House\sof\sCommons|House\sof\sLords)? # Optional house (1)
(?:</i>|<i>|[,;\s])*
(?:vol(?:ume|\.)\s\d+)? # Optional volume
[,;]?
\s*c(?:c|o|ol|olumn)?s?\.? # Various ways of saying "column"
(?:[:\s]| )*(?:<i>)?
(?:(W[AS]?)\s*)? # Optional column number prefix (2)
(\d+(?:(?:&\#150;|-)\d+)?) # Column number or numbers (3)
([WHSA]*) # Optional column suffix (4)
""")
def TokenOffRep(qoffrep, phrtok):
loc1 = qoffrep.group(1)
qcolprefix = qoffrep.group(2)
qcolsuffix = qoffrep.group(4)
if qcolprefix:
qcolprefix = qcolprefix.upper()
if qcolsuffix:
qcolsuffix = qcolsuffix.upper()
# print '*', qoffrep.group(0), loc1, qcolprefix, qcolsuffix, qoffrep.group(3)
qcpart = re.match("(\d+)(?:(?:–|-)(\d+))?(?i)$", qoffrep.group(3))
qcolnum = qcpart.group(1)
if qcpart.group(2):
qcpartlead = qcpart.group(1)[len(qcpart.group(1)) - len(qcpart.group(2)) :]
if int(qcpartlead) >= int(qcpart.group(2)):
print(" non-following column leadoff ", qoffrep.group(0))
# raise Exception, ' non-following column leadoff '
if qcolsuffix == "WH":
sect = "westminhall"
elif qcolprefix == "WS" or qcolsuffix == "WS":
sect = "wms"
elif qcolprefix == "WA" or qcolsuffix == "W" or qcolsuffix == "WA":
sect = "wrans"
elif loc1 == "House of Lords":
sect = "lords"
else:
sect = "debates"
offrepid = "%s/%s.%s" % (sect, phrtok.lastdate, qcolnum)
return ("phrase", ' class="offrep" id="%s"' % offrepid)
# Date in the middle, so need to match before the date-only parsing...
reoffrepwdate = re.compile("""(?ix)
<i>\s*official(?:</i>|<i>|\s)*report # Official Report
(?:(?:</i>|<i>|,|\s)*(Westminster\sHall|House\sof\sLords|House\sof\sCommons))? # Optionally followed by a chamber (1)
[,;]?\s*(?:</i>)?[,;]?\s*
(?:(Commons|Lords)[,;]?\s*)? # Optionally followed by a House (2)
(\d+(?:\s| )\S+\s\d+|\d+/\d+/\d+) # The date (3)
(?:[;,]\s*Vol\.?(?:\s| )*\d+\.?\s*)? # Optional volume number
[,;]?
(?:\s+|\s*c(?:c|o|ol|olumn)?s?\.?) # Various ways of saying "column"
(?:\s| )*(?:<i>)?
(?:(W[AS]?)\s*)? # Optional column number prefix (4)
(\d+)(?:(?:&\#150;|-)\d+)? # Column number or numbers (5)
([WHS]*) # Optional column number suffix (6)
""")
def TokenOffRepWDate(qoffrep, phrtok):
# print qoffrep.group(0)
loc1 = qoffrep.group(1)
loc2 = qoffrep.group(2)
date = qoffrep.group(3).replace(" ", " ")
qcolprefix = qoffrep.group(4)
qcolnum = qoffrep.group(5)
qcolsuffix = qoffrep.group(6)
m = re.match("(\d+)/(\d+)/(\d+)", date)
if m:
lordsdate = True
date = datetime.strptime(date, "%d/%m/%Y").date().isoformat()
else:
lordsdate = False
date = datetime.strptime(date, "%d %B %Y").date().isoformat()
if qcolprefix:
qcolprefix = qcolprefix.upper()
if qcolsuffix:
qcolsuffix = qcolsuffix.upper()
if loc1 == "Westminster Hall" or qcolsuffix == "WH":
sect = "westminhall"
elif qcolprefix == "WS" or qcolsuffix == "WS":
sect = "wms"
elif qcolprefix == "WA" or qcolsuffix == "W":
sect = "wrans"
elif loc1 == "House of Commons" or loc2 == "Commons":
sect = "debates"
elif loc1 == "House of Lords" or loc2 == "Lords" or lordsdate:
sect = "lords"
else:
sect = "debates"
offrepid = "%s/%s.%s" % (sect, date, qcolnum)
return ("phrase", ' class="offrep" id="%s"' % offrepid)
# my hon. Friend the Member for Regent's Park and Kensington, North (Ms Buck)
# (sometimes there are spurious adjectives
rehonfriend = re.compile("""(?ix)
the\.?
# Privy counsellors, barrister, armed forces, status, etc.
(?:(?:\s|&.{4};)*(?:right\.?|rt\.|very|old|new|now|current|then|visiting|former|distinguished|hon\.?|honourable|and|learned|gallant|Labour|Liberal Democrat|Conservative|reverend|independent|excellent|poor|rude|courageous|wonderful|brutal|redoubtable|mute|present|pious|formidable|fragrant))*
(?:\s|&.{4};)*
member\sfor\s
([^(]{3,60}?) # group 1 the name of the constituency
\s*
\(([^)]{5,60}?)(?:&\#(?:146|8217);s)?\) # group 2 the name of the MP, inserted for clarity.
""")
rehonfriendmarg = re.compile("the\s+(hon\.\s*)?member for [^(]{0,60}\((?i)")
def TokenHonFriend(mhonfriend, phrtok):
# will match for ids
orgname = mhonfriend.group(2)
res = memberList.matchfullnamecons(
orgname, mhonfriend.group(1), phrtok.sdate, alwaysmatchcons=False
)
if not res[0]: # comes back as None
nid = "unknown"
mname = orgname
else:
nid = res[0]
mname = res[1]
assert not re.search("&", mname), mname
# remove any xml entities from the name
orgname = res[1]
return ("phrase", ' class="honfriend" person_id="%s" name="%s"' % (nid, orgname))
# the array of tokens which we will detect on the way through
tokenchain = [
("hreflink", rehreflink, None, TokenHrefLink),
("offrepwdate", reoffrepwdate, None, TokenOffRepWDate),
("date", redatephraseval, None, TokenDate),
("offrep", reoffrepw, None, TokenOffRep),
("standing order", restandingo, restandingomarg, TokenStandingOrder),
("httplink", rehtlink, None, TokenHttpLink),
("honfriend", rehonfriend, rehonfriendmarg, TokenHonFriend),
]
# this handles the chain of tokenization of a paragraph
class PhraseTokenize:
# recurses over itc < len(tokenchain)
def TokenizePhraseRecurse(self, qs, stex, itc):
# end of the chain
if itc == len(tokenchain):
self.toklist.append(("", "", stex))
return
# keep eating through the pieces for the same token
while stex:
# attempt to split the token
mtoken = tokenchain[itc][1].search(stex)
if mtoken: # the and/or method fails with this
headtex = stex[: mtoken.span(0)[0]]
else:
headtex = stex
# check for marginals
if tokenchain[itc][2] and tokenchain[itc][2].search(headtex):
pass
# print "Marginal token match:", tokenchain[itc][0]
# print tokenchain[itc][2].findall(headtex)
# print headtex
# send down the one or three pieces up the token chain
if headtex:
self.TokenizePhraseRecurse(qs, headtex, itc + 1)
# no more left
if not mtoken:
break
# break up the token if it is there
tokpair = tokenchain[itc][3](mtoken, self)
self.toklist.append((tokpair[0], tokpair[1], mtoken.group(0)))
# print "Token detected:", mtoken.group(0)
# the tail part
stex = stex[mtoken.span(0)[1] :]
def __init__(self, date, stex):
self.lastdate = ""
self.toklist = []
self.sdate = date
stex = re.sub("&(?!amp;)", "&", stex)
# separate out any qnums at end of paragraph
self.rmqnum = reqnum.search(stex)
if self.rmqnum:
stex = stex[: self.rmqnum.span(0)[0]]
# separate out qnums stuffed into front of paragraph (by the grabber of the speakername)
frqnum = refqnum.match(stex)
if frqnum:
if self.rmqnum:
raise ContextException(
"Found question number [%s] in para, but already found [%s] at end (this probably just means it is being quoted, and you just need to change [] to ()."
% (frqnum.group(1), self.rmqnum.group(1))
)
self.rmqnum = frqnum
stex = stex[frqnum.span(0)[1] :]
stex_nohtml = re.sub("<[^>]*>", "", stex)
if len(stex_nohtml) < 10:
raise ContextException(
"Removing question number from para appears to have removed all text (this probably just means a footnote marker is using [], just change to ())."
)
self.TokenizePhraseRecurse(date, stex, 0)
def GetPara(self):
res = []
for tok in self.toklist:
if tok[0]:
res.append("<%s%s>" % (tok[0], tok[1]))
res.append(tok[2])
res.append("</%s>" % tok[0])
else:
res.append(tok[2])
return "".join(res)