-
Notifications
You must be signed in to change notification settings - Fork 13
/
parser.py
158 lines (129 loc) · 6.06 KB
/
parser.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
# -*- encoding=utf-8 -*-
import re
import logging
import datetime
from stanford_corenlp_pywrapper import sockwrap
def stanford_parse(coll, stories, stanford):
"""
Runs stories pulled from the MongoDB instance through CoreNLP. Updates
the database entry with the parsed sentences. Currently set to run the
first 6 sentences.
Parameters
----------
coll: pymongo.collection.Collection.
Collection within MongoDB that holds the scraped news stories.
stories: pymongo.cursor.Cursor.
Stories pulled from the MongoDB instance.
stanford: String.
Directory path for Stanford CoreNLP.
"""
logger = logging.getLogger('stanford')
logger.info('Setting up CoreNLP.')
print "\nSetting up StanfordNLP. The program isn't dead. Promise."
stanford_parser = sockwrap.SockWrap(mode='justparse',
configfile='stanford_config.ini',
corenlp_libdir=stanford)
total = stories.count()
print "Stanford setup complete. Starting parse of {} stories...".format(total)
logger.info('Finished CoreNLP setup.')
for story in stories:
#print 'Processing story {}. {}'.format(story['_id'],
# datetime.datetime.now())
logger.info('\tProcessing story {}'.format(story['_id']))
if story['stanford'] == 1:
#print '\tStory {} already parsed.'.format(story['_id'])
logger.info('\tStory {} already parsed.'.format(story['_id']))
pass
else:
content = _sentence_segmenter(story['content'])[:7]
parsed = []
for sent in content:
try:
stanford_result = stanford_parser.parse_doc(sent)
parsed.append(stanford_result['sentences'][0]['parse'])
except Exception as e:
print 'Error on story {}. ¯\_(ツ)_/¯. {}'.format(story['_id'],
e)
logger.warning('\tError on story {}. {}'.format(story['_id'],
e))
coll.update({"_id": story['_id']}, {"$set": {'parsed_sents': parsed,
'stanford': 1}})
print 'Done with StanfordNLP parse...\n\n'
logger.info('Done with CoreNLP parse.')
def _sentence_segmenter(paragr):
"""
Function to break a string 'paragraph' into a list of sentences based on
the following rules:
1. Look for terminal [.,?,!] followed by a space and [A-Z]
2. If ., check against abbreviation list ABBREV_LIST: Get the string
between the . and the previous blank, lower-case it, and see if it is in
the list. Also check for single-letter initials. If true, continue search
for terminal punctuation
3. Extend selection to balance (...) and "...". Reapply termination rules
4. Add to sentlist if the length of the string is between MIN_SENTLENGTH
and MAX_SENTLENGTH
5. Returns sentlist
Parameters
----------
paragr: String.
Content that will be split into constituent sentences.
Returns
-------
sentlist: List.
List of sentences.
"""
# this is relatively high because we are only looking for sentences that
# will have subject and object
MIN_SENTLENGTH = 100
MAX_SENTLENGTH = 512
# sentence termination pattern used in sentence_segmenter(paragr)
terpat = re.compile('[\.\?!]\s+[A-Z\"]')
# source: LbjNerTagger1.11.release/Data/KnownLists/known_title.lst from
# University of Illinois with editing
ABBREV_LIST = ['mrs.', 'ms.', 'mr.', 'dr.', 'gov.', 'sr.', 'rev.', 'r.n.',
'pres.', 'treas.', 'sect.', 'maj.', 'ph.d.', 'ed. psy.',
'proc.', 'fr.', 'asst.', 'p.f.c.', 'prof.', 'admr.',
'engr.', 'mgr.', 'supt.', 'admin.', 'assoc.', 'voc.',
'hon.', 'm.d.', 'dpty.', 'sec.', 'capt.', 'c.e.o.',
'c.f.o.', 'c.i.o.', 'c.o.o.', 'c.p.a.', 'c.n.a.', 'acct.',
'llc.', 'inc.', 'dir.', 'esq.', 'lt.', 'd.d.', 'ed.',
'revd.', 'psy.d.', 'v.p.', 'senr.', 'gen.', 'prov.',
'cmdr.', 'sgt.', 'sen.', 'col.', 'lieut.', 'cpl.', 'pfc.',
'k.p.h.', 'cent.', 'deg.', 'doz.', 'Fahr.', 'Cel.', 'F.',
'C.', 'K.', 'ft.', 'fur.', 'gal.', 'gr.', 'in.', 'kg.',
'km.', 'kw.', 'l.', 'lat.', 'lb.', 'lb per sq in.', 'long.',
'mg.', 'mm.,, m.p.g.', 'm.p.h.', 'cc.', 'qr.', 'qt.', 'sq.',
't.', 'vol.', 'w.', 'wt.']
sentlist = []
# controls skipping over non-terminal conditions
searchstart = 0
terloc = terpat.search(paragr)
while terloc:
isok = True
if paragr[terloc.start()] == '.':
if (paragr[terloc.start() - 1].isupper() and
paragr[terloc.start() - 2] == ' '):
isok = False # single initials
else:
# check abbreviations
loc = paragr.rfind(' ', 0, terloc.start() - 1)
if loc > 0:
if paragr[loc + 1:terloc.start() + 1].lower() in ABBREV_LIST:
isok = False
if paragr[:terloc.start()].count('(') != paragr[:terloc.start()].count(')'):
isok = False
if paragr[:terloc.start()].count('"') % 2 != 0:
isok = False
if isok:
if (len(paragr[:terloc.start()]) > MIN_SENTLENGTH and
len(paragr[:terloc.start()]) < MAX_SENTLENGTH):
sentlist.append(paragr[:terloc.start() + 2])
paragr = paragr[terloc.end() - 1:]
searchstart = 0
else:
searchstart = terloc.start() + 2
terloc = terpat.search(paragr, searchstart)
# add final sentence
if (len(paragr) > MIN_SENTLENGTH and len(paragr) < MAX_SENTLENGTH):
sentlist.append(paragr)
return sentlist